Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
addressed comments v0.1
  • Loading branch information
brkyvz committed Apr 28, 2015
commit 2384266bb5d8307ca17700ad288bd58dac5e27a1
5 changes: 2 additions & 3 deletions core/src/main/scala/org/apache/spark/rdd/RDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -420,10 +420,9 @@ abstract class RDD[T: ClassTag](
* @return A random sub-sample of the RDD without replacement.
*/
private[spark] def randomSampleWithRange(lb: Double, ub: Double, seed: Long): RDD[T] = {
val random = new Random(seed)
this.mapPartitions { partition =>
this.mapPartitionsWithIndex { case (index, partition) =>
val sampler = new BernoulliCellSampler[T](lb, ub)
sampler.setSeed(random.nextLong)
sampler.setSeed(seed + index)
sampler.sample(partition)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -278,13 +278,6 @@ package object dsl {
def sfilter[T1](arg1: Symbol)(udf: (T1) => Boolean): LogicalPlan =
Filter(ScalaUdf(udf, BooleanType, Seq(UnresolvedAttribute(arg1.name))), logicalPlan)

def sample(
lb: Double,
ub: Double,
withReplacement: Boolean = true,
seed: Int = (math.random * 1000).toInt): LogicalPlan =
Sample(lb, ub, withReplacement, seed, logicalPlan)

// TODO specify the output column names
def generate(
generator: Generator,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,9 +300,19 @@ case class Subquery(alias: String, child: LogicalPlan) extends UnaryNode {
override def output: Seq[Attribute] = child.output.map(_.withQualifiers(alias :: Nil))
}

/**
* Sample the dataset.
*
* @param lowerBound Lower-bound of the sampling probability (usually 0.0)
* @param upperBound Upper-bound of the sampling probability. The expected fraction sampled
* will be ub - lb.
* @param withReplacement Whether to sample with replacement.
* @param seed the random seed
* @param child the LogicalPlan
*/
case class Sample(
lb: Double,
ub: Double,
lowerBound: Double,
upperBound: Double,
withReplacement: Boolean,
seed: Long,
child: LogicalPlan) extends UnaryNode {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,17 @@ case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
/**
* :: DeveloperApi ::
* Sample the dataset.
* @param lb Lower-bound of the sampling probability (usually 0.0)
* @param ub Upper-bound of the sampling probability. The expected fraction sampled will be ub - lb.
* @param lowerBound Lower-bound of the sampling probability (usually 0.0)
* @param upperBound Upper-bound of the sampling probability. The expected fraction sampled
* will be ub - lb.
* @param withReplacement Whether to sample with replacement.
* @param seed the random seed
* @param child the QueryPlan
*/
@DeveloperApi
case class Sample(
lb: Double,
ub: Double,
lowerBound: Double,
upperBound: Double,
withReplacement: Boolean,
seed: Long,
child: SparkPlan)
Expand Down