@@ -22,6 +22,7 @@ import java.io.{File, IOException, ObjectInputStream, ObjectOutputStream}
2222import scala .collection .JavaConverters ._
2323import scala .collection .mutable .{ArrayBuffer , HashMap }
2424import scala .reflect .ClassTag
25+ import scala .util .Random
2526
2627import com .esotericsoftware .kryo .KryoException
2728import org .apache .hadoop .io .{LongWritable , Text }
@@ -326,7 +327,10 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext {
326327 assert(repartitioned2.collect().toSet === (1 to 1000 ).toSet)
327328 }
328329
329- test(" repartitioned RDDs perform load balancing" ) {
330+ // Ignore the test case since round-robin partitioning can cause incorrect result under some
331+ // cases discussed in SPARK-23207 and SPARK-23243. Will re-enable this after we resolved the
332+ // issue.
333+ ignore(" repartitioned RDDs perform load balancing" ) {
330334 // Coalesce partitions
331335 val input = Array .fill(1000 )(1 )
332336 val initialPartitions = 10
@@ -361,6 +365,30 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext {
361365 testSplitPartitions(Array .fill(1000 )(1 ), 250 , 128 )
362366 }
363367
368+ test(" SPARK-23243: Make repartition() generate consistent output" ) {
369+ def assertConsistency (rdd : RDD [Any ]): Unit = {
370+ rdd.persist()
371+
372+ val partitions1 = rdd.mapPartitions { iter =>
373+ Random .shuffle(iter)
374+ }.repartition(111 ).collectPartitions()
375+ val partitions2 = rdd.repartition(111 ).collectPartitions()
376+ assert(partitions1.size === partitions2.size)
377+ assert(partitions1.zip(partitions2).forall { pair =>
378+ pair._1.toSet === pair._2.toSet
379+ })
380+ }
381+
382+ // repartition() should generate consistent output.
383+ assertConsistency(sc.parallelize(1 to 10000 , 10 ))
384+
385+ // case when input contains duplicated values.
386+ assertConsistency(sc.parallelize(1 to 10000 , 10 ).map(i => Random .nextInt(1000 )))
387+
388+ // case when input contains null values.
389+ assertConsistency(sc.parallelize(1 to 100 , 10 ).map(i => if (i % 2 == 0 ) null else i))
390+ }
391+
364392 test(" coalesced RDDs" ) {
365393 val data = sc.parallelize(1 to 10 , 10 )
366394
0 commit comments