Reynold's comments

apache · sameeragarwal · Jan 7, 2016 · Jan 7, 2016 · Jan 7, 2016 · Jan 7, 2016
commit 633683219022a3d7fa512bdbc3ea7ae6349fc7e1
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -62,6 +62,32 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("randomSplit on reordered partitions") {
+    val n = 600
+    // This test ensures that randomSplit does not create overlapping splits even when the
+    // underlying dataframe (such as the one below) doesn't guarantee a deterministic ordering of
+    // rows in each partition.
+    val data =
+      sparkContext.parallelize(1 to n, 2).mapPartitions(scala.util.Random.shuffle(_)).toDF("id")
+    for (seed <- 1 to 5) {
+      val splits = data.randomSplit(Array[Double](1, 2, 3), seed)
+      assert(splits.length == 3, "wrong number of splits")
+
+      assert(splits.reduce((a, b) => a.unionAll(b)).sort("id").collect().toList ==
+        data.sort($"id").collect().toList, "incomplete or wrong split")
+
+      for (id <- splits.indices) {
+        assert(splits(id).intersect(splits((id + 1) % splits.length)).collect().isEmpty,
+          s"split $id overlaps with split ${(id + 1) % splits.length}")
+      }
+
+      val s = splits.map(_.count())
+      assert(math.abs(s(0) - 100) < 50) // std =  9.13
+      assert(math.abs(s(1) - 200) < 50) // std = 11.55
+      assert(math.abs(s(2) - 300) < 50) // std = 12.25
+    }
+  }
+
   test("pearson correlation") {
     val df = Seq.tabulate(10)(i => (i, 2 * i, i * -1.0)).toDF("a", "b", "c")
     val corr1 = df.stat.corr("a", "b", "pearson")

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -129,19 +129,6 @@ class HiveSparkSubmitSuite
     runSparkSubmit(args)
   }
 
-  test("SPARK-12662 fix DataFrame.randomSplit to avoid creating overlapping splits") {
-    val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
-    val args = Seq(
-      "--class", SPARK_12662.getClass.getName.stripSuffix("$"),
-      "--name", "SparkSQLConfTest",
-      "--master", "local-cluster[2,1,1024]",
-      "--conf", "spark.ui.enabled=false",
-      "--conf", "spark.master.rest.enabled=false",
-      "--driver-java-options", "-Dderby.system.durability=test",
-      unusedJar.toString)
-    runSparkSubmit(args)
-  }
-
   // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
   // This is copied from org.apache.spark.deploy.SparkSubmitSuite
   private def runSparkSubmit(args: Seq[String]): Unit = {
@@ -385,48 +372,3 @@ object SPARK_11009 extends QueryTest {
     }
   }
 }
-
-/**
- * This object is used to test SPARK-12662: https://issues.apache.org/jira/browse/SPARK-12662.
- * This test ensures that [[org.apache.spark.sql.DataFrame.randomSplit]] does not create overlapping
- * splits even when the underlying dataframe doesn't guarantee a deterministic ordering of rows in
- * each partition.
- */
-object SPARK_12662 extends QueryTest {
-  import org.apache.spark.sql.functions._
-
-  protected var sqlContext: SQLContext = _
-
-  def main(args: Array[String]): Unit = {
-    Utils.configTestLog4j("INFO")
-
-    val sparkContext = new SparkContext(
-      new SparkConf()
-        .set("spark.sql.shuffle.partitions", "100"))
-
-    val hiveContext = new TestHiveContext(sparkContext)
-    sqlContext = hiveContext
-
-    try {
-      val n = 600
-      val data = sqlContext.range(n).toDF("id").repartition(200, col("id"))
-      val splits = data.randomSplit(Array[Double](1, 2, 3), seed = 1)
-      assert(splits.length == 3, "wrong number of splits")
-
-      assert(splits.reduce((a, b) => a.unionAll(b)).sort("id").collect().toList ==
-        data.sort(col("id")).collect().toList, "incomplete or wrong split")
-
-      for (id <- splits.indices) {
-        assert(splits(id).intersect(splits((id + 1) % splits.length)).collect().isEmpty,
-          s"split $id overlaps with split ${(id + 1) % splits.length}")
-      }
-
-      val s = splits.map(_.count())
-      assert(math.abs(s(0) - 100) < 50) // std =  9.13
-      assert(math.abs(s(1) - 200) < 50) // std = 11.55
-      assert(math.abs(s(2) - 300) < 50) // std = 12.25
-    } finally {
-      sparkContext.stop()
-    }
-  }
-}