[SPARK-19355][SQL][FOLLOWUP] Remove the child.outputOrdering check in global limit

viirya · cloud-fan · commit 5c27b0d4f8d3 · 2018-08-27T14:02:50.000+08:00
## What changes were proposed in this pull request? This is based on the discussion https://github.com/apache/spark/pull/16677/files#r212805327. As SQL standard doesn't mandate that a nested order by followed by a limit has to respect that ordering clause, this patch removes the `child.outputOrdering` check. ## How was this patch tested? Unit tests. Closes apache#22239 from viirya/improve-global-limit-parallelism-followup. Authored-by: Liang-Chi Hsieh <viirya@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
@@ -122,11 +122,11 @@ case class GlobalLimitExec(limit: Int, child: SparkPlan) extends UnaryExecNode {
       Nil
     }
 
-    // During global limit, try to evenly distribute limited rows across data
-    // partitions. If disabled, scanning data partitions sequentially until reaching limit number.
-    // Besides, if child output has certain ordering, we can't evenly pick up rows from
-    // each parititon.
-    val flatGlobalLimit = sqlContext.conf.limitFlatGlobalLimit && child.outputOrdering == Nil
+    // This is an optimization to evenly distribute limited rows across all partitions.
+    // When enabled, Spark goes to take rows at each partition repeatedly until reaching
+    // limit number. When disabled, Spark takes all rows at first partition, then rows
+    // at second partition ..., until reaching limit number.
+    val flatGlobalLimit = sqlContext.conf.limitFlatGlobalLimit
 
     val shuffled = new ShuffledRowRDD(shuffleDependency)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TakeOrderedAndProjectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TakeOrderedAndProjectSuite.scala
@@ -22,6 +22,7 @@ import scala.util.Random
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions.Literal
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 
@@ -31,10 +32,19 @@ class TakeOrderedAndProjectSuite extends SparkPlanTest with SharedSQLContext {
   private var rand: Random = _
   private var seed: Long = 0
 
+  private val originalLimitFlatGlobalLimit = SQLConf.get.getConf(SQLConf.LIMIT_FLAT_GLOBAL_LIMIT)
+
   protected override def beforeAll(): Unit = {
     super.beforeAll()
     seed = System.currentTimeMillis()
     rand = new Random(seed)
+
+    // Disable the optimization to make Sort-Limit match `TakeOrderedAndProject` semantics.
+    SQLConf.get.setConf(SQLConf.LIMIT_FLAT_GLOBAL_LIMIT, false)
+  }
+
+  protected override def afterAll() = {
+    SQLConf.get.setConf(SQLConf.LIMIT_FLAT_GLOBAL_LIMIT, originalLimitFlatGlobalLimit)
   }
 
   private def generateRandomInputData(): DataFrame = {