Fix limit codegen.

apache · viirya · Sep 22, 2018 · Sep 24, 2018 · Sep 25, 2018 · Sep 27, 2018
commit a09e60f1e026504657f3de7669eb79cc0b4c2c8c
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java b/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java
@@ -38,6 +38,11 @@ public abstract class BufferedRowIterator {
 
   protected int partitionIndex = -1;
 
+  // This indicates whether the query execution should be stopped even the input rows are still
+  // available. This is used in limit operator. When it reaches the given number of rows to limit,
+  // this flag is set and the execution should be stopped.
+  protected boolean isStopEarly = false;
+
   public boolean hasNext() throws IOException {
     if (currentRows.isEmpty()) {
       processNext();
@@ -73,14 +78,21 @@ public void append(InternalRow row) {
     currentRows.add(row);
   }
 
+  /**
+   * Sets the flag of stopping the query execution early.
+   */
+  public void setStopEarly(boolean value) {
+    isStopEarly = value;
+  }
+
   /**
    * Returns whether this iterator should stop fetching next row from [[CodegenSupport#inputRDDs]].
    *
    * If it returns true, the caller should exit the loop that [[InputAdapter]] generates.
    * This interface is mainly used to limit the number of input rows.
    */
   public boolean stopEarly() {
-    return false;
+    return isStopEarly;
   }
 
   /**

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -243,7 +243,7 @@ case class HashAggregateExec(
     val aggTime = metricTerm(ctx, "aggTime")
     val beforeAgg = ctx.freshName("beforeAgg")
     s"""
-       | while (!$initAgg && !stopEarly()) {
+       | while (!$initAgg) {
        |   $initAgg = true;
        |   long $beforeAgg = System.nanoTime();
        |   $doAggFuncName();
@@ -723,6 +723,9 @@ case class HashAggregateExec(
        long $beforeAgg = System.nanoTime();
        $doAggFuncName();
        $aggTime.add((System.nanoTime() - $beforeAgg) / 1000000);
+
+       // Reset stop early flag set by previous limit operator
+       setStopEarly(false);
      }
 
      // output the result

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
@@ -71,21 +71,12 @@ trait BaseLimitExec extends UnaryExecNode with CodegenSupport {
   }
 
   override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = {
-    val stopEarly =
-      ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "stopEarly") // init as stopEarly = false
-
-    ctx.addNewFunction("stopEarly", s"""
-      @Override
-      protected boolean stopEarly() {
-        return $stopEarly;
-      }
-    """, inlineToOuterClass = true)
     val countTerm = ctx.addMutableState(CodeGenerator.JAVA_INT, "count") // init as count = 0
     s"""
        | if ($countTerm < $limit) {
        |   $countTerm += 1;
        |   if ($countTerm == $limit) {
-       |     $stopEarly = true;
+       |     setStopEarly(true);
        |   }
        |   ${consume(ctx, input)}
        | }

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -556,7 +556,7 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       Seq(Row(1, 2, Seq("a", "b")), Row(3, 2, Seq("c", "c", "d"))))
   }
 
-  test("SPARK-18004 limit + aggregates") {
+  test("SPARK-18528 limit + aggregates") {
     val df = Seq(("a", 1), ("b", 2), ("c", 1), ("d", 5)).toDF("id", "value")
     val limit2Df = df.limit(2)
     checkAnswer(

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2865,6 +2865,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     // The second hash aggregate before local limit outputs 1 record.
     assert(aggNumRecords == 101)
 
+    val aggNoGroupingDF = spark.range(0, 100, 1, 1)
+      .groupBy()
+      .count().limit(1).filter('count > 0)
+    aggNoGroupingDF.collect()
+    val aggNoGroupingNumRecords = aggNoGroupingDF.queryExecution.sparkPlan.collect {
+      case h: HashAggregateExec => h
+    }.map { hashNode =>
+      hashNode.metrics("numOutputRows").value
+    }.sum
+    assert(aggNoGroupingNumRecords == 2)
+
     val filterDF = spark.range(0, 100, 1, 1).filter('id >= 0)
       .selectExpr("id + 1 as id2").limit(1).filter('id > 50)
     filterDF.collect()
@@ -2875,6 +2886,20 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }.head
     // RangeNode and FilterNode both output 1 record.
     assert(filterNumRecords == Tuple2(1, 1))
+
+    val twoLimitsDF = spark.range(0, 100, 1, 1)
+      .limit(1)
+      .filter('id >= 0)
+      .selectExpr("id + 1 as id2")
+      .limit(2)
+      .filter('id > 50)
+    twoLimitsDF.collect()
+    val twoLimitsDFNumRecords = twoLimitsDF.queryExecution.sparkPlan.collect {
+      case r: RangeExec => r
+    }.map { rangeNode =>
+      rangeNode.metrics("numOutputRows").value
+    }.head
+    assert(twoLimitsDFNumRecords == 1)
   }
 }