apache · cloud-fan · Oct 3, 2018 · Oct 4, 2018 · Oct 5, 2018 · Oct 5, 2018
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java b/sql/core/src/main/java/org/apache/spark/sql/execution/BufferedRowIterator.java
@@ -73,16 +73,6 @@ public void append(InternalRow row) {
     currentRows.add(row);
   }
 
-  /**
-   * Returns whether this iterator should stop fetching next row from [[CodegenSupport#inputRDDs]].
-   *
-   * If it returns true, the caller should exit the loop that [[InputAdapter]] generates.
-   * This interface is mainly used to limit the number of input rows.
-   */
-  public boolean stopEarly() {
-    return false;
-  }
-
   /**
    * Returns whether `processNext()` should stop processing next row from `input` or not.
    *

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
@@ -136,7 +136,7 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
        |if ($batch == null) {
        |  $nextBatchFuncName();
        |}
-       |while ($batch != null) {
+       |while ($limitNotReachedCond $batch != null) {
        |  int $numRows = $batch.numRows();
        |  int $localEnd = $numRows - $idx;
        |  for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) {
@@ -166,7 +166,7 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
     }
     val inputRow = if (needsUnsafeRowConversion) null else row
     s"""
-       |while ($input.hasNext()) {
+       |while ($limitNotReachedCond $input.hasNext()) {
        |  InternalRow $row = (InternalRow) $input.next();
        |  $numOutputRows.add(1);
        |  ${consume(ctx, outputVars, inputRow).trim}

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala
@@ -39,7 +39,7 @@ case class SortExec(
     global: Boolean,
     child: SparkPlan,
     testSpillFrequency: Int = 0)
-  extends UnaryExecNode with CodegenSupport {
+  extends UnaryExecNode with BlockingOperatorWithCodegen {
 
   override def output: Seq[Attribute] = child.output
 
@@ -124,14 +124,6 @@ case class SortExec(
   // Name of sorter variable used in codegen.
   private var sorterVariable: String = _
 
-  // The result rows come from the sort buffer, so this operator doesn't need to copy its result
-  // even if its child does.
-  override def needCopyResult: Boolean = false
-
-  // Sort operator always consumes all the input rows before outputting any result, so we don't need
-  // a stop check before sorting.
-  override def needStopCheck: Boolean = false
-
   override protected def doProduce(ctx: CodegenContext): String = {
     val needToSort =
       ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "needToSort", v => s"$v = true;")
@@ -172,7 +164,7 @@ case class SortExec(
        |   $needToSort = false;
        | }
        |
-       | while ($sortedIterator.hasNext()) {
+       | while ($limitNotReachedCond $sortedIterator.hasNext()) {
        |   UnsafeRow $outputRow = (UnsafeRow)$sortedIterator.next();
        |   ${consume(ctx, null, outputRow)}
        |   if (shouldStop()) return;

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -345,6 +345,61 @@ trait CodegenSupport extends SparkPlan {
    * don't require shouldStop() in the loop of producing rows.
    */
   def needStopCheck: Boolean = parent.needStopCheck
+
+  /**
+   * A sequence of checks which evaluate to true if the downstream Limit operators have not received
+   * enough records and reached the limit. If current node is a data producing node, it can leverage
+   * this information to stop producing data and complete the data flow earlier. Common data
+   * producing nodes are leaf nodes like Range and Scan, and blocking nodes like Sort and Aggregate.
+   * These checks should be put into the loop condition of the data producing loop.
+   */
+  def limitNotReachedChecks: Seq[String] = parent.limitNotReachedChecks
+
+  /**
+   * A helper method to generate the data producing loop condition according to the
+   * limit-not-reached checks.
+   */
+  final def limitNotReachedCond: String = {
+    // InputAdapter is also a leaf node.
+    val isLeafNode = children.isEmpty || this.isInstanceOf[InputAdapter]
+    if (!isLeafNode && !this.isInstanceOf[BlockingOperatorWithCodegen]) {
+      val errMsg = "Only leaf nodes and blocking nodes need to call 'limitNotReachedCond' " +
+        "in its data producing loop."
+      if (Utils.isTesting) {
+        throw new IllegalStateException(errMsg)
+      } else {
+        logWarning(s"[BUG] $errMsg Please open a JIRA ticket to report it.")
+      }
+    }
+    if (parent.limitNotReachedChecks.isEmpty) {
+      ""
+    } else {
+      parent.limitNotReachedChecks.mkString("", " && ", " &&")
+    }
+  }
+}
+
+/**
+ * A special kind of operators which support whole stage codegen. Blocking means these operators
+ * will consume all the inputs first, before producing output. Typical blocking operators are
+ * sort and aggregate.
+ */
+trait BlockingOperatorWithCodegen extends CodegenSupport {
+
+  // Blocking operators usually have some kind of buffer to keep the data before producing them, so
+  // then don't to copy its result even if its child does.
+  override def needCopyResult: Boolean = false
+
+  // Blocking operators always consume all the input first, so its upstream operators don't need a
+  // stop check.
+  override def needStopCheck: Boolean = false
+
+  // Blocking operators need to consume all the inputs before producing any output. This means,
+  // Limit operator after this blocking operator will never reach its limit during the execution of
+  // this blocking operator's upstream operators. Here we override this method to return Nil, so
+  // that upstream operators will not generate useless conditions (which are always evaluated to
+  // false) for the Limit operators after this blocking operator.
+  override def limitNotReachedChecks: Seq[String] = Nil
 }
 
 
@@ -381,7 +436,7 @@ case class InputAdapter(child: SparkPlan) extends UnaryExecNode with CodegenSupp
       forceInline = true)
     val row = ctx.freshName("row")
     s"""
-       | while ($input.hasNext() && !stopEarly()) {
+       | while ($limitNotReachedCond $input.hasNext()) {
        |   InternalRow $row = (InternalRow) $input.next();
        |   ${consume(ctx, null, row).trim}
        |   if (shouldStop()) return;
@@ -677,6 +732,8 @@ case class WholeStageCodegenExec(child: SparkPlan)(val codegenStageId: Int)
 
   override def needStopCheck: Boolean = true
 
+  override def limitNotReachedChecks: Seq[String] = Nil
+
   override protected def otherCopyArgs: Seq[AnyRef] = Seq(codegenStageId.asInstanceOf[Integer])
 }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -45,7 +45,7 @@ case class HashAggregateExec(
     initialInputBufferOffset: Int,
     resultExpressions: Seq[NamedExpression],
     child: SparkPlan)
-  extends UnaryExecNode with CodegenSupport {
+  extends UnaryExecNode with BlockingOperatorWithCodegen {
 
   private[this] val aggregateBufferAttributes = {
     aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
@@ -151,14 +151,6 @@ case class HashAggregateExec(
     child.asInstanceOf[CodegenSupport].inputRDDs()
   }
 
-  // The result rows come from the aggregate buffer, or a single row(no grouping keys), so this
-  // operator doesn't need to copy its result even if its child does.
-  override def needCopyResult: Boolean = false
-
-  // Aggregate operator always consumes all the input rows before outputting any result, so we
-  // don't need a stop check before aggregating.
-  override def needStopCheck: Boolean = false
-
   protected override def doProduce(ctx: CodegenContext): String = {
     if (groupingExpressions.isEmpty) {
       doProduceWithoutKeys(ctx)
@@ -705,13 +697,16 @@ case class HashAggregateExec(
 
     def outputFromRegularHashMap: String = {
       s"""
-         |while ($iterTerm.next()) {
+         |while ($limitNotReachedCond $iterTerm.next()) {
          |  UnsafeRow $keyTerm = (UnsafeRow) $iterTerm.getKey();
          |  UnsafeRow $bufferTerm = (UnsafeRow) $iterTerm.getValue();
          |  $outputFunc($keyTerm, $bufferTerm);
-         |
          |  if (shouldStop()) return;
          |}
+         |$iterTerm.close();
+         |if ($sorterTerm == null) {
+         |  $hashMapTerm.free();
+         |}
        """.stripMargin
     }
 
@@ -728,11 +723,6 @@ case class HashAggregateExec(
      // output the result
      $outputFromFastHashMap
      $outputFromRegularHashMap
-
-     $iterTerm.close();
-     if ($sorterTerm == null) {
-       $hashMapTerm.free();
-     }
      """
   }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -378,7 +378,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
     val numOutput = metricTerm(ctx, "numOutputRows")
 
     val initTerm = ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "initRange")
-    val number = ctx.addMutableState(CodeGenerator.JAVA_LONG, "number")
+    val nextIndex = ctx.addMutableState(CodeGenerator.JAVA_LONG, "nextIndex")
 
     val value = ctx.freshName("value")
     val ev = ExprCode.forNonNullValue(JavaCode.variable(value, LongType))
@@ -397,7 +397,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
     // within a batch, while the code in the outer loop is setting batch parameters and updating
     // the metrics.
 
-    // Once number == batchEnd, it's time to progress to the next batch.
+    // Once nextIndex == batchEnd, it's time to progress to the next batch.
     val batchEnd = ctx.addMutableState(CodeGenerator.JAVA_LONG, "batchEnd")
 
     // How many values should still be generated by this range operator.
@@ -421,13 +421,13 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
         |
         |   $BigInt st = index.multiply(numElement).divide(numSlice).multiply(step).add(start);
         |   if (st.compareTo($BigInt.valueOf(Long.MAX_VALUE)) > 0) {
-        |     $number = Long.MAX_VALUE;
+        |     $nextIndex = Long.MAX_VALUE;
         |   } else if (st.compareTo($BigInt.valueOf(Long.MIN_VALUE)) < 0) {
-        |     $number = Long.MIN_VALUE;
+        |     $nextIndex = Long.MIN_VALUE;
         |   } else {
-        |     $number = st.longValue();
+        |     $nextIndex = st.longValue();
         |   }
-        |   $batchEnd = $number;
+        |   $batchEnd = $nextIndex;
         |
         |   $BigInt end = index.add($BigInt.ONE).multiply(numElement).divide(numSlice)
         |     .multiply(step).add(start);
@@ -440,7 +440,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
         |   }
         |
         |   $BigInt startToEnd = $BigInt.valueOf(partitionEnd).subtract(
-        |     $BigInt.valueOf($number));
+        |     $BigInt.valueOf($nextIndex));
         |   $numElementsTodo  = startToEnd.divide(step).longValue();
         |   if ($numElementsTodo < 0) {
         |     $numElementsTodo = 0;
@@ -452,46 +452,73 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
 
     val localIdx = ctx.freshName("localIdx")
     val localEnd = ctx.freshName("localEnd")
-    val range = ctx.freshName("range")
     val shouldStop = if (parent.needStopCheck) {
-      s"if (shouldStop()) { $number = $value + ${step}L; return; }"
+      s"if (shouldStop()) { $nextIndex = $value + ${step}L; return; }"
     } else {
       "// shouldStop check is eliminated"
     }
+    val loopCondition = if (limitNotReachedChecks.isEmpty) {
+      "true"
+    } else {
+      limitNotReachedChecks.mkString(" && ")
+    }
+
+    // An overview of the Range processing.
+    //
+    // For each partition, the Range task needs to produce records from partition start(inclusive)
+    // to end(exclusive). For better performance, we separate the partition range into batches, and
+    // use 2 loops to produce data. The outer while loop is used to iterate batches, and the inner
+    // for loop is used to iterate records inside a batch.
+    //
+    // `nextIndex` tracks the index of the next record that is going to be consumed, initialized
+    // with partition start. `batchEnd` tracks the end index of the current batch, initialized
+    // with `nextIndex`. In the outer loop, we first check if `nextIndex == batchEnd`. If it's true,
+    // it means the current batch is fully consumed, and we will update `batchEnd` to process the
+    // next batch. If `batchEnd` reaches partition end, exit the outer loop. Finally we enter the
+    // inner loop. Note that, when we enter inner loop, `nextIndex` must be different from
+    // `batchEnd`, otherwise we already exit the outer loop.
+    //
+    // The inner loop iterates from 0 to `localEnd`, which is calculated by
+    // `(batchEnd - nextIndex) / step`. Since `batchEnd` is increased by `nextBatchTodo * step` in
+    // the outer loop, and initialized with `nextIndex`, so `batchEnd - nextIndex` is always
+    // divisible by `step`. The `nextIndex` is increased by `step` during each iteration, and ends
+    // up being equal to `batchEnd` when the inner loop finishes.
+    //
+    // The inner loop can be interrupted, if the query has produced at least one result row, so that
+    // we don't buffer too many result rows and waste memory. It's ok to interrupt the inner loop,
+    // because `nextIndex` will be updated before interrupting.
+
     s"""
       | // initialize Range
       | if (!$initTerm) {
       |   $initTerm = true;
       |   $initRangeFuncName(partitionIndex);
       | }
       |
-      | while (true) {
-      |   long $range = $batchEnd - $number;
-      |   if ($range != 0L) {
-      |     int $localEnd = (int)($range / ${step}L);
-      |     for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) {
-      |       long $value = ((long)$localIdx * ${step}L) + $number;
-      |       ${consume(ctx, Seq(ev))}
-      |       $shouldStop
+      | while ($loopCondition) {
+      |   if ($nextIndex == $batchEnd) {
+      |     long $nextBatchTodo;
+      |     if ($numElementsTodo > ${batchSize}L) {
+      |       $nextBatchTodo = ${batchSize}L;
+      |       $numElementsTodo -= ${batchSize}L;
+      |     } else {
+      |       $nextBatchTodo = $numElementsTodo;
+      |       $numElementsTodo = 0;
+      |       if ($nextBatchTodo == 0) break;
       |     }
-      |     $number = $batchEnd;
+      |     $numOutput.add($nextBatchTodo);
+      |     $inputMetrics.incRecordsRead($nextBatchTodo);
+      |     $batchEnd += $nextBatchTodo * ${step}L;
       |   }
       |
-      |   $taskContext.killTaskIfInterrupted();
-      |
-      |   long $nextBatchTodo;
-      |   if ($numElementsTodo > ${batchSize}L) {
-      |     $nextBatchTodo = ${batchSize}L;
-      |     $numElementsTodo -= ${batchSize}L;
-      |   } else {
-      |     $nextBatchTodo = $numElementsTodo;
-      |     $numElementsTodo = 0;
-      |     if ($nextBatchTodo == 0) break;
+      |   int $localEnd = (int)(($batchEnd - $nextIndex) / ${step}L);
+      |   for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) {
+      |     long $value = ((long)$localIdx * ${step}L) + $nextIndex;
+      |     ${consume(ctx, Seq(ev))}
+      |     $shouldStop
       |   }
-      |   $numOutput.add($nextBatchTodo);
-      |   $inputMetrics.incRecordsRead($nextBatchTodo);
-      |
-      |   $batchEnd += $nextBatchTodo * ${step}L;
+      |   $nextIndex = $batchEnd;
+      |   $taskContext.killTaskIfInterrupted();
       | }
      """.stripMargin
   }