-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-25497][SQL] Limit operation within whole stage codegen should not consume all the inputs #22630
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-25497][SQL] Limit operation within whole stage codegen should not consume all the inputs #22630
Changes from 1 commit
13d882a
0a6c79a
51ce7be
2188b27
e0bc621
dc2dfa5
e61078b
9114107
eac31b2
4fc4301
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -132,11 +132,12 @@ case class SortExec( | |
| // a stop check before sorting. | ||
| override def needStopCheck: Boolean = false | ||
|
|
||
| // Sort is a blocking operator. It needs to consume all the inputs before producing any | ||
| // output. This means, Limit after Sort has no effect to Sort's upstream operators. | ||
| // Here we override this method to return Nil, so that upstream operators will not generate | ||
| // unnecessary conditions (which is always evaluated to false) for the Limit after Sort. | ||
| override def conditionsOfKeepProducingData: Seq[String] = Nil | ||
| // Sort is a blocking operator. It needs to consume all the inputs before producing any output. | ||
| // This means, Limit operator after Sort will never reach its limit during the execution of Sort's | ||
| // upstream operators. Here we override this method to return Nil, so that upstream operators will | ||
| // not generate useless conditions (which are always evaluated to false) for the Limit operators | ||
| // after Sort. | ||
| override def limitNotReachedChecks: Seq[String] = Nil | ||
|
||
|
|
||
| override protected def doProduce(ctx: CodegenContext): String = { | ||
| val needToSort = | ||
|
|
@@ -178,7 +179,7 @@ case class SortExec( | |
| | $needToSort = false; | ||
| | } | ||
| | | ||
| | while ($sortedIterator.hasNext()$keepProducingDataCond) { | ||
| | while ($sortedIterator.hasNext()$limitNotReachedCond) { | ||
| | UnsafeRow $outputRow = (UnsafeRow)$sortedIterator.next(); | ||
| | ${consume(ctx, null, outputRow)} | ||
| | if (shouldStop()) return; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -346,13 +346,24 @@ trait CodegenSupport extends SparkPlan { | |
| */ | ||
| def needStopCheck: Boolean = parent.needStopCheck | ||
|
|
||
| def conditionsOfKeepProducingData: Seq[String] = parent.conditionsOfKeepProducingData | ||
| /** | ||
| * A sequence of checks which evaluate to true if the downstream Limit operators have not received | ||
| * enough records and reached the limit. If current node is a data producing node, it can leverage | ||
| * this information to stop producing data and complete the data flow earlier. Common data | ||
| * producing nodes are leaf nodes like Range and Scan, and blocking nodes like Sort and Aggregate. | ||
| * These checks should be put into the loop condition of the data producing loop. | ||
| */ | ||
| def limitNotReachedChecks: Seq[String] = parent.limitNotReachedChecks | ||
|
|
||
| final protected def keepProducingDataCond: String = { | ||
| if (parent.conditionsOfKeepProducingData.isEmpty) { | ||
| /** | ||
| * A helper method to generate the data producing loop condition according to the | ||
| * limit-not-reached checks. | ||
| */ | ||
| final def limitNotReachedCond: String = { | ||
| if (parent.limitNotReachedChecks.isEmpty) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just one thought: since we propagate (correctly) the The reason I'd like to do this is to enforce that we are not introducing the same limit condition check more than once, in more than one operator, which would be useless and may cause (small) perf issue. WDYT?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not very useful to enforce that. The consequence is so minor and I don't think it's worth the complexity. I want to have a simple and robust framework for the limit optimization first.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
yes, I 100%, that's why I'd like to early detect all the possible situations which we are not thinking as possible but may happen in corner cases we are not considering. What I am suggesting here is to enforce and fail that for testing only of course, in production we shouldn't do anything similar. |
||
| "" | ||
| } else { | ||
| parent.conditionsOfKeepProducingData.mkString(" && ", " && ", "") | ||
| parent.limitNotReachedChecks.mkString(" && ", " && ", "") | ||
|
||
| } | ||
| } | ||
| } | ||
|
|
@@ -391,7 +402,7 @@ case class InputAdapter(child: SparkPlan) extends UnaryExecNode with CodegenSupp | |
| forceInline = true) | ||
| val row = ctx.freshName("row") | ||
| s""" | ||
| | while ($input.hasNext()$keepProducingDataCond) { | ||
| | while ($input.hasNext()$limitNotReachedCond) { | ||
| | InternalRow $row = (InternalRow) $input.next(); | ||
| | ${consume(ctx, null, row).trim} | ||
| | if (shouldStop()) return; | ||
|
|
@@ -687,7 +698,7 @@ case class WholeStageCodegenExec(child: SparkPlan)(val codegenStageId: Int) | |
|
|
||
| override def needStopCheck: Boolean = true | ||
|
|
||
| override def conditionsOfKeepProducingData: Seq[String] = Nil | ||
| override def limitNotReachedChecks: Seq[String] = Nil | ||
|
|
||
| override protected def otherCopyArgs: Seq[AnyRef] = Seq(codegenStageId.asInstanceOf[Integer]) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can put
limitNotReachedCondas first condition to avoid possible buffering of row.