-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-13306] [SQL] uncorrelated scalar subquery #11190
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
0665a69
236ac88
016c36c
a4bae33
d0974cf
3a8f08d
7596173
0034172
e082845
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -115,44 +115,59 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ | |
| final def execute(): RDD[InternalRow] = { | ||
| RDDOperationScope.withScope(sparkContext, nodeName, false, true) { | ||
| prepare() | ||
| waitForSubqueries() | ||
| doExecute() | ||
| } | ||
| } | ||
|
|
||
| // All the subquries and their Future of results. | ||
| @transient private val queryResults = ArrayBuffer[(ScalarSubquery, Future[Array[InternalRow]])]() | ||
|
|
||
| /** | ||
| * Collects all the subqueries and create a Future to take the first two rows of them. | ||
| */ | ||
| protected def prepareSubqueries(): Unit = { | ||
| val allSubqueries = expressions.flatMap(_.collect {case e: ScalarSubquery => e}) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could move this into
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It has a little bit difference than that, I'd like to duplicate it here. |
||
| allSubqueries.foreach { e => | ||
| val futureResult = Future { | ||
| // We only need the first row, try to take two rows so we can throw an exception if there | ||
| // are more than one rows returned. | ||
| e.executedPlan.executeTake(2) | ||
| }(SparkPlan.subqueryExecutionContext) | ||
| queryResults += e -> futureResult | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Waits for all the subquires to finish and updates the results. | ||
| */ | ||
| protected def waitForSubqueries(): Unit = { | ||
| // fill in the result of subqueries | ||
| queryResults.foreach { | ||
| case (e, futureResult) => | ||
| val rows = Await.result(futureResult, Duration.Inf) | ||
| if (rows.length > 1) { | ||
| sys.error(s"more than one row returned by a subquery used as an expression:\n${e.plan}") | ||
| } | ||
| if (rows.length == 1) { | ||
| assert(rows(0).numFields == 1, "Analyzer should make sure this only returns one column") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The analyzer checks this right?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nevermind. |
||
| e.updateResult(rows(0).get(0, e.dataType)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why don't we replace the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The ScalarSubqueries could be class member of the current plan, the filed could be immutable, we could not replace it. |
||
| } else { | ||
| // There is no rows returned, the result should be null. | ||
| e.updateResult(null) | ||
| } | ||
| } | ||
| queryResults.clear() | ||
| } | ||
|
|
||
| /** | ||
| * Prepare a SparkPlan for execution. It's idempotent. | ||
| */ | ||
| final def prepare(): Unit = { | ||
| if (prepareCalled.compareAndSet(false, true)) { | ||
| doPrepare() | ||
|
|
||
| // collect all the subqueries and submit jobs to execute them in background | ||
| val queryResults = ArrayBuffer[(ScalarSubquery, Future[Array[InternalRow]])]() | ||
| val allSubqueries = expressions.flatMap(_.collect {case e: ScalarSubquery => e}) | ||
| allSubqueries.foreach { e => | ||
| val futureResult = Future { | ||
| e.plan.executeTake(2) | ||
| }(SparkPlan.subqueryExecutionContext) | ||
| queryResults += e -> futureResult | ||
| } | ||
|
|
||
| prepareSubqueries() | ||
| children.foreach(_.prepare()) | ||
|
|
||
| // fill in the result of subqueries | ||
| queryResults.foreach { | ||
| case (e, futureResult) => | ||
| val rows = Await.result(futureResult, Duration.Inf) | ||
| if (rows.length > 1) { | ||
| sys.error(s"more than one row returned by a subquery used as an expression:\n${e.plan}") | ||
| } | ||
| if (rows.length == 1) { | ||
| assert(rows(0).numFields == 1, "Analyzer should make sure this only returns one column") | ||
| e.updateResult(rows(0).get(0, e.dataType)) | ||
| } else { | ||
| // There is no rows returned, the result should be null. | ||
| e.updateResult(null) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -73,9 +73,10 @@ trait CodegenSupport extends SparkPlan { | |
| /** | ||
| * Returns Java source code to process the rows from upstream. | ||
| */ | ||
| def produce(ctx: CodegenContext, parent: CodegenSupport): String = { | ||
| final def produce(ctx: CodegenContext, parent: CodegenSupport): String = { | ||
| this.parent = parent | ||
| ctx.freshNamePrefix = variablePrefix | ||
| waitForSubqueries() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this needed? shouldn't SparkPlan.execute already call waitForSubqueries?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is needed for whole stage codegen, those operator will not call execute().
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok got it. this is fairly hacky ... |
||
| doProduce(ctx) | ||
| } | ||
|
|
||
|
|
@@ -101,7 +102,7 @@ trait CodegenSupport extends SparkPlan { | |
| /** | ||
| * Consume the columns generated from current SparkPlan, call it's parent. | ||
| */ | ||
| def consume(ctx: CodegenContext, input: Seq[ExprCode], row: String = null): String = { | ||
| final def consume(ctx: CodegenContext, input: Seq[ExprCode], row: String = null): String = { | ||
| if (input != null) { | ||
| assert(input.length == output.length) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: subqueries