-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-13404] [SQL] Create variables for input row when it's actually used #11274
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
c92e457
e58c8a6
f6139e6
4faf5f9
4fb0bc8
ef9e8f3
705da3f
ca8fe0f
853502e
1a1452e
62682b2
76ca6c6
ffc9d8c
f431170
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -83,7 +83,10 @@ trait CodegenSupport extends SparkPlan { | |
| this.parent = parent | ||
| ctx.freshNamePrefix = variablePrefix | ||
| waitForSubqueries() | ||
| doProduce(ctx) | ||
| s""" | ||
| |/*** PRODUCE: ${toCommentSafeString(this.simpleString)} */ | ||
| |${doProduce(ctx)} | ||
| """.stripMargin | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -115,6 +118,39 @@ trait CodegenSupport extends SparkPlan { | |
| parent.consumeChild(ctx, this, input, row) | ||
| } | ||
|
|
||
| /** | ||
| * Returns source code to evaluate all the variables, and clear the code of them, to prevent | ||
| * them to be evaluated twice. | ||
| */ | ||
| protected def evaluateVariables(variables: Seq[ExprCode]): String = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you update the comment for ExprCode.code to specify what it means when it is empty.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| val evaluate = variables.filter(_.code != "").map(_.code.trim).mkString("\n") | ||
| variables.foreach(_.code = "") | ||
| evaluate | ||
| } | ||
|
|
||
| /** | ||
| * Returns source code to evaluate the variables for required attributes, and clear the code | ||
| * of evaluated variables, to prevent them to be evaluated twice.. | ||
| */ | ||
| protected def evaluateRequiredVariables( | ||
| attributes: Seq[Attribute], | ||
| variables: Seq[ExprCode], | ||
| required: AttributeSet): String = { | ||
| var evaluateVars = "" | ||
| variables.zipWithIndex.foreach { case (ev, i) => | ||
| if (ev.code != "" && required.contains(attributes(i))) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @davies I was just reviewing build warnings, and it flags this line. |
||
| evaluateVars += ev.code.trim + "\n" | ||
| ev.code = "" | ||
| } | ||
| } | ||
| evaluateVars | ||
| } | ||
|
|
||
| /** | ||
| * The subset of inputSet those should be evaluated before this plan. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is a good place to document how this whole thing works in a couple of sentences. Something describing that we defer attribute access in the generated function. We access all the attributes needed by the operator at the beginning if it was not already referenced earlier in the pipeline. Might also update the commit message with this since this is what most of the patch is about.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| */ | ||
| def usedInputs: AttributeSet = references | ||
|
|
||
| /** | ||
| * Consume the columns generated from it's child, call doConsume() or emit the rows. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you comment what the semantics are if row == null vs not null? |
||
| */ | ||
|
|
@@ -124,19 +160,22 @@ trait CodegenSupport extends SparkPlan { | |
| input: Seq[ExprCode], | ||
| row: String = null): String = { | ||
| ctx.freshNamePrefix = variablePrefix | ||
| if (row != null) { | ||
| ctx.currentVars = null | ||
| ctx.INPUT_ROW = row | ||
| val evals = child.output.zipWithIndex.map { case (attr, i) => | ||
| BoundReference(i, attr.dataType, attr.nullable).gen(ctx) | ||
| val inputVars = | ||
| if (row != null) { | ||
| ctx.currentVars = null | ||
| ctx.INPUT_ROW = row | ||
| child.output.zipWithIndex.map { case (attr, i) => | ||
| BoundReference(i, attr.dataType, attr.nullable).gen(ctx) | ||
| } | ||
| } else { | ||
| input | ||
| } | ||
| s""" | ||
| | ${evals.map(_.code).mkString("\n")} | ||
| | ${doConsume(ctx, evals)} | ||
| """.stripMargin | ||
| } else { | ||
| doConsume(ctx, input) | ||
| } | ||
| s""" | ||
| | | ||
| |/*** CONSUME: ${toCommentSafeString(this.simpleString)} */ | ||
| |${evaluateRequiredVariables(child.output, inputVars, usedInputs)} | ||
| |${doConsume(ctx, inputVars)} | ||
| """.stripMargin | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -198,13 +237,9 @@ case class InputAdapter(child: SparkPlan) extends LeafNode with CodegenSupport { | |
| ctx.currentVars = null | ||
| val columns = exprs.map(_.gen(ctx)) | ||
| s""" | ||
| | while ($input.hasNext()) { | ||
| | while (!shouldStop() && $input.hasNext()) { | ||
| | InternalRow $row = (InternalRow) $input.next(); | ||
| | ${columns.map(_.code).mkString("\n").trim} | ||
| | ${consume(ctx, columns).trim} | ||
| | if (shouldStop()) { | ||
| | return; | ||
| | } | ||
| | } | ||
| """.stripMargin | ||
| } | ||
|
|
@@ -345,10 +380,12 @@ case class WholeStageCodegen(plan: CodegenSupport, children: Seq[SparkPlan]) | |
| val colExprs = output.zipWithIndex.map { case (attr, i) => | ||
| BoundReference(i, attr.dataType, attr.nullable) | ||
| } | ||
| val evaluateInputs = evaluateVariables(input) | ||
| // generate the code to create a UnsafeRow | ||
| ctx.currentVars = input | ||
| val code = GenerateUnsafeProjection.createCode(ctx, colExprs, false) | ||
| s""" | ||
| |$evaluateInputs | ||
| |${code.code.trim} | ||
| |append(${code.value}.copy()); | ||
| """.stripMargin | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -39,15 +39,26 @@ case class Project(projectList: Seq[NamedExpression], child: SparkPlan) | |
| child.asInstanceOf[CodegenSupport].produce(ctx, this) | ||
| } | ||
|
|
||
| override def usedInputs: AttributeSet = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? I think we need to have stronger semantics so that this is not necessary. I think if each operator just always ensured the referenced attributes were populated at the start of consume, we don't need this.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed offline, this is need to handle the cases that an Attribute could appear twice in |
||
| // only the attributes those are used at least twice should be evaluated before this plan, | ||
| // otherwise we could defer the evaluation until output attribute is actually used. | ||
| val usedExprIds = projectList.flatMap(_.collect { | ||
| case a: Attribute => a.exprId | ||
| }) | ||
| val usedMoreThanOnce = usedExprIds.groupBy(id => id).filter(_._2.size > 1).keySet | ||
| references.filter(a => usedMoreThanOnce.contains(a.exprId)) | ||
| } | ||
|
|
||
| override def doConsume(ctx: CodegenContext, input: Seq[ExprCode]): String = { | ||
| val exprs = projectList.map(x => | ||
| ExpressionCanonicalizer.execute(BindReferences.bindReference(x, child.output))) | ||
| ctx.currentVars = input | ||
| val output = exprs.map(_.gen(ctx)) | ||
| val resultVars = exprs.map(_.gen(ctx)) | ||
| // Evaluation of non-deterministic expressions can't be deferred. | ||
| val nonDeterministicAttrs = projectList.zip(output).filter(!_._1.deterministic).unzip._2 | ||
| s""" | ||
| | ${output.map(_.code).mkString("\n")} | ||
| | | ||
| | ${consume(ctx, output)} | ||
| |${evaluateRequiredVariables(output, resultVars, AttributeSet(nonDeterministicAttrs))} | ||
| |${consume(ctx, resultVars)} | ||
| """.stripMargin | ||
| } | ||
|
|
||
|
|
@@ -89,11 +100,10 @@ case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode wit | |
| s"" | ||
| } | ||
| s""" | ||
| | ${eval.code} | ||
| | if ($nullCheck ${eval.value}) { | ||
| | $numOutput.add(1); | ||
| | ${consume(ctx, ctx.currentVars)} | ||
| | } | ||
| |${eval.code} | ||
| |if (!($nullCheck ${eval.value})) continue; | ||
| |$numOutput.add(1); | ||
| |${consume(ctx, ctx.currentVars)} | ||
| """.stripMargin | ||
| } | ||
|
|
||
|
|
@@ -228,15 +238,13 @@ case class Range( | |
| | } | ||
| | } | ||
| | | ||
| | while (!$overflow && $checkEnd) { | ||
| | while (!$overflow && $checkEnd && !shouldStop()) { | ||
| | long $value = $number; | ||
| | $number += ${step}L; | ||
| | if ($number < $value ^ ${step}L < 0) { | ||
| | $overflow = true; | ||
| | } | ||
| | ${consume(ctx, Seq(ev))} | ||
| | | ||
| | if (shouldStop()) return; | ||
| | } | ||
| """.stripMargin | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a high level comment that describes the overall framework? I think the important things to include are:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was imagining something like:
evaluateAttributes(Seq[Expression]) which evaluates all the attribute refernces in the tree that haven't been. This is kind of similar to what you have below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some variables could be generated in the middle of the plan, for example, aggregate, and join, so we can't always use the references of current plan to determine which expression is used or not. So I have two different functions here, we could pass in the used references to the function below.