-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-13404] [SQL] Create variables for input row when it's actually used #11274
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
c92e457
e58c8a6
f6139e6
4faf5f9
4fb0bc8
ef9e8f3
705da3f
ca8fe0f
853502e
1a1452e
62682b2
76ca6c6
ffc9d8c
f431170
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -76,7 +76,10 @@ trait CodegenSupport extends SparkPlan { | |
| def produce(ctx: CodegenContext, parent: CodegenSupport): String = { | ||
| this.parent = parent | ||
| ctx.freshNamePrefix = variablePrefix | ||
| doProduce(ctx) | ||
| s""" | ||
| |/*** PRODUCE: ${commentSafe(this.simpleString)} */ | ||
| |${doProduce(ctx)} | ||
| """.stripMargin | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -108,6 +111,38 @@ trait CodegenSupport extends SparkPlan { | |
| parent.consumeChild(ctx, this, input, row) | ||
| } | ||
|
|
||
| /** | ||
| * Returns source code to evaluate all the variables, and clear the code of them, to prevent | ||
| * them to be evaluated twice. | ||
| */ | ||
| protected def evaluateVariables(variables: Seq[ExprCode]): String = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you update the comment for ExprCode.code to specify what it means when it is empty.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| val evaluate = variables.filter(_.code != "").map(_.code.trim).mkString("\n") | ||
| variables.foreach(_.code = "") | ||
| evaluate | ||
| } | ||
|
|
||
| /** | ||
| * Returns source code to evaluate the variables for required attributes, and clear the code | ||
| * of evaluated variables, to prevent them to be evaluated twice.. | ||
| */ | ||
| protected def evaluateRequiredVariables( | ||
| attributes: Seq[Attribute], | ||
| variables: Seq[ExprCode], | ||
| required: AttributeSet): String = { | ||
| var evaluateVars = "" | ||
| variables.zipWithIndex.foreach { case (ev, i) => | ||
| if (ev.code != "" && required.contains(attributes(i))) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @davies I was just reviewing build warnings, and it flags this line. |
||
| evaluateVars += ev.code.trim + "\n" | ||
| ev.code = "" | ||
| } | ||
| } | ||
| evaluateVars | ||
| } | ||
|
|
||
| protected def commentSafe(s: String): String = { | ||
| s.replace("*/", "\\*\\/").replace("\\u", "\\\\u") | ||
| } | ||
|
|
||
| /** | ||
| * Consume the columns generated from it's child, call doConsume() or emit the rows. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you comment what the semantics are if row == null vs not null? |
||
| */ | ||
|
|
@@ -117,19 +152,22 @@ trait CodegenSupport extends SparkPlan { | |
| input: Seq[ExprCode], | ||
| row: String = null): String = { | ||
| ctx.freshNamePrefix = variablePrefix | ||
| if (row != null) { | ||
| ctx.currentVars = null | ||
| ctx.INPUT_ROW = row | ||
| val evals = child.output.zipWithIndex.map { case (attr, i) => | ||
| BoundReference(i, attr.dataType, attr.nullable).gen(ctx) | ||
| val inputVars = | ||
| if (row != null) { | ||
| ctx.currentVars = null | ||
| ctx.INPUT_ROW = row | ||
| child.output.zipWithIndex.map { case (attr, i) => | ||
| BoundReference(i, attr.dataType, attr.nullable).gen(ctx) | ||
| } | ||
| } else { | ||
| input | ||
| } | ||
| s""" | ||
| | ${evals.map(_.code).mkString("\n")} | ||
| | ${doConsume(ctx, evals)} | ||
| """.stripMargin | ||
| } else { | ||
| doConsume(ctx, input) | ||
| } | ||
| s""" | ||
| | | ||
| |/*** CONSUME: ${commentSafe(this.simpleString)} */ | ||
| |${evaluateRequiredVariables(child.output, inputVars, references)} | ||
| |${doConsume(ctx, inputVars)} | ||
| """.stripMargin | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -183,13 +221,9 @@ case class InputAdapter(child: SparkPlan) extends LeafNode with CodegenSupport { | |
| ctx.currentVars = null | ||
| val columns = exprs.map(_.gen(ctx)) | ||
| s""" | ||
| | while (input.hasNext()) { | ||
| | while (!shouldStop() && input.hasNext()) { | ||
| | InternalRow $row = (InternalRow) input.next(); | ||
| | ${columns.map(_.code).mkString("\n").trim} | ||
| | ${consume(ctx, columns).trim} | ||
| | if (shouldStop()) { | ||
| | return; | ||
| | } | ||
| | } | ||
| """.stripMargin | ||
| } | ||
|
|
@@ -251,7 +285,7 @@ case class WholeStageCodegen(plan: CodegenSupport, children: Seq[SparkPlan]) | |
| } | ||
|
|
||
| /** Codegened pipeline for: | ||
| * ${plan.treeString.trim} | ||
| * ${commentSafe(plan.treeString.trim)} | ||
| */ | ||
| class GeneratedIterator extends org.apache.spark.sql.execution.BufferedRowIterator { | ||
|
|
||
|
|
@@ -305,7 +339,7 @@ case class WholeStageCodegen(plan: CodegenSupport, children: Seq[SparkPlan]) | |
| if (row != null) { | ||
| // There is an UnsafeRow already | ||
| s""" | ||
| | currentRows.add($row.copy()); | ||
| |currentRows.add($row.copy()); | ||
| """.stripMargin | ||
| } else { | ||
| assert(input != null) | ||
|
|
@@ -317,13 +351,14 @@ case class WholeStageCodegen(plan: CodegenSupport, children: Seq[SparkPlan]) | |
| ctx.currentVars = input | ||
| val code = GenerateUnsafeProjection.createCode(ctx, colExprs, false) | ||
| s""" | ||
| | ${code.code.trim} | ||
| | currentRows.add(${code.value}.copy()); | ||
| |${evaluateVariables(input)} | ||
| |${code.code.trim} | ||
| |currentRows.add(${code.value}.copy()); | ||
| """.stripMargin | ||
| } else { | ||
| // There is no columns | ||
| s""" | ||
| | currentRows.add(unsafeRow); | ||
| |currentRows.add(unsafeRow); | ||
| """.stripMargin | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a high level comment that describes the overall framework? I think the important things to include are:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was imagining something like:
evaluateAttributes(Seq[Expression]) which evaluates all the attribute refernces in the tree that haven't been. This is kind of similar to what you have below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some variables could be generated in the middle of the plan, for example, aggregate, and join, so we can't always use the references of current plan to determine which expression is used or not. So I have two different functions here, we could pass in the used references to the function below.