Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
reuse the FixNullability rule
  • Loading branch information
cloud-fan committed Jan 10, 2019
commit feb57c00508560a08100a9f78c1245cfa80474de
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,8 @@ class Analyzer(
PullOutNondeterministic),
Batch("UDF", Once,
HandleNullInputsForUDF),
Batch("FixNullability", Once,
FixNullability),
Batch("UpdateNullability", Once,
UpdateNullability),
Batch("Subquery", Once,
UpdateOuterReferences),
Batch("Cleanup", fixedPoint,
Expand Down Expand Up @@ -1821,40 +1821,6 @@ class Analyzer(
}
}

/**
* Fixes nullability of Attributes in a resolved LogicalPlan by using the nullability of
* corresponding Attributes of its children output Attributes. This step is needed because
* users can use a resolved AttributeReference in the Dataset API and outer joins
* can change the nullability of an AttribtueReference. Without the fix, a nullable column's
* nullable field can be actually set as non-nullable, which cause illegal optimization
* (e.g., NULL propagation) and wrong answers.
* See SPARK-13484 and SPARK-13801 for the concrete queries of this case.
*/
object FixNullability extends Rule[LogicalPlan] {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

move this rule out of the Analyzer, so that it can be used in other places.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1


def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp {
case p if !p.resolved => p // Skip unresolved nodes.
case p: LogicalPlan if p.resolved =>
val childrenOutput = p.children.flatMap(c => c.output).groupBy(_.exprId).flatMap {
case (exprId, attributes) =>
// If there are multiple Attributes having the same ExprId, we need to resolve
// the conflict of nullable field. We do not really expect this happen.
val nullable = attributes.exists(_.nullable)
attributes.map(attr => attr.withNullability(nullable))
}.toSeq
// At here, we create an AttributeMap that only compare the exprId for the lookup
// operation. So, we can find the corresponding input attribute's nullability.
val attributeMap = AttributeMap[Attribute](childrenOutput.map(attr => attr -> attr))
// For an Attribute used by the current LogicalPlan, if it is from its children,
// we fix the nullable field by using the nullability setting of the corresponding
// output Attribute from the children.
p.transformExpressions {
case attr: Attribute if attributeMap.contains(attr) =>
attr.withNullability(attributeMap(attr).nullable)
}
}
}

/**
* Extracts [[WindowExpression]]s from the projectList of a [[Project]] operator and
* aggregateExpressions of an [[Aggregate]] operator and creates individual [[Window]]
Expand Down Expand Up @@ -2848,3 +2814,43 @@ object UpdateOuterReferences extends Rule[LogicalPlan] {
}
}
}

/**
* Updates nullability of Attributes in a resolved LogicalPlan by using the nullability of
* corresponding Attributes of its children output Attributes. This step is needed because
* users can use a resolved AttributeReference in the Dataset API and outer joins
* can change the nullability of an AttribtueReference. Without this rule, a nullable column's
* nullable field can be actually set as non-nullable, which cause illegal optimization
* (e.g., NULL propagation) and wrong answers.
* See SPARK-13484 and SPARK-13801 for the concrete queries of this case.
*
* This rule should be executed again at the end of optimization phase, as optimizer may change
* some expressions and their nullabilities as well. See SPARK-21351 for more details.
*/
object UpdateNullability extends Rule[LogicalPlan] {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since Analyzer.scala is too big, let's make this as a new file, please.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about UpdateNullability -> UpdateAttributeNullability?


def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the impl of resolveOperatorsUp, if the plan is analyzed, this rule will not take any effect in the optimizer stage. Right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the plan is analyzed

More precisely, the _analyzed flag is true.

This flag will be reset to false if the plan changed(plan copy happened). If it's true, then the plan is not changed since last analysis and we don't need to update the nullability.

// Skip unresolved nodes.
case p if !p.resolved => p
// Skip leaf node, as it has no child and no need to update nullability.
case p: LeafNode => p
Copy link
Contributor Author

@cloud-fan cloud-fan Dec 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is from the UpdateNullabilityInAttributeReferences. Leaf nodes don't have child and no nullability will be updated, then the case below is noop.

case p: LogicalPlan =>
val childrenOutput = p.children.flatMap(c => c.output).groupBy(_.exprId).flatMap {
case (exprId, attributes) =>
// If there are multiple Attributes having the same ExprId, we need to resolve
// the conflict of nullable field. We do not really expect this happen.
val nullable = attributes.exists(_.nullable)
attributes.map(attr => attr.withNullability(nullable))
}.toSeq
// At here, we create an AttributeMap that only compare the exprId for the lookup
// operation. So, we can find the corresponding input attribute's nullability.
val attributeMap = AttributeMap[Attribute](childrenOutput.map(attr => attr -> attr))
// For an Attribute used by the current LogicalPlan, if it is from its children,
// we fix the nullable field by using the nullability setting of the corresponding
// output Attribute from the children.
p.transformExpressions {
case attr: Attribute if attributeMap.contains(attr) =>
attr.withNullability(attributeMap(attr).nullable)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,7 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
ColumnPruning,
CollapseProject,
RemoveNoopOperators) :+
Batch("UpdateAttributeReferences", Once,
UpdateNullabilityInAttributeReferences) :+
Batch("UpdateNullability", Once, UpdateNullability) :+
// This batch must be executed after the `RewriteSubquery` batch, which creates joins.
Batch("NormalizeFloatingNumbers", Once, NormalizeFloatingNumbers)
}
Expand Down Expand Up @@ -1647,18 +1646,3 @@ object RemoveRepetitionFromGroupExpressions extends Rule[LogicalPlan] {
}
}
}

/**
* Updates nullability in [[AttributeReference]]s if nullability is different between
* non-leaf plan's expressions and the children output.
*/
object UpdateNullabilityInAttributeReferences extends Rule[LogicalPlan] {
def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
case p if !p.isInstanceOf[LeafNode] =>
val nullabilityMap = AttributeMap(p.children.flatMap(_.output).map { x => x -> x.nullable })
p transformExpressions {
case ar: AttributeReference if nullabilityMap.contains(ar) =>
ar.withNullability(nullabilityMap(ar))
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.analysis.UpdateNullability
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.{CreateArray, GetArrayItem}
Expand All @@ -25,7 +26,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.RuleExecutor


class UpdateNullabilityInAttributeReferencesSuite extends PlanTest {
class UpdateNullabilityInOptimizerSuite extends PlanTest {

object Optimizer extends RuleExecutor[LogicalPlan] {
val batches =
Expand All @@ -36,8 +37,8 @@ class UpdateNullabilityInAttributeReferencesSuite extends PlanTest {
SimplifyConditionals,
SimplifyBinaryComparison,
SimplifyExtractValueOps) ::
Batch("UpdateAttributeReferences", Once,
UpdateNullabilityInAttributeReferences) :: Nil
Batch("UpdateNullability", Once,
UpdateNullability) :: Nil
}

test("update nullability in AttributeReference") {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: plz update the comment inside this test: UpdateNullabilityInAttributeReferences -> UpdateNullability

Expand Down