-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-34527][SQL] Resolve duplicated common columns from USING/NATURAL JOIN #31666
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
1c5ab03
2fe733f
2c261bb
80beda8
e1719d3
6fa70ba
0ba1916
2b7e730
0c116a5
bf87f55
b5dc44f
6e32b3d
7c3f5df
181751a
ad5e824
e36e853
db44c53
73b7c8a
1eb01e2
9fd2490
f5cc3ae
fa7207e
7af12ae
07f9ad5
c474745
0f267e7
ed0270c
66ad572
fc3b16d
f665030
85b81b1
eab7964
44ee9f8
c84f396
0fe04a2
c7c3df6
b1bf28d
47be66d
8c5144e
333a815
49de5c5
9e62d7d
446d4bc
8f70c2d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -979,7 +979,7 @@ class Analyzer(override val catalogManager: CatalogManager) | |
| * | ||
| * References to metadata columns are resolved using columns from [[LogicalPlan.metadataOutput]], | ||
| * but the relation's output does not include the metadata columns until the relation is replaced | ||
| * using [[DataSourceV2Relation.withMetadataColumns()]]. Unless this rule adds metadata to the | ||
| * with a copy adding them to the output. Unless this rule adds metadata to the relation's output, | ||
| * relation's output, the analyzer will detect that nothing produces the columns. | ||
| * | ||
| * This rule only adds metadata columns when a node is resolved but is missing input from its | ||
|
|
@@ -988,31 +988,43 @@ class Analyzer(override val catalogManager: CatalogManager) | |
| * columns are not accidentally selected by *. | ||
| */ | ||
| object AddMetadataColumns extends Rule[LogicalPlan] { | ||
| import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._ | ||
| import org.apache.spark.sql.catalyst.util._ | ||
|
|
||
| private def getMetadataAttributes(plan: LogicalPlan): Seq[Attribute] = { | ||
| lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) | ||
|
||
| plan.expressions.flatMap(_.collect { | ||
| case a: Attribute if a.isMetadataCol => a | ||
| case a: Attribute if childMetadataOutput.exists(_.exprId == a.exprId) => | ||
|
||
| childMetadataOutput.find(_.exprId == a.exprId).get | ||
| }) | ||
| } | ||
|
|
||
| private def hasMetadataCol(plan: LogicalPlan): Boolean = { | ||
| lazy val childMetadataOutput = plan.children.flatMap(_.metadataOutput) | ||
| plan.expressions.exists(_.find { | ||
| case a: Attribute => a.isMetadataCol | ||
| case a: Attribute => | ||
| a.isMetadataCol || childMetadataOutput.exists(_.exprId == a.exprId) | ||
cloud-fan marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| case _ => false | ||
| }.isDefined) | ||
| } | ||
|
|
||
| private def addMetadataCol(plan: LogicalPlan): LogicalPlan = plan match { | ||
| case r: DataSourceV2Relation => r.withMetadataColumns() | ||
| case p: Project => p.copy( | ||
| projectList = p.metadataOutput ++ p.projectList, | ||
| child = addMetadataCol(p.child)) | ||
| case _ => plan.withNewChildren(plan.children.map(addMetadataCol)) | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperatorsUp { | ||
| case node if node.children.nonEmpty && node.resolved && hasMetadataCol(node) => | ||
| val inputAttrs = AttributeSet(node.children.flatMap(_.output)) | ||
| val metaCols = node.expressions.flatMap(_.collect { | ||
| case a: Attribute if a.isMetadataCol && !inputAttrs.contains(a) => a | ||
| }) | ||
| val metaCols = getMetadataAttributes(node).filterNot(inputAttrs.contains) | ||
| if (metaCols.isEmpty) { | ||
| node | ||
| } else { | ||
| val newNode = addMetadataCol(node) | ||
| // We should not change the output schema of the plan. We should project away the extr | ||
| // We should not change the output schema of the plan. We should project away the extra | ||
| // metadata columns if necessary. | ||
| if (newNode.sameOutput(node)) { | ||
| newNode | ||
|
|
@@ -3283,6 +3295,59 @@ class Analyzer(override val catalogManager: CatalogManager) | |
| * Then apply a Project on a normal Join to eliminate natural or using join. | ||
| */ | ||
| object ResolveNaturalAndUsingJoin extends Rule[LogicalPlan] { | ||
| private def commonNaturalJoinProcessing( | ||
| left: LogicalPlan, | ||
| right: LogicalPlan, | ||
| joinType: JoinType, | ||
| joinNames: Seq[String], | ||
| condition: Option[Expression], | ||
| hint: JoinHint): LogicalPlan = { | ||
| import org.apache.spark.sql.catalyst.util._ | ||
|
|
||
| val leftKeys = joinNames.map { keyName => | ||
| left.output.find(attr => resolver(attr.name, keyName)).getOrElse { | ||
| throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, left, "left") | ||
| } | ||
| } | ||
| val rightKeys = joinNames.map { keyName => | ||
| right.output.find(attr => resolver(attr.name, keyName)).getOrElse { | ||
| throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, right, "right") | ||
| } | ||
| } | ||
| val joinPairs = leftKeys.zip(rightKeys) | ||
|
|
||
| val newCondition = (condition ++ joinPairs.map(EqualTo.tupled)).reduceOption(And) | ||
|
|
||
| // columns not in joinPairs | ||
| val lUniqueOutput = left.output.filterNot(att => leftKeys.contains(att)) | ||
| val rUniqueOutput = right.output.filterNot(att => rightKeys.contains(att)) | ||
|
|
||
| // the output list looks like: join keys, columns from left, columns from right | ||
| val (projectList, hiddenList) = joinType match { | ||
| case LeftOuter => | ||
| (leftKeys ++ lUniqueOutput ++ rUniqueOutput.map(_.withNullability(true)), rightKeys) | ||
| case LeftExistence(_) => | ||
| (leftKeys ++ lUniqueOutput, Seq.empty) | ||
| case RightOuter => | ||
| (rightKeys ++ lUniqueOutput.map(_.withNullability(true)) ++ rUniqueOutput, leftKeys) | ||
| case FullOuter => | ||
| // in full outer join, joinCols should be non-null if there is. | ||
| val joinedCols = joinPairs.map { case (l, r) => Alias(Coalesce(Seq(l, r)), l.name)() } | ||
| (joinedCols ++ | ||
| lUniqueOutput.map(_.withNullability(true)) ++ | ||
| rUniqueOutput.map(_.withNullability(true)), | ||
| leftKeys ++ rightKeys) | ||
| case _ : InnerLike => | ||
| (leftKeys ++ lUniqueOutput ++ rUniqueOutput, rightKeys) | ||
| case _ => | ||
| sys.error("Unsupported natural join type " + joinType) | ||
| } | ||
| // use Project to hide duplicated common keys | ||
| val project = Project(projectList, Join(left, right, joinType, newCondition, hint)) | ||
| project.setTagValue(project.hiddenOutputTag, hiddenList.map(_.asHiddenCol())) | ||
| project | ||
| } | ||
|
|
||
| override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp { | ||
| case j @ Join(left, right, UsingJoin(joinType, usingCols), _, hint) | ||
| if left.resolved && right.resolved && j.duplicateResolved => | ||
|
|
@@ -3370,54 +3435,6 @@ class Analyzer(override val catalogManager: CatalogManager) | |
| } | ||
| } | ||
|
|
||
| private def commonNaturalJoinProcessing( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we move this method? It creates a lot of code diff and makes it harder to review.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can move it back - I just wasn't sure why it lived outside of this class, given that it's not shared. |
||
| left: LogicalPlan, | ||
| right: LogicalPlan, | ||
| joinType: JoinType, | ||
| joinNames: Seq[String], | ||
| condition: Option[Expression], | ||
| hint: JoinHint) = { | ||
| val leftKeys = joinNames.map { keyName => | ||
| left.output.find(attr => resolver(attr.name, keyName)).getOrElse { | ||
| throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, left, "left") | ||
| } | ||
| } | ||
| val rightKeys = joinNames.map { keyName => | ||
| right.output.find(attr => resolver(attr.name, keyName)).getOrElse { | ||
| throw QueryCompilationErrors.unresolvedUsingColForJoinError(keyName, right, "right") | ||
| } | ||
| } | ||
| val joinPairs = leftKeys.zip(rightKeys) | ||
|
|
||
| val newCondition = (condition ++ joinPairs.map(EqualTo.tupled)).reduceOption(And) | ||
|
|
||
| // columns not in joinPairs | ||
| val lUniqueOutput = left.output.filterNot(att => leftKeys.contains(att)) | ||
| val rUniqueOutput = right.output.filterNot(att => rightKeys.contains(att)) | ||
|
|
||
| // the output list looks like: join keys, columns from left, columns from right | ||
| val projectList = joinType match { | ||
| case LeftOuter => | ||
| leftKeys ++ lUniqueOutput ++ rUniqueOutput.map(_.withNullability(true)) | ||
| case LeftExistence(_) => | ||
| leftKeys ++ lUniqueOutput | ||
| case RightOuter => | ||
| rightKeys ++ lUniqueOutput.map(_.withNullability(true)) ++ rUniqueOutput | ||
| case FullOuter => | ||
| // in full outer join, joinCols should be non-null if there is. | ||
| val joinedCols = joinPairs.map { case (l, r) => Alias(Coalesce(Seq(l, r)), l.name)() } | ||
| joinedCols ++ | ||
| lUniqueOutput.map(_.withNullability(true)) ++ | ||
| rUniqueOutput.map(_.withNullability(true)) | ||
| case _ : InnerLike => | ||
| leftKeys ++ lUniqueOutput ++ rUniqueOutput | ||
| case _ => | ||
| sys.error("Unsupported natural join type " + joinType) | ||
| } | ||
| // use Project to trim unnecessary fields | ||
| Project(projectList, Join(left, right, joinType, newCondition, hint)) | ||
| } | ||
|
|
||
| /** | ||
| * Replaces [[UnresolvedDeserializer]] with the deserialization expression that has been resolved | ||
| * to the given input attributes. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,7 +85,7 @@ trait AnalysisHelper extends QueryPlan[LogicalPlan] { self: LogicalPlan => | |
| if (!analyzed) { | ||
| AnalysisHelper.allowInvokingTransformsInAnalyzer { | ||
| val afterRuleOnChildren = mapChildren(_.resolveOperatorsUp(rule)) | ||
| if (self fastEquals afterRuleOnChildren) { | ||
| val newNode = if (self fastEquals afterRuleOnChildren) { | ||
| CurrentOrigin.withOrigin(origin) { | ||
| rule.applyOrElse(self, identity[LogicalPlan]) | ||
| } | ||
|
|
@@ -94,6 +94,8 @@ trait AnalysisHelper extends QueryPlan[LogicalPlan] { self: LogicalPlan => | |
| rule.applyOrElse(afterRuleOnChildren, identity[LogicalPlan]) | ||
| } | ||
| } | ||
| newNode.copyTagsFrom(this) | ||
|
||
| newNode | ||
| } | ||
| } else { | ||
| self | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,7 +25,8 @@ import org.apache.spark.sql.catalyst.expressions._ | |
| import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression | ||
| import org.apache.spark.sql.catalyst.plans._ | ||
| import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning, RoundRobinPartitioning} | ||
| import org.apache.spark.sql.catalyst.util.truncatedString | ||
| import org.apache.spark.sql.catalyst.trees.TreeNodeTag | ||
| import org.apache.spark.sql.catalyst.util._ | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.util.random.RandomSampler | ||
|
|
@@ -76,6 +77,13 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) | |
|
|
||
| override lazy val validConstraints: ExpressionSet = | ||
| getAllValidConstraints(projectList) | ||
|
|
||
| val hiddenOutputTag: TreeNodeTag[Seq[Attribute]] = TreeNodeTag[Seq[Attribute]]("hiddenOutput") | ||
karenfeng marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| override def metadataOutput: Seq[Attribute] = { | ||
| child.metadataOutput ++ | ||
| getTagValue(hiddenOutputTag).getOrElse(Seq.empty[Attribute]) | ||
|
||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -950,7 +958,7 @@ case class SubqueryAlias( | |
|
|
||
| override def metadataOutput: Seq[Attribute] = { | ||
| val qualifierList = identifier.qualifier :+ alias | ||
| child.metadataOutput.map(_.withQualifier(qualifierList)) | ||
| child.metadataOutput.filterNot(_.isHiddenCol).map(_.withQualifier(qualifierList)) | ||
| } | ||
|
|
||
| override def doCanonicalize(): LogicalPlan = child.canonicalized | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,7 +25,7 @@ import java.util.concurrent.atomic.AtomicBoolean | |
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types.{NumericType, StringType} | ||
| import org.apache.spark.sql.types.{MetadataBuilder, NumericType, StringType} | ||
| import org.apache.spark.unsafe.types.UTF8String | ||
| import org.apache.spark.util.Utils | ||
|
|
||
|
|
@@ -193,4 +193,29 @@ package object util extends Logging { | |
| def truncatedString[T](seq: Seq[T], sep: String, maxFields: Int): String = { | ||
| truncatedString(seq, "", sep, "", maxFields) | ||
| } | ||
|
|
||
| val METADATA_COL_ATTR_KEY = "__metadata_col" | ||
| implicit class MetadataColumnHelper(attr: Attribute) { | ||
| def isMetadataCol: Boolean = attr.metadata.contains(METADATA_COL_ATTR_KEY) && | ||
| attr.metadata.getBoolean(METADATA_COL_ATTR_KEY) | ||
| } | ||
|
|
||
| /** | ||
| * Hidden columns are a type of metadata column that are not propagated through subquery aliases, | ||
| * and are candidates during qualified star expansions. | ||
karenfeng marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| */ | ||
| val HIDDEN_COL_ATTR_KEY = "__hidden_col" | ||
|
||
| implicit class HiddenColumnHelper(attr: Attribute) { | ||
karenfeng marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| def isHiddenCol: Boolean = attr.isMetadataCol && | ||
| attr.metadata.contains(HIDDEN_COL_ATTR_KEY) && | ||
| attr.metadata.getBoolean(HIDDEN_COL_ATTR_KEY) | ||
|
|
||
| def asHiddenCol(): Attribute = attr.withMetadata( | ||
| new MetadataBuilder() | ||
| .withMetadata(attr.metadata) | ||
| .putBoolean(METADATA_COL_ATTR_KEY, true) | ||
| .putBoolean(HIDDEN_COL_ATTR_KEY, true) | ||
| .build() | ||
| ) | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.