-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-40382][SQL] Group distinct aggregate expressions by semantically equivalent children in RewriteDistinctAggregates
#37825
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
a5a6fc0
0a109d9
38f1f6a
3fa3588
4a40f91
165f558
27dcffe
484ca8e
208fe82
882cdaa
f53136d
9938252
f7d29df
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -213,10 +213,13 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] { | |
| case a: Aggregate if mayNeedtoRewrite(a) => rewrite(a) | ||
| } | ||
|
|
||
| def rewrite(aRaw: Aggregate): Aggregate = { | ||
| def rewrite(aOrig: Aggregate): Aggregate = { | ||
| // make children of distinct aggregations the same if they are different | ||
| // only because of superficial differences. | ||
| val a = getSanitizedAggregate(aRaw) | ||
| // only because of superficial reasons, e.g.: | ||
| // "1 + col1" vs "col1 + 1", both become "1 + col1" | ||
| // or | ||
| // "col1" vs "Col1", both become "col1" | ||
| val a = reduceDistinctAggregateGroups(aOrig) | ||
|
|
||
| val aggExpressions = collectAggregateExprs(a) | ||
| val distinctAggs = aggExpressions.filter(_.isDistinct) | ||
|
|
@@ -248,6 +251,13 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] { | |
| } | ||
| val groupByAttrs = groupByMap.map(_._2) | ||
|
|
||
| def patchAggregateFunctionChildren( | ||
| af: AggregateFunction)( | ||
| attrs: Expression => Option[Expression]): AggregateFunction = { | ||
| val newChildren = af.children.map(c => attrs(c).getOrElse(c)) | ||
| af.withNewChildren(newChildren).asInstanceOf[AggregateFunction] | ||
| } | ||
|
|
||
| // Setup unique distinct aggregate children. | ||
| val distinctAggChildren = distinctAggGroups.keySet.flatten.toSeq.distinct | ||
|
||
| val distinctAggChildAttrMap = distinctAggChildren.map(expressionAttributePair) | ||
|
|
@@ -409,14 +419,7 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] { | |
| }} | ||
| } | ||
|
|
||
| private def patchAggregateFunctionChildren( | ||
| af: AggregateFunction)( | ||
| attrs: Expression => Option[Expression]): AggregateFunction = { | ||
| val newChildren = af.children.map(c => attrs(c).getOrElse(c)) | ||
| af.withNewChildren(newChildren).asInstanceOf[AggregateFunction] | ||
| } | ||
|
|
||
| private def getSanitizedAggregate(a: Aggregate): Aggregate = { | ||
| private def reduceDistinctAggregateGroups(a: Aggregate): Aggregate = { | ||
| val aggExpressions = collectAggregateExprs(a) | ||
| val distinctAggs = aggExpressions.filter(_.isDistinct) | ||
|
|
||
|
|
@@ -436,6 +439,14 @@ object RewriteDistinctAggregates extends Rule[LogicalPlan] { | |
| (e, funcChildren.find(fc => e.semanticEquals(fc)).getOrElse(e)) | ||
|
||
| }.toMap | ||
|
|
||
| val funcChildrenPatched = funcChildren.map { e => | ||
| funcChildrenLookup.getOrElse(e, e) | ||
| } | ||
|
|
||
| if (funcChildren.distinct.size == funcChildrenPatched.distinct.size) { | ||
| return a; | ||
| } | ||
|
|
||
| val patchedAggExpressions = a.aggregateExpressions.map { e => | ||
| e.transformDown { | ||
| case e: Expression => | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shall we just canonicalize the function inputs when group by them? e.g. change
e.aggregateFunction.children.filter(!_.foldable).toSettoExpressionSet(e.aggregateFunction.children.filter(!_.foldable))There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! I am working on it, just working through some small complications.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I made the change to use
ExpressionSetand also commented on some of the issues.I still prefer 'sanitizing' each original function child to use the first semantically equivalent child, in essence creating a new set of "original" children, as it bypasses some complexities (in particular the one where we may lose some of the original children as keys when we group by
ExpressionSet).