From 3deef6740bfde03f0063ee29b4e61e425c6d0b6c Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 14 Nov 2023 15:41:36 +0800 Subject: [PATCH 1/3] group by ordinal should be idempotent --- .../sql/catalyst/analysis/Analyzer.scala | 13 ++++++++++- .../SubstituteUnresolvedOrdinalsSuite.scala | 23 +++++++++++++++++-- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 3a5f60eb376c..570e7f0288a2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -1999,7 +1999,18 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor throw QueryCompilationErrors.groupByPositionRefersToAggregateFunctionError( index, ordinalExpr) } else { - ordinalExpr + trimAliases(ordinalExpr) match { + // HACK ALERT: If the ordinal expression is also an integer literal, don't use it + // but still keep the ordinal literal. The reason is we may repeatedly + // analyze the plan. Using a different integer literal may lead to + // a repeat GROUP BY ordinal resolution which is wrong. GROUP BY + // constant is meaningless so whatever value does not matter here. + // TODO: GROUP BY ordinal should pull out grouping expressions to a Project, then + // the resolved ordinal expression is always `AttributeReference`. + case Literal(_: Int, IntegerType) => + Literal(index) + case _ => ordinalExpr + } } } else { throw QueryCompilationErrors.groupByPositionRangeError(index, aggs.size) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala index b0d7ace646e2..3c29bfc992c8 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala @@ -17,10 +17,11 @@ package org.apache.spark.sql.catalyst.analysis -import org.apache.spark.sql.catalyst.analysis.TestRelations.testRelation2 +import org.apache.spark.sql.catalyst.analysis.TestRelations.{testRelation, testRelation2} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ -import org.apache.spark.sql.catalyst.expressions.Literal +import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, Literal} +import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.internal.SQLConf class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest { @@ -67,4 +68,22 @@ class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest { testRelation2.groupBy(Literal(1), Literal(2))($"a", $"b")) } } + + test("group by ordinal repeated analysis") { + val plan = testRelation.groupBy(Literal(1))(Literal(100).as("a")).analyze + comparePlans( + plan, + testRelation.groupBy(Literal(1))(Literal(100).as("a")) + ) + + val testRelationWithData = testRelation.copy(data = Seq(new GenericInternalRow(Array(1: Any)))) + // Copy the plan to reset its `analyzed` flag, so that analyzer rules will re-apply. + val copiedPlan = plan.transform { + case _: LocalRelation => testRelationWithData + } + comparePlans( + copiedPlan.analyze, // repeated analysis + testRelationWithData.groupBy(Literal(1))(Literal(100).as("a")) + ) + } } From bf34e26d5ea11d3576b709c6203af0a536e62853 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Wed, 15 Nov 2023 12:55:57 +0800 Subject: [PATCH 2/3] address comments --- .../org/apache/spark/sql/catalyst/analysis/Analyzer.scala | 5 +++-- .../analysis/SubstituteUnresolvedOrdinalsSuite.scala | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 570e7f0288a2..95b35220dc1a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -2005,8 +2005,9 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor // analyze the plan. Using a different integer literal may lead to // a repeat GROUP BY ordinal resolution which is wrong. GROUP BY // constant is meaningless so whatever value does not matter here. - // TODO: GROUP BY ordinal should pull out grouping expressions to a Project, then - // the resolved ordinal expression is always `AttributeReference`. + // TODO: (SPARK-45932) GROUP BY ordinal should pull out grouping expressions to + // a Project, then the resolved ordinal expression is always + // `AttributeReference`. case Literal(_: Int, IntegerType) => Literal(index) case _ => ordinalExpr diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala index 3c29bfc992c8..953b2c8bb101 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala @@ -69,7 +69,7 @@ class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest { } } - test("group by ordinal repeated analysis") { + test("SPARK-45920: group by ordinal repeated analysis") { val plan = testRelation.groupBy(Literal(1))(Literal(100).as("a")).analyze comparePlans( plan, From 457d14e90cec34ca894b58d9a87697d2b298b0e0 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Wed, 15 Nov 2023 14:36:33 +0800 Subject: [PATCH 3/3] fix golden --- .../sql-tests/analyzer-results/group-by-ordinal.sql.out | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out index c8c34a856d49..1bcde5bd367f 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-ordinal.sql.out @@ -61,7 +61,7 @@ Aggregate [a#x, a#x], [a#x, 1 AS 1#x, sum(b#x) AS sum(b)#xL] -- !query select a, 1, sum(b) from data group by 1, 2 -- !query analysis -Aggregate [a#x, 1], [a#x, 1 AS 1#x, sum(b#x) AS sum(b)#xL] +Aggregate [a#x, 2], [a#x, 1 AS 1#x, sum(b#x) AS sum(b)#xL] +- SubqueryAlias data +- View (`data`, [a#x,b#x]) +- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]