Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1016,7 +1016,7 @@ class Analyzer(override val catalogManager: CatalogManager)
case s: ExposesMetadataColumns => s.withMetadataColumns()
case p: Project =>
val newProj = p.copy(
projectList = p.metadataOutput ++ p.projectList,
projectList = p.projectList ++ p.metadataOutput,
Copy link
Member

@dongjoon-hyun dongjoon-hyun Dec 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this cause any user-facing query result change? I guess it's not. Just want to get your confirmation.

Copy link
Member Author

@gengliangwang gengliangwang Dec 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so. The analyzer adds the metadata columns so the parent operators can access theym. If the metadata columns are not used, they will be pruned.

child = addMetadataCol(p.child))
newProj.copyTagsFrom(p)
newProj
Expand Down
39 changes: 39 additions & 0 deletions sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4588,6 +4588,45 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
sql("SELECT /*+ hash(t2) */ * FROM t1 join t2 on c1 = c2")
}
}

test("SPARK-41538: Metadata column should be appended at the end of project") {
val tableName = "table_1"
val viewName = "view_1"
withTable(tableName) {
withView(viewName) {
sql(s"CREATE TABLE $tableName (a ARRAY<STRING>, s STRUCT<id: STRING>) USING parquet")
val id = "id1"
sql(s"INSERT INTO $tableName values(ARRAY('a'), named_struct('id', '$id'))")
sql(
s"""
|CREATE VIEW $viewName (id)
|AS WITH source AS (
| SELECT * FROM $tableName
|),
|renamed AS (
| SELECT s.id FROM source
|)
|SELECT id FROM renamed
|""".stripMargin)
val query =
s"""
|with foo AS (
| SELECT '$id' as id
|),
|bar AS (
| SELECT '$id' as id
|)
|SELECT
| 1
|FROM foo
|FULL OUTER JOIN bar USING(id)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The full outer join will trigger the rule AddMetadataColumns to add meta columns over the whole plan.
Before rule AddMetadataColumns:

!Filter isnotnull(id#223)
+- Project [coalesce(id#227, id#225) AS id#228]
   +- Join FullOuter, (id#227 = id#225)
      :- Project [coalesce(id#223, id#224) AS id#227]
      :  +- Join FullOuter, (id#223 = id#224)
      :     :- SubqueryAlias foo
      :     :  +- CTERelationRef 2, true, [id#223]
      :     +- SubqueryAlias bar
      :        +- CTERelationRef 3, true, [id#224]
      +- SubqueryAlias spark_catalog.default.view_1
         +- View (`spark_catalog`.`default`.`view_1`, [id#225])
            +- Project [cast(id#226 as string) AS id#225]
               +- WithCTE
                  :- CTERelationDef 4, false
                  :  +- SubqueryAlias source
                  :     +- Project [a#218, s#219]
                  :        +- SubqueryAlias spark_catalog.default.table_1
                  :           +- Relation spark_catalog.default.table_1[a#218,s#219] parquet
                  :- CTERelationDef 5, false
                  :  +- SubqueryAlias renamed
                  :     +- Project [s#219.id AS id#226]
                  :        +- SubqueryAlias source
                  :           +- CTERelationRef 4, true, [a#218, s#219]
                  +- Project [id#226]
                     +- SubqueryAlias renamed
                        +- CTERelationRef 5, true, [id#226]

After the rule:

Filter isnotnull(id#223)
+- Project [id#227, id#225, id#223, id#224, coalesce(id#227, id#225) AS id#228]
   +- Join FullOuter, (id#227 = id#225)
      :- Project [id#223, id#224, coalesce(id#223, id#224) AS id#227]
      :  +- Join FullOuter, (id#223 = id#224)
      :     :- SubqueryAlias foo
      :     :  +- CTERelationRef 2, true, [id#223]
      :     +- SubqueryAlias bar
      :        +- CTERelationRef 3, true, [id#224]
      +- SubqueryAlias spark_catalog.default.view_1
         +- View (`spark_catalog`.`default`.`view_1`, [id#225])
            +- Project [cast(id#226 as string) AS id#225]
               +- WithCTE
                  :- CTERelationDef 4, false
                  :  +- SubqueryAlias source
                  :     +- Project [_metadata#221, a#218, s#219]
                  :        +- SubqueryAlias spark_catalog.default.table_1
                  :           +- Relation spark_catalog.default.table_1[a#218,s#219,_metadata#221] parquet
                  :- CTERelationDef 5, false
                  :  +- SubqueryAlias renamed
                  :     +- Project [s#219.id AS id#226]
                  :        +- SubqueryAlias source
                  :           +- CTERelationRef 4, true, [a#218, s#219]
                  +- Project [id#226]
                     +- SubqueryAlias renamed
                        +- CTERelationRef 5, true, [id#226]

|FULL OUTER JOIN $viewName USING(id)
|WHERE foo.id IS NOT NULL
|""".stripMargin
checkAnswer(sql(query), Row(1))
}
}
}
}

case class Foo(bar: Option[String])