-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-20460][SQL] Make it more consistent to handle column name duplication #17758
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
1e647ee
4467077
33ab217
d8efb9d
11d1818
22e1e4f
743a069
f6eab2d
09da8d6
6d03f31
a0b9b05
91b6424
37ad3f3
d0d9d3e
cbe9c71
c69270f
af959f6
8d3e10a
9b386d5
a878510
be20127
f41bf80
0526391
9e199bc
1ae132d
5c29a75
5ed2c0d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -188,6 +188,9 @@ case class DataSource( | |
| (dataSchema ++ partitionSchema).map(_.name), "in the data schema and the partition schema", | ||
| sparkSession.sessionState.conf.caseSensitiveAnalysis) | ||
|
|
||
| SchemaUtils.checkSchemaColumnNameDuplication( | ||
| dataSchema, "datasource", sparkSession.sessionState.conf.caseSensitiveAnalysis) | ||
|
|
||
| (dataSchema, partitionSchema) | ||
| } | ||
|
|
||
|
|
@@ -334,6 +337,9 @@ case class DataSource( | |
| "It must be specified manually") | ||
| } | ||
|
|
||
| SchemaUtils.checkSchemaColumnNameDuplication( | ||
| dataSchema, "datasource", sparkSession.sessionState.conf.caseSensitiveAnalysis) | ||
|
||
|
|
||
| HadoopFsRelation( | ||
| fileCatalog, | ||
| partitionSchema = fileCatalog.partitionSchema, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -59,9 +59,7 @@ abstract class JsonDataSource extends Serializable { | |
| inputPaths: Seq[FileStatus], | ||
| parsedOptions: JSONOptions): Option[StructType] = { | ||
| if (inputPaths.nonEmpty) { | ||
| val jsonSchema = infer(sparkSession, inputPaths, parsedOptions) | ||
| checkConstraints(jsonSchema) | ||
| Some(jsonSchema) | ||
| Some(infer(sparkSession, inputPaths, parsedOptions)) | ||
|
||
| } else { | ||
| None | ||
| } | ||
|
|
@@ -71,17 +69,6 @@ abstract class JsonDataSource extends Serializable { | |
| sparkSession: SparkSession, | ||
| inputPaths: Seq[FileStatus], | ||
| parsedOptions: JSONOptions): StructType | ||
|
|
||
| /** Constraints to be imposed on schema to be stored. */ | ||
| private def checkConstraints(schema: StructType): Unit = { | ||
| if (schema.fieldNames.length != schema.fieldNames.distinct.length) { | ||
| val duplicateColumns = schema.fieldNames.groupBy(identity).collect { | ||
| case (x, ys) if ys.length > 1 => "\"" + x + "\"" | ||
| }.mkString(", ") | ||
| throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " + | ||
| s"cannot save to JSON format") | ||
| } | ||
| } | ||
| } | ||
|
|
||
| object JsonDataSource { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,51 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.util | ||
|
|
||
| import org.apache.spark.sql.AnalysisException | ||
| import org.apache.spark.sql.types.StructType | ||
|
|
||
|
|
||
| /** | ||
| * Utils for handling schemas. | ||
| * | ||
| * TODO: Merge this file with [[org.apache.spark.ml.util.SchemaUtils]]. | ||
| */ | ||
| private[spark] object SchemaUtils { | ||
|
||
|
|
||
| def checkSchemaColumnNameDuplication( | ||
| schema: StructType, colType: String, caseSensitiveAnalysis: Boolean = false): Unit = { | ||
| checkColumnNameDuplication(schema.map(_.name), colType, caseSensitiveAnalysis) | ||
| } | ||
|
|
||
| def checkColumnNameDuplication( | ||
| names: Seq[String], colType: String, caseSensitiveAnalysis: Boolean = false): Unit = { | ||
| val colNames = if (caseSensitiveAnalysis) { | ||
|
||
| names | ||
| } else { | ||
| names.map(_.toLowerCase) | ||
| } | ||
| if (colNames.distinct.length != colNames.length) { | ||
| val duplicateColumns = colNames.groupBy(identity).collect { | ||
| case (x, ys) if ys.length > 1 => "\"" + x + "\"" | ||
| } | ||
| throw new AnalysisException(s"Found duplicate column(s) in $colType: " + | ||
| duplicateColumns.mkString(", ")) | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| -- Check name duplication in a regular case | ||
| CREATE TABLE t (c STRING, c INT) USING parquet; | ||
|
||
|
|
||
| -- Check multiple name duplication | ||
| CREATE TABLE t (c0 STRING, c1 INT, c1 DOUBLE, c0 INT) USING parquet; | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| -- Automatically generated by SQLQueryTestSuite | ||
| -- Number of queries: 2 | ||
|
|
||
|
|
||
| -- !query 0 | ||
| CREATE TABLE t (c STRING, c INT) USING parquet | ||
| -- !query 0 schema | ||
| struct<> | ||
| -- !query 0 output | ||
| org.apache.spark.sql.AnalysisException | ||
| Found duplicate column(s) in table definition of `t`: "c"; | ||
|
|
||
|
|
||
| -- !query 1 | ||
| CREATE TABLE t (c0 STRING, c1 INT, c1 DOUBLE, c0 INT) USING parquet | ||
| -- !query 1 schema | ||
| struct<> | ||
| -- !query 1 output | ||
| org.apache.spark.sql.AnalysisException | ||
| Found duplicate column(s) in table definition of `t`: "c1", "c0"; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -687,4 +687,46 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be | |
| testRead(spark.read.schema(userSchemaString).text(dir, dir), data ++ data, userSchema) | ||
| testRead(spark.read.schema(userSchemaString).text(Seq(dir, dir): _*), data ++ data, userSchema) | ||
| } | ||
|
|
||
| test("SPARK-20460 Check name duplication in schema") { | ||
| withTempDir { src => | ||
| val columnDuplicateSchema = StructType( | ||
| StructField("a", IntegerType) :: | ||
| StructField("a", IntegerType) :: | ||
| Nil) | ||
|
|
||
| // Check CSV format | ||
| Seq("a,a", "1,1").toDF().coalesce(1).write.mode("overwrite").text(src.toString) | ||
| val e1 = intercept[AnalysisException] { | ||
| spark.read.format("csv").schema(columnDuplicateSchema).option("header", false) | ||
|
||
| .load(src.toString) | ||
| } | ||
| assert(e1.getMessage.contains("""Found duplicate column(s) in datasource: "a";""")) | ||
|
|
||
| // If `inferSchema` is true, a CSV format is duplicate-safe (See SPARK-16896) | ||
| val df = spark.read.format("csv").option("inferSchema", true).option("header", true) | ||
| .load(src.toString) | ||
| checkAnswer(df, Row(1, 1)) | ||
|
|
||
| // Check JSON format | ||
| Seq("""{"a":1, "a":1}"""""").toDF().coalesce(1).write.mode("overwrite").text(src.toString) | ||
|
||
| val e2 = intercept[AnalysisException] { | ||
| spark.read.format("json").schema(columnDuplicateSchema).option("header", false) | ||
| .load(src.toString) | ||
| } | ||
| assert(e2.getMessage.contains("""Found duplicate column(s) in datasource: "a";""")) | ||
|
|
||
| val e3 = intercept[AnalysisException] { | ||
| spark.read.format("json").option("inferSchema", true).load(src.toString) | ||
| } | ||
| assert(e3.getMessage.contains("""Found duplicate column(s) in datasource: "a";""")) | ||
|
|
||
| // Check Paruqet format | ||
| Seq((1, 1)).toDF("a", "b").coalesce(1).write.mode("overwrite").parquet(src.toString) | ||
| val e4 = intercept[AnalysisException] { | ||
| spark.read.format("parquet").schema(columnDuplicateSchema).load(src.toString) | ||
| } | ||
| assert(e4.getMessage.contains("""Found duplicate column(s) in datasource: "a";""")) | ||
| } | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ import org.apache.spark.sql.catalyst.plans.logical._ | |
| import org.apache.spark.sql.execution.datasources._ | ||
| import org.apache.spark.sql.internal.SQLConf.HiveCaseSensitiveInferenceMode._ | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.sql.util.SchemaUtils | ||
|
|
||
| /** | ||
| * Legacy catalog for interacting with the Hive metastore. | ||
|
|
@@ -248,6 +249,10 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log | |
| if (inferenceMode == INFER_AND_SAVE) { | ||
| updateCatalogSchema(relation.tableMeta.identifier, schema) | ||
| } | ||
|
|
||
| SchemaUtils.checkSchemaColumnNameDuplication( | ||
|
||
| schema, "hive serde table", sparkSession.sessionState.conf.caseSensitiveAnalysis) | ||
|
|
||
| (schema, relation.tableMeta.copy(schema = schema)) | ||
| case None => | ||
| logWarning(s"Unable to infer schema for table $tableName from file format " + | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what are we checking here? both user-specified schema and inferred schema should have been checked
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yea, I'll remove (I wrongly reverted this entry, too...)