Share code to check column name duplication

apache · maropu · Apr 25, 2017 · Jun 13, 2017 · Jun 13, 2017 · Jun 16, 2017
commit 1e647ee9c779d5d60fcadfe70a3dac05d48d8ccc
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -188,6 +188,9 @@ case class DataSource(
       (dataSchema ++ partitionSchema).map(_.name), "in the data schema and the partition schema",
       sparkSession.sessionState.conf.caseSensitiveAnalysis)
 
+    SchemaUtils.checkSchemaColumnNameDuplication(
+      dataSchema, "datasource", sparkSession.sessionState.conf.caseSensitiveAnalysis)
+
     (dataSchema, partitionSchema)
   }
 
@@ -334,6 +337,9 @@ case class DataSource(
                 "It must be specified manually")
         }
 
+        SchemaUtils.checkSchemaColumnNameDuplication(
+          dataSchema, "datasource", sparkSession.sessionState.conf.caseSensitiveAnalysis)
+
         HadoopFsRelation(
           fileCatalog,
           partitionSchema = fileCatalog.partitionSchema,

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonDataSource.scala
@@ -59,9 +59,7 @@ abstract class JsonDataSource extends Serializable {
       inputPaths: Seq[FileStatus],
       parsedOptions: JSONOptions): Option[StructType] = {
     if (inputPaths.nonEmpty) {
-      val jsonSchema = infer(sparkSession, inputPaths, parsedOptions)
-      checkConstraints(jsonSchema)
-      Some(jsonSchema)
+      Some(infer(sparkSession, inputPaths, parsedOptions))
     } else {
       None
     }
@@ -71,17 +69,6 @@ abstract class JsonDataSource extends Serializable {
       sparkSession: SparkSession,
       inputPaths: Seq[FileStatus],
       parsedOptions: JSONOptions): StructType
-
-  /** Constraints to be imposed on schema to be stored. */
-  private def checkConstraints(schema: StructType): Unit = {
-    if (schema.fieldNames.length != schema.fieldNames.distinct.length) {
-      val duplicateColumns = schema.fieldNames.groupBy(identity).collect {
-        case (x, ys) if ys.length > 1 => "\"" + x + "\""
-      }.mkString(", ")
-      throw new AnalysisException(s"Duplicate column(s) : $duplicateColumns found, " +
-        s"cannot save to JSON format")
-    }
-  }
 }
 
 object JsonDataSource {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.InsertableRelation
 import org.apache.spark.sql.types.{AtomicType, StructType}
+import org.apache.spark.sql.util.SchemaUtils
 
 /**
  * Try to replaces [[UnresolvedRelation]]s if the plan is for direct query on files.
@@ -222,12 +223,10 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi
   }
 
   private def normalizeCatalogTable(schema: StructType, table: CatalogTable): CatalogTable = {
-    val columnNames = if (sparkSession.sessionState.conf.caseSensitiveAnalysis) {
-      schema.map(_.name)
-    } else {
-      schema.map(_.name.toLowerCase)
-    }
-    checkDuplication(columnNames, "table definition of " + table.identifier)
+    SchemaUtils.checkSchemaColumnNameDuplication(
+      schema,
+      "table definition of " + table.identifier,
+      sparkSession.sessionState.conf.caseSensitiveAnalysis)
 
     val normalizedPartCols = normalizePartitionColumns(schema, table)
     val normalizedBucketSpec = normalizeBucketSpec(schema, table)
@@ -253,7 +252,10 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi
       partCols = table.partitionColumnNames,
       resolver = sparkSession.sessionState.conf.resolver)
 
-    checkDuplication(normalizedPartitionCols, "partition")
+    SchemaUtils.checkColumnNameDuplication(
+      normalizedPartitionCols,
+      "partition",
+      sparkSession.sessionState.conf.caseSensitiveAnalysis)
 
     if (schema.nonEmpty && normalizedPartitionCols.length == schema.length) {
       if (DDLUtils.isHiveTable(table)) {
@@ -283,8 +285,16 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi
           tableCols = schema.map(_.name),
           bucketSpec = bucketSpec,
           resolver = sparkSession.sessionState.conf.resolver)
-        checkDuplication(normalizedBucketSpec.bucketColumnNames, "bucket")
-        checkDuplication(normalizedBucketSpec.sortColumnNames, "sort")
+
+        val caseSensitiveAnalysis = sparkSession.sessionState.conf.caseSensitiveAnalysis
+        SchemaUtils.checkColumnNameDuplication(
+          normalizedBucketSpec.bucketColumnNames,
+          "bucket",
+          caseSensitiveAnalysis)
+        SchemaUtils.checkColumnNameDuplication(
+          normalizedBucketSpec.sortColumnNames,
+          "sort",
+          caseSensitiveAnalysis)
 
         normalizedBucketSpec.sortColumnNames.map(schema(_)).map(_.dataType).foreach {
           case dt if RowOrdering.isOrderable(dt) => // OK
@@ -297,15 +307,6 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi
     }
   }
 
-  private def checkDuplication(colNames: Seq[String], colType: String): Unit = {
-    if (colNames.distinct.length != colNames.length) {
-      val duplicateColumns = colNames.groupBy(identity).collect {
-        case (x, ys) if ys.length > 1 => x
-      }
-      failAnalysis(s"Found duplicate column(s) in $colType: ${duplicateColumns.mkString(", ")}")
-    }
-  }
-
   private def failAnalysis(msg: String) = throw new AnalysisException(msg)
 }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/SchemaUtil.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/SchemaUtil.scala
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.util
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.types.StructType
+
+
+/**
+ * Utils for handling schemas.
+ *
+ * TODO: Merge this file with [[org.apache.spark.ml.util.SchemaUtils]].
+ */
+private[spark] object SchemaUtils {
+
+  def checkSchemaColumnNameDuplication(
+      schema: StructType, colType: String, caseSensitiveAnalysis: Boolean = false): Unit = {
+    checkColumnNameDuplication(schema.map(_.name), colType, caseSensitiveAnalysis)
+  }
+
+  def checkColumnNameDuplication(
+      names: Seq[String], colType: String, caseSensitiveAnalysis: Boolean = false): Unit = {
+    val colNames = if (caseSensitiveAnalysis) {
+      names
+    } else {
+      names.map(_.toLowerCase)
+    }
+    if (colNames.distinct.length != colNames.length) {
+      val duplicateColumns = colNames.groupBy(identity).collect {
+        case (x, ys) if ys.length > 1 => "\"" + x + "\""
+      }
+      throw new AnalysisException(s"Found duplicate column(s) in $colType: " +
+        duplicateColumns.mkString(", "))
+    }
+  }
+}
diff --git a/sql/core/src/test/resources/sql-tests/inputs/create.sql b/sql/core/src/test/resources/sql-tests/inputs/create.sql
@@ -0,0 +1,5 @@
+-- Check name duplication in a regular case
+CREATE TABLE t (c STRING, c INT) USING parquet;
+
+-- Check multiple name duplication
+CREATE TABLE t (c0 STRING, c1 INT, c1 DOUBLE, c0 INT) USING parquet;
diff --git a/sql/core/src/test/resources/sql-tests/results/create.sql.out b/sql/core/src/test/resources/sql-tests/results/create.sql.out
@@ -0,0 +1,20 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 2
+
+
+-- !query 0
+CREATE TABLE t (c STRING, c INT) USING parquet
+-- !query 0 schema
+struct<>
+-- !query 0 output
+org.apache.spark.sql.AnalysisException
+Found duplicate column(s) in table definition of `t`: "c";
+
+
+-- !query 1
+CREATE TABLE t (c0 STRING, c1 INT, c1 DOUBLE, c0 INT) USING parquet
+-- !query 1 schema
+struct<>
+-- !query 1 output
+org.apache.spark.sql.AnalysisException
+Found duplicate column(s) in table definition of `t`: "c1", "c0";
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -439,13 +439,13 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     val e = intercept[AnalysisException] {
       sql("CREATE TABLE tbl(a int, a string) USING json")
     }
-    assert(e.message == "Found duplicate column(s) in table definition of `tbl`: a")
+    assert(e.message == """Found duplicate column(s) in table definition of `tbl`: "a"""")
 
     withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
       val e2 = intercept[AnalysisException] {
         sql("CREATE TABLE tbl(a int, A string) USING json")
       }
-      assert(e2.message == "Found duplicate column(s) in table definition of `tbl`: a")
+      assert(e2.message == """Found duplicate column(s) in table definition of `tbl`: "a"""")
     }
   }
 
@@ -469,14 +469,14 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     val e = intercept[AnalysisException] {
       sql("CREATE TABLE tbl(a int) USING json PARTITIONED BY (a, a)")
     }
-    assert(e.message == "Found duplicate column(s) in partition: a")
+    assert(e.message == """Found duplicate column(s) in partition: "a"""")
   }
 
   test("create table - column repeated in bucket columns") {
     val e = intercept[AnalysisException] {
       sql("CREATE TABLE tbl(a int) USING json CLUSTERED BY (a, a) INTO 4 BUCKETS")
     }
-    assert(e.message == "Found duplicate column(s) in bucket: a")
+    assert(e.message == """Found duplicate column(s) in bucket: "a"""")
   }
 
   test("Refresh table after changing the data source table partitioning") {

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
@@ -687,4 +687,46 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
     testRead(spark.read.schema(userSchemaString).text(dir, dir), data ++ data, userSchema)
     testRead(spark.read.schema(userSchemaString).text(Seq(dir, dir): _*), data ++ data, userSchema)
   }
+
+  test("SPARK-20460 Check name duplication in schema") {
+    withTempDir { src =>
+      val columnDuplicateSchema = StructType(
+        StructField("a", IntegerType) ::
+        StructField("a", IntegerType) ::
+        Nil)
+
+      // Check CSV format
+      Seq("a,a", "1,1").toDF().coalesce(1).write.mode("overwrite").text(src.toString)
+      val e1 = intercept[AnalysisException] {
+        spark.read.format("csv").schema(columnDuplicateSchema).option("header", false)
+          .load(src.toString)
+      }
+      assert(e1.getMessage.contains("""Found duplicate column(s) in datasource: "a";"""))
+
+      // If `inferSchema` is true, a CSV format is duplicate-safe (See SPARK-16896)
+      val df = spark.read.format("csv").option("inferSchema", true).option("header", true)
+        .load(src.toString)
+      checkAnswer(df, Row(1, 1))
+
+      // Check JSON format
+      Seq("""{"a":1, "a":1}"""""").toDF().coalesce(1).write.mode("overwrite").text(src.toString)
+      val e2 = intercept[AnalysisException] {
+        spark.read.format("json").schema(columnDuplicateSchema).option("header", false)
+          .load(src.toString)
+      }
+      assert(e2.getMessage.contains("""Found duplicate column(s) in datasource: "a";"""))
+
+      val e3 = intercept[AnalysisException] {
+        spark.read.format("json").option("inferSchema", true).load(src.toString)
+      }
+      assert(e3.getMessage.contains("""Found duplicate column(s) in datasource: "a";"""))
+
+      // Check Paruqet format
+      Seq((1, 1)).toDF("a", "b").coalesce(1).write.mode("overwrite").parquet(src.toString)
+      val e4 = intercept[AnalysisException] {
+        spark.read.format("parquet").schema(columnDuplicateSchema).load(src.toString)
+      }
+      assert(e4.getMessage.contains("""Found duplicate column(s) in datasource: "a";"""))
+    }
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -31,6 +31,7 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.internal.SQLConf.HiveCaseSensitiveInferenceMode._
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.util.SchemaUtils
 
 /**
  * Legacy catalog for interacting with the Hive metastore.
@@ -248,6 +249,10 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
           if (inferenceMode == INFER_AND_SAVE) {
             updateCatalogSchema(relation.tableMeta.identifier, schema)
           }
+
+          SchemaUtils.checkSchemaColumnNameDuplication(
+            schema, "hive serde table", sparkSession.sessionState.conf.caseSensitiveAnalysis)
+
           (schema, relation.tableMeta.copy(schema = schema))
         case None =>
           logWarning(s"Unable to infer schema for table $tableName from file format " +

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -345,7 +345,7 @@ class HiveDDLSuite
     val e = intercept[AnalysisException] {
       sql("CREATE TABLE tbl(a int) PARTITIONED BY (a string)")
     }
-    assert(e.message == "Found duplicate column(s) in table definition of `default`.`tbl`: a")
+    assert(e.message == """Found duplicate column(s) in table definition of `default`.`tbl`: "a"""")
   }
 
   test("add/drop partition with location - managed table") {