add checking for duplicate column names

apache · xwu0226 · Nov 21, 2016 · Dec 1, 2016 · Dec 1, 2016 · Dec 5, 2016
commit 5bf7360834e257a2e0083a5f92f24da73416780d
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -454,11 +454,11 @@ abstract class SessionCatalogSuite extends PlanTest {
   test("alter table add columns") {
     val externalCatalog = newBasicCatalog()
     val sessionCatalog = new SessionCatalog(externalCatalog)
-    sessionCatalog.createTable(newTable("alter_add", "default"), ignoreIfExists = false)
-    val oldTab = externalCatalog.getTable("default", "alter_add")
-    sessionCatalog.alterTableSchema(TableIdentifier("alter_add", Some("default")),
+    sessionCatalog.createTable(newTable("t1", "default"), ignoreIfExists = false)
+    val oldTab = externalCatalog.getTable("default", "t1")
+    sessionCatalog.alterTableSchema(TableIdentifier("t1", Some("default")),
       oldTab.schema.add("c3", IntegerType))
-    val newTab = externalCatalog.getTable("default", "alter_add")
+    val newTab = externalCatalog.getTable("default", "t1")
     assert(newTab.schema.equals(oldTab.schema.add("c3", IntegerType)))
   }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -192,28 +192,37 @@ case class AlterTableAddColumnsCommand(
     val catalog = sparkSession.sessionState.catalog
     val catalogTable = verifyAlterTableAddColumn(catalog, table)
 
-    // If an exception is thrown here we can just assume the table is uncached;
-    // this can happen with Hive tables when the underlying catalog is in-memory.
-    val wasCached = Try(sparkSession.catalog.isCached(table.unquotedString)).getOrElse(false)
-    if (wasCached) {
-      try {
-        sparkSession.catalog.uncacheTable(table.unquotedString)
-      } catch {
-        case NonFatal(e) => log.warn(e.toString, e)
-      }
+    try {
+      sparkSession.catalog.uncacheTable(table.unquotedString)
+    } catch {
+      case NonFatal(e) =>
+        log.warn(s"Exception when attempting to uncache table ${table.unquotedString}", e)
     }
+
     // Invalidate the table last, otherwise uncaching the table would load the logical plan
     // back into the hive metastore cache
     catalog.refreshTable(table)
     val partitionFields = catalogTable.schema.takeRight(catalogTable.partitionColumnNames.length)
-    val dataSchema = catalogTable.schema
-      .take(catalogTable.schema.length - catalogTable.partitionColumnNames.length)
+    val newSchemaFields = catalogTable.schema
+      .take(catalogTable.schema.length - catalogTable.partitionColumnNames.length) ++
+      columns ++ partitionFields
+    checkDuplication(newSchemaFields.map(_.name))
     catalog.alterTableSchema(table, newSchema =
-      catalogTable.schema.copy(fields = (dataSchema ++ columns ++ partitionFields).toArray))
+      catalogTable.schema.copy(fields = newSchemaFields.toArray))
 
     Seq.empty[Row]
   }
 
+  private def checkDuplication(colNames: Seq[String]): Unit = {
+    if (colNames.distinct.length != colNames.length) {
+      val duplicateColumns = colNames.groupBy(identity).collect {
+        case (x, ys) if ys.length > 1 => x
+      }
+      throw new AnalysisException(
+        s"Found duplicate column(s): ${duplicateColumns.mkString(", ")}")
+    }
+  }
+
   /**
    * ALTER TABLE ADD COLUMNS command does not support temporary view/table,
    * view, or datasource table with text, orc formats or external provider.

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -2180,22 +2180,22 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
 
   Seq("parquet", "json", "csv").foreach { provider =>
     test(s"alter datasource table add columns - $provider") {
-      withTable("alter_add_ds") {
-        sql(s"CREATE TABLE alter_add_ds (c1 int) USING $provider")
-        sql("INSERT INTO alter_add_ds VALUES (1)")
-        sql("ALTER TABLE alter_add_ds ADD COLUMNS (c2 int)")
+      withTable("t1") {
+        sql(s"CREATE TABLE t1 (c1 int) USING $provider")
+        sql("INSERT INTO t1 VALUES (1)")
+        sql("ALTER TABLE t1 ADD COLUMNS (c2 int)")
         checkAnswer(
-          sql("SELECT * FROM alter_add_ds"),
+          sql("SELECT * FROM t1"),
           Seq(Row(1, null))
         )
         checkAnswer(
-          sql("SELECT * FROM alter_add_ds WHERE c2 is null"),
+          sql("SELECT * FROM t1 WHERE c2 is null"),
           Seq(Row(1, null))
         )
 
-        sql("INSERT INTO alter_add_ds VALUES (3, 2)")
+        sql("INSERT INTO t1 VALUES (3, 2)")
         checkAnswer(
-          sql("SELECT * FROM alter_add_ds WHERE c2 = 2"),
+          sql("SELECT * FROM t1 WHERE c2 = 2"),
           Seq(Row(3, 2))
         )
       }
@@ -2204,36 +2204,36 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
 
   Seq("parquet", "json", "csv").foreach { provider =>
     test(s"alter datasource table add columns - partitioned - $provider") {
-      withTable("alter_add_ds") {
-        sql(s"CREATE TABLE alter_add_ds (c1 int, c2 int) USING $provider PARTITIONED BY (c2)")
-        sql("INSERT INTO alter_add_ds PARTITION(c2 = 2) VALUES (1)")
-        sql("ALTER TABLE alter_add_ds ADD COLUMNS (c3 int)")
+      withTable("t1") {
+        sql(s"CREATE TABLE t1 (c1 int, c2 int) USING $provider PARTITIONED BY (c2)")
+        sql("INSERT INTO t1 PARTITION(c2 = 2) VALUES (1)")
+        sql("ALTER TABLE t1 ADD COLUMNS (c3 int)")
         checkAnswer(
-          sql("SELECT * FROM alter_add_ds"),
+          sql("SELECT * FROM t1"),
           Seq(Row(1, null, 2))
         )
         checkAnswer(
-          sql("SELECT * FROM alter_add_ds WHERE c3 is null"),
+          sql("SELECT * FROM t1 WHERE c3 is null"),
           Seq(Row(1, null, 2))
         )
-        sql("INSERT INTO alter_add_ds PARTITION(c2 =1) VALUES (2, 3)")
+        sql("INSERT INTO t1 PARTITION(c2 =1) VALUES (2, 3)")
         checkAnswer(
-          sql("SELECT * FROM alter_add_ds WHERE c3 = 3"),
+          sql("SELECT * FROM t1 WHERE c3 = 3"),
           Seq(Row(2, 3, 1))
         )
         checkAnswer(
-          sql("SELECT * FROM alter_add_ds WHERE c2 = 1"),
+          sql("SELECT * FROM t1 WHERE c2 = 1"),
           Seq(Row(2, 3, 1))
         )
       }
     }
   }
 
   test("alter datasource table add columns - text format not supported") {
-    withTable("alter_add_ds_text") {
-      sql(s"CREATE TABLE alter_add_ds_text (c1 int) USING text")
+    withTable("t1") {
+      sql(s"CREATE TABLE t1 (c1 int) USING text")
       val e = intercept[AnalysisException] {
-        sql("ALTER TABLE alter_add_ds_text ADD COLUMNS (c2 int)")
+        sql("ALTER TABLE t1 ADD COLUMNS (c2 int)")
       }.getMessage
       assert(e.contains("does not support ALTER ADD COLUMNS"))
     }
@@ -2258,4 +2258,14 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
       assert(e.message.contains("is a VIEW, which does not support ALTER ADD COLUMNS"))
     }
   }
+
+  test("alter table add columns with existing column name") {
+    withTable("t1") {
+      sql(s"CREATE TABLE t1 (c1 int) USING PARQUET")
+      val e = intercept[AnalysisException] {
+        sql("ALTER TABLE t1 ADD COLUMNS (c1 string)")
+      }.getMessage
+      assert(e.contains("Found duplicate column(s)"))
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -71,6 +71,7 @@ class JDBCSuite extends SparkFunSuite
     conn.prepareStatement("insert into test.people values ('mary', 2)").executeUpdate()
     conn.prepareStatement(
       "insert into test.people values ('joe ''foo'' \"bar\"', 3)").executeUpdate()
+    conn.commit()
 
     sql(
       s"""

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -112,6 +112,7 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA
 class HiveDDLSuite
   extends QueryTest with SQLTestUtils with TestHiveSingleton with BeforeAndAfterEach {
   import testImplicits._
+  val hiveFormats = Seq("PARQUET", "ORC", "TEXTFILE", "SEQUENCEFILE", "RCFILE", "AVRO")
 
   override def afterEach(): Unit = {
     try {
@@ -1861,55 +1862,55 @@ class HiveDDLSuite
     }
   }
 
-  Seq("PARQUET", "ORC", "TEXTFILE", "SEQUENCEFILE", "RCFILE", "AVRO").foreach { tableType =>
+  hiveFormats.foreach { tableType =>
     test(s"alter hive serde table add columns -- partitioned - $tableType") {
-      withTable("alter_add_partitioned") {
+      withTable("tab") {
         sql(
           s"""
-             |CREATE TABLE alter_add_partitioned (c1 int, c2 int)
+             |CREATE TABLE tab (c1 int, c2 int)
              |PARTITIONED BY (c3 int) STORED AS $tableType
           """.stripMargin)
 
-        sql("INSERT INTO alter_add_partitioned PARTITION (c3=1) VALUES (1, 2)")
-        sql("ALTER TABLE alter_add_partitioned ADD COLUMNS (c4 int)")
+        sql("INSERT INTO tab PARTITION (c3=1) VALUES (1, 2)")
+        sql("ALTER TABLE tab ADD COLUMNS (c4 int)")
         checkAnswer(
-          sql("SELECT * FROM alter_add_partitioned WHERE c3 = 1"),
+          sql("SELECT * FROM tab WHERE c3 = 1"),
           Seq(Row(1, 2, null, 1))
         )
-        assert(sql("SELECT * FROM alter_add_partitioned").schema
+        assert(sql("SELECT * FROM tab").schema
           .contains(StructField("c4", IntegerType)))
-        sql("INSERT INTO alter_add_partitioned PARTITION (c3=2) VALUES (2, 3, 4)")
+        sql("INSERT INTO tab PARTITION (c3=2) VALUES (2, 3, 4)")
         checkAnswer(
-          sql("SELECT * FROM alter_add_partitioned"),
+          sql("SELECT * FROM tab"),
           Seq(Row(1, 2, null, 1), Row(2, 3, 4, 2))
         )
         checkAnswer(
-          sql("SELECT * FROM alter_add_partitioned WHERE c3 = 2 AND c4 IS NOT NULL"),
+          sql("SELECT * FROM tab WHERE c3 = 2 AND c4 IS NOT NULL"),
           Seq(Row(2, 3, 4, 2))
         )
       }
     }
   }
 
-  Seq("PARQUET", "ORC", "TEXTFILE", "SEQUENCEFILE", "RCFILE", "AVRO").foreach { tableType =>
+  hiveFormats.foreach { tableType =>
     test(s"alter hive serde table add columns -- with predicate - $tableType ") {
-      withTable("alter_add_predicate") {
-        sql(s"CREATE TABLE alter_add_predicate (c1 int, c2 int) STORED AS $tableType")
-        sql("INSERT INTO alter_add_predicate VALUES (1, 2)")
-        sql("ALTER TABLE alter_add_predicate ADD COLUMNS (c4 int)")
+      withTable("tab") {
+        sql(s"CREATE TABLE tab (c1 int, c2 int) STORED AS $tableType")
+        sql("INSERT INTO tab VALUES (1, 2)")
+        sql("ALTER TABLE tab ADD COLUMNS (c4 int)")
         checkAnswer(
-          sql("SELECT * FROM alter_add_predicate WHERE c4 IS NULL"),
+          sql("SELECT * FROM tab WHERE c4 IS NULL"),
           Seq(Row(1, 2, null))
         )
-        assert(sql("SELECT * FROM alter_add_predicate").schema
+        assert(sql("SELECT * FROM tab").schema
           .contains(StructField("c4", IntegerType)))
-        sql("INSERT INTO alter_add_predicate VALUES (2, 3, 4)")
+        sql("INSERT INTO tab VALUES (2, 3, 4)")
         checkAnswer(
-          sql("SELECT * FROM alter_add_predicate WHERE c4 = 4 "),
+          sql("SELECT * FROM tab WHERE c4 = 4 "),
           Seq(Row(2, 3, 4))
         )
         checkAnswer(
-          sql("SELECT * FROM alter_add_predicate"),
+          sql("SELECT * FROM tab"),
           Seq(Row(1, 2, null), Row(2, 3, 4))
         )
       }
@@ -1919,13 +1920,23 @@ class HiveDDLSuite
   Seq("orc", "ORC", "org.apache.spark.sql.hive.orc",
     "org.apache.spark.sql.hive.orc.DefaultSource").foreach { source =>
     test(s"alter datasource table add columns - $source format not supported") {
-      withTable("alter_add_ds_text") {
-        sql(s"CREATE TABLE alter_add_ds_text (c1 int) USING $source")
+      withTable("tab") {
+        sql(s"CREATE TABLE tab (c1 int) USING $source")
         val e = intercept[AnalysisException] {
-          sql("ALTER TABLE alter_add_ds_text ADD COLUMNS (c2 int)")
+          sql("ALTER TABLE tab ADD COLUMNS (c2 int)")
         }.getMessage
         assert(e.contains("does not support ALTER ADD COLUMNS"))
       }
     }
   }
+
+  test("alter table add columns with existing partition column name") {
+    withTable("tab") {
+      sql("CREATE TABLE tab (c1 int) PARTITIONED BY (c2 int) STORED AS PARQUET")
+      val e = intercept[AnalysisException] {
+        sql("ALTER TABLE tab ADD COLUMNS (c2 string)")
+      }.getMessage
+      assert(e.contains("Found duplicate column(s)"))
+    }
+  }
 }