apache · vanzin · Aug 2, 2017 · Aug 3, 2017 · gatorsmile · Aug 3, 2017
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -2356,18 +2356,9 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
             }.getMessage
             assert(e.contains("Found duplicate column(s)"))
           } else {
-            if (isUsingHiveMetastore) {
-              // hive catalog will still complains that c1 is duplicate column name because hive
-              // identifiers are case insensitive.
-              val e = intercept[AnalysisException] {
-                sql("ALTER TABLE t1 ADD COLUMNS (C1 string)")
-              }.getMessage
-              assert(e.contains("HiveException"))
-            } else {
-              sql("ALTER TABLE t1 ADD COLUMNS (C1 string)")
-              assert(spark.table("t1").schema
-                .equals(new StructType().add("c1", IntegerType).add("C1", StringType)))
-            }
+            sql("ALTER TABLE t1 ADD COLUMNS (C1 string)")
+            assert(spark.table("t1").schema
+              .equals(new StructType().add("c1", IntegerType).add("C1", StringType)))
           }
         }
       }

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -616,15 +616,24 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     // Add table metadata such as table schema, partition columns, etc. to table properties.
     val updatedTable = withNewSchema.copy(
       properties = withNewSchema.properties ++ tableMetaToTableProps(withNewSchema))
+
+    // If it's a data source table, make sure the original schema is left unchanged; the
+    // actual schema is recorded as a table property.
+    val tableToStore = if (DDLUtils.isDatasourceTable(updatedTable)) {
+      updatedTable.copy(schema = rawTable.schema)
+    } else {
+      updatedTable
+    }
+
     try {
-      client.alterTable(updatedTable)
+      client.alterTable(tableToStore)
     } catch {
       case NonFatal(e) =>
         val warningMessage =
           s"Could not alter schema of table  ${rawTable.identifier.quotedString} in a Hive " +
             "compatible way. Updating Hive metastore in Spark SQL specific format."
         logWarning(warningMessage, e)
-        client.alterTable(updatedTable.copy(schema = updatedTable.partitionSchema))
+        client.alterTable(updatedTable.copy(schema = tableToStore.partitionSchema))
 def newSparkSQLSpecificMetastoreTable(): CatalogTable = { 
   table.copy( 
     // Hive only allows directory paths as location URIs while Spark SQL data source tables 
     // also allow file paths. For non-hive-compatible format, we should not set location URI 
     // to avoid hive metastore to throw exception. 
     storage = table.storage.copy( 
       locationUri = None, 
       properties = storagePropsWithLocation), 
     schema = table.partitionSchema, 
     bucketSpec = None, 
     properties = table.properties ++ tableProperties) 
 } 
 def newSparkSQLSpecificMetastoreTable(): CatalogTable = { 
   table.copy( 
     // Hive only allows directory paths as location URIs while Spark SQL data source tables 
     // also allow file paths. For non-hive-compatible format, we should not set location URI 
     // to avoid hive metastore to throw exception. 
     storage = table.storage.copy( 
       locationUri = None, 
       properties = storagePropsWithLocation), 
     schema = table.partitionSchema, 
     bucketSpec = None, 
     properties = table.properties ++ tableProperties) 
 } 
     }
   }
 

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -49,6 +49,7 @@ import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException}
 import org.apache.spark.sql.execution.QueryExecutionException
 import org.apache.spark.sql.execution.command.DDLUtils
+import org.apache.spark.sql.hive.HiveExternalCatalog
 import org.apache.spark.sql.hive.client.HiveClientImpl._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{CircularBuffer, Utils}
@@ -413,7 +414,10 @@ private[hive] class HiveClientImpl(
         unsupportedFeatures += "partitioned view"
       }
 
-      val properties = Option(h.getParameters).map(_.asScala.toMap).orNull
+      val properties = Option(h.getParameters).map(_.asScala.toMap).getOrElse(Map())
+
+      val provider = properties.get(HiveExternalCatalog.DATASOURCE_PROVIDER)
+        .orElse(Some(DDLUtils.HIVE_PROVIDER))
 case None if table.tableType == VIEW => 
 case None if table.tableType == VIEW => 
 
       // Hive-generated Statistics are also recorded in ignoredProperties
       val ignoredProperties = scala.collection.mutable.Map.empty[String, String]
@@ -468,6 +472,7 @@ private[hive] class HiveClientImpl(
             throw new AnalysisException("Hive index table is not supported.")
         },
         schema = schema,
+        provider = provider,
         partitionColumnNames = partCols.map(_.name),
         // If the table is written by Spark, we will put bucketing information in table properties,
         // and will always overwrite the bucket spec in hive metastore by the bucketing information

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -23,19 +23,20 @@ import java.net.URI
 import scala.language.existentials
 
 import org.apache.hadoop.fs.Path
-import org.scalatest.BeforeAndAfterEach
+import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach}
 
-import org.apache.spark.SparkException
-import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode}
+import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.launcher.SparkLauncher
+import org.apache.spark.sql.{AnalysisException, QueryTest, Row, SaveMode, SparkSession}
 import org.apache.spark.sql.catalyst.analysis.{NoSuchPartitionException, TableAlreadyExistsException}
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils}
-import org.apache.spark.sql.hive.HiveExternalCatalog
+import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils}
 import org.apache.spark.sql.hive.orc.OrcFileOperator
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
-import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
+import org.apache.spark.sql.internal.StaticSQLConf._
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -1998,3 +1999,46 @@ class HiveDDLSuite
     }
   }
 }
+
+/**
+ * A separate set of DDL tests that uses Hive 2.1 libraries, which behave a little differently
+ * from the built-in ones.
+ */
+class HiveDDLSuite_2_1 extends SparkFunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
+
+  private val spark = {
+    val warehouse = Utils.createTempDir()
+    val metastore = Utils.createTempDir()
+    metastore.delete()
+    SparkSession.builder()
+      .config(SparkLauncher.SPARK_MASTER, "local")
+      .config(WAREHOUSE_PATH.key, warehouse.toURI().toString())
+      .config(CATALOG_IMPLEMENTATION.key, "hive")
+      .config(HiveUtils.HIVE_METASTORE_VERSION.key, "2.1")
+      .config(HiveUtils.HIVE_METASTORE_JARS.key, "maven")
+      .config("spark.hadoop.javax.jdo.option.ConnectionURL",
+        s"jdbc:derby:;databaseName=${metastore.getAbsolutePath()};create=true")
+      // These options are needed since the defaults in Hive 2.1 cause exceptions with an
+      // empty metastore db.
+      .config("spark.hadoop.datanucleus.schema.autoCreateAll", "true")
+      .config("spark.hadoop.hive.metastore.schema.verification", "false")
+      .getOrCreate()
+  }
+
+  override def afterEach: Unit = {
+    spark.sessionState.catalog.reset()
+  }
+
+  override def afterAll(): Unit = {
+    spark.close()
+  }
+
+  test("SPARK-21617: ALTER TABLE..ADD COLUMNS for DataSource tables") {
+    spark.sql("CREATE TABLE t1 (c1 int) USING json")
+    spark.sql("ALTER TABLE t1 ADD COLUMNS (c2 int)")
+
+    val df = spark.table("t1")
+    assert(df.schema.fieldNames === Array("c1", "c2"))
+  }
+
+}