-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-19107][SQL] support creating hive table with DataFrameWriter and Catalog #16487
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,7 @@ import org.apache.spark.sql.hive.test.TestHiveSingleton | |
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION | ||
| import org.apache.spark.sql.test.SQLTestUtils | ||
| import org.apache.spark.sql.types.StructType | ||
|
|
||
| class HiveDDLSuite | ||
| extends QueryTest with SQLTestUtils with TestHiveSingleton with BeforeAndAfterEach { | ||
|
|
@@ -1289,4 +1290,66 @@ class HiveDDLSuite | |
| } | ||
| } | ||
| } | ||
|
|
||
| test("create hive serde table with Catalog") { | ||
| withTable("t") { | ||
| withTempDir { dir => | ||
| val df = spark.catalog.createExternalTable( | ||
| "t", | ||
| "hive", | ||
| new StructType().add("i", "int"), | ||
| Map("path" -> dir.getCanonicalPath, "fileFormat" -> "parquet")) | ||
|
||
| assert(df.collect().isEmpty) | ||
|
|
||
| val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
| assert(DDLUtils.isHiveTable(table)) | ||
| assert(table.storage.inputFormat == | ||
| Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat")) | ||
| assert(table.storage.outputFormat == | ||
| Some("org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat")) | ||
| assert(table.storage.serde == | ||
| Some("org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe")) | ||
|
|
||
| sql("INSERT INTO t SELECT 1") | ||
| checkAnswer(spark.table("t"), Row(1)) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("create hive serde table with DataFrameWriter.saveAsTable") { | ||
| withTable("t", "t2") { | ||
| Seq(1 -> "a").toDF("i", "j") | ||
| .write.format("hive").option("fileFormat", "avro").saveAsTable("t") | ||
| checkAnswer(spark.table("t"), Row(1, "a")) | ||
|
|
||
| val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) | ||
| assert(DDLUtils.isHiveTable(table)) | ||
| assert(table.storage.inputFormat == | ||
| Some("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat")) | ||
| assert(table.storage.outputFormat == | ||
| Some("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat")) | ||
| assert(table.storage.serde == | ||
| Some("org.apache.hadoop.hive.serde2.avro.AvroSerDe")) | ||
|
|
||
| sql("INSERT INTO t SELECT 2, 'b'") | ||
| checkAnswer(spark.table("t"), Row(1, "a") :: Row(2, "b") :: Nil) | ||
|
|
||
| val e = intercept[AnalysisException] { | ||
| Seq(1 -> "a").toDF("i", "j").write.format("hive").partitionBy("i").saveAsTable("t2") | ||
| } | ||
| assert(e.message.contains("A Create Table As Select (CTAS) statement is not allowed " + | ||
| "to create a partitioned table using Hive")) | ||
|
|
||
| val e2 = intercept[AnalysisException] { | ||
| Seq(1 -> "a").toDF("i", "j").write.format("hive").bucketBy(4, "i").saveAsTable("t2") | ||
| } | ||
| assert(e2.message.contains("Creating bucketed Hive serde table is not supported yet")) | ||
|
|
||
| val e3 = intercept[AnalysisException] { | ||
| spark.table("t").write.format("hive").mode("overwrite").saveAsTable("t") | ||
| } | ||
| assert(e3.message.contains( | ||
| "CTAS for hive serde tables does not support append or overwrite semantics")) | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are also facing the same issue in the
insertInto(tableIdent: TableIdentifier)API?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
insertIntois different, it generatesInsertIntoTableplan instead ofCreateTableplan.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We captured the exceptions when the format is
parquet. Now, when the format ishive, should we do the same thing?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DataFrameWriter.insertIntowill ignore the specified provider, isn't it?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Although we ignore the specified provider, we still respect the actual format of the table. For example, below is the Hive table. We are not blocking it. Should we block it to make them consistent?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should not block it. This generates
InsertIntoTable, and it supports hive table. What we should block issaveAsTablewithOverwritemode, which generatesCreateTable.insert overwriteis different fromcreate table with overwrite mode