-
Notifications
You must be signed in to change notification settings - Fork 29.2k
[SPARK-4131] Support "Writing data into the filesystem from queries" #18975
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
6ca7771
a975536
a15bf4e
9f596fd
b9db02e
e516bec
e05624f
7f5664d
d50b3a2
61a18a2
4c19aaf
47fde8a
068662a
da7065b
7f4b488
051018e
73f605e
8261b39
163c124
0882dd1
c813ad8
bc5424c
675ffd7
f36e933
64f37f4
b2068ce
8ebe5e2
51f9a0a
e2db5e1
7bb5c85
2e7a29b
7ccbde4
52350e8
2ec9947
05d9d20
392593b
511cfc3
77948bb
fd9322c
e590847
d1b4ec8
dd6a418
c2c693c
e9c88b5
62370fd
b00acdc
b64520b
3aaf6e8
b461e00
e9c24de
28fcb39
0c03a2b
6c24b1b
c4ab411
0ec103e
160c0ec
95ebfd3
4a5ff29
c99872b
449249e
7919041
aeb5d5e
81382df
f93d57a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1556,7 +1556,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) { | |
| * | ||
| * Expected format: | ||
| * {{{ | ||
| * INSERT OVERWRITE DIRECTORY | ||
| * INSERT OVERWRITE [LOCAL] DIRECTORY | ||
| * path | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added [LOCAL] |
||
| * [ROW FORMAT row_format] | ||
| * [STORED AS file_format] | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,13 +35,14 @@ import org.apache.spark.sql.execution.datasources._ | |
| case class InsertIntoDataSourceDirCommand( | ||
| storage: CatalogStorageFormat, | ||
| provider: Option[String], | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also add a parameter
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let us change it to
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated. |
||
| query: LogicalPlan) extends RunnableCommand { | ||
| query: LogicalPlan, | ||
| overwrite: Boolean) extends RunnableCommand { | ||
|
|
||
| override def innerChildren: Seq[LogicalPlan] = Seq(query) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated. |
||
|
|
||
| override def run(sparkSession: SparkSession): Seq[Row] = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated |
||
| assert(innerChildren.length == 1) | ||
| assert(!storage.locationUri.isEmpty) | ||
| assert(storage.locationUri.nonEmpty) | ||
| assert(provider.isDefined) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please add the helper messages for the above three asserts?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
|
|
||
| // Create the relation based on the input logical plan: `data`. | ||
|
|
@@ -52,8 +53,9 @@ case class InsertIntoDataSourceDirCommand( | |
| options = storage.properties ++ pathOption, | ||
| catalogTable = None) | ||
|
|
||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can add an extra check here.
If A simple example is like sql(
s"""
|INSERT OVERWRITE DIRECTORY '${path}'
|USING JDBC
|OPTIONS (uRl '$url1', DbTaBlE 'TEST.PEOPLE1', User 'testUser', PassWord 'testPass')
|SELECT 1, 2
""".stripMargin)Currently, the above query can pass. We should get an exception instead.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated. |
||
| val saveMode = if (overwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists | ||
| try { | ||
| dataSource.writeAndRead(SaveMode.Overwrite, query) | ||
| dataSource.writeAndRead(saveMode, query) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we do not need to return BaseRelation, we can call
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The implementation here confused me, just want to leave a question here why we should call both
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. We should get rid of
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @gatorsmile Thanks for you reply, I'll try to fix this. |
||
| } catch { | ||
| case ex: AnalysisException => | ||
| logError(s"Failed to write to directory " + storage.locationUri.toString, ex) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -141,8 +141,8 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast | |
| parts, query, overwrite, false) if parts.isEmpty => | ||
| InsertIntoDataSourceCommand(l, query, overwrite) | ||
|
|
||
| case InsertIntoDir(_, storage, provider, query) => | ||
| InsertIntoDataSourceDirCommand(storage, provider, query) | ||
| case InsertIntoDir(_, storage, provider, query, overwrite) if provider.nonEmpty => | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated. |
||
| InsertIntoDataSourceDirCommand(storage, provider, query, overwrite) | ||
|
|
||
| case i @ InsertIntoTable( | ||
| l @ LogicalRelation(t: HadoopFsRelation, _, table), parts, query, overwrite, _) => | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -157,8 +157,8 @@ object HiveAnalysis extends Rule[LogicalPlan] { | |
| case CreateTable(tableDesc, mode, Some(query)) if DDLUtils.isHiveTable(tableDesc) => | ||
| CreateHiveTableAsSelectCommand(tableDesc, query, mode) | ||
|
|
||
| case InsertIntoDir(isLocal, storage, _, child) => | ||
| InsertIntoHiveDirCommand(isLocal, storage, child) | ||
| case InsertIntoDir(isLocal, storage, _, child, overwrite) => | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated. |
||
| InsertIntoHiveDirCommand(isLocal, storage, child, overwrite) | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,13 +38,14 @@ import org.apache.spark.util.Utils | |
| case class InsertIntoHiveDirCommand( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated |
||
| isLocal: Boolean, | ||
| storage: CatalogStorageFormat, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also add a parameter
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
| query: LogicalPlan) extends SaveAsHiveFile { | ||
| query: LogicalPlan, | ||
| overwrite: Boolean) extends SaveAsHiveFile { | ||
|
|
||
| override def children: Seq[LogicalPlan] = query :: Nil | ||
|
|
||
| override def run(sparkSession: SparkSession, children: Seq[SparkPlan]): Seq[Row] = { | ||
| assert(children.length == 1) | ||
| assert(!storage.locationUri.isEmpty) | ||
| assert(storage.locationUri.nonEmpty) | ||
|
|
||
| val Array(cols, types) = children.head.output.foldLeft(Array("", "")) { case (r, a) => | ||
| r(0) = r(0) + a.name + "," | ||
|
|
@@ -79,14 +80,22 @@ case class InsertIntoHiveDirCommand( | |
| val localFileSystem = FileSystem.getLocal(jobConf) | ||
| val localPath = localFileSystem.makeQualified(targetPath) | ||
| if (localFileSystem.exists(localPath)) { | ||
| localFileSystem.delete(localPath, true) | ||
| if (overwrite) { | ||
| localFileSystem.delete(localPath, true) | ||
| } else { | ||
| throw new RuntimeException("Directory '" + localPath.toString + "' already exists") | ||
| } | ||
| } | ||
| localPath | ||
| } else { | ||
| val qualifiedPath = FileUtils.makeQualified(targetPath, hadoopConf) | ||
| val dfs = qualifiedPath.getFileSystem(jobConf) | ||
| if (dfs.exists(qualifiedPath)) { | ||
| dfs.delete(qualifiedPath, true) | ||
| if (overwrite) { | ||
| dfs.delete(qualifiedPath, true) | ||
| } else { | ||
| throw new RuntimeException("Directory '" + qualifiedPath.toString + "' already exists") | ||
| } | ||
| } else { | ||
| dfs.mkdirs(qualifiedPath.getParent) | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,14 +31,15 @@ import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc} | |
| // Base trait from which all hive insert statement physical execution extends. | ||
| private[hive] trait SaveAsHiveFile extends DataWritingCommand { | ||
|
|
||
| protected def saveAsHiveFile(sparkSession: SparkSession, | ||
| plan: SparkPlan, | ||
| hadoopConf: Configuration, | ||
| fileSinkConf: FileSinkDesc, | ||
| outputLocation: String, | ||
| partitionAttributes: Seq[Attribute] = Nil, | ||
| bucketSpec: Option[BucketSpec] = None, | ||
| options: Map[String, String] = Map.empty): Unit = { | ||
| protected def saveAsHiveFile( | ||
| sparkSession: SparkSession, | ||
| plan: SparkPlan, | ||
| hadoopConf: Configuration, | ||
| fileSinkConf: FileSinkDesc, | ||
| outputLocation: String, | ||
| partitionAttributes: Seq[Attribute] = Nil, | ||
| bucketSpec: Option[BucketSpec] = None, | ||
| options: Map[String, String] = Map.empty): Unit = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed |
||
|
|
||
| val sessionState = sparkSession.sessionState | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove this?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed |
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2046,74 +2046,81 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { | |
|
|
||
| val path = Utils.createTempDir() | ||
| path.delete() | ||
| checkAnswer( | ||
| sql(s"INSERT OVERWRITE LOCAL DIRECTORY '${path.toString}' SELECT * FROM src where key < 10"), | ||
| Seq.empty[Row]) | ||
| withTempDir { dir => | ||
| val path = dir.toURI.getPath | ||
|
|
||
| checkAnswer( | ||
| sql(s"""INSERT OVERWRITE LOCAL DIRECTORY '${path.toString}' | ||
| checkAnswer( | ||
| sql(s"INSERT OVERWRITE LOCAL DIRECTORY '${path}' SELECT * FROM src where key < 10"), | ||
| Seq.empty[Row]) | ||
|
|
||
| checkAnswer( | ||
| sql( | ||
| s""" | ||
| |INSERT OVERWRITE LOCAL DIRECTORY '${path}' | ||
| |STORED AS orc | ||
| |SELECT * FROM src where key < 10""".stripMargin), | ||
| Seq.empty[Row]) | ||
| |SELECT * FROM src where key < 10 | ||
| """.stripMargin), | ||
| Seq.empty[Row]) | ||
|
|
||
| // use orc data source to check the data of path is right. | ||
| sql( | ||
| s"""CREATE TEMPORARY TABLE orc_source | ||
| |USING org.apache.spark.sql.hive.orc | ||
| |OPTIONS ( | ||
| | PATH '${path.getCanonicalPath}' | ||
| |) | ||
| """.stripMargin) | ||
| checkAnswer( | ||
| sql("select * from orc_source"), | ||
| sql("select * from src where key < 10").collect() | ||
| ) | ||
| // use orc data source to check the data of path is right. | ||
| sql( | ||
| s""" | ||
| |CREATE TEMPORARY TABLE orc_source | ||
| |USING org.apache.spark.sql.hive.orc | ||
| |OPTIONS ( | ||
| | PATH '${dir.getCanonicalPath}' | ||
| |) | ||
| """.stripMargin) | ||
|
|
||
| Utils.deleteRecursively(path) | ||
| dropTempTable("orc_source") | ||
| checkAnswer( | ||
| sql("select * from orc_source"), | ||
| sql("select * from src where key < 10").collect()) | ||
|
|
||
| dropTempTable("orc_source") | ||
| } | ||
| } | ||
|
|
||
| test("insert overwrite to dir from temp table") { | ||
| import org.apache.spark.util.Utils | ||
| withTempView("test_insert_table") { | ||
| spark.range(10).selectExpr("id", "id AS str").createOrReplaceTempView("test_insert_table") | ||
|
|
||
| sparkContext | ||
| .parallelize(1 to 10) | ||
| .map(i => TestData(i, i.toString)) | ||
| .toDF() | ||
| .registerTempTable("test_insert_table") | ||
| withTempDir { dir => | ||
| val path = dir.toURI.getPath | ||
|
|
||
| val path = Utils.createTempDir() | ||
| path.delete() | ||
| checkAnswer( | ||
| sql( | ||
| s""" | ||
| |INSERT OVERWRITE LOCAL DIRECTORY '${path.toString}' | ||
| |ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' | ||
| |SELECT * FROM test_insert_table | ||
| """.stripMargin), | ||
| Seq.empty[Row]) | ||
| checkAnswer( | ||
| sql( | ||
| s""" | ||
| |INSERT OVERWRITE LOCAL DIRECTORY '${path}' | ||
| |ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' | ||
| |SELECT * FROM test_insert_table | ||
| """.stripMargin), | ||
| Seq.empty[Row]) | ||
|
|
||
| checkAnswer( | ||
| sql(s""" | ||
| INSERT OVERWRITE LOCAL DIRECTORY '${path.toString}' | ||
| |STORED AS orc | ||
| |SELECT * FROM test_insert_table""".stripMargin), | ||
| Seq.empty[Row]) | ||
| checkAnswer( | ||
| sql( | ||
| s""" | ||
| |INSERT OVERWRITE LOCAL DIRECTORY '${path}' | ||
| |STORED AS orc | ||
| |SELECT * FROM test_insert_table | ||
| """.stripMargin), | ||
| Seq.empty[Row]) | ||
|
|
||
| // use orc data source to check the data of path is right. | ||
| sql( | ||
| s"""CREATE TEMPORARY TABLE orc_source | ||
| |USING org.apache.spark.sql.hive.orc | ||
| |OPTIONS ( | ||
| | PATH '${path.getCanonicalPath}' | ||
| |) | ||
| """.stripMargin) | ||
| checkAnswer( | ||
| sql("select * from orc_source"), | ||
| sql("select * from test_insert_table").collect() | ||
| ) | ||
| Utils.deleteRecursively(path) | ||
| dropTempTable("test_insert_table") | ||
| dropTempTable("orc_source") | ||
| // use orc data source to check the data of path is right. | ||
| sql( | ||
| s""" | ||
| |CREATE TEMPORARY TABLE orc_source | ||
| |USING org.apache.spark.sql.hive.orc | ||
| |OPTIONS ( | ||
| | PATH '${dir.getCanonicalPath}' | ||
| |) | ||
| """.stripMargin) | ||
|
|
||
| checkAnswer( | ||
| sql("select * from orc_source"), | ||
| sql("select * from test_insert_table").collect()) | ||
|
|
||
| dropTempTable("orc_source") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also put it into
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated. |
||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: We can simply extend
UnaryNodeand removechildren.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated.