-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-21786][SQL] The 'spark.sql.parquet.compression.codec' and 'spark.sql.orc.compression.codec' configuration doesn't take effect on hive table writing #19218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
677541b
4e70fff
3f022f9
6d77bf9
42aca3d
5cbe999
732266c
c7ff62c
384ee04
8c92074
dd5060a
d427df5
35cfa01
5387497
676d6a7
ae1da8f
fd73145
7615939
90cbcb3
dd6d635
4fe8170
aa31261
dfb36d9
c4801f6
105e129
dc12038
d779ee6
0cb7b7a
78e0403
7804f60
52cdd75
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
…rk.sql.orc.compression.codec' configuration doesn't take effect on hive table writing Fix the test case issue
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,12 +21,16 @@ import java.io.File | |
|
|
||
| import org.scalatest.BeforeAndAfter | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
|
|
||
| import org.apache.hadoop.fs.Path | ||
| import org.apache.spark.SparkException | ||
| import org.apache.spark.sql.{QueryTest, _} | ||
| import org.apache.spark.sql.catalyst.parser.ParseException | ||
| import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable | ||
| import org.apache.spark.sql.execution.datasources.parquet.ParquetTest | ||
| import org.apache.spark.sql.hive.orc.OrcFileOperator | ||
| import org.apache.spark.sql.hive.test.TestHiveSingleton | ||
| import org.apache.spark.sql.test.SQLTestUtils | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.util.Utils | ||
|
|
||
|
|
@@ -35,7 +39,7 @@ case class TestData(key: Int, value: String) | |
| case class ThreeCloumntable(key: Int, value: String, key1: String) | ||
|
|
||
| class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter | ||
| with SQLTestUtils { | ||
| with ParquetTest { | ||
| import spark.implicits._ | ||
|
|
||
| override lazy val testData = spark.sparkContext.parallelize( | ||
|
|
@@ -730,118 +734,185 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter | |
| } | ||
|
|
||
| test("[SPARK-21786] Check 'spark.sql.parquet.compression.codec' " + | ||
|
||
| "and 'spark.sql.parquet.compression.codec' taking effect on hive table writing") { | ||
| case class CompressionConf(name: String, codeC: String) | ||
| "and 'spark.sql.orc.compression.codec' taking effect on hive table writing") { | ||
|
|
||
| val hadoopConf = spark.sessionState.newHadoopConf() | ||
|
|
||
| val partitionStr = "p=10000" | ||
|
|
||
| case class TableCompressionConf(name: String, codeC: String) | ||
|
|
||
| case class TableDefine(tableName: String, isPartitioned: Boolean, format: String, | ||
|
||
| compressionConf: Option[CompressionConf]) { | ||
| compressionConf: Option[TableCompressionConf]) { | ||
| def createTable(rootDir: File): Unit = { | ||
| val compression = compressionConf.map(cf => s"'${cf.name}'='${cf.codeC}'") | ||
| sql( | ||
| s""" | ||
| |CREATE TABLE $tableName(a int) | ||
| |${ if (isPartitioned) "PARTITIONED BY (p int)" else "" } | ||
| |${if (isPartitioned) "PARTITIONED BY (p int)" else ""} | ||
|
||
| |STORED AS $format | ||
| |LOCATION '${rootDir.toURI.toString.stripSuffix("/")}/$tableName' | ||
| |${ if (compressionConf.nonEmpty) s"TBLPROPERTIES(${compression.get})" else "" } | ||
| |${if (compressionConf.nonEmpty) s"TBLPROPERTIES(${compression.get})" else ""} | ||
| """.stripMargin) | ||
| } | ||
|
|
||
| def insertOverwriteTable(): Unit = { | ||
| sql( | ||
| s""" | ||
| |INSERT OVERWRITE TABLE $tableName | ||
| |${ if (isPartitioned) "partition (p=10000)" else "" } | ||
| |${if (isPartitioned) s"partition ($partitionStr)" else ""} | ||
| |SELECT * from table_source | ||
| """.stripMargin) | ||
| } | ||
| } | ||
|
|
||
| def getTableCompressionCodec(path: String, format: String): String = { | ||
| val codecs = format match { | ||
| case "parquet" => for { | ||
| footer <- readAllFootersWithoutSummaryFiles(new Path(path), hadoopConf) | ||
| block <- footer.getParquetMetadata.getBlocks.asScala | ||
| column <- block.getColumns.asScala | ||
| } yield column.getCodec.name() | ||
| case "orc" => new File(path).listFiles() | ||
| .filter(file => file.isFile && !file.getName.endsWith(".crc")).map { | ||
| orcFile => | ||
| OrcFileOperator.getFileReader(orcFile.toPath.toString).get.getCompression.toString | ||
| }.toSeq | ||
| } | ||
|
|
||
| def getDirFiles(file: File): List[File] = { | ||
| if (!file.exists()) Nil | ||
| else if (file.isFile) List(file) | ||
| else { | ||
| file.listFiles().filterNot(_.getName.startsWith(".hive-staging")) | ||
| .groupBy(_.isFile).flatMap { | ||
| case (isFile, files) if isFile => files.toList | ||
| case (_, dirs) => dirs.flatMap(getDirFiles) | ||
| }.toList | ||
| assert(codecs.distinct.length == 1) | ||
| codecs.head | ||
| } | ||
|
|
||
| def checkCompressionCodecForTable(format:String, isPartitioned: Boolean, | ||
| compressionConf: Option[TableCompressionConf])(assertion: String => Boolean): Unit = { | ||
| val table = TableDefine(s"tbl_$format${isPartitioned}", | ||
| isPartitioned, format, compressionConf) | ||
| withTempDir { tmpDir => | ||
| withTable(table.tableName) { | ||
| table.createTable(tmpDir) | ||
| table.insertOverwriteTable() | ||
| val partition = if (table.isPartitioned) partitionStr else "" | ||
| val path = s"${tmpDir.getPath.stripSuffix("/")}/${table.tableName}/$partition" | ||
| assertion(getTableCompressionCodec(path, table.format)) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| def getConvertMetastoreConfName(format: String): String = format match { | ||
| case "parquet" => "spark.sql.hive.convertMetastoreParquet" | ||
| case "orc" => "spark.sql.hive.convertMetastoreOrc" | ||
| } | ||
|
|
||
| def getTableSize: Long = { | ||
| var totalSize = 0L | ||
| withTempDir { tmpDir => | ||
| withTable(tableName) { | ||
| createTable(tmpDir) | ||
| insertOverwriteTable() | ||
| val path = s"${tmpDir.getPath.stripSuffix("/")}/$tableName" | ||
| val dir = new File(path) | ||
| val files = getDirFiles(dir).filter(_.getName.startsWith("part-")) | ||
| totalSize = files.map(_.length()).sum | ||
| def getSparkCompressionConfName(format: String): String = format match { | ||
| case "parquet" => "spark.sql.parquet.compression.codec" | ||
| case "orc" => "spark.sql.orc.compression.codec" | ||
| } | ||
|
|
||
| def checkTableCompressionCodecForCodecs(format: String, isPartitioned: Boolean, | ||
| convertMetastore: Boolean, compressionCodecs: List[String], | ||
| tableCompressionConf: List[TableCompressionConf]) | ||
|
||
| (assertion: (Option[TableCompressionConf], String, String) => Boolean): Unit = { | ||
| withSQLConf(getConvertMetastoreConfName(format) -> convertMetastore.toString) { | ||
| tableCompressionConf.foreach { tableCompression => | ||
| compressionCodecs.foreach { sessionCompressionCodec => | ||
| withSQLConf(getSparkCompressionConfName(format) -> sessionCompressionCodec) { | ||
| val compression = if (tableCompression == null) None else Some(tableCompression) | ||
| checkCompressionCodecForTable(format, isPartitioned, compression) { | ||
| case realCompressionCodec => assertion(compression, | ||
| sessionCompressionCodec, realCompressionCodec) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| totalSize | ||
| } | ||
| } | ||
|
|
||
| def checkParquetCompressionCodec(isPartitioned: Boolean, tableCodec: String, | ||
| sessionCodec: String, f: (Long, Long) => Boolean = _ == _): Unit = { | ||
| val tableOrg = TableDefine(s"tbl_parquet$tableCodec", isPartitioned, "parquet", | ||
| Some(CompressionConf("parquet.compression", tableCodec))) | ||
| val tableOrgSize = tableOrg.getTableSize | ||
| def checkTableCompressionCodec(format: String, compressionCodecs: List[String], | ||
| tableCompressionConf: List[TableCompressionConf]): Unit = { | ||
| // For tables with table-level compression property, when | ||
| // 'spark.sql.hive.convertMetastoreParquet' was set to 'false', partitioned parquet tables | ||
| // and non-partitioned parquet tables will always take the table-level compression | ||
| // configuration first and ignore session compression configuration. | ||
| checkTableCompressionCodecForCodecs(format = format, isPartitioned = true, | ||
| convertMetastore = false, compressionCodecs, tableCompressionConf) { | ||
| case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) => | ||
| // table-level take effect | ||
| tableCompressionCodec.get.codeC == realCompressionCodec | ||
| } | ||
|
|
||
| checkTableCompressionCodecForCodecs(format = format, isPartitioned = false, | ||
| convertMetastore = false, compressionCodecs, tableCompressionConf) { | ||
| case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) => | ||
| // table-level take effect | ||
| tableCompressionCodec.get.codeC == realCompressionCodec | ||
| } | ||
|
|
||
| withSQLConf("spark.sql.parquet.compression.codec" -> sessionCodec) { | ||
| // priority check, when table-level compression conf was set, expecting | ||
| // table-level compression conf is not affected by the session conf, and table-level | ||
| // compression conf takes precedence even the two conf of codec is different | ||
| val tableOrgSessionConfSize = tableOrg.getTableSize | ||
| assert(tableOrgSize == tableOrgSessionConfSize) | ||
| // For tables with table-level compression property, when | ||
| // 'spark.sql.hive.convertMetastoreParquet' was set to 'true', partitioned parquet tables | ||
| // will always take the table-level compression configuration first, but non-partitioned tables | ||
| // will take the session-level compression configuration. | ||
| checkTableCompressionCodecForCodecs(format = format, isPartitioned = true, | ||
| convertMetastore = true, compressionCodecs, tableCompressionConf) { | ||
| case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) => | ||
| // table-level take effect | ||
| tableCompressionCodec.get.codeC == realCompressionCodec | ||
| } | ||
|
|
||
| // check session conf of compression codec taking effect | ||
| val table = TableDefine(s"tbl_parquet", isPartitioned, "parquet", None) | ||
| assert(f(tableOrg.getTableSize, table.getTableSize)) | ||
| checkTableCompressionCodecForCodecs(format = format, isPartitioned = false, | ||
| convertMetastore = true, compressionCodecs, tableCompressionConf) { | ||
| case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) => | ||
| // session-level take effect | ||
| sessionCompressionCodec == realCompressionCodec | ||
| } | ||
| } | ||
|
|
||
| def checkOrcCompressionCodec(isPartitioned: Boolean, tableCodec: String, | ||
| sessionCodec: String, f: (Long, Long) => Boolean = _ == _): Unit = { | ||
| val tableOrg = TableDefine(s"tbl_orc$tableCodec", isPartitioned, "orc", | ||
| Some(CompressionConf("orc.compress", tableCodec))) | ||
| val tableOrgSize = tableOrg.getTableSize | ||
| // For tables without table-level compression property, session-level compression configuration | ||
| // will take effect. | ||
| checkTableCompressionCodecForCodecs(format = format, isPartitioned = true, | ||
| convertMetastore = true, compressionCodecs, List(null)) { | ||
| case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) => | ||
| // session-level take effect | ||
| sessionCompressionCodec == realCompressionCodec | ||
| } | ||
|
|
||
| checkTableCompressionCodecForCodecs(format = format, isPartitioned = false, | ||
| convertMetastore = true, compressionCodecs, List(null)) { | ||
| case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) => | ||
| // session-level take effect | ||
| sessionCompressionCodec == realCompressionCodec | ||
| } | ||
|
|
||
| withSQLConf("spark.sql.orc.compression.codec" -> sessionCodec) { | ||
| // priority check, when table-level compression conf was set, expecting | ||
| // table-level compression conf is not affected by the session conf, and table-level | ||
| // compression conf takes precedence even the two conf of codec is different | ||
| val tableOrgSessionConfSize = tableOrg.getTableSize | ||
| assert(tableOrgSize == tableOrgSessionConfSize) | ||
| checkTableCompressionCodecForCodecs(format = format, isPartitioned = true, | ||
| convertMetastore = false, compressionCodecs, List(null)) { | ||
| case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) => | ||
| // session-level take effect | ||
| sessionCompressionCodec == realCompressionCodec | ||
| } | ||
|
|
||
| // check session conf of compression codec taking effect | ||
| val table = TableDefine(s"tbl_orc", isPartitioned, "orc", None) | ||
| assert(f(tableOrg.getTableSize, table.getTableSize)) | ||
| checkTableCompressionCodecForCodecs(format = format, isPartitioned = false, | ||
| convertMetastore = false, compressionCodecs, List(null)) { | ||
| case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) => | ||
| // session-level take effect | ||
| sessionCompressionCodec == realCompressionCodec | ||
| } | ||
| } | ||
|
|
||
| withTempView("table_source") { | ||
| (0 until 100000).toDF("a").createOrReplaceTempView("table_source") | ||
|
|
||
| checkParquetCompressionCodec(true, "UNCOMPRESSED", "UNCOMPRESSED") | ||
| checkParquetCompressionCodec(true, "GZIP", "GZIP") | ||
| checkParquetCompressionCodec(true, "GZIP", "UNCOMPRESSED", _ < _) | ||
|
|
||
| withSQLConf("spark.sql.hive.convertMetastoreParquet" -> "false") { | ||
| checkParquetCompressionCodec(false, "UNCOMPRESSED", "UNCOMPRESSED") | ||
| checkParquetCompressionCodec(false, "GZIP", "GZIP") | ||
| checkParquetCompressionCodec(false, "GZIP", "UNCOMPRESSED", _ < _) | ||
| val parquetCompressionCodec = List("UNCOMPRESSED", "SNAPPY", "GZIP") | ||
| val tableCompressionConf = parquetCompressionCodec.map { tableCodec => | ||
| TableCompressionConf("parquet.compression", tableCodec) | ||
| } | ||
| checkTableCompressionCodec("parquet", parquetCompressionCodec, tableCompressionConf) | ||
| } | ||
|
|
||
| checkOrcCompressionCodec(true, "NONE", "NONE") | ||
| checkOrcCompressionCodec(true, "ZLIB", "ZLIB") | ||
| checkOrcCompressionCodec(true, "ZLIB", "NONE", _ < _) | ||
|
|
||
| checkOrcCompressionCodec(false, "NONE", "NONE") | ||
| checkOrcCompressionCodec(false, "ZLIB", "ZLIB") | ||
| checkOrcCompressionCodec(false, "ZLIB", "NONE", _ < _) | ||
| withTempView("table_source") { | ||
| (0 until 100000).toDF("a").createOrReplaceTempView("table_source") | ||
| val orcCompressionCodec = List("NONE", "SNAPPY", "ZLIB") | ||
| val tableCompressionConf = orcCompressionCodec.map { tableCodec => | ||
| TableCompressionConf("parquet.compression", tableCodec) | ||
| } | ||
| checkTableCompressionCodec("orc", orcCompressionCodec, tableCompressionConf) | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the insert suite. We are unable to do this.
Could you create a separate suite in the current package
org.apache.spark.sql.hive? The suite name can beCompressionCodecSuiteThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please also check whether the compression takes an effect? Compare the size whether is smaller than the original size without compressions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I will do it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems compressed table does not always be smaller than uncompressed tables.
SNAPPYCompression size may be bigger than non-compression size when the amount of data is not big. So I'd like to check the size not equal when compression are different.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fine to me. Thanks!