Make comression codec take effect in hive table writing.

apache · fjh100456 · Dec 25, 2017 · Dec 25, 2017 · Dec 25, 2017 · Dec 25, 2017
commit 6907a3ef86a2546fae91c22754796490a80effff
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOptions.scala
@@ -61,7 +61,7 @@ class OrcOptions(
 
 object OrcOptions {
   // The ORC compression short names
-  private val shortOrcCompressionCodecNames = Map(
+  val shortOrcCompressionCodecNames = Map(
     "none" -> "NONE",
     "uncompressed" -> "NONE",
     "snappy" -> "SNAPPY",

diff --git a/...re/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala b/...re/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
@@ -19,16 +19,16 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.util.Locale
 
-import org.apache.parquet.hadoop.metadata.CompressionCodecName
 import org.apache.parquet.hadoop.ParquetOutputFormat
+import org.apache.parquet.hadoop.metadata.CompressionCodecName
 
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Options for the Parquet data source.
  */
-private[parquet] class ParquetOptions(
+class ParquetOptions(
     @transient private val parameters: CaseInsensitiveMap[String],
     @transient private val sqlConf: SQLConf)
   extends Serializable {
@@ -76,7 +76,7 @@ object ParquetOptions {
   val MERGE_SCHEMA = "mergeSchema"
 
   // The parquet compression short names
-  private val shortParquetCompressionCodecNames = Map(
+  val shortParquetCompressionCodecNames = Map(
     "none" -> CompressionCodecName.UNCOMPRESSED,
     "uncompressed" -> CompressionCodecName.UNCOMPRESSED,
     "snappy" -> CompressionCodecName.SNAPPY,

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveOptions.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveOptions.scala
@@ -19,7 +19,16 @@ package org.apache.spark.sql.hive.execution
 
 import java.util.Locale
 
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.hive.ql.plan.TableDesc
+import org.apache.orc.OrcConf.COMPRESS
+import org.apache.parquet.hadoop.ParquetOutputFormat
+
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
+import org.apache.spark.sql.execution.datasources.orc.OrcOptions
+import org.apache.spark.sql.execution.datasources.parquet.ParquetOptions
+import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Options for the Hive data source. Note that rule `DetermineHiveSerde` will extract Hive
@@ -102,4 +111,17 @@ object HiveOptions {
     "collectionDelim" -> "colelction.delim",
     "mapkeyDelim" -> "mapkey.delim",
     "lineDelim" -> "line.delim").map { case (k, v) => k.toLowerCase(Locale.ROOT) -> v }
+
+  def getHiveWriteCompression(tableInfo: TableDesc, sqlConf: SQLConf): Option[(String, String)]= {
+    val tableProps = tableInfo.getProperties.asScala.toMap
+    tableInfo.getOutputFileFormatClassName.toLowerCase match {
+      case formatName if formatName.endsWith("parquetoutputformat") =>
+        val compressionCodec = new ParquetOptions(tableProps, sqlConf).compressionCodec
+        Option((ParquetOutputFormat.COMPRESSION, compressionCodec))
+      case formatName if formatName.endsWith("orcoutputformat") =>
+        val compressionCodec = new OrcOptions(tableProps, sqlConf).compressionCodec
+        Option((COMPRESS.getAttribute, compressionCodec))
+      case _ => None
+    }
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -68,6 +68,10 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
         .get("mapreduce.output.fileoutputformat.compress.type"))
     }
 
+    // Set compression by priority
+    HiveOptions.getHiveWriteCompression(fileSinkConf.getTableInfo, sparkSession.sessionState.conf)
+      .foreach { case (compression, codec) => hadoopConf.set(compression, codec) }
+
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,