apache · fjh100456 · Sep 13, 2017 · Sep 14, 2017 · Sep 15, 2017 · Sep 15, 2017
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -101,6 +101,19 @@ case class InsertIntoHiveTable(
     val tmpLocation = getExternalTmpPath(sparkSession, hadoopConf, tableLocation)
     val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false)
 
+    tableDesc.getOutputFileFormatClassName match {
+      case formatName if formatName.endsWith("ParquetOutputFormat") =>
+        val parquetCompression = sparkSession.sessionState.conf.parquetCompressionCodec
+        hadoopConf.set("parquet.compression", parquetCompression)
+      case formatName if formatName.endsWith("OrcOutputFormat") =>
+        val orcCompression = sparkSession.sessionState.conf.orcCompressionCodec.toUpperCase match {
+          case "UNCOMPRESSED" => "NONE"
+          case _@x => x
+        }
+        hadoopConf.set("orc.compress", orcCompression)
+      case _ =>
+    }
+
     val numDynamicPartitions = partition.values.count(_.isEmpty)
     val numStaticPartitions = partition.values.count(_.nonEmpty)
     val partitionSpec = partition.map {

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -728,4 +728,95 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
       assert(e.contains("mismatched input 'ROW'"))
     }
   }
+
+  test("[SPARK-21786] The 'spark.sql.parquet.compression.codec' " +
+    "configuration doesn't take effect on tables with partition field(s)") {
+    val tableWithPartition = "table_with_partition"
+    val tableNoPartition = "table_no_partition"
+
+    def insertOverwriteTable(tableName: String, paramName: String, codec: String,
+      isPartitioned: Boolean): Unit = {
+      withSQLConf(paramName -> codec) {
+        sql(
+          s"""
+              |INSERT OVERWRITE TABLE $tableName
+              |${if (isPartitioned) "partition (p=10000)" else "" }
+              |SELECT * from table_source
+           """.stripMargin)
+      }
+    }
+
+    def getDirFiles(file: File): List[File] = {
+      if (!file.exists()) Nil
+      else if (file.isFile) List(file)
+      else {
+        file.listFiles().filterNot(_.getName.startsWith(".hive-staging"))
+          .groupBy(_.isFile).flatMap {
+          case (isFile, files) if isFile => files.toList
+          case (_, dirs) => dirs.flatMap(getDirFiles)
+        }.toList
+      }
+    }
+
+    def getTableSize(tmpDir: File, tableName: String, paramName: String, codec: String,
+      isPartitioned: Boolean = false): Long = {
+      insertOverwriteTable(tableName, paramName, codec, isPartitioned)
+      val path = s"${tmpDir.getPath.stripSuffix("/")}/$tableName"
+      val dir = new File(path)
+      val files = getDirFiles(dir).filter(_.getName.startsWith("part-"))
+      files.map(_.length()).sum
+    }
+
+    def checkCompressionCodec(format: String)(f: File => Unit): Unit = {
+      withTempDir { tmpDir =>
+        withTempView("table_source") {
+          (0 until 100000).toDF("a").createOrReplaceTempView("table_source")
+
+          withTable(tableWithPartition, tableNoPartition) {
+            sql(
+              s"""
+               |CREATE TABLE $tableNoPartition(a int)
+               |STORED AS $format
+               |LOCATION '${tmpDir.toURI.toString.stripSuffix("/")}/$tableNoPartition'
+            """.stripMargin)
+            sql(
+              s"""
+               |CREATE TABLE $tableWithPartition(a int)
+               |PARTITIONED BY (p int)
+               |STORED AS $format
+               |LOCATION '${tmpDir.toURI.toString.stripSuffix("/")}/$tableWithPartition'
+            """.stripMargin)
+
+            f(tmpDir)
+          }
+        }
+      }
+    }
+
+    val parquetCompression = "spark.sql.parquet.compression.codec"
+    checkCompressionCodec("PARQUET") { tmpDir =>
+      // In fact, partitioned and unpartitioned table meta information is slightly different,
+      // and partitioned tables are slightly larger, but the differences are not very large.
+      // Think less than 1024Byte
+      val maxDiff = 1024
+      assert(getTableSize(tmpDir, tableWithPartition, parquetCompression, "uncompressed", true)
+        - getTableSize(tmpDir, tableNoPartition, parquetCompression, "uncompressed") < maxDiff)
+      assert(getTableSize(tmpDir, tableWithPartition, parquetCompression, "gzip", true)
+        - getTableSize(tmpDir, tableNoPartition, parquetCompression, "gzip") < maxDiff)
+      assert(getTableSize(tmpDir, tableWithPartition, parquetCompression, "uncompressed", true)
+        - getTableSize(tmpDir, tableWithPartition, parquetCompression, "gzip", true) > maxDiff)
+    }
+
+    val orcCompression = "spark.sql.orc.compression.codec"
+    checkCompressionCodec("ORC") { tmpDir =>
+      assert(getTableSize(tmpDir, tableWithPartition, orcCompression, "none", true)
+        == getTableSize(tmpDir, tableNoPartition, orcCompression, "none"))
+      assert(getTableSize(tmpDir, tableWithPartition, orcCompression, "uncompressed", true)
+        == getTableSize(tmpDir, tableNoPartition, orcCompression, "none"))
+      assert(getTableSize(tmpDir, tableWithPartition, orcCompression, "zlib", true)
+        == getTableSize(tmpDir, tableNoPartition, orcCompression, "zlib"))
+      assert(getTableSize(tmpDir, tableWithPartition, orcCompression, "none", true)
+        > getTableSize(tmpDir, tableWithPartition, orcCompression, "zlib", true))
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -1438,39 +1438,41 @@ class HiveDDLSuite
   }
 
   test("create hive serde table with new syntax") {
-    withTable("t", "t2", "t3") {
-      withTempPath { path =>
-        sql(
-          s"""
-            |CREATE TABLE t(id int) USING hive
-            |OPTIONS(fileFormat 'orc', compression 'Zlib')
-            |LOCATION '${path.toURI}'
-          """.stripMargin)
-        val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
-        assert(DDLUtils.isHiveTable(table))
-        assert(table.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
-        assert(table.storage.properties.get("compression") == Some("Zlib"))
-        assert(spark.table("t").collect().isEmpty)
-
-        sql("INSERT INTO t SELECT 1")
-        checkAnswer(spark.table("t"), Row(1))
-        // Check if this is compressed as ZLIB.
-        val maybeOrcFile = path.listFiles().find(!_.getName.endsWith(".crc"))
-        assert(maybeOrcFile.isDefined)
-        val orcFilePath = maybeOrcFile.get.toPath.toString
-        val expectedCompressionKind =
-          OrcFileOperator.getFileReader(orcFilePath).get.getCompression
-        assert("ZLIB" === expectedCompressionKind.name())
-
-        sql("CREATE TABLE t2 USING HIVE AS SELECT 1 AS c1, 'a' AS c2")
-        val table2 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t2"))
-        assert(DDLUtils.isHiveTable(table2))
-        assert(table2.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
-        checkAnswer(spark.table("t2"), Row(1, "a"))
-
-        sql("CREATE TABLE t3(a int, p int) USING hive PARTITIONED BY (p)")
-        sql("INSERT INTO t3 PARTITION(p=1) SELECT 0")
-        checkAnswer(spark.table("t3"), Row(0, 1))
+    withSQLConf("spark.sql.orc.compression.codec" -> "zlib") {
+      withTable("t", "t2", "t3") {
+        withTempPath { path =>
+          sql(
+            s"""
+              |CREATE TABLE t(id int) USING hive
+              |OPTIONS(fileFormat 'orc', compression 'Zlib')
+              |LOCATION '${path.toURI}'
+            """.stripMargin)
+          val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t"))
+          assert(DDLUtils.isHiveTable(table))
+          assert(table.storage.serde == Some("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
+          assert(table.storage.properties.get("compression") == Some("Zlib"))
+          assert(spark.table("t").collect().isEmpty)
+
+          sql("INSERT INTO t SELECT 1")
+          checkAnswer(spark.table("t"), Row(1))
+          // Check if this is compressed as ZLIB.
+          val maybeOrcFile = path.listFiles().find(!_.getName.endsWith(".crc"))
+          assert(maybeOrcFile.isDefined)
+          val orcFilePath = maybeOrcFile.get.toPath.toString
+          val expectedCompressionKind =
+            OrcFileOperator.getFileReader(orcFilePath).get.getCompression
+          assert("ZLIB" === expectedCompressionKind.name())
+
+          sql("CREATE TABLE t2 USING HIVE AS SELECT 1 AS c1, 'a' AS c2")
+          val table2 = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t2"))
+          assert(DDLUtils.isHiveTable(table2))
+          assert(table2.storage.serde == Some("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
+          checkAnswer(spark.table("t2"), Row(1, "a"))
+
+          sql("CREATE TABLE t3(a int, p int) USING hive PARTITIONED BY (p)")
+          sql("INSERT INTO t3 PARTITION(p=1) SELECT 0")
+          checkAnswer(spark.table("t3"), Row(0, 1))
+        }
       }
     }
   }