Use spark.sql.orc.impl.

dongjoon-hyun · dongjoon-hyun · commit 8bc420ab6a08 · 2017-12-04T11:52:04.000-08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -363,13 +363,13 @@ object SQLConf {
     .checkValues(Set("none", "uncompressed", "snappy", "zlib", "lzo"))
     .createWithDefault("snappy")
 
-  val ORC_USE_NEW_VERSION = buildConf("spark.sql.orc.useNewVersion")
-    .doc("When true, use new OrcFileFormat in sql/core module instead of the one in sql/hive. " +
-      "Since new OrcFileFormat uses Apache ORC library instead of ORC library Hive 1.2.1, it is " +
-      "more stable and faster.")
+  val ORC_IMPLEMENTATION = buildConf("spark.sql.orc.impl")
+    .doc("When native, use the native version of ORC support instead of the ORC library in Hive " +
+      "1.2.1. It is 'hive' by default prior to Spark 2.3.")
     .internal()
-    .booleanConf
-    .createWithDefault(true)
+    .stringConf
+    .checkValues(Set("hive", "native"))
+    .createWithDefault("native")
 
   val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown")
     .doc("When true, enable filter pushdown for ORC files.")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -540,7 +540,7 @@ object DataSource extends Logging {
     val csv = classOf[CSVFileFormat].getCanonicalName
     val libsvm = "org.apache.spark.ml.source.libsvm.LibSVMFileFormat"
     val orc = "org.apache.spark.sql.hive.orc.OrcFileFormat"
-    val newOrc = classOf[OrcFileFormat].getCanonicalName
+    val nativeOrc = classOf[OrcFileFormat].getCanonicalName
 
     Map(
       "org.apache.spark.sql.jdbc" -> jdbc,
@@ -557,8 +557,8 @@ object DataSource extends Logging {
       "org.apache.spark.sql.execution.datasources.parquet.DefaultSource" -> parquet,
       "org.apache.spark.sql.hive.orc.DefaultSource" -> orc,
       "org.apache.spark.sql.hive.orc" -> orc,
-      "org.apache.spark.sql.execution.datasources.orc.DefaultSource" -> newOrc,
-      "org.apache.spark.sql.execution.datasources.orc" -> newOrc,
+      "org.apache.spark.sql.execution.datasources.orc.DefaultSource" -> nativeOrc,
+      "org.apache.spark.sql.execution.datasources.orc" -> nativeOrc,
       "org.apache.spark.ml.source.libsvm.DefaultSource" -> libsvm,
       "org.apache.spark.ml.source.libsvm" -> libsvm,
       "com.databricks.spark.csv" -> csv
@@ -576,7 +576,8 @@ object DataSource extends Logging {
   /** Given a provider name, look up the data source class definition. */
   def lookupDataSource(provider: String, conf: SQLConf): Class[_] = {
     val provider1 = backwardCompatibilityMap.getOrElse(provider, provider) match {
-      case name if name.equalsIgnoreCase("orc") && conf.getConf(SQLConf.ORC_USE_NEW_VERSION) =>
+      case name if name.equalsIgnoreCase("orc") &&
+          conf.getConf(SQLConf.ORC_IMPLEMENTATION) == "native" =>
         classOf[OrcFileFormat].getCanonicalName
       case name => name
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2786,14 +2786,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") {
-    withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> "false") {
+    withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "hive") {
       val e = intercept[AnalysisException] {
         sql("CREATE TABLE spark_20728(a INT) USING ORC")
       }
       assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
     }
 
-    withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> "true") {
+    withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "native") {
       withTable("spark_20728") {
         sql("CREATE TABLE spark_20728(a INT) USING ORC")
         val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
@@ -478,7 +478,7 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
   }
 
   test("orc - API and behavior regarding schema") {
-    withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> "true") {
+    withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "native") {
       // Writer
       spark.createDataset(data).toDF("str").write.mode(SaveMode.Overwrite).orc(dir)
       val df = spark.read.orc(dir)
@@ -507,7 +507,7 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
   }
 
   test("column nullability and comment - write and then read") {
-    withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> "true") {
+    withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "native") {
       Seq("json", "orc", "parquet", "csv").foreach { format =>
         val schema = StructType(
           StructField("cl1", IntegerType, nullable = false).withComment("test") ::
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -194,7 +194,7 @@ case class RelationConversions(
         .convertToLogicalRelation(relation, options, classOf[ParquetFileFormat], "parquet")
     } else {
       val options = relation.tableMeta.storage.properties
-      if (conf.getConf(SQLConf.ORC_USE_NEW_VERSION)) {
+      if (conf.getConf(SQLConf.ORC_IMPLEMENTATION) == "native") {
         sessionCatalog.metastoreCatalog.convertToLogicalRelation(
           relation,
           options,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -624,10 +624,10 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
 
   test("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") {
     Seq(
-      (true, classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat]),
-      (false, classOf[org.apache.spark.sql.hive.orc.OrcFileFormat])).foreach { case (v, format) =>
+      ("native", classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat]),
+      ("hive", classOf[org.apache.spark.sql.hive.orc.OrcFileFormat])).foreach { case (i, format) =>
 
-      withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> s"$v") {
+      withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> i) {
         withTable("spark_20728") {
           sql("CREATE TABLE spark_20728(a INT) USING ORC")
           val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst {

Original file line number	Diff line number	Diff line change
`@@ -2786,14 +2786,14 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {`
`2786`	`2786`	`}`
`2787`	`2787`
`2788`	`2788`	`test("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") {`
`2789`		`- withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> "false") {`
	`2789`	`+ withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "hive") {`
`2790`	`2790`	`val e = intercept[AnalysisException] {`
`2791`	`2791`	`sql("CREATE TABLE spark_20728(a INT) USING ORC")`
`2792`	`2792`	`}`
`2793`	`2793`	`assert(e.message.contains("The ORC data source must be used with Hive support enabled"))`
`2794`	`2794`	`}`
`2795`	`2795`
`2796`		`- withSQLConf(SQLConf.ORC_USE_NEW_VERSION.key -> "true") {`
	`2796`	`+ withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "native") {`
`2797`	`2797`	`withTable("spark_20728") {`
`2798`	`2798`	`sql("CREATE TABLE spark_20728(a INT) USING ORC")`
`2799`	`2799`	`val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst {`