-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-20728][SQL] Make OrcFileFormat configurable between sql/hive and sql/core #19871
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
37e240c
e7beb02
8b7e88a
2e498f9
5474a07
2393e1d
8bc420a
e3f6f75
7fac88f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -363,6 +363,11 @@ object SQLConf { | |
| .checkValues(Set("none", "uncompressed", "snappy", "zlib", "lzo")) | ||
| .createWithDefault("snappy") | ||
|
|
||
| val ORC_ENABLED = buildConf("spark.sql.orc.enabled") | ||
| .doc("When true, use OrcFileFormat in sql/core module instead of the one in sql/hive module.") | ||
|
||
| .booleanConf | ||
|
||
| .createWithDefault(false) | ||
|
|
||
| val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown") | ||
| .doc("When true, enable filter pushdown for ORC files.") | ||
| .booleanConf | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,8 +36,10 @@ import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap | |
| import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat | ||
| import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider | ||
| import org.apache.spark.sql.execution.datasources.json.JsonFileFormat | ||
| import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat | ||
| import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat | ||
| import org.apache.spark.sql.execution.streaming._ | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.sources._ | ||
| import org.apache.spark.sql.streaming.OutputMode | ||
| import org.apache.spark.sql.types.{CalendarIntervalType, StructType} | ||
|
|
@@ -85,7 +87,7 @@ case class DataSource( | |
|
|
||
| case class SourceInfo(name: String, schema: StructType, partitionColumns: Seq[String]) | ||
|
|
||
| lazy val providingClass: Class[_] = DataSource.lookupDataSource(className) | ||
| lazy val providingClass: Class[_] = DataSource.lookupDataSource(sparkSession, className) | ||
| lazy val sourceInfo: SourceInfo = sourceSchema() | ||
| private val caseInsensitiveOptions = CaseInsensitiveMap(options) | ||
| private val equality = sparkSession.sessionState.conf.resolver | ||
|
|
@@ -568,8 +570,13 @@ object DataSource extends Logging { | |
| "org.apache.spark.Logging") | ||
|
|
||
| /** Given a provider name, look up the data source class definition. */ | ||
| def lookupDataSource(provider: String): Class[_] = { | ||
| val provider1 = backwardCompatibilityMap.getOrElse(provider, provider) | ||
| def lookupDataSource(sparkSession: SparkSession, provider: String): Class[_] = { | ||
|
||
| var provider1 = backwardCompatibilityMap.getOrElse(provider, provider) | ||
|
||
| if (Seq("orc", "org.apache.spark.sql.hive.orc.OrcFileFormat").contains(provider1.toLowerCase) && | ||
|
||
| sparkSession.conf.get(SQLConf.ORC_ENABLED)) { | ||
|
||
| logInfo(s"$provider1 is replaced with ${classOf[OrcFileFormat].getCanonicalName}") | ||
| provider1 = classOf[OrcFileFormat].getCanonicalName | ||
| } | ||
| val provider2 = s"$provider1.DefaultSource" | ||
| val loader = Utils.getContextOrSparkClassLoader | ||
| val serviceLoader = ServiceLoader.load(classOf[DataSourceRegister], loader) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| package org.apache.spark.sql.sources | ||
|
|
||
| import org.apache.spark.sql.{AnalysisException, SQLContext} | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.test.SharedSQLContext | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
|
|
@@ -54,11 +55,17 @@ class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext { | |
| .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))) | ||
| } | ||
|
|
||
| test("should fail to load ORC without Hive Support") { | ||
| val e = intercept[AnalysisException] { | ||
| spark.read.format("orc").load() | ||
| test("should fail to load ORC only if spark.sql.orc.enabled=false and without Hive Support") { | ||
|
||
| Seq( | ||
| (true, "Unable to infer schema for ORC. It must be specified manually"), | ||
| (false, "The ORC data source must be used with Hive support")).foreach { case (value, m) => | ||
| withSQLConf(SQLConf.ORC_ENABLED.key -> s"$value") { | ||
| val e = intercept[AnalysisException] { | ||
| spark.read.format("orc").load() | ||
| } | ||
| assert(e.message.contains(m)) | ||
| } | ||
| } | ||
| assert(e.message.contains("The ORC data source must be used with Hive support enabled")) | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -195,8 +195,18 @@ case class RelationConversions( | |
| .convertToLogicalRelation(relation, options, classOf[ParquetFileFormat], "parquet") | ||
| } else { | ||
| val options = relation.tableMeta.storage.properties | ||
| sessionCatalog.metastoreCatalog | ||
| .convertToLogicalRelation(relation, options, classOf[OrcFileFormat], "orc") | ||
| if (conf.getConf(SQLConf.ORC_ENABLED)) { | ||
| sessionCatalog.metastoreCatalog.convertToLogicalRelation( | ||
| relation, | ||
| options, | ||
| classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat], | ||
| "orc") | ||
| } else { | ||
| sessionCatalog.metastoreCatalog.convertToLogicalRelation( | ||
| relation, | ||
| options, | ||
| classOf[org.apache.spark.sql.hive.orc.OrcFileFormat], "orc") | ||
|
||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2153,4 +2153,21 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton { | |
| } | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") { | ||
|
||
| Seq( | ||
| (true, classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat]), | ||
| (false, classOf[org.apache.spark.sql.hive.orc.OrcFileFormat])).foreach { case (v, format) => | ||
|
|
||
| withSQLConf(SQLConf.ORC_ENABLED.key -> s"$v") { | ||
| withTable("spark_20728") { | ||
| sql("CREATE TABLE spark_20728(a INT) USING ORC") | ||
| val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst { | ||
| case l: LogicalRelation => l.relation.asInstanceOf[HadoopFsRelation].fileFormat.getClass | ||
| } | ||
| assert(fileFormat == Some(format)) | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how about
spark.sql.orc.useNewVersion? Also let's make it an internal config and enable it by default.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure!