-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-20728][SQL] Make OrcFileFormat configurable between sql/hive and sql/core #19871
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
37e240c
e7beb02
8b7e88a
2e498f9
5474a07
2393e1d
8bc420a
e3f6f75
7fac88f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -363,6 +363,14 @@ object SQLConf { | |
| .checkValues(Set("none", "uncompressed", "snappy", "zlib", "lzo")) | ||
| .createWithDefault("snappy") | ||
|
|
||
| val ORC_USE_NEW_VERSION = buildConf("spark.sql.orc.useNewVersion") | ||
| .doc("When true, use new OrcFileFormat in sql/core module instead of the one in sql/hive. " + | ||
| "Since new OrcFileFormat uses Apache ORC library instead of ORC library Hive 1.2.1, it is " + | ||
| "more stable and faster.") | ||
|
||
| .internal() | ||
| .booleanConf | ||
|
||
| .createWithDefault(true) | ||
|
|
||
| val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown") | ||
| .doc("When true, enable filter pushdown for ORC files.") | ||
| .booleanConf | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,8 +36,10 @@ import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap | |
| import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat | ||
| import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider | ||
| import org.apache.spark.sql.execution.datasources.json.JsonFileFormat | ||
| import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat | ||
| import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat | ||
| import org.apache.spark.sql.execution.streaming._ | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.sources._ | ||
| import org.apache.spark.sql.streaming.OutputMode | ||
| import org.apache.spark.sql.types.{CalendarIntervalType, StructType} | ||
|
|
@@ -85,7 +87,8 @@ case class DataSource( | |
|
|
||
| case class SourceInfo(name: String, schema: StructType, partitionColumns: Seq[String]) | ||
|
|
||
| lazy val providingClass: Class[_] = DataSource.lookupDataSource(className) | ||
| lazy val providingClass: Class[_] = | ||
| DataSource.lookupDataSource(className, sparkSession.sessionState.conf) | ||
| lazy val sourceInfo: SourceInfo = sourceSchema() | ||
| private val caseInsensitiveOptions = CaseInsensitiveMap(options) | ||
| private val equality = sparkSession.sessionState.conf.resolver | ||
|
|
@@ -537,6 +540,7 @@ object DataSource extends Logging { | |
| val csv = classOf[CSVFileFormat].getCanonicalName | ||
| val libsvm = "org.apache.spark.ml.source.libsvm.LibSVMFileFormat" | ||
| val orc = "org.apache.spark.sql.hive.orc.OrcFileFormat" | ||
| val newOrc = classOf[OrcFileFormat].getCanonicalName | ||
|
||
|
|
||
| Map( | ||
| "org.apache.spark.sql.jdbc" -> jdbc, | ||
|
|
@@ -553,6 +557,8 @@ object DataSource extends Logging { | |
| "org.apache.spark.sql.execution.datasources.parquet.DefaultSource" -> parquet, | ||
| "org.apache.spark.sql.hive.orc.DefaultSource" -> orc, | ||
| "org.apache.spark.sql.hive.orc" -> orc, | ||
| "org.apache.spark.sql.execution.datasources.orc.DefaultSource" -> newOrc, | ||
| "org.apache.spark.sql.execution.datasources.orc" -> newOrc, | ||
| "org.apache.spark.ml.source.libsvm.DefaultSource" -> libsvm, | ||
| "org.apache.spark.ml.source.libsvm" -> libsvm, | ||
| "com.databricks.spark.csv" -> csv | ||
|
|
@@ -568,8 +574,12 @@ object DataSource extends Logging { | |
| "org.apache.spark.Logging") | ||
|
|
||
| /** Given a provider name, look up the data source class definition. */ | ||
| def lookupDataSource(provider: String): Class[_] = { | ||
| val provider1 = backwardCompatibilityMap.getOrElse(provider, provider) | ||
| def lookupDataSource(provider: String, conf: SQLConf): Class[_] = { | ||
| val provider1 = backwardCompatibilityMap.getOrElse(provider, provider) match { | ||
| case name if name.equalsIgnoreCase("orc") && conf.getConf(SQLConf.ORC_USE_NEW_VERSION) => | ||
| classOf[OrcFileFormat].getCanonicalName | ||
| case name => name | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was looking at the exact same path. It seems not because it's not registered to
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cloud-fan . To avoid that issue, new OrcFileFormat is not registered intentionally.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This sounds counter-intuitive, I think we should register the new orc instead of the old one.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and also add comments here.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 for ^
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree with both of you. Just for explanation: The original design completely preserves the previous behavior. Anyway, I'm happy to update according to your advice. :)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, there is no more
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sounds good
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And for here, I added the following to prevent |
||
| } | ||
| val provider2 = s"$provider1.DefaultSource" | ||
| val loader = Utils.getContextOrSparkClassLoader | ||
| val serviceLoader = ServiceLoader.load(classOf[DataSourceRegister], loader) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
spark.sql.orc.implThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No problem to change to it. But, since the name is given by @cloud-fan before, ping @cloud-fan .