-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-24836][SQL] New option for Avro datasource - ignoreExtension #21798
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
cdb26f2
d07ced3
e86d231
565e599
d1c511c
3bd3475
0657508
3206a20
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -62,16 +62,14 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister { | |
| // Schema evolution is not supported yet. Here we only pick a single random sample file to | ||
| // figure out the schema of the whole dataset. | ||
| val sampleFile = | ||
| if (AvroFileFormat.ignoreFilesWithoutExtensions(conf)) { | ||
| files.find(_.getPath.getName.endsWith(".avro")).getOrElse { | ||
| throw new FileNotFoundException( | ||
| "No Avro files found. Hadoop option \"avro.mapred.ignore.inputs.without.extension\" " + | ||
| " is set to true. Do all input files have \".avro\" extension?" | ||
| ) | ||
| if (AvroFileFormat.ignoreExtension(conf, options)) { | ||
| files.headOption.getOrElse { | ||
| throw new FileNotFoundException("Files for schema inferring have been not found.") | ||
| } | ||
| } else { | ||
| files.headOption.getOrElse { | ||
| throw new FileNotFoundException("No Avro files found.") | ||
| files.find(_.getPath.getName.endsWith(".avro")).getOrElse { | ||
| throw new FileNotFoundException( | ||
| "No Avro files found. If files don't have .avro extension, set ignoreExtension to true") | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -170,9 +168,7 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister { | |
| // Doing input file filtering is improper because we may generate empty tasks that process no | ||
| // input files but stress the scheduler. We should probably add a more general input file | ||
| // filtering mechanism for `FileFormat` data sources. See SPARK-16317. | ||
| if (AvroFileFormat.ignoreFilesWithoutExtensions(conf) && !file.filePath.endsWith(".avro")) { | ||
| Iterator.empty | ||
| } else { | ||
| if (AvroFileFormat.ignoreExtension(conf, options) || file.filePath.endsWith(".avro")) { | ||
| val reader = { | ||
| val in = new FsInput(new Path(new URI(file.filePath)), conf) | ||
| try { | ||
|
|
@@ -227,6 +223,8 @@ private[avro] class AvroFileFormat extends FileFormat with DataSourceRegister { | |
| deserializer.deserialize(record).asInstanceOf[InternalRow] | ||
| } | ||
| } | ||
| } else { | ||
| Iterator.empty | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -276,10 +274,15 @@ private[avro] object AvroFileFormat { | |
| } | ||
| } | ||
|
|
||
| def ignoreFilesWithoutExtensions(conf: Configuration): Boolean = { | ||
| // Files without .avro extensions are not ignored by default | ||
| val defaultValue = false | ||
| def ignoreExtension(conf: Configuration, options: Map[String, String]): Boolean = { | ||
| val ignoreFilesWithoutExtensionByDefault = false | ||
| val ignoreFilesWithoutExtension = conf.getBoolean( | ||
| AvroFileFormat.IgnoreFilesWithoutExtensionProperty, | ||
| ignoreFilesWithoutExtensionByDefault) | ||
|
|
||
| conf.getBoolean(AvroFileFormat.IgnoreFilesWithoutExtensionProperty, defaultValue) | ||
| options | ||
| .get("ignoreExtension") | ||
|
||
| .map(_.toBoolean) | ||
| .getOrElse(!ignoreFilesWithoutExtension) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we have a class
AvroOptionslike what we are doing for the other built-in data sources?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would you like to see it as a part of this PR or a separate one? I would extract some common code like
getBool()from CSVOptions to a separate trait and extendAvroOptionsby it.