-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-20682][SPARK-15474][SPARK-21791] Add new ORCFileFormat based on ORC 1.4.1 #19651
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
b342196
ca78ac7
9d18834
6971cdf
b373495
de8b509
726406f
4097457
8e0d392
9e3ac1a
a3ebfbf
cc40fba
f482179
e13dfa3
fdab6a7
daef4ba
f143e17
8a34731
74cb053
eae50b3
520837f
71be008
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,22 +23,70 @@ import org.apache.orc.storage.common.`type`.HiveDecimal | |
| import org.apache.orc.storage.serde2.io.{DateWritable, HiveDecimalWritable} | ||
|
|
||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow | ||
| import org.apache.spark.sql.catalyst.expressions.{SpecializedGetters, SpecificInternalRow} | ||
| import org.apache.spark.sql.catalyst.util._ | ||
| import org.apache.spark.sql.execution.datasources.orc.OrcUtils.{getTypeDescription, withNullSafe} | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.unsafe.types.UTF8String | ||
|
|
||
| private[orc] class OrcSerializer(dataSchema: StructType) { | ||
|
|
||
| private[this] lazy val orcStruct: OrcStruct = | ||
| createOrcValue(dataSchema).asInstanceOf[OrcStruct] | ||
| private[this] lazy val orcStruct: OrcStruct = createOrcValue(dataSchema).asInstanceOf[OrcStruct] | ||
|
|
||
| private[this] val writableWrappers = | ||
| dataSchema.fields.map(f => getWritableWrapper(f.dataType)) | ||
| private[this] lazy val length = dataSchema.length | ||
|
|
||
| private[this] val writers = dataSchema.map(_.dataType).map(makeWriter).toArray | ||
|
|
||
| def serialize(row: InternalRow): OrcStruct = { | ||
| convertInternalRowToOrcStruct(row, dataSchema, Some(writableWrappers), Some(orcStruct)) | ||
| var i = 0 | ||
| while (i < length) { | ||
| if (row.isNullAt(i)) { | ||
| orcStruct.setFieldValue(i, null) | ||
| } else { | ||
| writers(i)(row, i) | ||
| } | ||
| i += 1 | ||
| } | ||
| orcStruct | ||
| } | ||
|
|
||
| private[this] def makeWriter(dataType: DataType): (SpecializedGetters, Int) => Unit = { | ||
| dataType match { | ||
| case BooleanType => | ||
| (row: SpecializedGetters, ordinal: Int) => | ||
| orcStruct.setFieldValue(ordinal, new BooleanWritable(row.getBoolean(ordinal))) | ||
|
|
||
| case ByteType => | ||
| (row: SpecializedGetters, ordinal: Int) => | ||
| orcStruct.setFieldValue(ordinal, new ByteWritable(row.getByte(ordinal))) | ||
|
|
||
| case ShortType => | ||
| (row: SpecializedGetters, ordinal: Int) => | ||
| orcStruct.setFieldValue(ordinal, new ShortWritable(row.getShort(ordinal))) | ||
|
|
||
| case IntegerType => | ||
| (row: SpecializedGetters, ordinal: Int) => | ||
| orcStruct.setFieldValue(ordinal, new IntWritable(row.getInt(ordinal))) | ||
|
|
||
| case LongType => | ||
| (row: SpecializedGetters, ordinal: Int) => | ||
| orcStruct.setFieldValue(ordinal, new LongWritable(row.getLong(ordinal))) | ||
|
|
||
| case FloatType => | ||
| (row: SpecializedGetters, ordinal: Int) => | ||
| orcStruct.setFieldValue(ordinal, new FloatWritable(row.getFloat(ordinal))) | ||
|
|
||
| case DoubleType => | ||
| (row: SpecializedGetters, ordinal: Int) => | ||
| orcStruct.setFieldValue(ordinal, new DoubleWritable(row.getDouble(ordinal))) | ||
|
|
||
| case _ => | ||
| val wrapper = getWritableWrapper(dataType) | ||
|
||
| (row: SpecializedGetters, ordinal: Int) => { | ||
| val value = wrapper(row.get(ordinal, dataType)).asInstanceOf[WritableComparable[_]] | ||
| orcStruct.setFieldValue(ordinal, value) | ||
| } | ||
|
||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -50,24 +98,22 @@ private[orc] class OrcSerializer(dataSchema: StructType) { | |
| /** | ||
| * Convert Apache Spark InternalRow to Apache ORC OrcStruct. | ||
| */ | ||
| private[this] def convertInternalRowToOrcStruct( | ||
| row: InternalRow, | ||
| schema: StructType, | ||
| valueWrappers: Option[Seq[Any => Any]] = None, | ||
| struct: Option[OrcStruct] = None): OrcStruct = { | ||
| val wrappers = | ||
| valueWrappers.getOrElse(schema.fields.map(_.dataType).map(getWritableWrapper).toSeq) | ||
| val orcStruct = struct.getOrElse(createOrcValue(schema).asInstanceOf[OrcStruct]) | ||
|
|
||
| for (schemaIndex <- 0 until schema.length) { | ||
| val fieldType = schema(schemaIndex).dataType | ||
| if (row.isNullAt(schemaIndex)) { | ||
| orcStruct.setFieldValue(schemaIndex, null) | ||
| private[this] def convertInternalRowToOrcStruct(row: InternalRow, schema: StructType) = { | ||
| val wrappers = schema.map(_.dataType).map(getWritableWrapper).toArray | ||
| val orcStruct = createOrcValue(schema).asInstanceOf[OrcStruct] | ||
|
|
||
| var i = 0 | ||
| val length = schema.length | ||
| while (i < length) { | ||
| val fieldType = schema(i).dataType | ||
| if (row.isNullAt(i)) { | ||
| orcStruct.setFieldValue(i, null) | ||
| } else { | ||
| val field = row.get(schemaIndex, fieldType) | ||
| val fieldValue = wrappers(schemaIndex)(field).asInstanceOf[WritableComparable[_]] | ||
| orcStruct.setFieldValue(schemaIndex, fieldValue) | ||
| val field = row.get(i, fieldType) | ||
| val fieldValue = wrappers(i)(field).asInstanceOf[WritableComparable[_]] | ||
| orcStruct.setFieldValue(i, fieldValue) | ||
| } | ||
| i += 1 | ||
| } | ||
| orcStruct | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how about
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also add array and map type here, then we can remove
convertOrcStructToInternalRowandgetValueUnwrapperThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oops this approach doesn't work for array and map type. I think we should follow
ParquetRowConverterand introduce a traitOrcDataUpdater, then we implementStructDataUpdater,ArrayDataUpdaterandMapDataUpdater, and the returned function should be(Any, OrcDataUpdater, Int) => UnitThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@cloud-fan . The current way is an old ORC way. Do we need to introduce the Parquet way for some performance reason?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And, for example, if the mappings look like the following, do we need to refactor some of the pattern between Parquet and ORC?
ArrayDataUpdater<=ParquetArrayConverterMapDataUpdater<=ParquetMapConverterThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel Updater is more precise here, let's not follow the naming of parquet.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's not only for performance, but also remove duplicated code.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see. Thanks.