-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-31238][SQL] Rebase dates to/from Julian calendar in write/read for ORC datasource #28016
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a2a081a
121b4c6
ecef05d
47d8588
29d7966
3b1b791
c8a897a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,7 +15,7 @@ | |
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.hive | ||
| package org.apache.spark.sql.execution.datasources | ||
|
|
||
| import java.io.{DataInput, DataOutput, IOException} | ||
| import java.sql.Date | ||
|
|
@@ -35,11 +35,12 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils.{rebaseGregorianToJulian | |
| * @param julianDays The number of days since the epoch 1970-01-01 in | ||
| * Julian calendar. | ||
| */ | ||
| private[hive] class DaysWritable( | ||
| class DaysWritable( | ||
| var gregorianDays: Int, | ||
| var julianDays: Int) | ||
| extends DateWritable { | ||
|
|
||
| def this() = this(0, 0) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume that |
||
| def this(gregorianDays: Int) = | ||
| this(gregorianDays, rebaseGregorianToJulianDays(gregorianDays)) | ||
| def this(dateWritable: DateWritable) = { | ||
|
|
@@ -55,6 +56,11 @@ private[hive] class DaysWritable( | |
| override def getDays: Int = julianDays | ||
| override def get(): Date = new Date(DateWritable.daysToMillis(julianDays)) | ||
|
|
||
| override def set(d: Int): Unit = { | ||
| gregorianDays = d | ||
| julianDays = rebaseGregorianToJulianDays(d) | ||
| } | ||
|
|
||
| @throws[IOException] | ||
| override def write(out: DataOutput): Unit = { | ||
| WritableUtils.writeVInt(out, julianDays) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.execution.datasources.orc | ||
|
|
||
| import java.io.{DataInput, DataOutput, IOException} | ||
| import java.sql.Date | ||
|
|
||
| import org.apache.hadoop.io.WritableUtils | ||
| import org.apache.orc.storage.serde2.io.DateWritable | ||
|
|
||
| import org.apache.spark.sql.catalyst.util.DateTimeUtils.{rebaseGregorianToJulianDays, rebaseJulianToGregorianDays} | ||
|
|
||
| /** | ||
| * The class accepts/returns days in Gregorian calendar and rebase them | ||
| * via conversion to local date in Julian calendar for dates before 1582-10-15 | ||
| * in read/write for backward compatibility with Spark 2.4 and earlier versions. | ||
| * | ||
| * This is a clone of `org.apache.spark.sql.execution.datasources.DaysWritable`. | ||
| * The class is cloned because Hive ORC v1.2 uses different `DateWritable`: | ||
| * - v1.2: `org.apache.orc.storage.serde2.io.DateWritable` | ||
| * - v2.3 and `HiveInspectors`: `org.apache.hadoop.hive.serde2.io.DateWritable` | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't need the above 4 line comments because
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I move it to 2.3, and enable hive-1.2,
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it. Let me check again.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ? Sorry, why do you move this to 2.3?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What I ask is keep this AS-IS, and move the other
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm checking again from my side.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is what I tried MaxGekk#26, and it fails when I enable hive-1.2: Feel free to open a PR for this PR if you mean something different. |
||
| * | ||
| * @param gregorianDays The number of days since the epoch 1970-01-01 in | ||
| * Gregorian calendar. | ||
| * @param julianDays The number of days since the epoch 1970-01-01 in | ||
| * Julian calendar. | ||
| */ | ||
| class DaysWritable( | ||
| var gregorianDays: Int, | ||
| var julianDays: Int) | ||
| extends DateWritable { | ||
|
|
||
| def this() = this(0, 0) | ||
| def this(gregorianDays: Int) = | ||
| this(gregorianDays, rebaseGregorianToJulianDays(gregorianDays)) | ||
| def this(dateWritable: DateWritable) = { | ||
| this( | ||
| gregorianDays = dateWritable match { | ||
| case daysWritable: DaysWritable => daysWritable.gregorianDays | ||
| case dateWritable: DateWritable => | ||
| rebaseJulianToGregorianDays(dateWritable.getDays) | ||
| }, | ||
| julianDays = dateWritable.getDays) | ||
| } | ||
|
|
||
| override def getDays: Int = julianDays | ||
| override def get(): Date = new Date(DateWritable.daysToMillis(julianDays)) | ||
|
|
||
| override def set(d: Int): Unit = { | ||
| gregorianDays = d | ||
| julianDays = rebaseGregorianToJulianDays(d) | ||
| } | ||
|
|
||
| @throws[IOException] | ||
| override def write(out: DataOutput): Unit = { | ||
| WritableUtils.writeVInt(out, julianDays) | ||
| } | ||
|
|
||
| @throws[IOException] | ||
| override def readFields(in: DataInput): Unit = { | ||
| julianDays = WritableUtils.readVInt(in) | ||
| gregorianDays = rebaseJulianToGregorianDays(julianDays) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,7 @@ import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf.{Operator => OrcOperator} | |
| import org.apache.hadoop.hive.serde2.io.{DateWritable, HiveDecimalWritable} | ||
|
|
||
| import org.apache.spark.sql.catalyst.expressions.SpecializedGetters | ||
| import org.apache.spark.sql.execution.datasources.DaysWritable | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After moving |
||
| import org.apache.spark.sql.types.Decimal | ||
|
|
||
| /** | ||
|
|
@@ -47,13 +48,13 @@ private[sql] object OrcShimUtils { | |
|
|
||
| def getDateWritable(reuseObj: Boolean): (SpecializedGetters, Int) => DateWritable = { | ||
| if (reuseObj) { | ||
| val result = new DateWritable() | ||
| val result = new DaysWritable() | ||
| (getter, ordinal) => | ||
| result.set(getter.getInt(ordinal)) | ||
| result | ||
| } else { | ||
| (getter: SpecializedGetters, ordinal: Int) => | ||
| new DateWritable(getter.getInt(ordinal)) | ||
| new DaysWritable(getter.getInt(ordinal)) | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,7 @@ import org.apache.spark.sql.AnalysisException | |
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.util._ | ||
| import org.apache.spark.sql.execution.datasources.DaysWritable | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unnecessary change
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I remove it, the build fails with: I moved |
||
| import org.apache.spark.sql.types | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.unsafe.types.UTF8String | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.