-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-31159][SQL] Rebase date/timestamp from/to Julian calendar in parquet #27915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 24 commits
ae83fdc
4c35ddb
74774fc
8d74214
56ca744
0cfeed5
78cbd6c
13aad60
96573a9
36c0400
f0a2df6
9e3c201
053861c
1624756
e3bbcb5
d1e6d84
acd33f1
41fc33f
fe9f130
c2c53b8
8e94359
d6f7e6b
81d342a
63428ab
a34a9ce
e590d36
262f744
8947298
276d159
167b463
bbc4a1a
a1b34cb
d7debb4
6bebf3b
67cec02
8fa19a6
a061870
ae49cc4
a96392c
5b52735
184fcd8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,7 @@ import java.nio.charset.StandardCharsets | |
| import java.sql.{Date, Timestamp} | ||
| import java.time._ | ||
| import java.time.temporal.{ChronoField, ChronoUnit, IsoFields} | ||
| import java.util.{Locale, TimeZone} | ||
| import java.util.{Calendar, Locale, TimeZone} | ||
| import java.util.concurrent.TimeUnit._ | ||
|
|
||
| import scala.util.control.NonFatal | ||
|
|
@@ -974,4 +974,102 @@ object DateTimeUtils { | |
| } | ||
| }.mkString("'") | ||
| } | ||
|
|
||
| /** | ||
| * Converts the given microseconds to a local date-time in UTC time zone in Proleptic Gregorian | ||
| * calendar, interprets the result as a local date-time in Julian calendar in UTC time zone. | ||
| * And takes microseconds since the epoch from the Julian timestamp. | ||
| * | ||
| * @param micros The number of microseconds since the epoch '1970-01-01T00:00:00Z'. | ||
| * @return The rebased microseconds since the epoch in Julian calendar. | ||
| */ | ||
| def rebaseGregorianToJulianMicros(micros: Long): Long = { | ||
| val ldt = microsToInstant(micros).atZone(ZoneId.systemDefault).toLocalDateTime | ||
| val utcCal = new Calendar.Builder() | ||
| // `gregory` is a hybrid calendar that supports both | ||
| // the Julian and Gregorian calendar systems | ||
| .setCalendarType("gregory") | ||
| .setDate(ldt.getYear, ldt.getMonthValue - 1, ldt.getDayOfMonth) | ||
| .setTimeOfDay(ldt.getHour, ldt.getMinute, ldt.getSecond) | ||
| .build() | ||
| millisToMicros(utcCal.getTimeInMillis) + ldt.get(ChronoField.MICRO_OF_SECOND) | ||
| } | ||
|
|
||
| /** | ||
| * Converts the given microseconds to a local date-time in UTC time zone in Julian calendar, | ||
| * interprets the result as a local date-time in Proleptic Gregorian calendar in UTC time zone. | ||
| * And takes microseconds since the epoch from the Gregorian timestamp. | ||
| * | ||
| * @param micros The number of microseconds since the epoch '1970-01-01T00:00:00Z'. | ||
| * @return The rebased microseconds since the epoch in Proleptic Gregorian calendar. | ||
| */ | ||
| def rebaseJulianToGregorianMicros(micros: Long): Long = { | ||
| val utcCal = new Calendar.Builder() | ||
| // `gregory` is a hybrid calendar that supports both | ||
| // the Julian and Gregorian calendar systems | ||
| .setCalendarType("gregory") | ||
| .setInstant(microsToMillis(micros)) | ||
| .build() | ||
| val localDateTime = LocalDateTime.of( | ||
| utcCal.get(Calendar.YEAR), | ||
| utcCal.get(Calendar.MONTH) + 1, | ||
| utcCal.get(Calendar.DAY_OF_MONTH), | ||
| utcCal.get(Calendar.HOUR_OF_DAY), | ||
| utcCal.get(Calendar.MINUTE), | ||
| utcCal.get(Calendar.SECOND), | ||
| (Math.floorMod(micros, MICROS_PER_SECOND) * NANOS_PER_MICROS).toInt) | ||
| instantToMicros(localDateTime.atZone(ZoneId.systemDefault).toInstant) | ||
| } | ||
|
|
||
| /** | ||
| * Converts the given number of days since the epoch day 1970-01-01 to | ||
| * a local date in Julian calendar, interprets the result as a local | ||
| * date in Proleptic Gregorian calendar, and take the number of days | ||
| * since the epoch from the Gregorian date. | ||
| * | ||
| * @param days The number of days since the epoch in Julian calendar. | ||
| * @return The rebased number of days in Gregorian calendar. | ||
| */ | ||
| def rebaseJulianToGregorianDays(days: Int): Int = { | ||
| val utcCal = new Calendar.Builder() | ||
| // `gregory` is a hybrid calendar that supports both | ||
| // the Julian and Gregorian calendar systems | ||
| .setCalendarType("gregory") | ||
| .setTimeZone(TimeZoneUTC) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can use particular time zone here because the conversion of "logical" days is independent from time zone, actually. |
||
| .setInstant(Math.multiplyExact(days, MILLIS_PER_DAY)) | ||
| .build() | ||
| val localDate = LocalDate.of( | ||
| utcCal.get(Calendar.YEAR), | ||
| utcCal.get(Calendar.MONTH) + 1, | ||
| utcCal.get(Calendar.DAY_OF_MONTH)) | ||
| Math.toIntExact(localDate.toEpochDay) | ||
| } | ||
|
|
||
| /** | ||
| * Rebasing days since the epoch to store the same number of days | ||
| * as by Spark 2.4 and earlier versions. Spark 3.0 switched to | ||
| * Proleptic Gregorian calendar (see SPARK-26651), and as a consequence of that, | ||
| * this affects dates before 1582-10-15. Spark 2.4 and earlier versions use | ||
| * Julian calendar for dates before 1582-10-15. So, the same local date may | ||
| * be mapped to different number of days since the epoch in different calendars. | ||
| * | ||
| * For example: | ||
| * Proleptic Gregorian calendar: 1582-01-01 -> -141714 | ||
| * Julian calendar: 1582-01-01 -> -141704 | ||
| * The code below converts -141714 to -141704. | ||
| * | ||
| * @param days The number of days since the epoch 1970-01-01. It can be negative. | ||
| * @return The rebased number of days since the epoch in Julian calendar. | ||
| */ | ||
| def rebaseGregorianToJulianDays(days: Int): Int = { | ||
| val localDate = LocalDate.ofEpochDay(days) | ||
| val utcCal = new Calendar.Builder() | ||
| // `gregory` is a hybrid calendar that supports both | ||
| // the Julian and Gregorian calendar systems | ||
| .setCalendarType("gregory") | ||
| .setTimeZone(TimeZoneUTC) | ||
| .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth) | ||
| .build() | ||
| Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, MILLIS_PER_DAY)) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2496,6 +2496,19 @@ object SQLConf { | |
| .booleanConf | ||
| .createWithDefault(false) | ||
|
|
||
| val LEGACY_PARQUET_REBASE_DATETIME = | ||
| buildConf("spark.sql.legacy.parquet.rebaseDateTime.enabled") | ||
| .internal() | ||
| .doc("When true, rebase dates/timestamps before 1582-10-15 from Proleptic " + | ||
|
||
| "Gregorian calendar to Julian calendar in write and from Julian to Proleptic " + | ||
| "Gregorian calendar in read. The rebasing is performed by converting micros/days to " + | ||
| "a local date/timestamp in the source calendar, interpreting the resulted date/" + | ||
| "timestamp in the target calendar, and getting the number of days/micros since" + | ||
| "the epoch 1970-01-01 00:00:00Z.") | ||
| .version("3.0.0") | ||
| .booleanConf | ||
| .createWithDefault(false) | ||
|
|
||
| /** | ||
| * Holds information about keys that have been deprecated. | ||
| * | ||
|
|
@@ -3072,6 +3085,8 @@ class SQLConf extends Serializable with Logging { | |
|
|
||
| def integerGroupingIdEnabled: Boolean = getConf(SQLConf.LEGACY_INTEGER_GROUPING_ID) | ||
|
|
||
| def parquetRebaseDateTimeEnabled: Boolean = getConf(SQLConf.LEGACY_PARQUET_REBASE_DATETIME) | ||
cloud-fan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| /** ********************** SQLConf functionality methods ************ */ | ||
|
|
||
| /** Set Spark SQL configuration properties. */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -136,6 +136,10 @@ class ParquetFileFormat | |
| s"Set Parquet option ${ParquetOutputFormat.JOB_SUMMARY_LEVEL} to NONE.") | ||
| } | ||
|
|
||
| conf.set( | ||
| SQLConf.LEGACY_PARQUET_REBASE_DATETIME.key, | ||
|
||
| sparkSession.sessionState.conf.parquetRebaseDateTimeEnabled.toString) | ||
|
|
||
| new OutputWriterFactory { | ||
| // This OutputWriterFactory instance is deserialized when writing Parquet files on the | ||
| // executor side without constructing or deserializing ParquetFileFormat. Therefore, we hold | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -119,14 +119,16 @@ private[parquet] class ParquetPrimitiveConverter(val updater: ParentContainerUpd | |
| * @param catalystType Spark SQL schema that corresponds to the Parquet record type. User-defined | ||
| * types should have been expanded. | ||
| * @param convertTz the optional time zone to convert to for int96 data | ||
| * @param updater An updater which propagates converted field values to the parent container | ||
| * @param updater An updater which propagates converted field values to the parent container | ||
| * @param rebaseDateTime Enable rebasing date/timestamp from Julian to Proleptic Gregorian calendar | ||
| */ | ||
| private[parquet] class ParquetRowConverter( | ||
| schemaConverter: ParquetToSparkSchemaConverter, | ||
| parquetType: GroupType, | ||
| catalystType: StructType, | ||
| convertTz: Option[ZoneId], | ||
| updater: ParentContainerUpdater) | ||
| updater: ParentContainerUpdater, | ||
| rebaseDateTime: Boolean) | ||
| extends ParquetGroupConverter(updater) with Logging { | ||
|
|
||
| assert( | ||
|
|
@@ -263,16 +265,35 @@ private[parquet] class ParquetRowConverter( | |
| new ParquetStringConverter(updater) | ||
|
|
||
| case TimestampType if parquetType.getOriginalType == OriginalType.TIMESTAMP_MICROS => | ||
| new ParquetPrimitiveConverter(updater) { | ||
| override def addLong(value: Long): Unit = { | ||
| updater.setLong(value) | ||
| if (rebaseDateTime) { | ||
| new ParquetPrimitiveConverter(updater) { | ||
| override def addLong(value: Long): Unit = { | ||
| val rebased = DateTimeUtils.rebaseJulianToGregorianMicros(value) | ||
| updater.setLong(rebased) | ||
| } | ||
| } | ||
| } else { | ||
| new ParquetPrimitiveConverter(updater) { | ||
| override def addLong(value: Long): Unit = { | ||
| updater.setLong(value) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| case TimestampType if parquetType.getOriginalType == OriginalType.TIMESTAMP_MILLIS => | ||
| new ParquetPrimitiveConverter(updater) { | ||
| override def addLong(value: Long): Unit = { | ||
| updater.setLong(DateTimeUtils.millisToMicros(value)) | ||
| if (rebaseDateTime) { | ||
| new ParquetPrimitiveConverter(updater) { | ||
| override def addLong(value: Long): Unit = { | ||
| val micros = DateTimeUtils.millisToMicros(value) | ||
| val rebased = DateTimeUtils.rebaseJulianToGregorianMicros(micros) | ||
| updater.setLong(rebased) | ||
| } | ||
| } | ||
| } else { | ||
| new ParquetPrimitiveConverter(updater) { | ||
| override def addLong(value: Long): Unit = { | ||
| updater.setLong(DateTimeUtils.millisToMicros(value)) | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -296,6 +317,15 @@ private[parquet] class ParquetRowConverter( | |
| } | ||
| } | ||
|
|
||
| case DateType if rebaseDateTime => | ||
| new ParquetPrimitiveConverter(updater) { | ||
| override def addInt(value: Int): Unit = { | ||
| val rebased = DateTimeUtils.rebaseJulianToGregorianDays(value) | ||
| // DateType is not specialized in `SpecificMutableRow`, have to box it here. | ||
| updater.set(rebased.asInstanceOf[DateType#InternalType]) | ||
|
||
| } | ||
| } | ||
|
|
||
| case DateType => | ||
| new ParquetPrimitiveConverter(updater) { | ||
| override def addInt(value: Int): Unit = { | ||
|
|
@@ -348,7 +378,7 @@ private[parquet] class ParquetRowConverter( | |
| } | ||
| } | ||
| new ParquetRowConverter( | ||
| schemaConverter, parquetType.asGroupType(), t, convertTz, wrappedUpdater) | ||
| schemaConverter, parquetType.asGroupType(), t, convertTz, wrappedUpdater, rebaseDateTime) | ||
|
|
||
| case t => | ||
| throw new RuntimeException( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shall we set timezone of it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is set to the default system time zone. If we set it to to particular time zone, the conversion will be incorrect.
Let me rename
utcCaltocal.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For example, if I set
UTC, conversions in UTC is ok but not in PST:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Default) Time zone should be involved in the conversion to avoid the problem of different time zone offsets returned by Java 7 and Java 8 APIs: