-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-35780][SQL] Support DATE/TIMESTAMP literals across the full range #32959
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
457d3b6
633781f
b250bc7
5b4fe62
e94885c
e9558cb
7e7a81a
e0ff811
d9f8af4
9a07c14
9e96020
b383571
dda6136
18d7766
e146045
4091859
4043889
8d69c88
cd330a6
4723f8e
538463a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -224,12 +224,12 @@ object DateTimeUtils { | |
| * value. The return type is [[Option]] in order to distinguish between 0L and null. The following | ||
| * formats are allowed: | ||
| * | ||
| * `yyyy` | ||
| * `yyyy-[m]m` | ||
| * `yyyy-[m]m-[d]d` | ||
| * `yyyy-[m]m-[d]d ` | ||
| * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` | ||
| * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` | ||
| * `[+-]yyyy*` | ||
| * `[+-]yyyy*-[m]m` | ||
| * `[+-]yyyy*-[m]m-[d]d` | ||
| * `[+-]yyyy*-[m]m-[d]d ` | ||
| * `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` | ||
| * `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` | ||
| * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` | ||
| * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` | ||
| * | ||
|
|
@@ -249,17 +249,30 @@ object DateTimeUtils { | |
| * the input string can't be parsed as timestamp, the result timestamp segments are empty. | ||
| */ | ||
| def parseTimestampString(s: UTF8String): (Array[Int], Option[ZoneId], Boolean) = { | ||
| if (s == null) { | ||
| def isValidDigits(segment: Int, digits: Int): Boolean = { | ||
| // A Long is able to represent a timestamp within [+-]200 thousand years | ||
| val maxDigitsYear = 6 | ||
| // For the nanosecond part, more than 6 digits is allowed, but will be truncated. | ||
| segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || | ||
| (segment != 0 && segment != 6 && digits <= 2) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. segments except year are allowed to have 0 digits before this PR. so I didn't do zero checks for these segments. select cast('12::' as timestamp); -- output: 2021-07-07 12:00:00
select cast('T' as timestamp); -- output: 2021-07-07 00:00:00
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch, we should fail it. Let's do it in another PR. |
||
| } | ||
| if (s == null || s.trimAll().numBytes() == 0) { | ||
| return (Array.empty, None, false) | ||
| } | ||
| var tz: Option[String] = None | ||
| val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0) | ||
| var i = 0 | ||
| var currentSegmentValue = 0 | ||
| var currentSegmentDigits = 0 | ||
| val bytes = s.trimAll().getBytes | ||
| var j = 0 | ||
| var digitsMilli = 0 | ||
| var justTime = false | ||
| var yearSign: Option[Int] = None | ||
| if (bytes(j) == '-' || bytes(j) == '+') { | ||
| yearSign = if (bytes(j) == '-') Some(-1) else Some(1) | ||
| j += 1 | ||
| } | ||
| while (j < bytes.length) { | ||
| val b = bytes(j) | ||
| val parsedValue = b - '0'.toByte | ||
|
|
@@ -269,50 +282,74 @@ object DateTimeUtils { | |
| i += 3 | ||
| } else if (i < 2) { | ||
| if (b == '-') { | ||
| if (i == 0 && j != 4) { | ||
cloud-fan marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| // year should have exact four digits | ||
cloud-fan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if (!isValidDigits(i, currentSegmentDigits)) { | ||
| return (Array.empty, None, false) | ||
| } | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| currentSegmentDigits = 0 | ||
| i += 1 | ||
| } else if (i == 0 && b == ':') { | ||
| } else if (i == 0 && b == ':' && yearSign.isEmpty) { | ||
| justTime = true | ||
| if (!isValidDigits(3, currentSegmentDigits)) { | ||
| return (Array.empty, None, false) | ||
| } | ||
| segments(3) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| currentSegmentDigits = 0 | ||
| i = 4 | ||
| } else { | ||
| return (Array.empty, None, false) | ||
| } | ||
| } else if (i == 2) { | ||
| if (b == ' ' || b == 'T') { | ||
| if (!isValidDigits(i, currentSegmentDigits)) { | ||
| return (Array.empty, None, false) | ||
| } | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| currentSegmentDigits = 0 | ||
| i += 1 | ||
| } else { | ||
| return (Array.empty, None, false) | ||
| } | ||
| } else if (i == 3 || i == 4) { | ||
| if (b == ':') { | ||
| if (!isValidDigits(i, currentSegmentDigits)) { | ||
| return (Array.empty, None, false) | ||
| } | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| currentSegmentDigits = 0 | ||
| i += 1 | ||
| } else { | ||
| return (Array.empty, None, false) | ||
| } | ||
| } else if (i == 5 || i == 6) { | ||
| if (b == '-' || b == '+') { | ||
| if (!isValidDigits(i, currentSegmentDigits)) { | ||
| return (Array.empty, None, false) | ||
| } | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| currentSegmentDigits = 0 | ||
| i += 1 | ||
| tz = Some(new String(bytes, j, 1)) | ||
| } else if (b == '.' && i == 5) { | ||
| if (!isValidDigits(i, currentSegmentDigits)) { | ||
| return (Array.empty, None, false) | ||
| } | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| currentSegmentDigits = 0 | ||
| i += 1 | ||
| } else { | ||
| if (!isValidDigits(i, currentSegmentDigits)) { | ||
| return (Array.empty, None, false) | ||
| } | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| currentSegmentDigits = 0 | ||
| i += 1 | ||
| tz = Some(new String(bytes, j, bytes.length - j)) | ||
| j = bytes.length - 1 | ||
|
|
@@ -322,8 +359,12 @@ object DateTimeUtils { | |
| } | ||
| } else { | ||
| if (i < segments.length && (b == ':' || b == ' ')) { | ||
| if (!isValidDigits(i, currentSegmentDigits)) { | ||
| return (Array.empty, None, false) | ||
| } | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| currentSegmentDigits = 0 | ||
| i += 1 | ||
| } else { | ||
| return (Array.empty, None, false) | ||
|
|
@@ -333,61 +374,40 @@ object DateTimeUtils { | |
| if (i == 6) { | ||
| digitsMilli += 1 | ||
| } | ||
| currentSegmentValue = currentSegmentValue * 10 + parsedValue | ||
| // We will truncate the nanosecond part if there are more than 6 digits, which results | ||
| // in loss of precision | ||
| if (i != 6 || currentSegmentDigits < 6) { | ||
| currentSegmentValue = currentSegmentValue * 10 + parsedValue | ||
| } | ||
| currentSegmentDigits += 1 | ||
| } | ||
| j += 1 | ||
| } | ||
|
|
||
| segments(i) = currentSegmentValue | ||
| if (!justTime && i == 0 && j != 4) { | ||
| // year should have exact four digits | ||
| if (!isValidDigits(i, currentSegmentDigits)) { | ||
| return (Array.empty, None, false) | ||
| } | ||
| segments(i) = currentSegmentValue | ||
|
|
||
| while (digitsMilli < 6) { | ||
| segments(6) *= 10 | ||
| digitsMilli += 1 | ||
| } | ||
|
|
||
| // We are truncating the nanosecond part, which results in loss of precision | ||
| while (digitsMilli > 6) { | ||
| segments(6) /= 10 | ||
| digitsMilli -= 1 | ||
| } | ||
| // This step also validates time zone part | ||
| val zoneId = tz.map { | ||
| case "+" => ZoneOffset.ofHoursMinutes(segments(7), segments(8)) | ||
| case "-" => ZoneOffset.ofHoursMinutes(-segments(7), -segments(8)) | ||
| case zoneName: String => getZoneId(zoneName.trim) | ||
| } | ||
| segments(0) *= yearSign.getOrElse(1) | ||
| (segments, zoneId, justTime) | ||
| } | ||
|
|
||
| /** | ||
| * Trims and parses a given UTF8 timestamp string to the corresponding a corresponding [[Long]] | ||
| * value. The return type is [[Option]] in order to distinguish between 0L and null. The following | ||
| * formats are allowed: | ||
| * | ||
| * `yyyy` | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we should just say "Please refer to parseTimestampString for the allowed formats." here
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| * `yyyy-[m]m` | ||
| * `yyyy-[m]m-[d]d` | ||
| * `yyyy-[m]m-[d]d ` | ||
| * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` | ||
| * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` | ||
| * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` | ||
| * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` | ||
| * | ||
| * where `zone_id` should have one of the forms: | ||
| * - Z - Zulu time zone UTC+0 | ||
| * - +|-[h]h:[m]m | ||
| * - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS | ||
| * - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-, | ||
| * and a suffix in the formats: | ||
| * - +|-h[h] | ||
| * - +|-hh[:]mm | ||
| * - +|-hh:mm:ss | ||
| * - +|-hhmmss | ||
| * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` | ||
| * value. The return type is [[Option]] in order to distinguish between 0L and null. Please | ||
| * refer to `parseTimestampString` for the allowed formats | ||
| */ | ||
| def stringToTimestamp(s: UTF8String, timeZoneId: ZoneId): Option[Long] = { | ||
| try { | ||
|
|
@@ -422,30 +442,8 @@ object DateTimeUtils { | |
| * Trims and parses a given UTF8 string to a corresponding [[Long]] value which representing the | ||
| * number of microseconds since the epoch. The result is independent of time zones, | ||
| * which means that zone ID in the input string will be ignored. | ||
| * The return type is [[Option]] in order to distinguish between 0L and null. The following | ||
| * formats are allowed: | ||
| * | ||
| * `yyyy` | ||
| * `yyyy-[m]m` | ||
| * `yyyy-[m]m-[d]d` | ||
| * `yyyy-[m]m-[d]d ` | ||
| * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` | ||
| * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]` | ||
| * | ||
| * where `zone_id` should have one of the forms: | ||
| * - Z - Zulu time zone UTC+0 | ||
| * - +|-[h]h:[m]m | ||
| * - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS | ||
| * - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-, | ||
| * and a suffix in the formats: | ||
| * - +|-h[h] | ||
| * - +|-hh[:]mm | ||
| * - +|-hh:mm:ss | ||
| * - +|-hhmmss | ||
| * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris` | ||
| * | ||
| * Note: The input string has to contains year/month/day fields, otherwise Spark can't determine | ||
| * the value of timestamp without time zone. | ||
| * The return type is [[Option]] in order to distinguish between 0L and null. Please | ||
| * refer to `parseTimestampString` for the allowed formats. | ||
| */ | ||
| def stringToTimestampWithoutTimeZone(s: UTF8String): Option[Long] = { | ||
| try { | ||
|
|
@@ -518,44 +516,55 @@ object DateTimeUtils { | |
| * The return type is [[Option]] in order to distinguish between 0 and null. The following | ||
| * formats are allowed: | ||
| * | ||
| * `yyyy` | ||
| * `yyyy-[m]m` | ||
| * `yyyy-[m]m-[d]d` | ||
| * `yyyy-[m]m-[d]d ` | ||
| * `yyyy-[m]m-[d]d *` | ||
| * `yyyy-[m]m-[d]dT*` | ||
| * `[+-]yyyy*` | ||
| * `[+-]yyyy*-[m]m` | ||
| * `[+-]yyyy*-[m]m-[d]d` | ||
| * `[+-]yyyy*-[m]m-[d]d ` | ||
| * `[+-]yyyy*-[m]m-[d]d *` | ||
| * `[+-]yyyy*-[m]m-[d]dT*` | ||
| */ | ||
| def stringToDate(s: UTF8String): Option[Int] = { | ||
| if (s == null) { | ||
| def isValidDigits(segment: Int, digits: Int): Boolean = { | ||
| // An integer is able to represent a date within [+-]5 million years. | ||
| var maxDigitsYear = 7 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can I implement a configuration item that configures the range of digits allowed for the year? I found that it was writing to tables in different formats and the results would behave differently. create table t(c1 date) stored as textfile;
insert overwrite table t select cast( '22022-05-01' as date);
select * from t1; -- output nullcreate table t(c1 date) stored as orcfile;
insert overwrite table t select cast( '22022-05-01' as date);
select * from t1; -- output +22022-05-01Because orc/parquet date stores integers, but textfile and sequencefile store text. But if you use hive jdbc, the query will fail, because
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's expected that not all the data sources and BI clients support datetime values larger than It looks to me that the Hive table should fail to write
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW, I don't think it's possible to add a Spark config to forbid large datetime values. The literal is just one place, there are many other datetime operations that may produce large datetime values, which have been there before this PR.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for your explanation, make sense. There may be some dates that were treated as abnormal by users in previous Spark versions, and can be handled normally in Spark 3.2, although they are normal dates.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea the impact on BI clients was missed, though strictly speaking BI clients are not part of Spark. |
||
| (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || (segment != 0 && digits <= 2) | ||
| } | ||
| if (s == null || s.trimAll().numBytes() == 0) { | ||
| return None | ||
| } | ||
| val segments: Array[Int] = Array[Int](1, 1, 1) | ||
| var sign = 1 | ||
| var i = 0 | ||
| var currentSegmentValue = 0 | ||
| var currentSegmentDigits = 0 | ||
| val bytes = s.trimAll().getBytes | ||
| var j = 0 | ||
| if (bytes(j) == '-' || bytes(j) == '+') { | ||
| sign = if (bytes(j) == '-') -1 else 1 | ||
| j += 1 | ||
| } | ||
| while (j < bytes.length && (i < 3 && !(bytes(j) == ' ' || bytes(j) == 'T'))) { | ||
| val b = bytes(j) | ||
| if (i < 2 && b == '-') { | ||
| if (i == 0 && j != 4) { | ||
| // year should have exact four digits | ||
| if (!isValidDigits(i, currentSegmentDigits)) { | ||
| return None | ||
| } | ||
| segments(i) = currentSegmentValue | ||
| currentSegmentValue = 0 | ||
| currentSegmentDigits = 0 | ||
| i += 1 | ||
| } else { | ||
| val parsedValue = b - '0'.toByte | ||
| if (parsedValue < 0 || parsedValue > 9) { | ||
| return None | ||
| } else { | ||
| currentSegmentValue = currentSegmentValue * 10 + parsedValue | ||
| currentSegmentDigits += 1 | ||
| } | ||
| } | ||
| j += 1 | ||
| } | ||
| if (i == 0 && j != 4) { | ||
| // year should have exact four digits | ||
| if (!isValidDigits(i, currentSegmentDigits)) { | ||
| return None | ||
| } | ||
| if (i < 2 && j < bytes.length) { | ||
|
|
@@ -564,7 +573,7 @@ object DateTimeUtils { | |
| } | ||
| segments(i) = currentSegmentValue | ||
| try { | ||
| val localDate = LocalDate.of(segments(0), segments(1), segments(2)) | ||
| val localDate = LocalDate.of(sign * segments(0), segments(1), segments(2)) | ||
| Some(localDateToDays(localDate)) | ||
| } catch { | ||
| case NonFatal(_) => None | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -392,8 +392,6 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase { | |
| s"Cannot cast $str to DateType.") | ||
| } | ||
|
|
||
| checkCastWithParseError("12345") | ||
| checkCastWithParseError("12345-12-18") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's move the removed tests to the base cast suite
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| checkCastWithParseError("2015-13-18") | ||
| checkCastWithParseError("2015-03-128") | ||
| checkCastWithParseError("2015/03/18") | ||
|
|
||

Uh oh!
There was an error while loading. Please reload this page.