Skip to content

Commit c9813f7

Browse files
linhongliu-dbcloud-fan
authored andcommitted
[SPARK-35780][SQL] Support DATE/TIMESTAMP literals across the full range
### What changes were proposed in this pull request? DATE/TIMESTAMP literals support years 0000 to 9999. However, internally we support a range that is much larger. We can add or subtract large intervals from a date/timestamp and the system will happily process and display large negative and positive dates. Since we obviously cannot put this genie back into the bottle the only thing we can do is allow matching DATE/TIMESTAMP literals. ### Why are the changes needed? make spark more usable and bug fix ### Does this PR introduce _any_ user-facing change? Yes, after this PR, below SQL will have different results ```sql select cast('-10000-1-2' as date) as date_col -- before PR: NULL -- after PR: -10000-1-2 ``` ```sql select cast('2021-4294967297-11' as date) as date_col -- before PR: 2021-01-11 -- after PR: NULL ``` ### How was this patch tested? newly added test cases Closes #32959 from linhongliu-db/SPARK-35780-full-range-datetime. Lead-authored-by: Linhong Liu <[email protected]> Co-authored-by: Linhong Liu <[email protected]> Signed-off-by: Wenchen Fan <[email protected]> (cherry picked from commit b866457) Signed-off-by: Wenchen Fan <[email protected]>
1 parent dcee7a6 commit c9813f7

File tree

14 files changed

+548
-116
lines changed

14 files changed

+548
-116
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala

Lines changed: 87 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -224,12 +224,12 @@ object DateTimeUtils {
224224
* value. The return type is [[Option]] in order to distinguish between 0L and null. The following
225225
* formats are allowed:
226226
*
227-
* `yyyy`
228-
* `yyyy-[m]m`
229-
* `yyyy-[m]m-[d]d`
230-
* `yyyy-[m]m-[d]d `
231-
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
232-
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
227+
* `[+-]yyyy*`
228+
* `[+-]yyyy*-[m]m`
229+
* `[+-]yyyy*-[m]m-[d]d`
230+
* `[+-]yyyy*-[m]m-[d]d `
231+
* `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
232+
* `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
233233
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
234234
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
235235
*
@@ -249,17 +249,30 @@ object DateTimeUtils {
249249
* the input string can't be parsed as timestamp, the result timestamp segments are empty.
250250
*/
251251
def parseTimestampString(s: UTF8String): (Array[Int], Option[ZoneId], Boolean) = {
252-
if (s == null) {
252+
def isValidDigits(segment: Int, digits: Int): Boolean = {
253+
// A Long is able to represent a timestamp within [+-]200 thousand years
254+
val maxDigitsYear = 6
255+
// For the nanosecond part, more than 6 digits is allowed, but will be truncated.
256+
segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
257+
(segment != 0 && segment != 6 && digits <= 2)
258+
}
259+
if (s == null || s.trimAll().numBytes() == 0) {
253260
return (Array.empty, None, false)
254261
}
255262
var tz: Option[String] = None
256263
val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0)
257264
var i = 0
258265
var currentSegmentValue = 0
266+
var currentSegmentDigits = 0
259267
val bytes = s.trimAll().getBytes
260268
var j = 0
261269
var digitsMilli = 0
262270
var justTime = false
271+
var yearSign: Option[Int] = None
272+
if (bytes(j) == '-' || bytes(j) == '+') {
273+
yearSign = if (bytes(j) == '-') Some(-1) else Some(1)
274+
j += 1
275+
}
263276
while (j < bytes.length) {
264277
val b = bytes(j)
265278
val parsedValue = b - '0'.toByte
@@ -269,50 +282,74 @@ object DateTimeUtils {
269282
i += 3
270283
} else if (i < 2) {
271284
if (b == '-') {
272-
if (i == 0 && j != 4) {
273-
// year should have exact four digits
285+
if (!isValidDigits(i, currentSegmentDigits)) {
274286
return (Array.empty, None, false)
275287
}
276288
segments(i) = currentSegmentValue
277289
currentSegmentValue = 0
290+
currentSegmentDigits = 0
278291
i += 1
279-
} else if (i == 0 && b == ':') {
292+
} else if (i == 0 && b == ':' && yearSign.isEmpty) {
280293
justTime = true
294+
if (!isValidDigits(3, currentSegmentDigits)) {
295+
return (Array.empty, None, false)
296+
}
281297
segments(3) = currentSegmentValue
282298
currentSegmentValue = 0
299+
currentSegmentDigits = 0
283300
i = 4
284301
} else {
285302
return (Array.empty, None, false)
286303
}
287304
} else if (i == 2) {
288305
if (b == ' ' || b == 'T') {
306+
if (!isValidDigits(i, currentSegmentDigits)) {
307+
return (Array.empty, None, false)
308+
}
289309
segments(i) = currentSegmentValue
290310
currentSegmentValue = 0
311+
currentSegmentDigits = 0
291312
i += 1
292313
} else {
293314
return (Array.empty, None, false)
294315
}
295316
} else if (i == 3 || i == 4) {
296317
if (b == ':') {
318+
if (!isValidDigits(i, currentSegmentDigits)) {
319+
return (Array.empty, None, false)
320+
}
297321
segments(i) = currentSegmentValue
298322
currentSegmentValue = 0
323+
currentSegmentDigits = 0
299324
i += 1
300325
} else {
301326
return (Array.empty, None, false)
302327
}
303328
} else if (i == 5 || i == 6) {
304329
if (b == '-' || b == '+') {
330+
if (!isValidDigits(i, currentSegmentDigits)) {
331+
return (Array.empty, None, false)
332+
}
305333
segments(i) = currentSegmentValue
306334
currentSegmentValue = 0
335+
currentSegmentDigits = 0
307336
i += 1
308337
tz = Some(new String(bytes, j, 1))
309338
} else if (b == '.' && i == 5) {
339+
if (!isValidDigits(i, currentSegmentDigits)) {
340+
return (Array.empty, None, false)
341+
}
310342
segments(i) = currentSegmentValue
311343
currentSegmentValue = 0
344+
currentSegmentDigits = 0
312345
i += 1
313346
} else {
347+
if (!isValidDigits(i, currentSegmentDigits)) {
348+
return (Array.empty, None, false)
349+
}
314350
segments(i) = currentSegmentValue
315351
currentSegmentValue = 0
352+
currentSegmentDigits = 0
316353
i += 1
317354
tz = Some(new String(bytes, j, bytes.length - j))
318355
j = bytes.length - 1
@@ -322,8 +359,12 @@ object DateTimeUtils {
322359
}
323360
} else {
324361
if (i < segments.length && (b == ':' || b == ' ')) {
362+
if (!isValidDigits(i, currentSegmentDigits)) {
363+
return (Array.empty, None, false)
364+
}
325365
segments(i) = currentSegmentValue
326366
currentSegmentValue = 0
367+
currentSegmentDigits = 0
327368
i += 1
328369
} else {
329370
return (Array.empty, None, false)
@@ -333,61 +374,40 @@ object DateTimeUtils {
333374
if (i == 6) {
334375
digitsMilli += 1
335376
}
336-
currentSegmentValue = currentSegmentValue * 10 + parsedValue
377+
// We will truncate the nanosecond part if there are more than 6 digits, which results
378+
// in loss of precision
379+
if (i != 6 || currentSegmentDigits < 6) {
380+
currentSegmentValue = currentSegmentValue * 10 + parsedValue
381+
}
382+
currentSegmentDigits += 1
337383
}
338384
j += 1
339385
}
340386

341-
segments(i) = currentSegmentValue
342-
if (!justTime && i == 0 && j != 4) {
343-
// year should have exact four digits
387+
if (!isValidDigits(i, currentSegmentDigits)) {
344388
return (Array.empty, None, false)
345389
}
390+
segments(i) = currentSegmentValue
346391

347392
while (digitsMilli < 6) {
348393
segments(6) *= 10
349394
digitsMilli += 1
350395
}
351396

352-
// We are truncating the nanosecond part, which results in loss of precision
353-
while (digitsMilli > 6) {
354-
segments(6) /= 10
355-
digitsMilli -= 1
356-
}
357397
// This step also validates time zone part
358398
val zoneId = tz.map {
359399
case "+" => ZoneOffset.ofHoursMinutes(segments(7), segments(8))
360400
case "-" => ZoneOffset.ofHoursMinutes(-segments(7), -segments(8))
361401
case zoneName: String => getZoneId(zoneName.trim)
362402
}
403+
segments(0) *= yearSign.getOrElse(1)
363404
(segments, zoneId, justTime)
364405
}
365406

366407
/**
367408
* Trims and parses a given UTF8 timestamp string to the corresponding a corresponding [[Long]]
368-
* value. The return type is [[Option]] in order to distinguish between 0L and null. The following
369-
* formats are allowed:
370-
*
371-
* `yyyy`
372-
* `yyyy-[m]m`
373-
* `yyyy-[m]m-[d]d`
374-
* `yyyy-[m]m-[d]d `
375-
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
376-
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
377-
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
378-
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
379-
*
380-
* where `zone_id` should have one of the forms:
381-
* - Z - Zulu time zone UTC+0
382-
* - +|-[h]h:[m]m
383-
* - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
384-
* - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
385-
* and a suffix in the formats:
386-
* - +|-h[h]
387-
* - +|-hh[:]mm
388-
* - +|-hh:mm:ss
389-
* - +|-hhmmss
390-
* - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
409+
* value. The return type is [[Option]] in order to distinguish between 0L and null. Please
410+
* refer to `parseTimestampString` for the allowed formats
391411
*/
392412
def stringToTimestamp(s: UTF8String, timeZoneId: ZoneId): Option[Long] = {
393413
try {
@@ -422,30 +442,8 @@ object DateTimeUtils {
422442
* Trims and parses a given UTF8 string to a corresponding [[Long]] value which representing the
423443
* number of microseconds since the epoch. The result is independent of time zones,
424444
* which means that zone ID in the input string will be ignored.
425-
* The return type is [[Option]] in order to distinguish between 0L and null. The following
426-
* formats are allowed:
427-
*
428-
* `yyyy`
429-
* `yyyy-[m]m`
430-
* `yyyy-[m]m-[d]d`
431-
* `yyyy-[m]m-[d]d `
432-
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
433-
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
434-
*
435-
* where `zone_id` should have one of the forms:
436-
* - Z - Zulu time zone UTC+0
437-
* - +|-[h]h:[m]m
438-
* - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
439-
* - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
440-
* and a suffix in the formats:
441-
* - +|-h[h]
442-
* - +|-hh[:]mm
443-
* - +|-hh:mm:ss
444-
* - +|-hhmmss
445-
* - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
446-
*
447-
* Note: The input string has to contains year/month/day fields, otherwise Spark can't determine
448-
* the value of timestamp without time zone.
445+
* The return type is [[Option]] in order to distinguish between 0L and null. Please
446+
* refer to `parseTimestampString` for the allowed formats.
449447
*/
450448
def stringToTimestampWithoutTimeZone(s: UTF8String): Option[Long] = {
451449
try {
@@ -518,44 +516,55 @@ object DateTimeUtils {
518516
* The return type is [[Option]] in order to distinguish between 0 and null. The following
519517
* formats are allowed:
520518
*
521-
* `yyyy`
522-
* `yyyy-[m]m`
523-
* `yyyy-[m]m-[d]d`
524-
* `yyyy-[m]m-[d]d `
525-
* `yyyy-[m]m-[d]d *`
526-
* `yyyy-[m]m-[d]dT*`
519+
* `[+-]yyyy*`
520+
* `[+-]yyyy*-[m]m`
521+
* `[+-]yyyy*-[m]m-[d]d`
522+
* `[+-]yyyy*-[m]m-[d]d `
523+
* `[+-]yyyy*-[m]m-[d]d *`
524+
* `[+-]yyyy*-[m]m-[d]dT*`
527525
*/
528526
def stringToDate(s: UTF8String): Option[Int] = {
529-
if (s == null) {
527+
def isValidDigits(segment: Int, digits: Int): Boolean = {
528+
// An integer is able to represent a date within [+-]5 million years.
529+
var maxDigitsYear = 7
530+
(segment == 0 && digits >= 4 && digits <= maxDigitsYear) || (segment != 0 && digits <= 2)
531+
}
532+
if (s == null || s.trimAll().numBytes() == 0) {
530533
return None
531534
}
532535
val segments: Array[Int] = Array[Int](1, 1, 1)
536+
var sign = 1
533537
var i = 0
534538
var currentSegmentValue = 0
539+
var currentSegmentDigits = 0
535540
val bytes = s.trimAll().getBytes
536541
var j = 0
542+
if (bytes(j) == '-' || bytes(j) == '+') {
543+
sign = if (bytes(j) == '-') -1 else 1
544+
j += 1
545+
}
537546
while (j < bytes.length && (i < 3 && !(bytes(j) == ' ' || bytes(j) == 'T'))) {
538547
val b = bytes(j)
539548
if (i < 2 && b == '-') {
540-
if (i == 0 && j != 4) {
541-
// year should have exact four digits
549+
if (!isValidDigits(i, currentSegmentDigits)) {
542550
return None
543551
}
544552
segments(i) = currentSegmentValue
545553
currentSegmentValue = 0
554+
currentSegmentDigits = 0
546555
i += 1
547556
} else {
548557
val parsedValue = b - '0'.toByte
549558
if (parsedValue < 0 || parsedValue > 9) {
550559
return None
551560
} else {
552561
currentSegmentValue = currentSegmentValue * 10 + parsedValue
562+
currentSegmentDigits += 1
553563
}
554564
}
555565
j += 1
556566
}
557-
if (i == 0 && j != 4) {
558-
// year should have exact four digits
567+
if (!isValidDigits(i, currentSegmentDigits)) {
559568
return None
560569
}
561570
if (i < 2 && j < bytes.length) {
@@ -564,7 +573,7 @@ object DateTimeUtils {
564573
}
565574
segments(i) = currentSegmentValue
566575
try {
567-
val localDate = LocalDate.of(segments(0), segments(1), segments(2))
576+
val localDate = LocalDate.of(sign * segments(0), segments(1), segments(2))
568577
Some(localDateToDays(localDate))
569578
} catch {
570579
case NonFatal(_) => None

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/AnsiCastSuiteBase.scala

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -392,8 +392,6 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase {
392392
s"Cannot cast $str to DateType.")
393393
}
394394

395-
checkCastWithParseError("12345")
396-
checkCastWithParseError("12345-12-18")
397395
checkCastWithParseError("2015-13-18")
398396
checkCastWithParseError("2015-03-128")
399397
checkCastWithParseError("2015/03/18")

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuiteBase.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,12 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {
107107

108108
test("cast string to date") {
109109
var c = Calendar.getInstance()
110+
c.set(12345, 0, 1, 0, 0, 0)
111+
c.set(Calendar.MILLISECOND, 0)
112+
checkEvaluation(Cast(Literal("12345"), DateType), new Date(c.getTimeInMillis))
113+
c.set(12345, 11, 18, 0, 0, 0)
114+
c.set(Calendar.MILLISECOND, 0)
115+
checkEvaluation(Cast(Literal("12345-12-18"), DateType), new Date(c.getTimeInMillis))
110116
c.set(2015, 0, 1, 0, 0, 0)
111117
c.set(Calendar.MILLISECOND, 0)
112118
checkEvaluation(Cast(Literal("2015"), DateType), new Date(c.getTimeInMillis))

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,9 +194,11 @@ class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
194194
// before epoch
195195
checkHiveHashForDateType("1800-01-01", -62091)
196196

197+
// negative year
198+
checkHiveHashForDateType("-1212-01-01", -1162202)
199+
197200
// Invalid input: bad date string. Hive returns 0 for such cases
198201
intercept[NoSuchElementException](checkHiveHashForDateType("0-0-0", 0))
199-
intercept[NoSuchElementException](checkHiveHashForDateType("-1212-01-01", 0))
200202
intercept[NoSuchElementException](checkHiveHashForDateType("2016-99-99", 0))
201203

202204
// Invalid input: Empty string. Hive returns 0 for this case

0 commit comments

Comments
 (0)