Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,12 @@ object DateTimeUtils {
* value. The return type is [[Option]] in order to distinguish between 0L and null. The following
* formats are allowed:
*
* `yyyy`
* `yyyy-[m]m`
* `yyyy-[m]m-[d]d`
* `yyyy-[m]m-[d]d `
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `[+-]yyyy*`
* `[+-]yyyy*-[m]m`
* `[+-]yyyy*-[m]m-[d]d`
* `[+-]yyyy*-[m]m-[d]d `
* `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
*
Expand All @@ -249,17 +249,30 @@ object DateTimeUtils {
* the input string can't be parsed as timestamp, the result timestamp segments are empty.
*/
def parseTimestampString(s: UTF8String): (Array[Int], Option[ZoneId], Boolean) = {
if (s == null) {
def isValidDigits(segment: Int, digits: Int): Boolean = {
// A Long is able to represent a timestamp within [+-]200 thousand years
val maxDigitsYear = 6
// For the nanosecond part, more than 6 digits is allowed, but will be truncated.
segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
(segment != 0 && segment != 6 && digits <= 2)
Copy link
Contributor Author

@linhongliu-db linhongliu-db Jul 7, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

segments except year are allowed to have 0 digits before this PR. so I didn't do zero checks for these segments.
for example, before and after this PR, the below query is valid:

select cast('12::' as timestamp); -- output: 2021-07-07 12:00:00
select cast('T' as timestamp); -- output: 2021-07-07 00:00:00

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch, we should fail it. Let's do it in another PR.

}
if (s == null || s.trimAll().numBytes() == 0) {
return (Array.empty, None, false)
}
var tz: Option[String] = None
val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0)
var i = 0
var currentSegmentValue = 0
var currentSegmentDigits = 0
val bytes = s.trimAll().getBytes
var j = 0
var digitsMilli = 0
var justTime = false
var yearSign: Option[Int] = None
if (bytes(j) == '-' || bytes(j) == '+') {
yearSign = if (bytes(j) == '-') Some(-1) else Some(1)
j += 1
}
while (j < bytes.length) {
val b = bytes(j)
val parsedValue = b - '0'.toByte
Expand All @@ -269,50 +282,74 @@ object DateTimeUtils {
i += 3
} else if (i < 2) {
if (b == '-') {
if (i == 0 && j != 4) {
// year should have exact four digits
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
} else if (i == 0 && b == ':') {
} else if (i == 0 && b == ':' && yearSign.isEmpty) {
justTime = true
if (!isValidDigits(3, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(3) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i = 4
} else {
return (Array.empty, None, false)
}
} else if (i == 2) {
if (b == ' ' || b == 'T') {
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
} else {
return (Array.empty, None, false)
}
} else if (i == 3 || i == 4) {
if (b == ':') {
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
} else {
return (Array.empty, None, false)
}
} else if (i == 5 || i == 6) {
if (b == '-' || b == '+') {
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
tz = Some(new String(bytes, j, 1))
} else if (b == '.' && i == 5) {
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
} else {
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
tz = Some(new String(bytes, j, bytes.length - j))
j = bytes.length - 1
Expand All @@ -322,8 +359,12 @@ object DateTimeUtils {
}
} else {
if (i < segments.length && (b == ':' || b == ' ')) {
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
} else {
return (Array.empty, None, false)
Expand All @@ -333,61 +374,40 @@ object DateTimeUtils {
if (i == 6) {
digitsMilli += 1
}
currentSegmentValue = currentSegmentValue * 10 + parsedValue
// We will truncate the nanosecond part if there are more than 6 digits, which results
// in loss of precision
if (i != 6 || currentSegmentDigits < 6) {
currentSegmentValue = currentSegmentValue * 10 + parsedValue
}
currentSegmentDigits += 1
}
j += 1
}

segments(i) = currentSegmentValue
if (!justTime && i == 0 && j != 4) {
// year should have exact four digits
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue

while (digitsMilli < 6) {
segments(6) *= 10
digitsMilli += 1
}

// We are truncating the nanosecond part, which results in loss of precision
while (digitsMilli > 6) {
segments(6) /= 10
digitsMilli -= 1
}
// This step also validates time zone part
val zoneId = tz.map {
case "+" => ZoneOffset.ofHoursMinutes(segments(7), segments(8))
case "-" => ZoneOffset.ofHoursMinutes(-segments(7), -segments(8))
case zoneName: String => getZoneId(zoneName.trim)
}
segments(0) *= yearSign.getOrElse(1)
(segments, zoneId, justTime)
}

/**
* Trims and parses a given UTF8 timestamp string to the corresponding a corresponding [[Long]]
* value. The return type is [[Option]] in order to distinguish between 0L and null. The following
* formats are allowed:
*
* `yyyy`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should just say "Please refer to parseTimestampString for the allowed formats." here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

* `yyyy-[m]m`
* `yyyy-[m]m-[d]d`
* `yyyy-[m]m-[d]d `
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
*
* where `zone_id` should have one of the forms:
* - Z - Zulu time zone UTC+0
* - +|-[h]h:[m]m
* - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
* - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
* and a suffix in the formats:
* - +|-h[h]
* - +|-hh[:]mm
* - +|-hh:mm:ss
* - +|-hhmmss
* - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
* value. The return type is [[Option]] in order to distinguish between 0L and null. Please
* refer to `parseTimestampString` for the allowed formats
*/
def stringToTimestamp(s: UTF8String, timeZoneId: ZoneId): Option[Long] = {
try {
Expand Down Expand Up @@ -422,30 +442,8 @@ object DateTimeUtils {
* Trims and parses a given UTF8 string to a corresponding [[Long]] value which representing the
* number of microseconds since the epoch. The result is independent of time zones,
* which means that zone ID in the input string will be ignored.
* The return type is [[Option]] in order to distinguish between 0L and null. The following
* formats are allowed:
*
* `yyyy`
* `yyyy-[m]m`
* `yyyy-[m]m-[d]d`
* `yyyy-[m]m-[d]d `
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
*
* where `zone_id` should have one of the forms:
* - Z - Zulu time zone UTC+0
* - +|-[h]h:[m]m
* - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
* - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
* and a suffix in the formats:
* - +|-h[h]
* - +|-hh[:]mm
* - +|-hh:mm:ss
* - +|-hhmmss
* - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
*
* Note: The input string has to contains year/month/day fields, otherwise Spark can't determine
* the value of timestamp without time zone.
* The return type is [[Option]] in order to distinguish between 0L and null. Please
* refer to `parseTimestampString` for the allowed formats.
*/
def stringToTimestampWithoutTimeZone(s: UTF8String): Option[Long] = {
try {
Expand Down Expand Up @@ -518,44 +516,55 @@ object DateTimeUtils {
* The return type is [[Option]] in order to distinguish between 0 and null. The following
* formats are allowed:
*
* `yyyy`
* `yyyy-[m]m`
* `yyyy-[m]m-[d]d`
* `yyyy-[m]m-[d]d `
* `yyyy-[m]m-[d]d *`
* `yyyy-[m]m-[d]dT*`
* `[+-]yyyy*`
* `[+-]yyyy*-[m]m`
* `[+-]yyyy*-[m]m-[d]d`
* `[+-]yyyy*-[m]m-[d]d `
* `[+-]yyyy*-[m]m-[d]d *`
* `[+-]yyyy*-[m]m-[d]dT*`
*/
def stringToDate(s: UTF8String): Option[Int] = {
if (s == null) {
def isValidDigits(segment: Int, digits: Int): Boolean = {
// An integer is able to represent a date within [+-]5 million years.
var maxDigitsYear = 7
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can I implement a configuration item that configures the range of digits allowed for the year?

I found that it was writing to tables in different formats and the results would behave differently.

create table t(c1 date) stored as textfile;
insert overwrite table t select cast( '22022-05-01' as date);
select * from t1; -- output null
create table t(c1 date) stored as orcfile;
insert overwrite table t select cast( '22022-05-01' as date);
select * from t1; -- output +22022-05-01

Because orc/parquet date stores integers, but textfile and sequencefile store text.

image

But if you use hive jdbc, the query will fail, because java.sql.Date only supports 4-digit years.

Caused by: java.lang.IllegalArgumentException
  at java.sql.Date.valueOf(Date.java:143)
  at org.apache.hive.jdbc.HiveBaseResultSet.evaluate(HiveBaseResultSet.java:447

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's expected that not all the data sources and BI clients support datetime values larger than 10000-01-01, the question is when the failure should happen.

It looks to me that the Hive table should fail to write 22022-05-01 with textfile source, and the hive jdbc should fail at the client-side saying 22022-05-01 is not supported.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, I don't think it's possible to add a Spark config to forbid large datetime values. The literal is just one place, there are many other datetime operations that may produce large datetime values, which have been there before this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your explanation, make sense.

There may be some dates that were treated as abnormal by users in previous Spark versions, and can be handled normally in Spark 3.2, although they are normal dates.
Because I didn't see this behavior change in the migration guide before noticing this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea the impact on BI clients was missed, though strictly speaking BI clients are not part of Spark.

(segment == 0 && digits >= 4 && digits <= maxDigitsYear) || (segment != 0 && digits <= 2)
}
if (s == null || s.trimAll().numBytes() == 0) {
return None
}
val segments: Array[Int] = Array[Int](1, 1, 1)
var sign = 1
var i = 0
var currentSegmentValue = 0
var currentSegmentDigits = 0
val bytes = s.trimAll().getBytes
var j = 0
if (bytes(j) == '-' || bytes(j) == '+') {
sign = if (bytes(j) == '-') -1 else 1
j += 1
}
while (j < bytes.length && (i < 3 && !(bytes(j) == ' ' || bytes(j) == 'T'))) {
val b = bytes(j)
if (i < 2 && b == '-') {
if (i == 0 && j != 4) {
// year should have exact four digits
if (!isValidDigits(i, currentSegmentDigits)) {
return None
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
} else {
val parsedValue = b - '0'.toByte
if (parsedValue < 0 || parsedValue > 9) {
return None
} else {
currentSegmentValue = currentSegmentValue * 10 + parsedValue
currentSegmentDigits += 1
}
}
j += 1
}
if (i == 0 && j != 4) {
// year should have exact four digits
if (!isValidDigits(i, currentSegmentDigits)) {
return None
}
if (i < 2 && j < bytes.length) {
Expand All @@ -564,7 +573,7 @@ object DateTimeUtils {
}
segments(i) = currentSegmentValue
try {
val localDate = LocalDate.of(segments(0), segments(1), segments(2))
val localDate = LocalDate.of(sign * segments(0), segments(1), segments(2))
Some(localDateToDays(localDate))
} catch {
case NonFatal(_) => None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -392,8 +392,6 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase {
s"Cannot cast $str to DateType.")
}

checkCastWithParseError("12345")
checkCastWithParseError("12345-12-18")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's move the removed tests to the base cast suite

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

checkCastWithParseError("2015-13-18")
checkCastWithParseError("2015-03-128")
checkCastWithParseError("2015/03/18")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {

test("cast string to date") {
var c = Calendar.getInstance()
c.set(12345, 0, 1, 0, 0, 0)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("12345"), DateType), new Date(c.getTimeInMillis))
c.set(12345, 11, 18, 0, 0, 0)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("12345-12-18"), DateType), new Date(c.getTimeInMillis))
c.set(2015, 0, 1, 0, 0, 0)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("2015"), DateType), new Date(c.getTimeInMillis))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,9 +194,11 @@ class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
// before epoch
checkHiveHashForDateType("1800-01-01", -62091)

// negative year
checkHiveHashForDateType("-1212-01-01", -1162202)

// Invalid input: bad date string. Hive returns 0 for such cases
intercept[NoSuchElementException](checkHiveHashForDateType("0-0-0", 0))
intercept[NoSuchElementException](checkHiveHashForDateType("-1212-01-01", 0))
intercept[NoSuchElementException](checkHiveHashForDateType("2016-99-99", 0))

// Invalid input: Empty string. Hive returns 0 for this case
Expand Down
Loading