@@ -224,12 +224,12 @@ object DateTimeUtils {
224224 * value. The return type is [[Option ]] in order to distinguish between 0L and null. The following
225225 * formats are allowed:
226226 *
227- * `yyyy`
228- * `yyyy-[m]m`
229- * `yyyy-[m]m-[d]d`
230- * `yyyy-[m]m-[d]d `
231- * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
232- * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
227+ * `[+-] yyyy* `
228+ * `[+-] yyyy* -[m]m`
229+ * `[+-] yyyy* -[m]m-[d]d`
230+ * `[+-] yyyy* -[m]m-[d]d `
231+ * `[+-] yyyy* -[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
232+ * `[+-] yyyy* -[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
233233 * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
234234 * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
235235 *
@@ -249,17 +249,30 @@ object DateTimeUtils {
249249 * the input string can't be parsed as timestamp, the result timestamp segments are empty.
250250 */
251251 def parseTimestampString (s : UTF8String ): (Array [Int ], Option [ZoneId ], Boolean ) = {
252- if (s == null ) {
252+ def isValidDigits (segment : Int , digits : Int ): Boolean = {
253+ // A Long is able to represent a timestamp within [+-]200 thousand years
254+ val maxDigitsYear = 6
255+ // For the nanosecond part, more than 6 digits is allowed, but will be truncated.
256+ segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
257+ (segment != 0 && segment != 6 && digits <= 2 )
258+ }
259+ if (s == null || s.trimAll().numBytes() == 0 ) {
253260 return (Array .empty, None , false )
254261 }
255262 var tz : Option [String ] = None
256263 val segments : Array [Int ] = Array [Int ](1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 )
257264 var i = 0
258265 var currentSegmentValue = 0
266+ var currentSegmentDigits = 0
259267 val bytes = s.trimAll().getBytes
260268 var j = 0
261269 var digitsMilli = 0
262270 var justTime = false
271+ var yearSign : Option [Int ] = None
272+ if (bytes(j) == '-' || bytes(j) == '+' ) {
273+ yearSign = if (bytes(j) == '-' ) Some (- 1 ) else Some (1 )
274+ j += 1
275+ }
263276 while (j < bytes.length) {
264277 val b = bytes(j)
265278 val parsedValue = b - '0' .toByte
@@ -269,50 +282,74 @@ object DateTimeUtils {
269282 i += 3
270283 } else if (i < 2 ) {
271284 if (b == '-' ) {
272- if (i == 0 && j != 4 ) {
273- // year should have exact four digits
285+ if (! isValidDigits(i, currentSegmentDigits)) {
274286 return (Array .empty, None , false )
275287 }
276288 segments(i) = currentSegmentValue
277289 currentSegmentValue = 0
290+ currentSegmentDigits = 0
278291 i += 1
279- } else if (i == 0 && b == ':' ) {
292+ } else if (i == 0 && b == ':' && yearSign.isEmpty ) {
280293 justTime = true
294+ if (! isValidDigits(3 , currentSegmentDigits)) {
295+ return (Array .empty, None , false )
296+ }
281297 segments(3 ) = currentSegmentValue
282298 currentSegmentValue = 0
299+ currentSegmentDigits = 0
283300 i = 4
284301 } else {
285302 return (Array .empty, None , false )
286303 }
287304 } else if (i == 2 ) {
288305 if (b == ' ' || b == 'T' ) {
306+ if (! isValidDigits(i, currentSegmentDigits)) {
307+ return (Array .empty, None , false )
308+ }
289309 segments(i) = currentSegmentValue
290310 currentSegmentValue = 0
311+ currentSegmentDigits = 0
291312 i += 1
292313 } else {
293314 return (Array .empty, None , false )
294315 }
295316 } else if (i == 3 || i == 4 ) {
296317 if (b == ':' ) {
318+ if (! isValidDigits(i, currentSegmentDigits)) {
319+ return (Array .empty, None , false )
320+ }
297321 segments(i) = currentSegmentValue
298322 currentSegmentValue = 0
323+ currentSegmentDigits = 0
299324 i += 1
300325 } else {
301326 return (Array .empty, None , false )
302327 }
303328 } else if (i == 5 || i == 6 ) {
304329 if (b == '-' || b == '+' ) {
330+ if (! isValidDigits(i, currentSegmentDigits)) {
331+ return (Array .empty, None , false )
332+ }
305333 segments(i) = currentSegmentValue
306334 currentSegmentValue = 0
335+ currentSegmentDigits = 0
307336 i += 1
308337 tz = Some (new String (bytes, j, 1 ))
309338 } else if (b == '.' && i == 5 ) {
339+ if (! isValidDigits(i, currentSegmentDigits)) {
340+ return (Array .empty, None , false )
341+ }
310342 segments(i) = currentSegmentValue
311343 currentSegmentValue = 0
344+ currentSegmentDigits = 0
312345 i += 1
313346 } else {
347+ if (! isValidDigits(i, currentSegmentDigits)) {
348+ return (Array .empty, None , false )
349+ }
314350 segments(i) = currentSegmentValue
315351 currentSegmentValue = 0
352+ currentSegmentDigits = 0
316353 i += 1
317354 tz = Some (new String (bytes, j, bytes.length - j))
318355 j = bytes.length - 1
@@ -322,8 +359,12 @@ object DateTimeUtils {
322359 }
323360 } else {
324361 if (i < segments.length && (b == ':' || b == ' ' )) {
362+ if (! isValidDigits(i, currentSegmentDigits)) {
363+ return (Array .empty, None , false )
364+ }
325365 segments(i) = currentSegmentValue
326366 currentSegmentValue = 0
367+ currentSegmentDigits = 0
327368 i += 1
328369 } else {
329370 return (Array .empty, None , false )
@@ -333,61 +374,40 @@ object DateTimeUtils {
333374 if (i == 6 ) {
334375 digitsMilli += 1
335376 }
336- currentSegmentValue = currentSegmentValue * 10 + parsedValue
377+ // We will truncate the nanosecond part if there are more than 6 digits, which results
378+ // in loss of precision
379+ if (i != 6 || currentSegmentDigits < 6 ) {
380+ currentSegmentValue = currentSegmentValue * 10 + parsedValue
381+ }
382+ currentSegmentDigits += 1
337383 }
338384 j += 1
339385 }
340386
341- segments(i) = currentSegmentValue
342- if (! justTime && i == 0 && j != 4 ) {
343- // year should have exact four digits
387+ if (! isValidDigits(i, currentSegmentDigits)) {
344388 return (Array .empty, None , false )
345389 }
390+ segments(i) = currentSegmentValue
346391
347392 while (digitsMilli < 6 ) {
348393 segments(6 ) *= 10
349394 digitsMilli += 1
350395 }
351396
352- // We are truncating the nanosecond part, which results in loss of precision
353- while (digitsMilli > 6 ) {
354- segments(6 ) /= 10
355- digitsMilli -= 1
356- }
357397 // This step also validates time zone part
358398 val zoneId = tz.map {
359399 case " +" => ZoneOffset .ofHoursMinutes(segments(7 ), segments(8 ))
360400 case " -" => ZoneOffset .ofHoursMinutes(- segments(7 ), - segments(8 ))
361401 case zoneName : String => getZoneId(zoneName.trim)
362402 }
403+ segments(0 ) *= yearSign.getOrElse(1 )
363404 (segments, zoneId, justTime)
364405 }
365406
366407 /**
367408 * Trims and parses a given UTF8 timestamp string to the corresponding a corresponding [[Long ]]
368- * value. The return type is [[Option ]] in order to distinguish between 0L and null. The following
369- * formats are allowed:
370- *
371- * `yyyy`
372- * `yyyy-[m]m`
373- * `yyyy-[m]m-[d]d`
374- * `yyyy-[m]m-[d]d `
375- * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
376- * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
377- * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
378- * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
379- *
380- * where `zone_id` should have one of the forms:
381- * - Z - Zulu time zone UTC+0
382- * - +|-[h]h:[m]m
383- * - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
384- * - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
385- * and a suffix in the formats:
386- * - +|-h[h]
387- * - +|-hh[:]mm
388- * - +|-hh:mm:ss
389- * - +|-hhmmss
390- * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
409+ * value. The return type is [[Option ]] in order to distinguish between 0L and null. Please
410+ * refer to `parseTimestampString` for the allowed formats
391411 */
392412 def stringToTimestamp (s : UTF8String , timeZoneId : ZoneId ): Option [Long ] = {
393413 try {
@@ -422,30 +442,8 @@ object DateTimeUtils {
422442 * Trims and parses a given UTF8 string to a corresponding [[Long ]] value which representing the
423443 * number of microseconds since the epoch. The result is independent of time zones,
424444 * which means that zone ID in the input string will be ignored.
425- * The return type is [[Option ]] in order to distinguish between 0L and null. The following
426- * formats are allowed:
427- *
428- * `yyyy`
429- * `yyyy-[m]m`
430- * `yyyy-[m]m-[d]d`
431- * `yyyy-[m]m-[d]d `
432- * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
433- * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
434- *
435- * where `zone_id` should have one of the forms:
436- * - Z - Zulu time zone UTC+0
437- * - +|-[h]h:[m]m
438- * - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
439- * - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
440- * and a suffix in the formats:
441- * - +|-h[h]
442- * - +|-hh[:]mm
443- * - +|-hh:mm:ss
444- * - +|-hhmmss
445- * - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
446- *
447- * Note: The input string has to contains year/month/day fields, otherwise Spark can't determine
448- * the value of timestamp without time zone.
445+ * The return type is [[Option ]] in order to distinguish between 0L and null. Please
446+ * refer to `parseTimestampString` for the allowed formats.
449447 */
450448 def stringToTimestampWithoutTimeZone (s : UTF8String ): Option [Long ] = {
451449 try {
@@ -518,44 +516,55 @@ object DateTimeUtils {
518516 * The return type is [[Option ]] in order to distinguish between 0 and null. The following
519517 * formats are allowed:
520518 *
521- * `yyyy`
522- * `yyyy-[m]m`
523- * `yyyy-[m]m-[d]d`
524- * `yyyy-[m]m-[d]d `
525- * `yyyy-[m]m-[d]d *`
526- * `yyyy-[m]m-[d]dT*`
519+ * `[+-] yyyy* `
520+ * `[+-] yyyy* -[m]m`
521+ * `[+-] yyyy* -[m]m-[d]d`
522+ * `[+-] yyyy* -[m]m-[d]d `
523+ * `[+-] yyyy* -[m]m-[d]d *`
524+ * `[+-] yyyy* -[m]m-[d]dT*`
527525 */
528526 def stringToDate (s : UTF8String ): Option [Int ] = {
529- if (s == null ) {
527+ def isValidDigits (segment : Int , digits : Int ): Boolean = {
528+ // An integer is able to represent a date within [+-]5 million years.
529+ var maxDigitsYear = 7
530+ (segment == 0 && digits >= 4 && digits <= maxDigitsYear) || (segment != 0 && digits <= 2 )
531+ }
532+ if (s == null || s.trimAll().numBytes() == 0 ) {
530533 return None
531534 }
532535 val segments : Array [Int ] = Array [Int ](1 , 1 , 1 )
536+ var sign = 1
533537 var i = 0
534538 var currentSegmentValue = 0
539+ var currentSegmentDigits = 0
535540 val bytes = s.trimAll().getBytes
536541 var j = 0
542+ if (bytes(j) == '-' || bytes(j) == '+' ) {
543+ sign = if (bytes(j) == '-' ) - 1 else 1
544+ j += 1
545+ }
537546 while (j < bytes.length && (i < 3 && ! (bytes(j) == ' ' || bytes(j) == 'T' ))) {
538547 val b = bytes(j)
539548 if (i < 2 && b == '-' ) {
540- if (i == 0 && j != 4 ) {
541- // year should have exact four digits
549+ if (! isValidDigits(i, currentSegmentDigits)) {
542550 return None
543551 }
544552 segments(i) = currentSegmentValue
545553 currentSegmentValue = 0
554+ currentSegmentDigits = 0
546555 i += 1
547556 } else {
548557 val parsedValue = b - '0' .toByte
549558 if (parsedValue < 0 || parsedValue > 9 ) {
550559 return None
551560 } else {
552561 currentSegmentValue = currentSegmentValue * 10 + parsedValue
562+ currentSegmentDigits += 1
553563 }
554564 }
555565 j += 1
556566 }
557- if (i == 0 && j != 4 ) {
558- // year should have exact four digits
567+ if (! isValidDigits(i, currentSegmentDigits)) {
559568 return None
560569 }
561570 if (i < 2 && j < bytes.length) {
@@ -564,7 +573,7 @@ object DateTimeUtils {
564573 }
565574 segments(i) = currentSegmentValue
566575 try {
567- val localDate = LocalDate .of(segments(0 ), segments(1 ), segments(2 ))
576+ val localDate = LocalDate .of(sign * segments(0 ), segments(1 ), segments(2 ))
568577 Some (localDateToDays(localDate))
569578 } catch {
570579 case NonFatal (_) => None
0 commit comments