Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
9546a0f
[SPARK-35111][SQL] Support Cast string to year-month interval
AngersZhuuuu Apr 21, 2021
691c1f4
Update CastSuite.scala
AngersZhuuuu Apr 21, 2021
f5b02ee
Update CastSuite.scala
AngersZhuuuu Apr 22, 2021
15424a7
Update IntervalUtils.scala
AngersZhuuuu Apr 22, 2021
879817b
Update CastSuite.scala
AngersZhuuuu Apr 22, 2021
62d175b
Update CastSuite.scala
AngersZhuuuu Apr 22, 2021
2c75bba
save
AngersZhuuuu Apr 22, 2021
5b134fa
Merge branch 'master' into SPARK-SPARK-35111
AngersZhuuuu Apr 22, 2021
6d14414
Update CastSuite.scala
AngersZhuuuu Apr 22, 2021
d19bbc8
Update Cast.scala
AngersZhuuuu Apr 23, 2021
ff904a1
save
AngersZhuuuu Apr 25, 2021
d0e30e4
Merge branch 'master' into SPARK-SPARK-35111
AngersZhuuuu Apr 25, 2021
3b84baa
follow comment
AngersZhuuuu Apr 25, 2021
b05f7e6
update
AngersZhuuuu Apr 28, 2021
25c08e0
Update IntervalUtils.scala
AngersZhuuuu Apr 28, 2021
f636d41
update
AngersZhuuuu Apr 28, 2021
ce69004
Update CastSuite.scala
AngersZhuuuu Apr 28, 2021
092d01a
Update IntervalUtils.scala
AngersZhuuuu Apr 28, 2021
f088f64
Update IntervalUtils.scala
AngersZhuuuu Apr 28, 2021
80499b8
Update IntervalUtils.scala
AngersZhuuuu Apr 29, 2021
ca19c09
Update IntervalUtils.scala
AngersZhuuuu Apr 29, 2021
3df92b6
Update Cast.scala
AngersZhuuuu Apr 29, 2021
3adde87
Update IntervalUtils.scala
AngersZhuuuu Apr 29, 2021
0f82987
follow comment
AngersZhuuuu Apr 29, 2021
2c8785b
Update CastSuite.scala
AngersZhuuuu Apr 29, 2021
253c70e
follow comment
AngersZhuuuu Apr 29, 2021
9c70b88
Update IntervalUtils.scala
AngersZhuuuu Apr 29, 2021
5ca83ab
update
AngersZhuuuu Apr 30, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
[SPARK-35111][SQL] Support Cast string to year-month interval
  • Loading branch information
AngersZhuuuu committed Apr 21, 2021
commit 9546a0f22f62e7017d73530cc691223e0ca85587
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ object Cast {
case (TimestampType, DateType) => true

case (StringType, CalendarIntervalType) => true
case (StringType, YearMonthIntervalType) => true

case (StringType, _: NumericType) => true
case (BooleanType, _: NumericType) => true
Expand Down Expand Up @@ -533,6 +534,11 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
buildCast[UTF8String](_, s => IntervalUtils.safeStringToInterval(s))
}

private[this] def castToYearMonthInterval(from: DataType): Any => Any = from match {
case StringType =>
buildCast[UTF8String](_, s => IntervalUtils.safeFromYearMonthString(s).orNull)
}

// LongConverter
private[this] def castToLong(from: DataType): Any => Any = from match {
case StringType if ansiEnabled =>
Expand Down Expand Up @@ -837,6 +843,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
case decimal: DecimalType => castToDecimal(from, decimal)
case TimestampType => castToTimestamp(from)
case CalendarIntervalType => castToInterval(from)
case YearMonthIntervalType => castToYearMonthInterval(from)
case BooleanType => castToBoolean(from)
case ByteType => castToByte(from)
case ShortType => castToShort(from)
Expand Down Expand Up @@ -895,6 +902,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
case decimal: DecimalType => castToDecimalCode(from, decimal, ctx)
case TimestampType => castToTimestampCode(from, ctx)
case CalendarIntervalType => castToIntervalCode(from)
case YearMonthIntervalType => castToYearMonthIntervalCode(from, ctx)
case BooleanType => castToBooleanCode(from)
case ByteType => castToByteCode(from, ctx)
case ShortType => castToShortCode(from, ctx)
Expand Down Expand Up @@ -1353,6 +1361,23 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit

}

private[this] def castToYearMonthIntervalCode(
from: DataType,
ctx: CodegenContext): CastFunction = from match {
case StringType =>
val util = IntervalUtils.getClass.getCanonicalName.stripSuffix("$")
val intOpt = ctx.freshVariable("intOpt", classOf[Option[Int]])
(c, evPrim, evNull) =>
code"""
scala.Option<Integer> $intOpt = $util.safeFromYearMonthString($c);
if ($intOpt.isDefined()) {
$evPrim = ((Integer) $intOpt.get()).intValue();
} else {
$evNull = true;
}
""".stripMargin
}

private[this] def decimalToTimestampCode(d: ExprValue): Block = {
val block = inline"new java.math.BigDecimal($MICROS_PER_SECOND)"
code"($d.toBigDecimal().bigDecimal().multiply($block)).longValue()"
Expand Down Expand Up @@ -1912,6 +1937,7 @@ object AnsiCast {
case (DateType, TimestampType) => true

case (StringType, _: CalendarIntervalType) => true
case (StringType, YearMonthIntervalType) => true

case (StringType, DateType) => true
case (TimestampType, DateType) => true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,24 @@ object IntervalUtils {

private val yearMonthPattern = "^([+|-])?(\\d+)-(\\d+)$".r

def safeFromYearMonthString(input: UTF8String): Option[Int] = {
try {
if (input == null || input.toString == null) {
throw new IllegalArgumentException("Interval year-month string must be not null")
} else {
val regex = "INTERVAL '([-|+]?[0-9]+-[-|+]?[0-9]+)' YEAR TO MONTH".r
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The keywords INTERVAL and YEAR TO MONTH are not mandatory by SQL standard. We should be able to parse only payload too.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also could you move the regexp out of the method. So, we can avoid its "compilation" per every call.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding to the pattern '([-|+]?[0-9]+-[-|+]?[0-9]+)', should we handle the second sign before month? Take a look at the val above yearMonthPattern

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding to the pattern '([-|+]?[0-9]+-[-|+]?[0-9]+)', should we handle the second sign before month? Take a look at the val above yearMonthPattern

How about current

Copy link
Contributor Author

@AngersZhuuuu AngersZhuuuu Apr 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@MaxGekk Can you help to list the case we need to support here,

INTERVAL [-|+]'[+|-]0-0' YEAR TO MONTH
[+|-]0-0

Any more case need to support here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First of all, we must support our own output (in case insensitive manner by default but ideally we should respect the SQL config spark.sql.caseSensitive):

  1. ANSI_STYLE, like
    INTERVAL -'-10-1' YEAR TO MONTH
  2. HIVE_STYLE like
    10-1 or -10-1

Rules from the SQL standard:

<interval literal> ::=
  INTERVAL [ <sign> ] <interval string> <interval qualifier>
<interval string> ::=
  <quote> <unquoted interval string> <quote>
<unquoted interval string> ::=
  [ <sign> ] { <year-month literal> | <day-time literal> }
<year-month literal> ::=
  <years value> [ <minus sign> <months value> ]
  | <months value>
<years value> ::=
  <datetime value>
<months value> ::=
  <datetime value>
<datetime value> ::=
  <unsigned integer>
<unsigned integer> ::= <digit>...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but ideally we should respect the SQL config spark.sql.caseSensitive):

Spark Catalyst parser not respect spark.sql.caseSensitive too, should we respect it here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For parsing I think it's OK to be always case insensitive. The Spark SQL parser is case insensitive.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, how about current approach

// scalastyle:off caselocale .toLowerCase
val intervalString = input.trimAll().toUpperCase.toString
// scalastyle:on
val interval = regex.findFirstMatchIn(intervalString)
.map(_.group(1)).getOrElse(intervalString)
Some(fromYearMonthString(interval).months)
}
} catch {
case _: IllegalArgumentException => None
}
}

/**
* Parse YearMonth string in form: [+|-]YYYY-MM
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1774,6 +1774,24 @@ class CastSuite extends CastSuiteBase {
assert(e3.contains("Casting 2147483648 to int causes overflow"))
}
}

test("SPARK-35111: Cast string to year-month interval") {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add round trip tests: string -> year-month interval -> string, and year-month interval -> string -> year-month interval

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you test corner cases when arithmetic overflow happens. Also test upper and lower cases.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you test corner cases when arithmetic overflow happens. Also test upper and lower cases.

Current test is not correct according to #32281, will update after #32281 merged

Copy link
Member

@MaxGekk MaxGekk Apr 29, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please, check the input in lower case. For example:

    checkEvaluation(cast(Literal.create("  interval '1-0' YEAR TO MONTH  "),
      YearMonthIntervalType), 12)

val regex = "INTERVAL '([-|+]?[0-9]+-[-|+]?[0-9]+)' YEAR TO MONTH".r
val m = regex.findFirstMatchIn("INTERVAL '0-0' YEAR TO MONTH")
println(m)
println(m.map(_.group(1)))

checkEvaluation(cast(Literal.create("INTERVAL '0-0' YEAR TO MONTH"),
YearMonthIntervalType), 0)
checkEvaluation(cast(Literal.create("0-0"), YearMonthIntervalType), 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is duplicated

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

checkEvaluation(cast(Literal.create("INTERVAL '-1-0' YEAR TO MONTH"),
YearMonthIntervalType), -12)
checkEvaluation(cast(Literal.create("-1-0"), YearMonthIntervalType), -12)
checkEvaluation(cast(Literal.create("INTERVAL '10-1' YEAR TO MONTH"),
YearMonthIntervalType), 121)
checkEvaluation(cast(Literal.create("10-1"), YearMonthIntervalType), 121)
checkEvaluation(cast(Literal.create("null"), YearMonthIntervalType), null)
}
}

/**
Expand Down