From 49061b908f8cb39236467e6837cbc8b19b25368c Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 11 Jul 2019 22:50:22 +0200 Subject: [PATCH 1/6] Use Java 8 API to add months --- .../spark/sql/catalyst/util/DateTimeUtils.scala | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 7017aae9ad52..e54e6bd4a36c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -543,22 +543,7 @@ object DateTimeUtils { * Returns a date value, expressed in days since 1.1.1970. */ def dateAddMonths(days: SQLDate, months: Int): SQLDate = { - val (year, monthInYear, dayOfMonth, daysToMonthEnd) = splitDate(days) - val absoluteMonth = (year - YearZero) * 12 + monthInYear - 1 + months - val nonNegativeMonth = if (absoluteMonth >= 0) absoluteMonth else 0 - val currentMonthInYear = nonNegativeMonth % 12 - val currentYear = nonNegativeMonth / 12 - - val leapDay = if (currentMonthInYear == 1 && isLeap(currentYear + YearZero)) 1 else 0 - val lastDayOfMonth = monthDays(currentMonthInYear) + leapDay - - val currentDayInMonth = if (daysToMonthEnd == 0 || dayOfMonth >= lastDayOfMonth) { - // last day of the month - lastDayOfMonth - } else { - dayOfMonth - } - firstDayOfMonth(nonNegativeMonth) + currentDayInMonth - 1 + LocalDate.ofEpochDay(days).plusMonths(months).toEpochDay.toInt } /** From 93a16aee02cff9e7d118239f9f3e2d40eec55fd2 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 11 Jul 2019 22:50:35 +0200 Subject: [PATCH 2/6] Adjust tests --- .../sql/catalyst/expressions/DateExpressionsSuite.scala | 2 +- .../spark/sql/catalyst/util/DateTimeUtilsSuite.scala | 8 ++++---- .../scala/org/apache/spark/sql/DateFunctionsSuite.scala | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala index 04bb61a7486e..9006dc45702a 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala @@ -463,7 +463,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(AddMonths(Literal.create(null, DateType), Literal.create(null, IntegerType)), null) checkEvaluation( - AddMonths(Literal(Date.valueOf("2015-01-30")), Literal(Int.MinValue)), -7293498) + AddMonths(Literal(Date.valueOf("2015-01-30")), Literal(Int.MinValue)), -938165455) checkEvaluation( AddMonths(Literal(Date.valueOf("2016-02-28")), positiveIntLit), 1014213) checkEvaluation( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index c77c9aec6887..4f8353922319 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -359,18 +359,18 @@ class DateTimeUtilsSuite extends SparkFunSuite { test("date add months") { val input = days(1997, 2, 28, 10, 30) - assert(dateAddMonths(input, 36) === days(2000, 2, 29)) - assert(dateAddMonths(input, -13) === days(1996, 1, 31)) + assert(dateAddMonths(input, 36) === days(2000, 2, 28)) + assert(dateAddMonths(input, -13) === days(1996, 1, 28)) } test("timestamp add months") { val ts1 = date(1997, 2, 28, 10, 30, 0) - val ts2 = date(2000, 2, 29, 10, 30, 0, 123000) + val ts2 = date(2000, 2, 28, 10, 30, 0, 123000) assert(timestampAddInterval(ts1, 36, 123000, defaultTz) === ts2) val ts3 = date(1997, 2, 27, 16, 0, 0, 0, TimeZonePST) val ts4 = date(2000, 2, 27, 16, 0, 0, 123000, TimeZonePST) - val ts5 = date(2000, 2, 29, 0, 0, 0, 123000, TimeZoneGMT) + val ts5 = date(2000, 2, 28, 0, 0, 0, 123000, TimeZoneGMT) assert(timestampAddInterval(ts3, 36, 123000, TimeZonePST) === ts4) assert(timestampAddInterval(ts3, 36, 123000, TimeZoneGMT) === ts5) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala index 3f91b91850e8..ff48ac8d7a6c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala @@ -301,11 +301,11 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext { val df = Seq((1, t1, d1), (3, t2, d2)).toDF("n", "t", "d") checkAnswer( df.selectExpr(s"d - $i"), - Seq(Row(Date.valueOf("2015-07-30")), Row(Date.valueOf("2015-12-30")))) + Seq(Row(Date.valueOf("2015-07-29")), Row(Date.valueOf("2015-12-28")))) checkAnswer( df.selectExpr(s"t - $i"), Seq(Row(Timestamp.valueOf("2015-07-31 23:59:59")), - Row(Timestamp.valueOf("2015-12-31 00:00:00")))) + Row(Timestamp.valueOf("2015-12-29 00:00:00")))) } test("function add_months") { @@ -314,10 +314,10 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext { val df = Seq((1, d1), (2, d2)).toDF("n", "d") checkAnswer( df.select(add_months(col("d"), 1)), - Seq(Row(Date.valueOf("2015-09-30")), Row(Date.valueOf("2015-03-31")))) + Seq(Row(Date.valueOf("2015-09-30")), Row(Date.valueOf("2015-03-28")))) checkAnswer( df.selectExpr("add_months(d, -1)"), - Seq(Row(Date.valueOf("2015-07-31")), Row(Date.valueOf("2015-01-31")))) + Seq(Row(Date.valueOf("2015-07-31")), Row(Date.valueOf("2015-01-28")))) } test("function months_between") { From 746de08b7857dc6bf2d955e5d55206aa00503f27 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 12 Jul 2019 09:34:21 +0200 Subject: [PATCH 3/6] Remove unused methods --- .../sql/catalyst/util/DateTimeUtils.scala | 33 ------------------- 1 file changed, 33 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index e54e6bd4a36c..1daf65a0c560 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -505,39 +505,6 @@ object DateTimeUtils { LocalDate.ofEpochDay(date).getDayOfMonth } - /** - * The number of days for each month (not leap year) - */ - private val monthDays = Array(31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) - - /** - * Returns the date value for the first day of the given month. - * The month is expressed in months since year zero (17999 BC), starting from 0. - */ - private def firstDayOfMonth(absoluteMonth: Int): SQLDate = { - val absoluteYear = absoluteMonth / 12 - var monthInYear = absoluteMonth - absoluteYear * 12 - var date = getDateFromYear(absoluteYear) - if (monthInYear >= 2 && isLeap(absoluteYear + YearZero)) { - date += 1 - } - while (monthInYear > 0) { - date += monthDays(monthInYear - 1) - monthInYear -= 1 - } - date - } - - /** - * Returns the date value for January 1 of the given year. - * The year is expressed in years since year zero (17999 BC), starting from 0. - */ - private def getDateFromYear(absoluteYear: Int): SQLDate = { - val absoluteDays = (absoluteYear * 365 + absoluteYear / 400 - absoluteYear / 100 - + absoluteYear / 4) - absoluteDays - toYearZero - } - /** * Add date and year-month interval. * Returns a date value, expressed in days since 1.1.1970. From 42d8e2dd4324b1935cbb2aab066036043c57fd08 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 14 Jul 2019 21:14:29 +0300 Subject: [PATCH 4/6] Update the SQL migration guide --- docs/sql-migration-guide-upgrade.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/sql-migration-guide-upgrade.md b/docs/sql-migration-guide-upgrade.md index d39bd933427f..e0dd04f57023 100644 --- a/docs/sql-migration-guide-upgrade.md +++ b/docs/sql-migration-guide-upgrade.md @@ -149,6 +149,8 @@ license: | - Since Spark 3.0, if files or subdirectories disappear during recursive directory listing (i.e. they appear in an intermediate listing but then cannot be read or listed during later phases of the recursive directory listing, due to either concurrent file deletions or object store consistency issues) then the listing will fail with an exception unless `spark.sql.files.ignoreMissingFiles` is `true` (default `false`). In previous versions, these missing files or subdirectories would be ignored. Note that this change of behavior only applies during initial table file listing (or during `REFRESH TABLE`), not during query execution: the net change is that `spark.sql.files.ignoreMissingFiles` is now obeyed during table file listing / query planning, not only at query execution time. + - Since Spark 3.0, the `add_months` function does not adjust the resulting date to a last day of month if the original date is a last day of month. The resulting date is adjust to a last day of month only if it is invalid. For example, `select add_months(DATE'2019-02-28', 1)` produces `2019-03-28` but `select add_months(DATE'2019-01-31', 1)` produces `2019-02-28`. + ## Upgrading from Spark SQL 2.4 to 2.4.1 - The value of `spark.executor.heartbeatInterval`, when specified without units like "30" rather than "30s", was From 52942045940353976e2bdc62faca3fef23df5753 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 14 Jul 2019 23:37:38 +0300 Subject: [PATCH 5/6] Calc next elem as add_months(start, i * stepMonth, i * stepMicro) --- .../catalyst/expressions/collectionOperations.scala | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 8477e63135e3..f671ede21782 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -2641,8 +2641,8 @@ object Sequence { while (t < exclusiveItem ^ stepSign < 0) { arr(i) = fromLong(t / scale) - t = timestampAddInterval(t, stepMonths, stepMicros, timeZone) i += 1 + t = timestampAddInterval(startMicros, i * stepMonths, i * stepMicros, timeZone) } // truncate array to the correct length @@ -2676,12 +2676,6 @@ object Sequence { |${genSequenceLengthCode(ctx, startMicros, stopMicros, intervalInMicros, arrLength)} """.stripMargin - val timestampAddIntervalCode = - s""" - |$t = org.apache.spark.sql.catalyst.util.DateTimeUtils.timestampAddInterval( - | $t, $stepMonths, $stepMicros, $genTimeZone); - """.stripMargin - s""" |final int $stepMonths = $step.months; |final long $stepMicros = $step.microseconds; @@ -2705,8 +2699,9 @@ object Sequence { | | while ($t < $exclusiveItem ^ $stepSign < 0) { | $arr[$i] = ($elemType) ($t / ${scale}L); - | $timestampAddIntervalCode | $i += 1; + | $t = org.apache.spark.sql.catalyst.util.DateTimeUtils.timestampAddInterval( + | $startMicros, $i * $stepMonths, $i * $stepMicros, $genTimeZone); | } | | if ($arr.length > $i) { From f1a4241cf9cf175648f5b314b832124dfccdce01 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 14 Jul 2019 23:46:31 +0300 Subject: [PATCH 6/6] Improve the SQL migration guide. --- docs/sql-migration-guide-upgrade.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sql-migration-guide-upgrade.md b/docs/sql-migration-guide-upgrade.md index f136142e30c6..f13d298674b2 100644 --- a/docs/sql-migration-guide-upgrade.md +++ b/docs/sql-migration-guide-upgrade.md @@ -151,7 +151,7 @@ license: | - Since Spark 3.0, substitution order of nested WITH clauses is changed and an inner CTE definition takes precedence over an outer. In version 2.4 and earlier, `WITH t AS (SELECT 1), t2 AS (WITH t AS (SELECT 2) SELECT * FROM t) SELECT * FROM t2` returns `1` while in version 3.0 it returns `2`. The previous behaviour can be restored by setting `spark.sql.legacy.ctePrecedence.enabled` to `true`. - - Since Spark 3.0, the `add_months` function does not adjust the resulting date to a last day of month if the original date is a last day of month. The resulting date is adjust to a last day of month only if it is invalid. For example, `select add_months(DATE'2019-02-28', 1)` produces `2019-03-28` but `select add_months(DATE'2019-01-31', 1)` produces `2019-02-28`. + - Since Spark 3.0, the `add_months` function adjusts the resulting date to a last day of month only if it is invalid. For example, `select add_months(DATE'2019-01-31', 1)` results `2019-02-28`. In Spark version 2.4 and earlier, the resulting date is adjusted when it is invalid, or the original date is a last day of months. For example, adding a month to `2019-02-28` resultes in `2019-03-31`. ## Upgrading from Spark SQL 2.4 to 2.4.1