Skip to content

Commit c34baeb

Browse files
srielaucloud-fan
authored andcommitted
[SPARK-47719][SQL] Change spark.sql.legacy.timeParserPolicy default to CORRECTED
### What changes were proposed in this pull request? We changed the time parser policy in Spark 3.0.0. The config has since defaulted to raise an exception if there is a potential conflict between teh legacy and the new policy. Spark 4.0.0 is a good time to default to the new policy ### Why are the changes needed? Move the product forward and retire legacy behavior over time. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Run existing unit tests and verify changes. ### Was this patch authored or co-authored using generative AI tooling? No Closes #45859 from srielau/SPARK-47719-parser-policy-default-to-corrected. Lead-authored-by: Serge Rielau <[email protected]> Co-authored-by: Wenchen Fan <[email protected]> Signed-off-by: Gengliang Wang <[email protected]>
1 parent 6bd0ccf commit c34baeb

File tree

12 files changed

+115
-201
lines changed

12 files changed

+115
-201
lines changed

connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with PrivateM
7474

7575
for (enrichErrorEnabled <- Seq(false, true)) {
7676
test(s"cause exception - ${enrichErrorEnabled}") {
77-
withSQLConf("spark.sql.connect.enrichError.enabled" -> enrichErrorEnabled.toString) {
77+
withSQLConf(
78+
"spark.sql.connect.enrichError.enabled" -> enrichErrorEnabled.toString,
79+
"spark.sql.legacy.timeParserPolicy" -> "EXCEPTION") {
7880
val ex = intercept[SparkUpgradeException] {
7981
spark
8082
.sql("""

docs/sql-migration-guide.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ license: |
4646
- Since Spark 4.0, MySQL JDBC datasource will read FLOAT as FloatType, while in Spark 3.5 and previous, it was read as DoubleType. To restore the previous behavior, you can cast the column to the old type.
4747
- Since Spark 4.0, MySQL JDBC datasource will read BIT(n > 1) as BinaryType, while in Spark 3.5 and previous, read as LongType. To restore the previous behavior, set `spark.sql.legacy.mysql.bitArrayMapping.enabled` to `true`.
4848
- Since Spark 4.0, MySQL JDBC datasource will write ShortType as SMALLINT, while in Spark 3.5 and previous, write as INTEGER. To restore the previous behavior, you can replace the column with IntegerType whenever before writing.
49+
- Since Spark 4.0, The default value for `spark.sql.legacy.ctePrecedencePolicy` has been changed from `EXCEPTION` to `CORRECTED`. Instead of raising an error, inner CTE definitions take precedence over outer definitions.
50+
- Since Spark 4.0, The default value for `spark.sql.legacy.timeParserPolicy` has been changed from `EXCEPTION` to `CORRECTED`. Instead of raising an `INCONSISTENT_BEHAVIOR_CROSS_VERSION` error, `CANNOT_PARSE_TIMESTAMP` will be raised if ANSI mode is enable. `NULL` will be returned if ANSI mode is disabled. See [Datetime Patterns for Formatting and Parsing](sql-ref-datetime-pattern.html).
4951

5052
## Upgrading from Spark SQL 3.5.1 to 3.5.2
5153

python/pyspark/sql/tests/connect/test_connect_session.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ def test_error_enrichment_jvm_stacktrace(self):
130130
{
131131
"spark.sql.connect.enrichError.enabled": True,
132132
"spark.sql.pyspark.jvmStacktrace.enabled": False,
133+
"spark.sql.legacy.timeParserPolicy": "EXCEPTION",
133134
}
134135
):
135136
with self.sql_conf({"spark.sql.connect.serverStacktrace.enabled": False}):

sql/api/src/main/scala/org/apache/spark/sql/internal/SqlApiConf.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,6 @@ private[sql] object DefaultSqlApiConf extends SqlApiConf {
7979
override def charVarcharAsString: Boolean = false
8080
override def datetimeJava8ApiEnabled: Boolean = false
8181
override def sessionLocalTimeZone: String = TimeZone.getDefault.getID
82-
override def legacyTimeParserPolicy: LegacyBehaviorPolicy.Value = LegacyBehaviorPolicy.EXCEPTION
82+
override def legacyTimeParserPolicy: LegacyBehaviorPolicy.Value = LegacyBehaviorPolicy.CORRECTED
8383
override def defaultStringType: StringType = StringType
8484
}

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4027,13 +4027,13 @@ object SQLConf {
40274027
.doc("When LEGACY, java.text.SimpleDateFormat is used for formatting and parsing " +
40284028
"dates/timestamps in a locale-sensitive manner, which is the approach before Spark 3.0. " +
40294029
"When set to CORRECTED, classes from java.time.* packages are used for the same purpose. " +
4030-
"The default value is EXCEPTION, RuntimeException is thrown when we will get different " +
4031-
"results.")
4030+
"When set to EXCEPTION, RuntimeException is thrown when we will get different " +
4031+
"results. The default is CORRECTED.")
40324032
.version("3.0.0")
40334033
.stringConf
40344034
.transform(_.toUpperCase(Locale.ROOT))
40354035
.checkValues(LegacyBehaviorPolicy.values.map(_.toString))
4036-
.createWithDefault(LegacyBehaviorPolicy.EXCEPTION.toString)
4036+
.createWithDefault(LegacyBehaviorPolicy.CORRECTED.toString)
40374037

40384038
val LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC =
40394039
buildConf("spark.sql.legacy.followThreeValuedLogicInArrayExists")

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateFormatterSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ class DateFormatterSuite extends DatetimeFormatterSuite {
185185
val formatter = DateFormatter("MM-dd")
186186
// The date parser in 2.4 accepts 1970-02-29 and turn it into 1970-03-01, so we should get a
187187
// SparkUpgradeException here.
188-
intercept[SparkUpgradeException](formatter.parse("02-29"))
188+
intercept[DateTimeException](formatter.parse("02-29"))
189189
}
190190

191191
test("SPARK-36418: default parsing w/o pattern") {

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DatetimeFormatterSuite.scala

Lines changed: 31 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import org.scalatest.matchers.must.Matchers
2424
import org.apache.spark.{SparkFunSuite, SparkIllegalArgumentException, SparkUpgradeException}
2525
import org.apache.spark.sql.catalyst.plans.SQLHelper
2626
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{date, UTC}
27+
import org.apache.spark.sql.internal.SQLConf
2728

2829
trait DatetimeFormatterSuite extends SparkFunSuite with SQLHelper with Matchers {
2930
import DateTimeFormatterHelper._
@@ -99,34 +100,36 @@ trait DatetimeFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
99100
}
100101

101102
test("SPARK-31939: Fix Parsing day of year when year field pattern is missing") {
102-
// resolved to queryable LocaleDate or fail directly
103-
assertEqual("yyyy-dd-DD", "2020-29-60", date(2020, 2, 29))
104-
assertError("yyyy-dd-DD", "2020-02-60",
105-
"Field DayOfMonth 29 differs from DayOfMonth 2 derived from 2020-02-29")
106-
assertEqual("yyyy-MM-DD", "2020-02-60", date(2020, 2, 29))
107-
assertError("yyyy-MM-DD", "2020-03-60",
108-
"Field MonthOfYear 2 differs from MonthOfYear 3 derived from 2020-02-29")
109-
assertEqual("yyyy-MM-dd-DD", "2020-02-29-60", date(2020, 2, 29))
110-
assertError("yyyy-MM-dd-DD", "2020-03-01-60",
111-
"Field DayOfYear 61 differs from DayOfYear 60 derived from 2020-03-01")
112-
assertEqual("yyyy-DDD", "2020-366", date(2020, 12, 31))
113-
assertError("yyyy-DDD", "2019-366",
114-
"Invalid date 'DayOfYear 366' as '2019' is not a leap year")
103+
withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> "EXCEPTION") {
104+
// resolved to queryable LocaleDate or fail directly
105+
assertEqual("yyyy-dd-DD", "2020-29-60", date(2020, 2, 29))
106+
assertError("yyyy-dd-DD", "2020-02-60",
107+
"Field DayOfMonth 29 differs from DayOfMonth 2 derived from 2020-02-29")
108+
assertEqual("yyyy-MM-DD", "2020-02-60", date(2020, 2, 29))
109+
assertError("yyyy-MM-DD", "2020-03-60",
110+
"Field MonthOfYear 2 differs from MonthOfYear 3 derived from 2020-02-29")
111+
assertEqual("yyyy-MM-dd-DD", "2020-02-29-60", date(2020, 2, 29))
112+
assertError("yyyy-MM-dd-DD", "2020-03-01-60",
113+
"Field DayOfYear 61 differs from DayOfYear 60 derived from 2020-03-01")
114+
assertEqual("yyyy-DDD", "2020-366", date(2020, 12, 31))
115+
assertError("yyyy-DDD", "2019-366",
116+
"Invalid date 'DayOfYear 366' as '2019' is not a leap year")
115117

116-
// unresolved and need to check manually(SPARK-31939 fixed)
117-
assertEqual("DDD", "365", date(1970, 12, 31))
118-
assertError("DDD", "366",
119-
"Invalid date 'DayOfYear 366' as '1970' is not a leap year")
120-
assertEqual("MM-DD", "03-60", date(1970, 3))
121-
assertError("MM-DD", "02-60",
122-
"Field MonthOfYear 2 differs from MonthOfYear 3 derived from 1970-03-01")
123-
assertEqual("MM-dd-DD", "02-28-59", date(1970, 2, 28))
124-
assertError("MM-dd-DD", "02-28-60",
125-
"Field MonthOfYear 2 differs from MonthOfYear 3 derived from 1970-03-01")
126-
assertError("MM-dd-DD", "02-28-58",
127-
"Field DayOfMonth 28 differs from DayOfMonth 27 derived from 1970-02-27")
128-
assertEqual("dd-DD", "28-59", date(1970, 2, 28))
129-
assertError("dd-DD", "27-59",
130-
"Field DayOfMonth 27 differs from DayOfMonth 28 derived from 1970-02-28")
118+
// unresolved and need to check manually(SPARK-31939 fixed)
119+
assertEqual("DDD", "365", date(1970, 12, 31))
120+
assertError("DDD", "366",
121+
"Invalid date 'DayOfYear 366' as '1970' is not a leap year")
122+
assertEqual("MM-DD", "03-60", date(1970, 3))
123+
assertError("MM-DD", "02-60",
124+
"Field MonthOfYear 2 differs from MonthOfYear 3 derived from 1970-03-01")
125+
assertEqual("MM-dd-DD", "02-28-59", date(1970, 2, 28))
126+
assertError("MM-dd-DD", "02-28-60",
127+
"Field MonthOfYear 2 differs from MonthOfYear 3 derived from 1970-03-01")
128+
assertError("MM-dd-DD", "02-28-58",
129+
"Field DayOfMonth 28 differs from DayOfMonth 27 derived from 1970-02-27")
130+
assertEqual("dd-DD", "28-59", date(1970, 2, 28))
131+
assertError("dd-DD", "27-59",
132+
"Field DayOfMonth 27 differs from DayOfMonth 28 derived from 1970-02-28")
133+
}
131134
}
132135
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/TimestampFormatterSuite.scala

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -36,23 +36,25 @@ class TimestampFormatterSuite extends DatetimeFormatterSuite {
3636
override protected def useDateFormatter: Boolean = false
3737

3838
test("parsing timestamps using time zones") {
39-
val localDate = "2018-12-02T10:11:12.001234"
40-
val expectedMicros = Map(
41-
"UTC" -> 1543745472001234L,
42-
PST.getId -> 1543774272001234L,
43-
CET.getId -> 1543741872001234L,
44-
"Africa/Dakar" -> 1543745472001234L,
45-
"America/Los_Angeles" -> 1543774272001234L,
46-
"Asia/Urumqi" -> 1543723872001234L,
47-
"Asia/Hong_Kong" -> 1543716672001234L,
48-
"Europe/Brussels" -> 1543741872001234L)
49-
outstandingTimezonesIds.foreach { zoneId =>
50-
val formatter = TimestampFormatter(
51-
"yyyy-MM-dd'T'HH:mm:ss.SSSSSS",
52-
getZoneId(zoneId),
53-
isParsing = true)
54-
val microsSinceEpoch = formatter.parse(localDate)
55-
assert(microsSinceEpoch === expectedMicros(zoneId))
39+
withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> "EXCEPTION") {
40+
val localDate = "2018-12-02T10:11:12.001234"
41+
val expectedMicros = Map(
42+
"UTC" -> 1543745472001234L,
43+
PST.getId -> 1543774272001234L,
44+
CET.getId -> 1543741872001234L,
45+
"Africa/Dakar" -> 1543745472001234L,
46+
"America/Los_Angeles" -> 1543774272001234L,
47+
"Asia/Urumqi" -> 1543723872001234L,
48+
"Asia/Hong_Kong" -> 1543716672001234L,
49+
"Europe/Brussels" -> 1543741872001234L)
50+
outstandingTimezonesIds.foreach { zoneId =>
51+
val formatter = TimestampFormatter(
52+
"yyyy-MM-dd'T'HH:mm:ss.SSSSSS",
53+
getZoneId(zoneId),
54+
isParsing = true)
55+
val microsSinceEpoch = formatter.parse(localDate)
56+
assert(microsSinceEpoch === expectedMicros(zoneId))
57+
}
5658
}
5759
}
5860

sql/core/src/test/resources/sql-tests/results/ansi/datetime-parsing-invalid.sql.out

Lines changed: 32 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,13 @@ select to_timestamp('1', 'yy')
1313
-- !query schema
1414
struct<>
1515
-- !query output
16-
org.apache.spark.SparkUpgradeException
16+
org.apache.spark.SparkDateTimeException
1717
{
18-
"errorClass" : "INCONSISTENT_BEHAVIOR_CROSS_VERSION.PARSE_DATETIME_BY_NEW_PARSER",
19-
"sqlState" : "42K0B",
18+
"errorClass" : "CANNOT_PARSE_TIMESTAMP",
19+
"sqlState" : "22007",
2020
"messageParameters" : {
21-
"config" : "\"spark.sql.legacy.timeParserPolicy\"",
22-
"datetime" : "'1'"
21+
"ansiConfig" : "\"spark.sql.ansi.enabled\"",
22+
"message" : "Text '1' could not be parsed at index 0"
2323
}
2424
}
2525

@@ -45,13 +45,13 @@ select to_timestamp('123', 'yy')
4545
-- !query schema
4646
struct<>
4747
-- !query output
48-
org.apache.spark.SparkUpgradeException
48+
org.apache.spark.SparkDateTimeException
4949
{
50-
"errorClass" : "INCONSISTENT_BEHAVIOR_CROSS_VERSION.PARSE_DATETIME_BY_NEW_PARSER",
51-
"sqlState" : "42K0B",
50+
"errorClass" : "CANNOT_PARSE_TIMESTAMP",
51+
"sqlState" : "22007",
5252
"messageParameters" : {
53-
"config" : "\"spark.sql.legacy.timeParserPolicy\"",
54-
"datetime" : "'123'"
53+
"ansiConfig" : "\"spark.sql.ansi.enabled\"",
54+
"message" : "Text '123' could not be parsed, unparsed text found at index 2"
5555
}
5656
}
5757

@@ -61,13 +61,13 @@ select to_timestamp('1', 'yyy')
6161
-- !query schema
6262
struct<>
6363
-- !query output
64-
org.apache.spark.SparkUpgradeException
64+
org.apache.spark.SparkDateTimeException
6565
{
66-
"errorClass" : "INCONSISTENT_BEHAVIOR_CROSS_VERSION.PARSE_DATETIME_BY_NEW_PARSER",
67-
"sqlState" : "42K0B",
66+
"errorClass" : "CANNOT_PARSE_TIMESTAMP",
67+
"sqlState" : "22007",
6868
"messageParameters" : {
69-
"config" : "\"spark.sql.legacy.timeParserPolicy\"",
70-
"datetime" : "'1'"
69+
"ansiConfig" : "\"spark.sql.ansi.enabled\"",
70+
"message" : "Text '1' could not be parsed at index 0"
7171
}
7272
}
7373

@@ -110,13 +110,13 @@ select to_timestamp('9', 'DD')
110110
-- !query schema
111111
struct<>
112112
-- !query output
113-
org.apache.spark.SparkUpgradeException
113+
org.apache.spark.SparkDateTimeException
114114
{
115-
"errorClass" : "INCONSISTENT_BEHAVIOR_CROSS_VERSION.PARSE_DATETIME_BY_NEW_PARSER",
116-
"sqlState" : "42K0B",
115+
"errorClass" : "CANNOT_PARSE_TIMESTAMP",
116+
"sqlState" : "22007",
117117
"messageParameters" : {
118-
"config" : "\"spark.sql.legacy.timeParserPolicy\"",
119-
"datetime" : "'9'"
118+
"ansiConfig" : "\"spark.sql.ansi.enabled\"",
119+
"message" : "Text '9' could not be parsed at index 0"
120120
}
121121
}
122122

@@ -142,13 +142,13 @@ select to_timestamp('9', 'DDD')
142142
-- !query schema
143143
struct<>
144144
-- !query output
145-
org.apache.spark.SparkUpgradeException
145+
org.apache.spark.SparkDateTimeException
146146
{
147-
"errorClass" : "INCONSISTENT_BEHAVIOR_CROSS_VERSION.PARSE_DATETIME_BY_NEW_PARSER",
148-
"sqlState" : "42K0B",
147+
"errorClass" : "CANNOT_PARSE_TIMESTAMP",
148+
"sqlState" : "22007",
149149
"messageParameters" : {
150-
"config" : "\"spark.sql.legacy.timeParserPolicy\"",
151-
"datetime" : "'9'"
150+
"ansiConfig" : "\"spark.sql.ansi.enabled\"",
151+
"message" : "Text '9' could not be parsed at index 0"
152152
}
153153
}
154154

@@ -158,13 +158,13 @@ select to_timestamp('99', 'DDD')
158158
-- !query schema
159159
struct<>
160160
-- !query output
161-
org.apache.spark.SparkUpgradeException
161+
org.apache.spark.SparkDateTimeException
162162
{
163-
"errorClass" : "INCONSISTENT_BEHAVIOR_CROSS_VERSION.PARSE_DATETIME_BY_NEW_PARSER",
164-
"sqlState" : "42K0B",
163+
"errorClass" : "CANNOT_PARSE_TIMESTAMP",
164+
"sqlState" : "22007",
165165
"messageParameters" : {
166-
"config" : "\"spark.sql.legacy.timeParserPolicy\"",
167-
"datetime" : "'99'"
166+
"ansiConfig" : "\"spark.sql.ansi.enabled\"",
167+
"message" : "Text '99' could not be parsed at index 0"
168168
}
169169
}
170170

@@ -284,17 +284,9 @@ org.apache.spark.SparkDateTimeException
284284
-- !query
285285
select from_csv('2018-366', 'date Date', map('dateFormat', 'yyyy-DDD'))
286286
-- !query schema
287-
struct<>
287+
struct<from_csv(2018-366):struct<date:date>>
288288
-- !query output
289-
org.apache.spark.SparkUpgradeException
290-
{
291-
"errorClass" : "INCONSISTENT_BEHAVIOR_CROSS_VERSION.PARSE_DATETIME_BY_NEW_PARSER",
292-
"sqlState" : "42K0B",
293-
"messageParameters" : {
294-
"config" : "\"spark.sql.legacy.timeParserPolicy\"",
295-
"datetime" : "'2018-366'"
296-
}
297-
}
289+
{"date":null}
298290

299291

300292
-- !query

0 commit comments

Comments
 (0)