Skip to content

Commit e1170d0

Browse files
allow legacy parser with inferDate
1 parent 41fa8eb commit e1170d0

File tree

9 files changed

+44
-63
lines changed

9 files changed

+44
-63
lines changed

core/src/main/resources/error/error-classes.json

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,6 @@
2323
],
2424
"sqlState" : "22005"
2525
},
26-
"CANNOT_INFER_DATE" : {
27-
"message" : [
28-
"Cannot infer date in schema inference when LegacyTimeParserPolicy is \"LEGACY\". Legacy Date formatter does not support strict date format matching which is required to avoid inferring timestamps and other non-date entries to date."
29-
],
30-
"sqlState" : "22007"
31-
},
3226
"CANNOT_PARSE_DECIMAL" : {
3327
"message" : [
3428
"Cannot parse decimal"

docs/sql-data-sources-csv.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ Data source options of CSV can be set via:
111111
<tr>
112112
<td><code>inferDate</code></td>
113113
<td>false</td>
114-
<td>Whether or not to infer columns that satisfy the <code>dateFormat</code> option as <code>Date</code>. Requires <code>inferSchema</code> to be <code>true</code>. When <code>false</code>, columns with dates will be inferred as <code>String</code> (or as <code>Timestamp</code> if it fits the <code>timestampFormat</code>).</td>
114+
<td>Whether or not to infer columns that satisfy the <code>dateFormat</code> option as <code>Date</code>. Requires <code>inferSchema</code> to be <code>true</code> and cannot infer from legacy formats. When <code>false</code>, columns with dates will be inferred as <code>String</code> (or as <code>Timestamp</code> if it fits the <code>timestampFormat</code>).</td>
115115
<td>read</td>
116116
</tr>
117117
<tr>

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,8 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
5050
options.dateFormatInRead,
5151
options.locale,
5252
legacyFormat = FAST_DATE_FORMAT,
53-
isParsing = true)
53+
isParsing = true,
54+
isInferring = true)
5455

5556
private val decimalParser = if (options.locale == Locale.US) {
5657
// Special handling the default locale for backward compatibility

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -150,17 +150,11 @@ class CSVOptions(
150150

151151
/**
152152
* Infer columns with all valid date entries as date type (otherwise inferred as timestamp type).
153-
* Disabled by default for backwards compatibility and performance. When enabled, date entries in
154-
* timestamp columns will be cast to timestamp upon parsing. Not compatible with
155-
* legacyTimeParserPolicy == LEGACY since legacy date parser will accept extra trailing characters
153+
* Disabled by default for performance. When enabled, date entries in timestamp columns
154+
* will be cast to timestamp upon parsing. Cannot infer legacy date formats since the legacy date
155+
* parser will accept extra trailing characters
156156
*/
157-
val inferDate = {
158-
val inferDateFlag = getBool("inferDate")
159-
if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY && inferDateFlag) {
160-
throw QueryExecutionErrors.inferDateWithLegacyTimeParserError()
161-
}
162-
inferDateFlag
163-
}
157+
val inferDate = getBool("inferDate")
164158

165159
// Provide a default value for dateFormatInRead when inferDate. This ensures that the
166160
// Iso8601DateFormatter (with strict date parsing) is used for date inference

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ class UnivocityParser(
112112
options.dateFormatInRead,
113113
options.locale,
114114
legacyFormat = FAST_DATE_FORMAT,
115-
isParsing = true)
115+
isParsing = true,
116+
isInferring = false)
116117

117118
private val csvFilters = if (SQLConf.get.csvFilterPushDown) {
118119
new OrderedFilters(filters, requiredSchema)

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@ class JacksonParser(
7676
options.dateFormatInRead,
7777
options.locale,
7878
legacyFormat = FAST_DATE_FORMAT,
79-
isParsing = true)
79+
isParsing = true,
80+
isInferring = false)
8081

8182
/**
8283
* Create a converter which converts the JSON documents held by the `JsonParser`

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,9 @@ object DateFormatter {
175175
format: Option[String],
176176
locale: Locale = defaultLocale,
177177
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
178-
isParsing: Boolean): DateFormatter = {
179-
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
178+
isParsing: Boolean,
179+
isInference: Boolean = false): DateFormatter = {
180+
if (SQLConf.get.legacyTimeParserPolicy == LEGACY && !isInference) {
180181
getLegacyFormatter(format.getOrElse(defaultPattern), locale, legacyFormat)
181182
} else {
182183
val df = format
@@ -203,8 +204,9 @@ object DateFormatter {
203204
format: Option[String],
204205
locale: Locale,
205206
legacyFormat: LegacyDateFormat,
206-
isParsing: Boolean): DateFormatter = {
207-
getFormatter(format, locale, legacyFormat, isParsing)
207+
isParsing: Boolean,
208+
isInferring: Boolean): DateFormatter = {
209+
getFormatter(format, locale, legacyFormat, isParsing, isInferring)
208210
}
209211

210212
def apply(

sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ import org.apache.hadoop.fs.permission.FsPermission
3434
import org.codehaus.commons.compiler.CompileException
3535
import org.codehaus.janino.InternalCompilerException
3636

37-
import org.apache.spark.{Partition, SparkArithmeticException, SparkArrayIndexOutOfBoundsException, SparkClassNotFoundException, SparkConcurrentModificationException, SparkDateTimeException, SparkException, SparkFileAlreadyExistsException, SparkFileNotFoundException, SparkIllegalArgumentException, SparkIndexOutOfBoundsException, SparkNoSuchElementException, SparkNoSuchMethodException, SparkNumberFormatException, SparkRuntimeException, SparkSecurityException, SparkSQLException, SparkSQLFeatureNotSupportedException, SparkThrowable, SparkUnsupportedOperationException, SparkUpgradeException}
37+
import org.apache.spark.{Partition, SparkArithmeticException, SparkArrayIndexOutOfBoundsException, SparkClassNotFoundException, SparkConcurrentModificationException, SparkDateTimeException, SparkException, SparkFileAlreadyExistsException, SparkFileNotFoundException, SparkIllegalArgumentException, SparkIndexOutOfBoundsException, SparkNoSuchElementException, SparkNoSuchMethodException, SparkNumberFormatException, SparkRuntimeException, SparkSecurityException, SparkSQLException, SparkSQLFeatureNotSupportedException, SparkUnsupportedOperationException, SparkUpgradeException}
3838
import org.apache.spark.executor.CommitDeniedException
3939
import org.apache.spark.launcher.SparkLauncher
4040
import org.apache.spark.memory.SparkOutOfMemoryError
@@ -529,12 +529,6 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase {
529529
""".stripMargin)
530530
}
531531

532-
def inferDateWithLegacyTimeParserError(): Throwable with SparkThrowable = {
533-
new SparkIllegalArgumentException(errorClass = "CANNOT_INFER_DATE",
534-
messageParameters = Array()
535-
)
536-
}
537-
538532
def streamedOperatorUnsupportedByDataSourceError(
539533
className: String, operator: String): Throwable = {
540534
new UnsupportedOperationException(

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala

Lines changed: 26 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2803,41 +2803,35 @@ abstract class CSVSuite
28032803
"inferSchema" -> "true",
28042804
"inferDate" -> "true")
28052805

2806-
// Error should be thrown when attempting to inferDate with Legacy parser
2806+
// We should still be able to parse legacy formats when the schema is provided
28072807
if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
2808-
val msg = intercept[IllegalArgumentException] {
2809-
spark.read
2810-
.format("csv")
2811-
.options(options1)
2812-
.load(testFile(dateInferSchemaFile))
2813-
}.getMessage
2814-
assert(msg.contains("CANNOT_INFER_DATE"))
2815-
} else {
2816-
// 1. Specify date format and timestamp format
2817-
// 2. Date inference should work with default date format when dateFormat is not provided
2818-
Seq(options1, options2).foreach {options =>
2819-
val results = spark.read
2820-
.format("csv")
2821-
.options(options)
2822-
.load(testFile(dateInferSchemaFile))
2823-
2824-
val expectedSchema = StructType(List(StructField("date", DateType),
2825-
StructField("timestamp-date", TimestampType),
2826-
StructField("date-timestamp", TimestampType)))
2827-
assert(results.schema == expectedSchema)
2808+
val ds = Seq("1500-02-29").toDS()
2809+
val csv = spark.read.option("header", false).schema("d date").csv(ds)
2810+
checkAnswer(csv, Row(Date.valueOf("1500-03-01")))
2811+
}
2812+
// 1. Specify date format and timestamp format
2813+
// 2. Date inference should work with default date format when dateFormat is not provided
2814+
Seq(options1, options2).foreach {options =>
2815+
val results = spark.read
2816+
.format("csv")
2817+
.options(options)
2818+
.load(testFile(dateInferSchemaFile))
28282819

2829-
val expected =
2830-
Seq(
2831-
Seq(Date.valueOf("2001-9-8"), Timestamp.valueOf("2014-10-27 18:30:0.0"),
2832-
Timestamp.valueOf("1765-03-28 00:00:0.0")),
2833-
Seq(Date.valueOf("1941-1-2"), Timestamp.valueOf("2000-09-14 01:01:0.0"),
2834-
Timestamp.valueOf("1423-11-12 23:41:0.0")),
2835-
Seq(Date.valueOf("0293-11-7"), Timestamp.valueOf("1995-06-25 00:00:00.0"),
2836-
Timestamp.valueOf("2016-01-28 20:00:00.0"))
2837-
)
2838-
assert(results.collect().toSeq.map(_.toSeq) == expected)
2839-
}
2820+
val expectedSchema = StructType(List(StructField("date", DateType),
2821+
StructField("timestamp-date", TimestampType),
2822+
StructField("date-timestamp", TimestampType)))
2823+
assert(results.schema == expectedSchema)
28402824

2825+
val expected =
2826+
Seq(
2827+
Seq(Date.valueOf("2001-9-8"), Timestamp.valueOf("2014-10-27 18:30:0.0"),
2828+
Timestamp.valueOf("1765-03-28 00:00:0.0")),
2829+
Seq(Date.valueOf("1941-1-2"), Timestamp.valueOf("2000-09-14 01:01:0.0"),
2830+
Timestamp.valueOf("1423-11-12 23:41:0.0")),
2831+
Seq(Date.valueOf("0293-11-7"), Timestamp.valueOf("1995-06-25 00:00:00.0"),
2832+
Timestamp.valueOf("2016-01-28 20:00:00.0"))
2833+
)
2834+
assert(results.collect().toSeq.map(_.toSeq) == expected)
28412835
}
28422836
}
28432837
}

0 commit comments

Comments
 (0)