Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Revert "allow legacy parser with inferDate"
This reverts commit e1170d0.
  • Loading branch information
Jonathancui123 committed Jul 19, 2022
commit 1e8f9384163e7f90fed69cc890a2ff6ef6323dab
6 changes: 6 additions & 0 deletions core/src/main/resources/error/error-classes.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@
],
"sqlState" : "22005"
},
"CANNOT_INFER_DATE" : {
"message" : [
"Cannot infer date in schema inference when LegacyTimeParserPolicy is \"LEGACY\". Legacy Date formatter does not support strict date format matching which is required to avoid inferring timestamps and other non-date entries to date."
],
"sqlState" : "22007"
},
"CANNOT_PARSE_DECIMAL" : {
"message" : [
"Cannot parse decimal"
Expand Down
2 changes: 1 addition & 1 deletion docs/sql-data-sources-csv.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ Data source options of CSV can be set via:
<tr>
<td><code>inferDate</code></td>
<td>false</td>
<td>Whether or not to infer columns that satisfy the <code>dateFormat</code> option as <code>Date</code>. Requires <code>inferSchema</code> to be <code>true</code> and cannot infer from legacy formats. When <code>false</code>, columns with dates will be inferred as <code>String</code> (or as <code>Timestamp</code> if it fits the <code>timestampFormat</code>).</td>
<td>Whether or not to infer columns that satisfy the <code>dateFormat</code> option as <code>Date</code>. Requires <code>inferSchema</code> to be <code>true</code>. When <code>false</code>, columns with dates will be inferred as <code>String</code> (or as <code>Timestamp</code> if it fits the <code>timestampFormat</code>).</td>
<td>read</td>
</tr>
<tr>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
options.dateFormatInRead,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
isParsing = true,
isInferring = true)
isParsing = true)

private val decimalParser = if (options.locale == Locale.US) {
// Special handling the default locale for backward compatibility
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,17 @@ class CSVOptions(

/**
* Infer columns with all valid date entries as date type (otherwise inferred as timestamp type).
* Disabled by default for performance. When enabled, date entries in timestamp columns
* will be cast to timestamp upon parsing. Cannot infer legacy date formats since the legacy date
* parser will accept extra trailing characters
* Disabled by default for backwards compatibility and performance. When enabled, date entries in
* timestamp columns will be cast to timestamp upon parsing. Not compatible with
* legacyTimeParserPolicy == LEGACY since legacy date parser will accept extra trailing characters
*/
val inferDate = getBool("inferDate")
val inferDate = {
val inferDateFlag = getBool("inferDate")
if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY && inferDateFlag) {
throw QueryExecutionErrors.inferDateWithLegacyTimeParserError()
}
inferDateFlag
}

// Provide a default value for dateFormatInRead when inferDate. This ensures that the
// Iso8601DateFormatter (with strict date parsing) is used for date inference
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,7 @@ class UnivocityParser(
options.dateFormatInRead,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
isParsing = true,
isInferring = false)
isParsing = true)

private val csvFilters = if (SQLConf.get.csvFilterPushDown) {
new OrderedFilters(filters, requiredSchema)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,7 @@ class JacksonParser(
options.dateFormatInRead,
options.locale,
legacyFormat = FAST_DATE_FORMAT,
isParsing = true,
isInferring = false)
isParsing = true)

/**
* Create a converter which converts the JSON documents held by the `JsonParser`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,8 @@ object DateFormatter {
format: Option[String],
locale: Locale = defaultLocale,
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
isParsing: Boolean,
isInference: Boolean = false): DateFormatter = {
if (SQLConf.get.legacyTimeParserPolicy == LEGACY && !isInference) {
isParsing: Boolean): DateFormatter = {
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
getLegacyFormatter(format.getOrElse(defaultPattern), locale, legacyFormat)
} else {
val df = format
Expand All @@ -204,9 +203,8 @@ object DateFormatter {
format: Option[String],
locale: Locale,
legacyFormat: LegacyDateFormat,
isParsing: Boolean,
isInferring: Boolean): DateFormatter = {
getFormatter(format, locale, legacyFormat, isParsing, isInferring)
isParsing: Boolean): DateFormatter = {
getFormatter(format, locale, legacyFormat, isParsing)
}

def apply(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ import org.apache.hadoop.fs.permission.FsPermission
import org.codehaus.commons.compiler.CompileException
import org.codehaus.janino.InternalCompilerException

import org.apache.spark.{Partition, SparkArithmeticException, SparkArrayIndexOutOfBoundsException, SparkClassNotFoundException, SparkConcurrentModificationException, SparkDateTimeException, SparkException, SparkFileAlreadyExistsException, SparkFileNotFoundException, SparkIllegalArgumentException, SparkIndexOutOfBoundsException, SparkNoSuchElementException, SparkNoSuchMethodException, SparkNumberFormatException, SparkRuntimeException, SparkSecurityException, SparkSQLException, SparkSQLFeatureNotSupportedException, SparkUnsupportedOperationException, SparkUpgradeException}
import org.apache.spark.{Partition, SparkArithmeticException, SparkArrayIndexOutOfBoundsException, SparkClassNotFoundException, SparkConcurrentModificationException, SparkDateTimeException, SparkException, SparkFileAlreadyExistsException, SparkFileNotFoundException, SparkIllegalArgumentException, SparkIndexOutOfBoundsException, SparkNoSuchElementException, SparkNoSuchMethodException, SparkNumberFormatException, SparkRuntimeException, SparkSecurityException, SparkSQLException, SparkSQLFeatureNotSupportedException, SparkThrowable, SparkUnsupportedOperationException, SparkUpgradeException}
import org.apache.spark.executor.CommitDeniedException
import org.apache.spark.launcher.SparkLauncher
import org.apache.spark.memory.SparkOutOfMemoryError
Expand Down Expand Up @@ -529,6 +529,12 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase {
""".stripMargin)
}

def inferDateWithLegacyTimeParserError(): Throwable with SparkThrowable = {
new SparkIllegalArgumentException(errorClass = "CANNOT_INFER_DATE",
messageParameters = Array()
)
}

def streamedOperatorUnsupportedByDataSourceError(
className: String, operator: String): Throwable = {
new UnsupportedOperationException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2803,35 +2803,41 @@ abstract class CSVSuite
"inferSchema" -> "true",
"inferDate" -> "true")

// We should still be able to parse legacy formats when the schema is provided
// Error should be thrown when attempting to inferDate with Legacy parser
if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
val ds = Seq("1500-02-29").toDS()
val csv = spark.read.option("header", false).schema("d date").csv(ds)
checkAnswer(csv, Row(Date.valueOf("1500-03-01")))
}
// 1. Specify date format and timestamp format
// 2. Date inference should work with default date format when dateFormat is not provided
Seq(options1, options2).foreach {options =>
val results = spark.read
.format("csv")
.options(options)
.load(testFile(dateInferSchemaFile))
val msg = intercept[IllegalArgumentException] {
spark.read
.format("csv")
.options(options1)
.load(testFile(dateInferSchemaFile))
}.getMessage
assert(msg.contains("CANNOT_INFER_DATE"))
} else {
// 1. Specify date format and timestamp format
// 2. Date inference should work with default date format when dateFormat is not provided
Seq(options1, options2).foreach {options =>
val results = spark.read
.format("csv")
.options(options)
.load(testFile(dateInferSchemaFile))

val expectedSchema = StructType(List(StructField("date", DateType),
StructField("timestamp-date", TimestampType),
StructField("date-timestamp", TimestampType)))
assert(results.schema == expectedSchema)
val expectedSchema = StructType(List(StructField("date", DateType),
StructField("timestamp-date", TimestampType),
StructField("date-timestamp", TimestampType)))
assert(results.schema == expectedSchema)

val expected =
Seq(
Seq(Date.valueOf("2001-9-8"), Timestamp.valueOf("2014-10-27 18:30:0.0"),
Timestamp.valueOf("1765-03-28 00:00:0.0")),
Seq(Date.valueOf("1941-1-2"), Timestamp.valueOf("2000-09-14 01:01:0.0"),
Timestamp.valueOf("1423-11-12 23:41:0.0")),
Seq(Date.valueOf("0293-11-7"), Timestamp.valueOf("1995-06-25 00:00:00.0"),
Timestamp.valueOf("2016-01-28 20:00:00.0"))
)
assert(results.collect().toSeq.map(_.toSeq) == expected)
}

val expected =
Seq(
Seq(Date.valueOf("2001-9-8"), Timestamp.valueOf("2014-10-27 18:30:0.0"),
Timestamp.valueOf("1765-03-28 00:00:0.0")),
Seq(Date.valueOf("1941-1-2"), Timestamp.valueOf("2000-09-14 01:01:0.0"),
Timestamp.valueOf("1423-11-12 23:41:0.0")),
Seq(Date.valueOf("0293-11-7"), Timestamp.valueOf("1995-06-25 00:00:00.0"),
Timestamp.valueOf("2016-01-28 20:00:00.0"))
)
assert(results.collect().toSeq.map(_.toSeq) == expected)
}
}
}
Expand Down