Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix case where dateFormat is not specified
  • Loading branch information
Jonathancui123 committed Jul 12, 2022
commit 41fa8eb2ff25cce402a38be40b5f0a0d3b48c6a8
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,28 @@ class CSVOptions(
// A language tag in IETF BCP 47 format
val locale: Locale = parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US)

val dateFormatInRead: Option[String] = parameters.get("dateFormat")
/**
* Infer columns with all valid date entries as date type (otherwise inferred as timestamp type).
* Disabled by default for backwards compatibility and performance. When enabled, date entries in
* timestamp columns will be cast to timestamp upon parsing. Not compatible with
* legacyTimeParserPolicy == LEGACY since legacy date parser will accept extra trailing characters
*/
val inferDate = {
val inferDateFlag = getBool("inferDate")
if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY && inferDateFlag) {
throw QueryExecutionErrors.inferDateWithLegacyTimeParserError()
}
inferDateFlag
}

// Provide a default value for dateFormatInRead when inferDate. This ensures that the
// Iso8601DateFormatter (with strict date parsing) is used for date inference
val dateFormatInRead: Option[String] =
if (inferDate) {
Option(parameters.getOrElse("dateFormat", DateFormatter.defaultPattern))
} else {
parameters.get("dateFormat")
}
val dateFormatInWrite: String = parameters.getOrElse("dateFormat", DateFormatter.defaultPattern)

val timestampFormatInRead: Option[String] =
Expand Down Expand Up @@ -195,20 +216,6 @@ class CSVOptions(
*/
val enforceSchema = getBool("enforceSchema", default = true)

/**
* Infer columns with all valid date entries as date type (otherwise inferred as timestamp type).
* Disabled by default for backwards compatibility and performance. When enabled, date entries in
* timestamp columns will be cast to timestamp upon parsing. Not compatible with
* legacyTimeParserPolicy == LEGACY
*/
val inferDate = {
val inferDateFlag = getBool("inferDate")
if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY && inferDateFlag) {
throw QueryExecutionErrors.inferDateWithLegacyTimeParserError()
}
inferDateFlag
}

/**
* String representation of an empty value in read and in write.
*/
Expand Down
6 changes: 3 additions & 3 deletions sql/core/src/test/resources/test-data/date-infer-schema.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
date,timestamp-date,date-timestamp
2001-09-08,2014-10-27T18:30,1765-03-28
1941-01-02,2000-09-14T01:01,1423-11-12T23:41
0293-11-07,1995-06-25,2016-01-28T20:00
2001-09-08,2014-10-27T18:30:00,1765-03-28
1941-01-02,2000-09-14T01:01:00,1423-11-12T23:41:00
0293-11-07,1995-06-25,2016-01-28T20:00:00
Original file line number Diff line number Diff line change
Expand Up @@ -2792,40 +2792,52 @@ abstract class CSVSuite
}

test("SPARK-39469: Infer schema for date type") {
val options = Map(
val options1 = Map(
"header" -> "true",
"inferSchema" -> "true",
"timestampFormat" -> "yyyy-MM-dd'T'HH:mm",
"timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss",
"dateFormat" -> "yyyy-MM-dd",
"inferDate" -> "true")
val options2 = Map(
"header" -> "true",
"inferSchema" -> "true",
"inferDate" -> "true")

// Error should be thrown when attempting to inferDate with Legacy parser
if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
val msg = intercept[IllegalArgumentException] {
spark.read
.format("csv")
.options(options)
.options(options1)
.load(testFile(dateInferSchemaFile))
}.getMessage
assert(msg.contains("CANNOT_INFER_DATE"))
} else {
val results = spark.read
.format("csv")
.options(options)
.load(testFile(dateInferSchemaFile))
// 1. Specify date format and timestamp format
// 2. Date inference should work with default date format when dateFormat is not provided
Seq(options1, options2).foreach {options =>
val results = spark.read
.format("csv")
.options(options)
.load(testFile(dateInferSchemaFile))

val expectedSchema = StructType(List(StructField("date", DateType),
StructField("timestamp-date", TimestampType), StructField("date-timestamp", TimestampType)))
assert(results.schema == expectedSchema)
val expectedSchema = StructType(List(StructField("date", DateType),
StructField("timestamp-date", TimestampType),
StructField("date-timestamp", TimestampType)))
assert(results.schema == expectedSchema)

val expected =
Seq(
Seq(Date.valueOf("2001-9-8"), Timestamp.valueOf("2014-10-27 18:30:0.0"),
Timestamp.valueOf("1765-03-28 00:00:0.0")),
Seq(Date.valueOf("1941-1-2"), Timestamp.valueOf("2000-09-14 01:01:0.0"),
Timestamp.valueOf("1423-11-12 23:41:0.0")),
Seq(Date.valueOf("0293-11-7"), Timestamp.valueOf("1995-06-25 00:00:00.0"),
Timestamp.valueOf("2016-01-28 20:00:00.0"))
)
assert(results.collect().toSeq.map(_.toSeq) == expected)
}

val expected =
Seq(
Seq(Date.valueOf("2001-9-8"), Timestamp.valueOf("2014-10-27 18:30:0.0"),
Timestamp.valueOf("1765-03-28 00:00:0.0")),
Seq(Date.valueOf("1941-1-2"), Timestamp.valueOf("2000-09-14 01:01:0.0"),
Timestamp.valueOf("1423-11-12 23:41:0.0")),
Seq(Date.valueOf("0293-11-7"), Timestamp.valueOf("1995-06-25 00:00:00.0"),
Timestamp.valueOf("2016-01-28 20:00:00.0"))
)
assert(results.collect().toSeq.map(_.toSeq) == expected)
}
}
}
Expand Down