Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
parse dates from timestamp column
  • Loading branch information
Jonathancui123 committed Jul 12, 2022
commit f16e5e169b99fdeffdc0925a4cbed20b672628ce
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
case _: DecimalType => tryParseDecimal(field)
case DoubleType => tryParseDouble(field)
// Temporary NOTE: DateTimeType is private to [sql] package
case DateType | TimestampNTZType | TimestampType => tryParseDate(field)
case DateType | TimestampNTZType | TimestampType => tryParseDateTime(field)
case BooleanType => tryParseBoolean(field)
case StringType => StringType
case other: DataType =>
Expand Down Expand Up @@ -177,11 +177,11 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
if ((allCatch opt field.toDouble).isDefined || isInfOrNan(field)) {
DoubleType
} else {
tryParseDate(field)
tryParseDateTime(field)
}
}

private def tryParseDate(field: String): DataType = {
private def tryParseDateTime(field: String): DataType = {
if ((allCatch opt dateFormatter.parse(field)).isDefined) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a very expensive way

DateType
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,18 @@ class UnivocityParser(

private val decimalParser = ExprUtils.getDecimalParser(options.locale)

private def convertToDate(datum: String): Int = {
try {
dateFormatter.parse(datum)
} catch {
case NonFatal(e) =>
// If fails to parse, then tries the way used in 2.0 and 1.x for backwards
// compatibility.
val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(datum))
DateTimeUtils.stringToDate(str).getOrElse(throw e)
}
}

/**
* Create a converter which converts the string value to a value according to a desired type.
* Currently, we do not support complex types (`ArrayType`, `MapType`, `StructType`).
Expand Down Expand Up @@ -206,28 +218,23 @@ class UnivocityParser(
// If fails to parse, then tries the way used in 2.0 and 1.x for backwards
// compatibility.
val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(datum))
DateTimeUtils.stringToTimestamp(str, options.zoneId).getOrElse(throw e)
DateTimeUtils.stringToTimestamp(str, options.zoneId).getOrElse(convertToDate(datum))
}
}

case _: TimestampNTZType => (d: String) =>
nullSafeDatum(d, name, nullable, options) { datum =>
timestampNTZFormatter.parseWithoutTimeZone(datum, false)
}

case _: DateType => (d: String) =>
nullSafeDatum(d, name, nullable, options) { datum =>
try {
dateFormatter.parse(datum)
timestampNTZFormatter.parseWithoutTimeZone(datum, false)
} catch {
case NonFatal(e) =>
// If fails to parse, then tries the way used in 2.0 and 1.x for backwards
// compatibility.
val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(datum))
DateTimeUtils.stringToDate(str).getOrElse(throw e)
convertToDate(datum)
}
}

case _: DateType => (d: String) =>
nullSafeDatum(d, name, nullable, options)(convertToDate)

case _: StringType => (d: String) =>
nullSafeDatum(d, name, nullable, options)(UTF8String.fromString)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -358,4 +358,19 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper {
Map("timestampFormat" -> "invalid", "dateFormat" -> "invalid"), false, "UTC")
check(new UnivocityParser(StructType(Seq.empty), optionsWithPattern))
}

test("dates should be parsed correctly in a timestamp column") {
def checkDate(dataType: DataType): Unit = {
val timestampsOptions =
new CSVOptions(Map("timestampFormat" -> "dd/MM/yyyy HH:mm",
"timestampNTZFormat" -> "dd-MM-yyyy HH:mm", "dateFormat" -> "dd_MM_yyyy"),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One test we might need would be:

timestampFormat" -> "dd/MM/yyyy HH:mm and dateFormat -> dd/MM/yyyy to make sure timestamps are not parsed as date types without conflicting.

Copy link
Contributor

@bersprockets bersprockets Jun 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to make sure timestamps are not parsed as date types without conflicting.

That's actually what happens:

Before this PR:

scala> val csvInput = Seq("0,2012-01-01 12:00:00", "1,2021-07-01 15:00:00").toDS()
csvInput: org.apache.spark.sql.Dataset[String] = [value: string]

scala> val df = spark.read.option("inferSchema", "true").csv(csvInput)
df: org.apache.spark.sql.DataFrame = [_c0: int, _c1: timestamp]

scala> df.printSchema
root
 |-- _c0: integer (nullable = true)
 |-- _c1: timestamp (nullable = true)

scala> 

After this PR:

scala> val csvInput = Seq("0,2012-01-01 12:00:00", "1,2021-07-01 15:00:00").toDS()
csvInput: org.apache.spark.sql.Dataset[String] = [value: string]

scala> val df = spark.read.option("inferSchema", "true").csv(csvInput)
df: org.apache.spark.sql.DataFrame = [_c0: int, _c1: date]

scala> df.printSchema
root
 |-- _c0: integer (nullable = true)
 |-- _c1: date (nullable = true)

scala>

It looks like some tests fail too, like CSVInferSchemaSuite, and CSVv1Suite, possibly others (I ran these two suites on my laptop. For some reason, the github actions didn't run tests for this PR. Maybe @Jonathancui123 needs to turn them on in his fork?).

We should probably 1. add either SQL configuration or an option e.g., infersDate

I think you would need something like that: when set, the date formatter could use the slower, more strict method of parsing (so "2012-01-01 12:00:00" wouldn't parse as a date).

Edit: To do a strict parsing, one might need to use ParsePosition and check that the whole date/time value string was consumed. Even after setting lenient=false SimpleDateFormat.parse didn't complain about extra characters that weren't consumed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I addressed inference mistakes in the following code snippet and comment

false, "UTC")
val dateString = "08_09_2001"
val date = days(2001, 9, 8)
val parser = new UnivocityParser(new StructType(), timestampsOptions)
assert(parser.makeConverter("d", dataType).apply(dateString) == date)
}
checkDate(TimestampType)
checkDate(TimestampNTZType)
}
}