Updated documentation and fixed tests

apache · Jonathancui123 · Jun 15, 2022 · Jun 15, 2022 · Jun 15, 2022 · Jun 15, 2022
commit 638064bb847396c749b0cf81df33c2ce03d3af46
diff --git a/docs/sql-data-sources-csv.md b/docs/sql-data-sources-csv.md
@@ -108,6 +108,12 @@ Data source options of CSV can be set via:
     <td>Infers the input schema automatically from data. It requires one extra pass over the data. CSV built-in functions ignore this option.</td>
     <td>read</td>
   </tr>
+  <tr>
+    <td><code>inferDate</code></td> 
+    <td>false</td>
+    <td>Whether or not to infer columns that satisfy the <code>dateFormat</code> option as <code>Date</code>. Requires <code>inferSchema</code> to be <code>true</code>. Legacy date formats in Timestamp columns cannot be parsed with this option.</td>
+    <td>read</td>
+  </tr>
   <tr>
     <td><code>enforceSchema</code></td>
     <td>true</td>

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala
@@ -141,8 +141,18 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
         case _: DecimalType => tryParseDecimal(field)
         case DoubleType => tryParseDouble(field)
         case DateType => tryParseDateTime(field)
-        case TimestampNTZType => tryParseTimestampNTZ(field)
-        case TimestampType => tryParseTimestamp(field)
+        case TimestampNTZType =>
+          if (options.inferDate) {
+            tryParseDateTime(field)
+          } else {
+            tryParseTimestampNTZ(field)
+          }
+        case TimestampType =>
+          if (options.inferDate) {
+            tryParseDateTime(field)
+          } else {
+            tryParseTimestamp(field)
+          }
         case BooleanType => tryParseBoolean(field)
         case StringType => StringType
         case other: DataType =>

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala
@@ -19,12 +19,12 @@ package org.apache.spark.sql.catalyst.csv
 
 import java.math.BigDecimal
 import java.text.{DecimalFormat, DecimalFormatSymbols}
-import java.time.{ZoneId, ZoneOffset}
+import java.time.{ZoneOffset}
 import java.util.{Locale, TimeZone}
 
 import org.apache.commons.lang3.time.FastDateFormat
-import org.apache.spark.SparkFunSuite
 
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.plans.SQLHelper
 import org.apache.spark.sql.catalyst.util.DateTimeConstants._
@@ -367,11 +367,15 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper {
           "timestampNTZFormat" -> "dd-MM-yyyy HH:mm", "dateFormat" -> "dd_MM_yyyy"),
           false, "UTC")
       val dateString = "08_09_2001"
-      val expected = date(2001, 9, 8, 0, 0, 0, 0, ZoneOffset.UTC)
+      val expected = dataType match {
+        case TimestampType | TimestampNTZType => date(2001, 9, 8, 0, 0, 0, 0, ZoneOffset.UTC)
+        case DateType => days(2001, 9, 8)
+      }
       val parser = new UnivocityParser(new StructType(), timestampsOptions)
       assert(parser.makeConverter("d", dataType).apply(dateString) == expected)
     }
     checkDate(TimestampType)
     checkDate(TimestampNTZType)
+    checkDate(DateType)
   }
 }