apache · wangyum · Aug 5, 2017 · Sep 10, 2017 · Sep 10, 2017 · Sep 10, 2017
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
@@ -1460,6 +1460,13 @@ that these options will be deprecated in future release as more optimizations ar
       Configures the number of partitions to use when shuffling data for joins or aggregations.
     </td>
   </tr>
+  <tr>
+    <td><code>spark.sql.typeCoercion.mode</code></td>
+    <td><code>default</code></td>
+    <td>
+        Whether compatible with Hive. Available options are <code>default</code> and <code>hive</code>.
+    </td>
+  </tr>
 </table>
 
 # Distributed SQL Engine

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -115,21 +115,46 @@ object TypeCoercion {
    * is a String and the other is not. It also handles when one op is a Date and the
    * other is a Timestamp by making the target type to be String.
    */
-  val findCommonTypeForBinaryComparison: (DataType, DataType) => Option[DataType] = {
-    // We should cast all relative timestamp/date/string comparison into string comparisons
-    // This behaves as a user would expect because timestamp strings sort lexicographically.
-    // i.e. TimeStamp(2013-01-01 00:00 ...) < "2014" = true
-    case (StringType, DateType) => Some(StringType)
-    case (DateType, StringType) => Some(StringType)
-    case (StringType, TimestampType) => Some(StringType)
-    case (TimestampType, StringType) => Some(StringType)
-    case (TimestampType, DateType) => Some(StringType)
-    case (DateType, TimestampType) => Some(StringType)
-    case (StringType, NullType) => Some(StringType)
-    case (NullType, StringType) => Some(StringType)
-    case (l: StringType, r: AtomicType) if r != StringType => Some(r)
-    case (l: AtomicType, r: StringType) if (l != StringType) => Some(l)
-    case (l, r) => None
+  private def findCommonTypeForBinaryComparison(
+      plan: LogicalPlan,
+      l: DataType,
+      r: DataType): Option[DataType] =
+    if (!plan.conf.isHiveTypeCoercionMode) {
+      (l, r) match {
+        // We should cast all relative timestamp/date/string comparison into string comparisons
+        // This behaves as a user would expect because timestamp strings sort lexicographically.
+        // i.e. TimeStamp(2013-01-01 00:00 ...) < "2014" = true
+        case (StringType, DateType) => Some(StringType)
+        case (DateType, StringType) => Some(StringType)
+        case (StringType, TimestampType) => Some(StringType)
+        case (TimestampType, StringType) => Some(StringType)
+        case (TimestampType, DateType) => Some(StringType)
+        case (DateType, TimestampType) => Some(StringType)
+        case (StringType, NullType) => Some(StringType)
+        case (NullType, StringType) => Some(StringType)
+        case (l: StringType, r: AtomicType) if r != StringType => Some(r)
+        case (l: AtomicType, r: StringType) if (l != StringType) => Some(l)
+        case (l, r) => None
+      }
+    } else {
+      (l, r) match {
+        // Follow hive's binary comparison action:
+        // https://github.com/apache/hive/blob/rel/storage-release-2.4.0/ql/src/java/
+        // org/apache/hadoop/hive/ql/exec/FunctionRegistry.java#L781
+        case (StringType, DateType) => Some(DateType)
+        case (DateType, StringType) => Some(DateType)
+        case (StringType, TimestampType) => Some(TimestampType)
+        case (TimestampType, StringType) => Some(TimestampType)
+        case (TimestampType, DateType) => Some(TimestampType)
+        case (DateType, TimestampType) => Some(TimestampType)
+        case (StringType, NullType) => Some(StringType)
+        case (NullType, StringType) => Some(StringType)
+        case (StringType | TimestampType, r: NumericType) => Some(DoubleType)
+        case (l: NumericType, StringType | TimestampType) => Some(DoubleType)
+        case (l: StringType, r: AtomicType) if r != StringType => Some(r)
+        case (l: AtomicType, r: StringType) if l != StringType => Some(l)
+        case _ => None
+      }
   }
 
   /**
@@ -352,10 +377,9 @@ object TypeCoercion {
         p.makeCopy(Array(Cast(left, TimestampType), right))
       case p @ Equality(left @ TimestampType(), right @ StringType()) =>
         p.makeCopy(Array(left, Cast(right, TimestampType)))
-
       case p @ BinaryComparison(left, right)
-        if findCommonTypeForBinaryComparison(left.dataType, right.dataType).isDefined =>
-        val commonType = findCommonTypeForBinaryComparison(left.dataType, right.dataType).get
+        if findCommonTypeForBinaryComparison(plan, left.dataType, right.dataType).isDefined =>
+        val commonType = findCommonTypeForBinaryComparison(plan, left.dataType, right.dataType).get
         p.makeCopy(Array(castExpr(left, commonType), castExpr(right, commonType)))
 
       case Abs(e @ StringType()) => Abs(Cast(e, DoubleType))
@@ -411,8 +435,9 @@ object TypeCoercion {
         val rhs = sub.output
 
         val commonTypes = lhs.zip(rhs).flatMap { case (l, r) =>
-          findCommonTypeForBinaryComparison(l.dataType, r.dataType)
-            .orElse(findTightestCommonType(l.dataType, r.dataType))
+          findCommonTypeForBinaryComparison(plan, l.dataType, r.dataType)
+              .orElse(findTightestCommonType(l.dataType, r.dataType))
+
         }
 
         // The number of columns/expressions must match between LHS and RHS of an

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -925,6 +925,14 @@ object SQLConf {
       .intConf
       .createWithDefault(10000)
 
+  val typeCoercionMode =
+    buildConf("spark.sql.typeCoercion.mode")
+      .doc("Whether compatible with Hive.")
+      .stringConf
+      .transform(_.toLowerCase(Locale.ROOT))
+      .checkValues(Set("default", "hive"))
+      .createWithDefault("default")
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1203,6 +1211,8 @@ class SQLConf extends Serializable with Logging {
 
   def arrowMaxRecordsPerBatch: Int = getConf(ARROW_EXECUTION_MAX_RECORDS_PER_BATCH)
 
+  def isHiveTypeCoercionMode: Boolean = getConf(SQLConf.typeCoercionMode).equals("hive")
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
 import java.io.File
 import java.math.MathContext
 import java.net.{MalformedURLException, URL}
-import java.sql.Timestamp
+import java.sql.{Date, Timestamp}
 import java.util.concurrent.atomic.AtomicBoolean
 
 import org.apache.spark.{AccumulatorSuite, SparkException}
@@ -2677,4 +2677,142 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       checkAnswer(df, Row(1, 1, 1))
     }
   }
+
+  test("SPARK-21646: CommonTypeForBinaryComparison: StringType vs NumericType") {
+    withTempView("v") {
+      val str1 = Long.MaxValue.toString + "1"
+      val str2 = Int.MaxValue.toString + "1"
+      val str3 = "10"
+      Seq(str1, str2, str3).toDF("c1").createOrReplaceTempView("v")
+      withSQLConf(SQLConf.typeCoercionMode.key -> "hive") {
+        checkAnswer(sql("SELECT c1 from v where c1 > 0"),
+          Row(str1) :: Row(str2) :: Row(str3) :: Nil)
+        checkAnswer(sql("SELECT c1 from v where c1 > 0L"),
+          Row(str1) :: Row(str2) :: Row(str3) :: Nil)
+      }
+
+      withSQLConf(SQLConf.typeCoercionMode.key -> "default") {
+        checkAnswer(sql("SELECT c1 from v where c1 > 0"), Row(str3) :: Nil)
+        checkAnswer(sql("SELECT c1 from v where c1 > 0L"), Row(str2) :: Row(str3) :: Nil)
+      }
+    }
+  }
+
+  test("SPARK-21646: CommonTypeForBinaryComparison: DoubleType vs IntegerType") {
+    withTempView("v") {
+      Seq(("0", 1), ("-0.4", 2), ("0.6", 3)).toDF("c1", "c2").createOrReplaceTempView("v")
+      withSQLConf(SQLConf.typeCoercionMode.key -> "hive") {
+        checkAnswer(sql("SELECT c1 FROM v WHERE c1 = 0"), Seq(Row("0")))
+        checkAnswer(sql("SELECT c1 FROM v WHERE c1 = 0L"), Seq(Row("0")))
+        checkAnswer(sql("SELECT c1 FROM v WHERE c1 = 0.0"), Seq(Row("0")))
+        checkAnswer(sql("SELECT c1 FROM v WHERE c1 = -0.4"), Seq(Row("-0.4")))
+        checkAnswer(sql("SELECT count(*) FROM v WHERE c1 > 0"), Row(1) :: Nil)
+      }
+
+      withSQLConf(SQLConf.typeCoercionMode.key -> "default") {
+        checkAnswer(sql("SELECT c1 FROM v WHERE c1 = 0"), Seq(Row("0"), Row("-0.4"), Row("0.6")))
+        checkAnswer(sql("SELECT c1 FROM v WHERE c1 = 0L"), Seq(Row("0"), Row("-0.4"), Row("0.6")))
+        checkAnswer(sql("SELECT c1 FROM v WHERE c1 = 0.0"), Seq(Row("0")))
+        checkAnswer(sql("SELECT c1 FROM v WHERE c1 = -0.4"), Seq(Row("-0.4")))
+        checkAnswer(sql("SELECT count(*) FROM v WHERE c1 > 0"), Row(0) :: Nil)
+      }
+    }
+  }
+
+  test("SPARK-21646: CommonTypeForBinaryComparison: StringType vs DateType") {
+    withTempView("v") {
+      val v1 = Date.valueOf("2017-09-22")
+      val v2 = Date.valueOf("2017-09-09")
+      Seq(v1, v2).toDF("c1").createTempView("v")
+      withSQLConf(SQLConf.typeCoercionMode.key -> "hive") {
+        checkAnswer(sql("select c1 from v where c1 > '2017-8-1'"), Row(v1) :: Row(v2) :: Nil)
+        checkAnswer(sql("select c1 from v where c1 > cast('2017-8-1' as date)"),
+          Row(v1) :: Row(v2) :: Nil)
+      }
+
+      withSQLConf(SQLConf.typeCoercionMode.key -> "default") {
+        checkAnswer(sql("select c1 from v where c1 > '2017-8-1'"), Nil)
+        checkAnswer(sql("select c1 from v where c1 > cast('2017-8-1' as date)"),
+          Row(v1) :: Row(v2) :: Nil)
+      }
+    }
+  }
+
+  test("SPARK-21646: CommonTypeForBinaryComparison: StringType vs TimestampType") {
+    withTempView("v") {
+      val v1 = Timestamp.valueOf("2017-07-21 23:42:12.123")
+      val v2 = Timestamp.valueOf("2017-08-21 23:42:12.123")
+      Seq(v1, v2).toDF("c1").createTempView("v")
+      withSQLConf(SQLConf.typeCoercionMode.key -> "hive") {
+        checkAnswer(sql("select c1 from v where c1 > '2017-8-1'"), Row(v2) :: Nil)
+        checkAnswer(sql("select c1 from v where c1 > cast('2017-8-1' as timestamp)"),
+          Row(v2) :: Nil)
+      }
+
+      withSQLConf(SQLConf.typeCoercionMode.key -> "default") {
+        checkAnswer(sql("select c1 from v where c1 > '2017-8-1'"), Nil)
+        checkAnswer(sql("select c1 from v where c1 > cast('2017-8-1' as timestamp)"),
+          Row(v2) :: Nil)
+      }
+    }
+  }
+
+  test("SPARK-21646: CommonTypeForBinaryComparison: TimestampType vs DateType") {
+    withTempView("v") {
+      val v1 = Timestamp.valueOf("2017-07-21 23:42:12.123")
+      val v2 = Timestamp.valueOf("2017-08-21 23:42:12.123")
+      Seq(v1, v2).toDF("c1").createTempView("v")
+      withSQLConf(SQLConf.typeCoercionMode.key -> "Hive") {
+        checkAnswer(sql("select c1 from v where c1 > cast('2017-8-1' as date)"), Row(v2) :: Nil)
+        checkAnswer(sql("select c1 from v where c1 > cast('2017-8-1' as timestamp)"),
+          Row(v2) :: Nil)
+      }
+
+      withSQLConf(SQLConf.typeCoercionMode.key -> "Default") {
+        checkAnswer(sql("select c1 from v where c1 > cast('2017-8-1' as date)"), Row(v2) :: Nil)
+        checkAnswer(sql("select c1 from v where c1 > cast('2017-8-1' as timestamp)"),
+          Row(v2) :: Nil)
+      }
+    }
+  }
+
+  test("SPARK-21646: CommonTypeForBinaryComparison: TimestampType vs NumericType") {
+    withTempView("v") {
+      val v1 = Timestamp.valueOf("2017-07-21 23:42:12.123")
+      val v2 = Timestamp.valueOf("2017-08-21 23:42:12.123")
+      Seq(v1, v2).toDF("c1").createTempView("v")
+      withSQLConf(SQLConf.typeCoercionMode.key -> "hive") {
+        checkAnswer(sql("select c1 from v where c1 > 1"), Row(v1) :: Row(v2) :: Nil)
+        checkAnswer(sql("select c1 from v where c1 > '2017-8-1'"), Row(v2) :: Nil)
+        checkAnswer(sql("select c1 from v where c1 > '2017-08-01'"), Row(v2) :: Nil)
+        checkAnswer(
+          sql("select * from v where c1 > cast(cast('2017-08-01' as timestamp) as double)"),
+          Row(v2) :: Nil)
+      }
+
+      withSQLConf(SQLConf.typeCoercionMode.key -> "default") {
+        val e1 = intercept[AnalysisException] {
+          sql("select * from v where c1 > 1")
+        }
+        assert(e1.getMessage.contains("data type mismatch"))
+        checkAnswer(sql("select c1 from v where c1 > '2017-8-1'"), Nil)
+        checkAnswer(sql("select c1 from v where c1 > '2017-08-01'"), Row(v2) :: Nil)
+        val e2 = intercept[AnalysisException] {
+          sql("select * from v where c1 > cast(cast('2017-08-01' as timestamp) as double)")
+        }
+        assert(e2.getMessage.contains("data type mismatch"))
+      }
+
+      val e1 = intercept[AnalysisException] {
+        sql("select * from v where c1 > 1")
+      }
+      assert(e1.getMessage.contains("data type mismatch"))
+      checkAnswer(sql("select c1 from v where c1 > '2017-8-1'"), Nil)
+      checkAnswer(sql("select c1 from v where c1 > '2017-08-01'"), Row(v2) :: Nil)
+      val e2 = intercept[AnalysisException] {
+        sql("select * from v where c1 > cast(cast('2017-08-01' as timestamp) as double)")
+      }
+      assert(e2.getMessage.contains("data type mismatch"))
+    }
+  }
 }