[SPARK-22036][SQL] Decimal multiplication with high precision/scale o…

…ften returns NULL
apache · mgaido91 · Dec 17, 2017 · Jan 9, 2018 · Jan 9, 2018 · Jan 10, 2018
commit 3037d4aa6afc4d7630d86d29b8dd7d7d724cc990
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
@@ -93,41 +93,46 @@ object DecimalPrecision extends TypeCoercionRule {
     case e: BinaryArithmetic if e.left.isInstanceOf[PromotePrecision] => e
 
     case Add(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2))
+      val resultScale = max(s1, s2)
+      val dt = DecimalType.adjustPrecisionScale(max(p1 - s1, p2 - s2) + resultScale + 1,
+        resultScale)
       CheckOverflow(Add(promotePrecision(e1, dt), promotePrecision(e2, dt)), dt)
 
     case Subtract(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val dt = DecimalType.bounded(max(s1, s2) + max(p1 - s1, p2 - s2) + 1, max(s1, s2))
+      val resultScale = max(s1, s2)
+      val dt = DecimalType.adjustPrecisionScale(max(p1 - s1, p2 - s2) + resultScale + 1,
+        resultScale)
       CheckOverflow(Subtract(promotePrecision(e1, dt), promotePrecision(e2, dt)), dt)
 
     case Multiply(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val resultType = DecimalType.bounded(p1 + p2 + 1, s1 + s2)
+      val resultType = DecimalType.adjustPrecisionScale(p1 + p2 + 1, s1 + s2)
       val widerType = widerDecimalType(p1, s1, p2, s2)
       CheckOverflow(Multiply(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
         resultType)
 
     case Divide(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      var intDig = min(DecimalType.MAX_SCALE, p1 - s1 + s2)
-      var decDig = min(DecimalType.MAX_SCALE, max(6, s1 + p2 + 1))
-      val diff = (intDig + decDig) - DecimalType.MAX_SCALE
-      if (diff > 0) {
-        decDig -= diff / 2 + 1
-        intDig = DecimalType.MAX_SCALE - decDig
-      }
-      val resultType = DecimalType.bounded(intDig + decDig, decDig)
+      // From https://msdn.microsoft.com/en-us/library/ms190476.aspx
+      // Precision: p1 - s1 + s2 + max(6, s1 + p2 + 1)
+      // Scale: max(6, s1 + p2 + 1)
+      val intDig = p1 - s1 + s2
+      val scale = max(DecimalType.MINIMUM_ADJUSTED_SCALE, s1 + p2 + 1)
+      val prec = intDig + scale
+      val resultType = DecimalType.adjustPrecisionScale(prec, scale)
       val widerType = widerDecimalType(p1, s1, p2, s2)
       CheckOverflow(Divide(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
         resultType)
 
     case Remainder(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val resultType = DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      val resultType = DecimalType.adjustPrecisionScale(min(p1 - s1, p2 - s2) + max(s1, s2),
+        max(s1, s2))
       // resultType may have lower precision, so we cast them into wider type first.
       val widerType = widerDecimalType(p1, s1, p2, s2)
       CheckOverflow(Remainder(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
         resultType)
 
     case Pmod(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) =>
-      val resultType = DecimalType.bounded(min(p1 - s1, p2 - s2) + max(s1, s2), max(s1, s2))
+      val resultType = DecimalType.adjustPrecisionScale(min(p1 - s1, p2 - s2) + max(s1, s2),
+        max(s1, s2))
       // resultType may have lower precision, so we cast them into wider type first.
       val widerType = widerDecimalType(p1, s1, p2, s2)
       CheckOverflow(Pmod(promotePrecision(e1, widerType), promotePrecision(e2, widerType)),
@@ -243,17 +248,43 @@ object DecimalPrecision extends TypeCoercionRule {
     // Promote integers inside a binary expression with fixed-precision decimals to decimals,
     // and fixed-precision decimals in an expression with floats / doubles to doubles
     case b @ BinaryOperator(left, right) if left.dataType != right.dataType =>
-      (left.dataType, right.dataType) match {
-        case (t: IntegralType, DecimalType.Fixed(p, s)) =>
-          b.makeCopy(Array(Cast(left, DecimalType.forType(t)), right))
-        case (DecimalType.Fixed(p, s), t: IntegralType) =>
-          b.makeCopy(Array(left, Cast(right, DecimalType.forType(t))))
-        case (t, DecimalType.Fixed(p, s)) if isFloat(t) =>
-          b.makeCopy(Array(left, Cast(right, DoubleType)))
-        case (DecimalType.Fixed(p, s), t) if isFloat(t) =>
-          b.makeCopy(Array(Cast(left, DoubleType), right))
-        case _ =>
-          b
-      }
+      nondecimalLiteralAndDecimal(b).lift((left, right)).getOrElse(
+        nondecimalNonliteralAndDecimal(b).applyOrElse((left.dataType, right.dataType),
+          (_: (DataType, DataType)) => b))
   }
+
+  /**
+   * Type coercion for BinaryOperator in which one side is a non-decimal literal numeric, and the
+   * other side is a decimal.
+   */
+  private def nondecimalLiteralAndDecimal(
+      b: BinaryOperator): PartialFunction[(Expression, Expression), Expression] = {
+    // Promote literal integers inside a binary expression with fixed-precision decimals to
+    // decimals. The precision and scale are the ones needed by the integer value.
+    case (l: Literal, r) if r.dataType.isInstanceOf[DecimalType]
+      && l.dataType.isInstanceOf[IntegralType] =>
+      b.makeCopy(Array(Cast(l, DecimalType.forLiteral(l)), r))
+    case (l, r: Literal) if l.dataType.isInstanceOf[DecimalType]
+      && r.dataType.isInstanceOf[IntegralType] =>
+      b.makeCopy(Array(l, Cast(r, DecimalType.forLiteral(r))))
+  }
+
+  /**
+   * Type coercion for BinaryOperator in which one side is a non-decimal non-literal numeric, and
+   * the other side is a decimal.
+   */
+  private def nondecimalNonliteralAndDecimal(
+      b: BinaryOperator): PartialFunction[(DataType, DataType), Expression] = {
+    // Promote integers inside a binary expression with fixed-precision decimals to decimals,
+    // and fixed-precision decimals in an expression with floats / doubles to doubles
+    case (t: IntegralType, DecimalType.Fixed(p, s)) =>
+      b.makeCopy(Array(Cast(b.left, DecimalType.forType(t)), b.right))
+    case (DecimalType.Fixed(_, _), t: IntegralType) =>
+      b.makeCopy(Array(b.left, Cast(b.right, DecimalType.forType(t))))
+    case (t, DecimalType.Fixed(_, _)) if isFloat(t) =>
+      b.makeCopy(Array(b.left, Cast(b.right, DoubleType)))
+    case (DecimalType.Fixed(_, _), t) if isFloat(t) =>
+      b.makeCopy(Array(Cast(b.left, DoubleType), b.right))
+  }
+
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -58,7 +58,7 @@ object Literal {
     case s: Short => Literal(s, ShortType)
     case s: String => Literal(UTF8String.fromString(s), StringType)
     case b: Boolean => Literal(b, BooleanType)
-    case d: BigDecimal => Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale))
+    case d: BigDecimal => Literal(Decimal(d), DecimalType.fromBigDecimal(d))
     case d: JavaBigDecimal =>
       Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale()))
     case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), d.scale))

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DecimalType.scala
@@ -23,7 +23,7 @@ import scala.reflect.runtime.universe.typeTag
 
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.expressions.{Expression, Literal}
 
 
 /**
@@ -117,6 +117,7 @@ object DecimalType extends AbstractDataType {
   val MAX_SCALE = 38
   val SYSTEM_DEFAULT: DecimalType = DecimalType(MAX_PRECISION, 18)
   val USER_DEFAULT: DecimalType = DecimalType(10, 0)
+  val MINIMUM_ADJUSTED_SCALE = 6
 
   // The decimal types compatible with other numeric types
   private[sql] val ByteDecimal = DecimalType(3, 0)
@@ -136,10 +137,54 @@ object DecimalType extends AbstractDataType {
     case DoubleType => DoubleDecimal
   }
 
+  private[sql] def forLiteral(literal: Literal): DecimalType = literal.value match {
+    case v: Short => fromBigDecimal(BigDecimal(v))
+    case v: Int => fromBigDecimal(BigDecimal(v))
+    case v: Long => fromBigDecimal(BigDecimal(v))
+    case _ => forType(literal.dataType)
+  }
+
+  private[sql] def fromBigDecimal(d: BigDecimal): DecimalType = {
+    DecimalType(Math.max(d.precision, d.scale), d.scale)
+  }
+
   private[sql] def bounded(precision: Int, scale: Int): DecimalType = {
     DecimalType(min(precision, MAX_PRECISION), min(scale, MAX_SCALE))
   }
 
+  // scalastyle:off line.size.limit
+  /**
+   * Decimal implementation is based on Hive's one, which is itself inspired to SQLServer's one.
+   * In particular, when a result precision is greater than {@link #MAX_PRECISION}, the
+   * corresponding scale is reduced to prevent the integral part of a result from being truncated.
+   *
+   * For further reference, please see
+   * https://blogs.msdn.microsoft.com/sqlprogrammability/2006/03/29/multiplication-and-division-with-numerics/.
+   *
+   * @param precision
+   * @param scale
+   * @return
+   */
+  // scalastyle:on line.size.limit
+  private[sql] def adjustPrecisionScale(precision: Int, scale: Int): DecimalType = {
+    // Assumptions:
+    // precision >= scale
+    // scale >= 0
+    if (precision <= MAX_PRECISION) {
+      // Adjustment only needed when we exceed max precision
+      DecimalType(precision, scale)
+    } else {
+      // Precision/scale exceed maximum precision. Result must be adjusted to MAX_PRECISION.
+      val intDigits = precision - scale
+      // If original scale less than MINIMUM_ADJUSTED_SCALE, use original scale value; otherwise
+      // preserve at least MINIMUM_ADJUSTED_SCALE fractional digits
+      val minScaleValue = Math.min(scale, MINIMUM_ADJUSTED_SCALE)
+      val adjustedScale = Math.max(MAX_PRECISION - intDigits, minScaleValue)
+
+      DecimalType(MAX_PRECISION, adjustedScale)
+    }
+  }
+
   override private[sql] def defaultConcreteType: DataType = SYSTEM_DEFAULT
 
   override private[sql] def acceptsType(other: DataType): Boolean = {

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -408,8 +408,8 @@ class AnalysisSuite extends AnalysisTest with Matchers {
     assertExpressionType(sum(Divide(1.0, 2.0)), DoubleType)
     assertExpressionType(sum(Divide(1, 2.0f)), DoubleType)
     assertExpressionType(sum(Divide(1.0f, 2)), DoubleType)
-    assertExpressionType(sum(Divide(1, Decimal(2))), DecimalType(31, 11))
-    assertExpressionType(sum(Divide(Decimal(1), 2)), DecimalType(31, 11))
+    assertExpressionType(sum(Divide(1, Decimal(2))), DecimalType(22, 11))
+    assertExpressionType(sum(Divide(Decimal(1), 2)), DecimalType(26, 6))
     assertExpressionType(sum(Divide(Decimal(1), 2.0)), DoubleType)
     assertExpressionType(sum(Divide(1.0, Decimal(2.0))), DoubleType)
   }

diff --git a/...atalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/...atalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -136,19 +136,19 @@ class DecimalPrecisionSuite extends AnalysisTest with BeforeAndAfter {
 
   test("maximum decimals") {
     for (expr <- Seq(d1, d2, i, u)) {
-      checkType(Add(expr, u), DecimalType.SYSTEM_DEFAULT)
-      checkType(Subtract(expr, u), DecimalType.SYSTEM_DEFAULT)
+      checkType(Add(expr, u), DecimalType(38, 17))
+      checkType(Subtract(expr, u), DecimalType(38, 17))
     }
 
-    checkType(Multiply(d1, u), DecimalType(38, 19))
-    checkType(Multiply(d2, u), DecimalType(38, 20))
-    checkType(Multiply(i, u), DecimalType(38, 18))
-    checkType(Multiply(u, u), DecimalType(38, 36))
+    checkType(Multiply(d1, u), DecimalType(38, 16))
+    checkType(Multiply(d2, u), DecimalType(38, 14))
+    checkType(Multiply(i, u), DecimalType(38, 7))
+    checkType(Multiply(u, u), DecimalType(38, 6))
 
-    checkType(Divide(u, d1), DecimalType(38, 18))
-    checkType(Divide(u, d2), DecimalType(38, 19))
-    checkType(Divide(u, i), DecimalType(38, 23))
-    checkType(Divide(u, u), DecimalType(38, 18))
+    checkType(Divide(u, d1), DecimalType(38, 17))
+    checkType(Divide(u, d2), DecimalType(38, 16))
+    checkType(Divide(u, i), DecimalType(38, 18))
+    checkType(Divide(u, u), DecimalType(38, 6))
 
     checkType(Remainder(d1, u), DecimalType(19, 18))
     checkType(Remainder(d2, u), DecimalType(21, 18))

diff --git a/sql/core/src/test/resources/sql-tests/inputs/decimals.sql b/sql/core/src/test/resources/sql-tests/inputs/decimals.sql
@@ -0,0 +1,16 @@
+-- tests for decimals handling in operations
+-- Spark draws its inspiration byt Hive implementation
+create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet;
+
+insert into decimals_test values(1, 100.0, 999.0);
+insert into decimals_test values(2, 12345.123, 12345.123);
+insert into decimals_test values(3, 0.1234567891011, 1234.1);
+insert into decimals_test values(4, 123456789123456789.0, 1.123456789123456789);
+
+-- test decimal operations
+select id, a+b, a-b, a*b, a/b from decimals_test order by id;
+
+-- test operations between decimals and constants
+select id, a*10, b/10 from decimals_test order by id;
+
+drop table decimals_test;
diff --git a/sql/core/src/test/resources/sql-tests/results/decimals.sql.out b/sql/core/src/test/resources/sql-tests/results/decimals.sql.out
@@ -0,0 +1,72 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 8
+
+
+-- !query 0
+create table decimals_test(id int, a decimal(38,18), b decimal(38,18)) using parquet
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+insert into decimals_test values(1, 100.0, 999.0)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+insert into decimals_test values(2, 12345.123, 12345.123)
+-- !query 2 schema
+struct<>
+-- !query 2 output
+
+
+
+-- !query 3
+insert into decimals_test values(3, 0.1234567891011, 1234.1)
+-- !query 3 schema
+struct<>
+-- !query 3 output
+
+
+
+-- !query 4
+insert into decimals_test values(4, 123456789123456789.0, 1.123456789123456789)
+-- !query 4 schema
+struct<>
+-- !query 4 output
+
+
+
+-- !query 5
+select id, a+b, a-b, a*b, a/b from decimals_test order by id
+-- !query 5 schema
+struct<id:int,(a + b):decimal(38,17),(a - b):decimal(38,17),(a * b):decimal(38,6),(a / b):decimal(38,6)>
+-- !query 5 output
+1	1099	-899	99900	0.1001
+2	24690.246	0	152402061.885129	1
+3	1234.2234567891011	-1233.9765432108989	152.358023	0.0001
+4	123456789123456790.12345678912345679	123456789123456787.87654321087654321	138698367904130467.515623	109890109097814272.043109
+
+
+-- !query 6
+select id, a*10, b/10 from decimals_test order by id
+-- !query 6 schema
+struct<id:int,(CAST(a AS DECIMAL(38,18)) * CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,15),(CAST(b AS DECIMAL(38,18)) / CAST(CAST(10 AS DECIMAL(2,0)) AS DECIMAL(38,18))):decimal(38,18)>
+-- !query 6 output
+1	1000	99.9
+2	123451.23	1234.5123
+3	1.234567891011	123.41
+4	1234567891234567890	0.112345678912345679
+
+
+-- !query 7
+drop table decimals_test
+-- !query 7 schema
+struct<>
+-- !query 7 output
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1526,15 +1526,15 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     checkAnswer(sql("select 10.300000000000000000 * 3.000000000000000000"),
       Row(BigDecimal("30.900000000000000000000000000000000000", new MathContext(38))))
     checkAnswer(sql("select 10.300000000000000000 * 3.0000000000000000000"),
-      Row(null))
+      Row(BigDecimal("30.900000000000000000000000000000000000", new MathContext(38))))
 
     checkAnswer(sql("select 10.3 / 3.0"), Row(BigDecimal("3.433333")))
     checkAnswer(sql("select 10.3000 / 3.0"), Row(BigDecimal("3.4333333")))
     checkAnswer(sql("select 10.30000 / 30.0"), Row(BigDecimal("0.343333333")))
     checkAnswer(sql("select 10.300000000000000000 / 3.00000000000000000"),
-      Row(BigDecimal("3.433333333333333333333333333", new MathContext(38))))
+      Row(BigDecimal("3.4333333333333333333", new MathContext(38))))
     checkAnswer(sql("select 10.3000000000000000000 / 3.00000000000000000"),
-      Row(BigDecimal("3.4333333333333333333333333333", new MathContext(38))))
+      Row(BigDecimal("3.4333333333333333333", new MathContext(38))))
   }
 
   test("SPARK-10215 Div of Decimal returns null") {