[SPARK-36702][SQL] ArrayUnion handle duplicated Double.NaN and Float.Nan

apache · AngersZhuuuu · Sep 10, 2021 · Sep 10, 2021 · Sep 10, 2021 · Sep 10, 2021
commit 8579c9769df6bfe4f59dda612661d402938867a3
diff --git a/...alyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/...alyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.util.SQLOpenHashSet
 import org.apache.spark.unsafe.UTF8StringBuilder
 import org.apache.spark.unsafe.array.ByteArrayMethods
 import org.apache.spark.unsafe.array.ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH
@@ -3575,17 +3576,11 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
     if (TypeUtils.typeWithProperEquals(elementType)) {
       (array1, array2) =>
         val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
-        val hs = new OpenHashSet[Any]
+        val hs = new SQLOpenHashSet[Any]
         var foundNullElement = false
         Seq(array1, array2).foreach { array =>
           var i = 0
           while (i < array.numElements()) {
-            if (array.isNullAt(i)) {
-              if (!foundNullElement) {
-                arrayBuffer += null
-                foundNullElement = true
-              }
-            } else {
               val elem = array.get(i, elementType)
               if (!hs.contains(elem)) {
                 if (arrayBuffer.size > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) {
@@ -3594,7 +3589,6 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
                 arrayBuffer += elem
                 hs.add(elem)
               }
-            }
             i += 1
           }
         }
@@ -3649,61 +3643,37 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
       val ptName = CodeGenerator.primitiveTypeName(jt)
 
       nullSafeCodeGen(ctx, ev, (array1, array2) => {
-        val foundNullElement = ctx.freshName("foundNullElement")
         val nullElementIndex = ctx.freshName("nullElementIndex")
         val builder = ctx.freshName("builder")
         val array = ctx.freshName("array")
         val arrays = ctx.freshName("arrays")
         val arrayDataIdx = ctx.freshName("arrayDataIdx")
-        val openHashSet = classOf[OpenHashSet[_]].getName
+        val openHashSet = classOf[SQLOpenHashSet[_]].getName
         val classTag = s"scala.reflect.ClassTag$$.MODULE$$.$hsTypeName()"
         val hashSet = ctx.freshName("hashSet")
         val arrayBuilder = classOf[mutable.ArrayBuilder[_]].getName
         val arrayBuilderClass = s"$arrayBuilder$$of$ptName"
 
-        def withArrayNullAssignment(body: String) =
-          if (dataType.asInstanceOf[ArrayType].containsNull) {
-            s"""
-               |if ($array.isNullAt($i)) {
-               |  if (!$foundNullElement) {
-               |    $nullElementIndex = $size;
-               |    $foundNullElement = true;
-               |    $size++;
-               |    $builder.$$plus$$eq($nullValueHolder);
-               |  }
-               |} else {
-               |  $body
-               |}
-             """.stripMargin
-          } else {
-            body
-          }
-
-        val processArray = withArrayNullAssignment(
+        val processArray =
           s"""
              |$jt $value = ${genGetValue(array, i)};
              |if (!$hashSet.contains($hsValueCast$value)) {
              |  if (++$size > ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}) {
              |    break;
              |  }
              |  $hashSet.add$hsPostFix($hsValueCast$value);
-             |  $builder.$$plus$$eq($value);
+             |  if ($array.isNullAt($i)) {
+             |    $nullElementIndex = $size - 1;
+             |    $builder.$$plus$$eq($nullValueHolder);
+             |  } else {
+             |    $builder.$$plus$$eq($value);
+             |  }
              |}
-           """.stripMargin)
-
-        // Only need to track null element index when result array's element is nullable.
-        val declareNullTrackVariables = if (dataType.asInstanceOf[ArrayType].containsNull) {
-          s"""
-             |boolean $foundNullElement = false;
-             |int $nullElementIndex = -1;
            """.stripMargin
-        } else {
-          ""
-        }
 
         s"""
            |$openHashSet $hashSet = new $openHashSet$hsPostFix($classTag);
-           |$declareNullTrackVariables
+           |int $nullElementIndex = -1;
            |int $size = 0;
            |$arrayBuilderClass $builder = new $arrayBuilderClass();
            |ArrayData[] $arrays = new ArrayData[]{$array1, $array2};

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.util
+
+import scala.reflect._
+
+import org.apache.spark.annotation.Private
+import org.apache.spark.util.collection.OpenHashSet
+
+/**
+ * A wrap of [[OpenHashSet]] that can handle null, Double.NaN and Float.NaN.
+ */
+@Private
+class SQLOpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
+    initialCapacity: Int,
+    loadFactor: Double) {
+
+  def this(initialCapacity: Int) = this(initialCapacity, 0.7)
+
+  def this() = this(64)
+
+  private val hashSet = new OpenHashSet[T](initialCapacity, loadFactor)
+
+  private var containsNull = false
+  private var containsDoubleNaN = false
+  private var containsFloatNaN = false
+
+  def add(k: T): Unit = {
+    if (k == null) {
+      containsNull = true
+    } else if (Double.NaN.equals(k)) {
+      containsDoubleNaN = true
+    } else if (Float.NaN.equals(k)) {
+      containsFloatNaN = true
+    } else {
+      hashSet.add(k)
+    }
+  }
+
+  def contains(k: T): Boolean = {
+    if (k == null) {
+      containsNull
+    } else if (Double.NaN.equals(k)) {
+      containsDoubleNaN
+    } else if (Float.NaN.equals(k)) {
+      containsFloatNaN
+    } else {
+      hashSet.contains(k)
+    }
+  }
+}
diff --git a/...src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/...src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -2309,4 +2309,21 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
       }
     }
   }
+
+  test("SPARK-36702: ArrayUnion should handle duplicated Double.NaN and Float.Nan") {
+    checkEvaluation(ArrayUnion(
+      Literal.apply(Array(Double.NaN, Double.NaN)), Literal.apply(Array(1d))),
+      Seq(Double.NaN, 1d))
+    checkEvaluation(ArrayUnion(
+      Literal.create(Seq(Double.NaN, null), ArrayType(DoubleType)),
+      Literal.create(Seq(Double.NaN, null, 1d), ArrayType(DoubleType))),
+      Seq(Double.NaN, null, 1d))
+    checkEvaluation(ArrayUnion(
+      Literal.apply(Array(Float.NaN, Float.NaN)), Literal.apply(Array(1f))),
+      Seq(Float.NaN, 1f))
+    checkEvaluation(ArrayUnion(
+      Literal.create(Seq(Float.NaN, null), ArrayType(FloatType)),
+      Literal.create(Seq(Float.NaN, null, 1f), ArrayType(FloatType))),
+      Seq(Float.NaN, null, 1f))
+  }
 }