address review comments

apache · kiszk · Apr 13, 2018 · Apr 13, 2018 · Apr 13, 2018 · Apr 17, 2018
commit 2041ec45efdcb2b3ae9dfc7c5b7c6dc26c0091ea
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -1942,6 +1942,7 @@ def concat(*cols):
     return Column(sc._jvm.functions.concat(_to_seq(sc, cols, _to_java_column)))
 
 
+@ignore_unicode_prefix
 @since(2.4)
 def array_position(col, value):
     """
@@ -2017,16 +2018,16 @@ def array_distinct(col):
 @since(2.4)
 def array_union(col1, col2):
     """
-    Collection function: Returns an array of the elements in the union of col1 and col2,
-    without duplicates
+    Collection function: returns an array of the elements in the union of col1 and col2,
+    without duplicates. The order of elements in the result is not determined.
 
     :param col1: name of column containing array
     :param col2: name of column containing array
 
     >>> from pyspark.sql import Row
     >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])
     >>> df.select(array_union(df.c1, df.c2)).collect()
-    [Row(array_union(c1, c2)=[u'b', u'a', u'c', u'd', u'f']))]
+    [Row(array_union(c1, c2)=[u'b', u'c', u'd', u'a', u'f']))]
     """
     sc = SparkContext._active_spark_context
     return Column(sc._jvm.functions.array_union(_to_java_column(col1), _to_java_column(col2)))

diff --git a/...alyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/...alyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -2876,7 +2876,7 @@ case class ArrayRepeat(left: Expression, right: Expression)
   """,
   since = "2.4.0")
 case class ArrayUnion(left: Expression, right: Expression)
-  extends BinaryExpression with ExpectsInputTypes with CodegenFallback {
+  extends BinaryExpression with ExpectsInputTypes {
 
   override def inputTypes: Seq[AbstractDataType] = Seq(ArrayType, ArrayType)
 
@@ -2893,46 +2893,106 @@ case class ArrayUnion(left: Expression, right: Expression)
 
   override def dataType: DataType = left.dataType
 
+  private def elementType = dataType.asInstanceOf[ArrayType].elementType
+  private def cnLeft = left.dataType.asInstanceOf[ArrayType].containsNull
+  private def cnRight = right.dataType.asInstanceOf[ArrayType].containsNull
+
   override def nullSafeEval(linput: Any, rinput: Any): Any = {
-    val elementType = dataType.asInstanceOf[ArrayType].elementType
-    val cnl = left.dataType.asInstanceOf[ArrayType].containsNull
-    val cnr = right.dataType.asInstanceOf[ArrayType].containsNull
     val larray = linput.asInstanceOf[ArrayData]
     val rarray = rinput.asInstanceOf[ArrayData]
 
-    if (!cnl && !cnr && elementType == IntegerType) {
-      // avoid boxing primitive int array elements
-      val hs = new OpenHashSet[Int]
-      var i = 0
-      while (i < larray.numElements()) {
-        hs.add(larray.getInt(i))
-        i += 1
-      }
-      i = 0
-      while (i < rarray.numElements()) {
-        hs.add(rarray.getInt(i))
-        i += 1
-      }
-      UnsafeArrayData.fromPrimitiveArray(hs.iterator.toArray)
-    } else if (!cnl && !cnr && elementType == LongType) {
-      // avoid boxing of primitive long array elements
-      val hs = new OpenHashSet[Long]
-      var i = 0
-      while (i < larray.numElements()) {
-        hs.add(larray.getLong(i))
-        i += 1
+    if (!cnLeft && !cnRight) {
+      elementType match {
+        case IntegerType =>
+          // avoid boxing of primitive int array elements
+          val hs = new OpenHashSet[Int]
+          var i = 0
+          while (i < larray.numElements()) {
+            hs.add(larray.getInt(i))
+            i += 1
+          }
+          i = 0
+          while (i < rarray.numElements()) {
+            hs.add(rarray.getInt(i))
+            i += 1
+          }
+          UnsafeArrayData.fromPrimitiveArray(hs.iterator.toArray)
+        case LongType =>
+          // avoid boxing of primitive long array elements
+          val hs = new OpenHashSet[Long]
+          var i = 0
+          while (i < larray.numElements()) {
+            hs.add(larray.getLong(i))
+            i += 1
+          }
+          i = 0
+          while (i < rarray.numElements()) {
+            hs.add(rarray.getLong(i))
+            i += 1
+          }
+          UnsafeArrayData.fromPrimitiveArray(hs.iterator.toArray)
+        case _ =>
+          val hs = new OpenHashSet[Any]
+          var i = 0
+          while (i < larray.numElements()) {
+            hs.add(larray.get(i, elementType))
+            i += 1
+          }
+          i = 0
+          while (i < rarray.numElements()) {
+            hs.add(rarray.get(i, elementType))
+            i += 1
+          }
+          new GenericArrayData(hs.iterator.toArray)
       }
-      i = 0
-      while (i < rarray.numElements()) {
-        hs.add(rarray.getLong(i))
-        i += 1
+    } else {
+      CollectionOperations.arrayUnion(larray, rarray, elementType)
+    }
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val hs = ctx.freshName("hs")
+    val i = ctx.freshName("i")
+    val collectionOperations = "org.apache.spark.sql.catalyst.expressions.CollectionOperations"
+    val genericArrayData = classOf[GenericArrayData].getName
+    val unsafeArrayData = classOf[UnsafeArrayData].getName
+    val openHashSet = classOf[OpenHashSet[_]].getName
+    val ot = "org.apache.spark.sql.types.ObjectType$.MODULE$.apply(Object.class)"
+    val (postFix, classTag, getter, arrayBuilder, castType) = if (!cnLeft && !cnRight) {
+      val ptName = CodeGenerator.primitiveTypeName(elementType)
+      elementType match {
+        case ByteType | ShortType | IntegerType =>
+          (s"$$mcI$$sp", s"scala.reflect.ClassTag$$.MODULE$$.$ptName()", s"get$ptName($i)",
+           s"$unsafeArrayData.fromPrimitiveArray", CodeGenerator.javaType(elementType))
+        case LongType =>
+          (s"$$mcJ$$sp", s"scala.reflect.ClassTag$$.MODULE$$.$ptName()", s"get$ptName($i)",
+           s"$unsafeArrayData.fromPrimitiveArray", "long")
+        case _ =>
+          ("", s"scala.reflect.ClassTag$$.MODULE$$.Object()", s"get($i, $ot)",
+           s"new $genericArrayData", "Object")
       }
-      UnsafeArrayData.fromPrimitiveArray(hs.iterator.toArray)
     } else {
-      new GenericArrayData(
-        (larray.toArray[AnyRef](elementType) union rarray.toArray[AnyRef](elementType))
-          .distinct.asInstanceOf[Array[Any]])
+      ("", "", "", "", "")
     }
+
+    nullSafeCodeGen(ctx, ev, (larray, rarray) => {
+      if (classTag != "") {
+        s"""
+           |$openHashSet $hs = new $openHashSet$postFix($classTag);
+           |for (int $i = 0; $i < $larray.numElements(); $i++) {
+           |  $hs.add$postFix($larray.$getter);
+           |}
+           |for (int $i = 0; $i < $rarray.numElements(); $i++) {
+           |  $hs.add$postFix($rarray.$getter);
+           |}
+           |${ev.value} = $arrayBuilder(
+           |  ($castType[]) $hs.iterator().toArray($classTag));
+         """.stripMargin
+      } else {
+        val dt = "org.apache.spark.sql.types.ObjectType$.MODULE$.apply(Object.class)"
+        s"${ev.value} = $collectionOperations$$.MODULE$$.arrayUnion($larray, $rarray, $ot);"
+      }
+    })
   }
 
   override def prettyName: String = "array_union"
@@ -3338,3 +3398,10 @@ case class ArrayDistinct(child: Expression)
 
   override def prettyName: String = "array_distinct"
 }
+
+object CollectionOperations {
+  def arrayUnion(larray: ArrayData, rarray: ArrayData, et: DataType): ArrayData = {
+    new GenericArrayData(larray.toArray[AnyRef](et).union(rarray.toArray[AnyRef](et))
+      .distinct.asInstanceOf[Array[Any]])
+  }
+}
diff --git a/...src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/...src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -1185,6 +1185,8 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
     val a20 = Literal.create(Seq("b", "a", "c"), ArrayType(StringType))
     val a21 = Literal.create(Seq("c", "d", "a", "f"), ArrayType(StringType))
     val a22 = Literal.create(Seq("b", null, "a", "g"), ArrayType(StringType))
+    val a23 = Literal.create(Seq("b", "a", "c"), ArrayType(StringType, false))
+    val a24 = Literal.create(Seq("c", "d", "a", "f"), ArrayType(StringType, false))
 
     val a30 = Literal.create(Seq(null, null), ArrayType(NullType))
 
@@ -1201,6 +1203,7 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
 
     checkEvaluation(ArrayUnion(a20, a21), Seq("b", "a", "c", "d", "f"))
     checkEvaluation(ArrayUnion(a20, a22), Seq("b", "a", "c", null, "g"))
+    checkEvaluation(ArrayUnion(a23, a24), Seq("b", "c", "d", "a", "f"))
 
     checkEvaluation(ArrayUnion(a30, a30), Seq(null))
   }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3199,6 +3199,7 @@ object functions {
 
   /**
    * Returns an array of the elements in the union of the given two arrays, without duplicates.
+   * The order of elements in the result is not determined
    *
    * @group collection_funcs
    * @since 2.4.0