fix review comments

clockfly · clockfly · commit d3108ab7ea1e · 2016-08-23T11:24:10.000+08:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -415,8 +415,8 @@ abstract class DeclarativeAggregate
  *  2. Upon each input row, the framework calls
  *     `update(buffer: T, input: InternalRow): Unit` to update the aggregation buffer object T.
  *  3. After processing all rows of current group (group by key), the framework will serialize
- *     aggregation buffer object T to SparkSQL internally supported underlying storage format, and
- *     persist the serializable format to disk if needed.
+ *     aggregation buffer object T to storage format (Array[Byte]) and persist the Array[Byte]
+ *     to disk if needed.
  *  4. The framework moves on to next group, until all groups have been processed.
  *
  * Shuffling exchange data to Reducer tasks...
@@ -426,7 +426,7 @@ abstract class DeclarativeAggregate
  *  1. The framework calls `createAggregationBuffer(): T` to create an empty internal aggregation
  *     buffer object (type T) for merging.
  *  2. For each aggregation output of Stage 1, The framework de-serializes the storage
- *     format and generates one input aggregation object (type T).
+ *     format (Array[Byte]) and produces one input aggregation object (type T).
  *  3. For each input aggregation object, the framework calls `merge(buffer: T, input: T): Unit`
  *     to merge the input aggregation object into aggregation buffer object.
  *  4. After processing all input aggregation objects of current group (group by key), the framework
@@ -474,39 +474,11 @@ abstract class TypedImperativeAggregate[T] extends ImperativeAggregate {
   /** Returns the class of aggregation buffer object */
   def aggregationBufferClass: Class[T]
 
-  /** Serializes the aggregation buffer object T to Spark-sql internally supported storage format */
-  def serialize(buffer: T): Any
+  /** Serializes the aggregation buffer object T to Array[Byte] */
+  def serialize(buffer: T): Array[Byte]
 
-  /** De-serializes the storage format, and produces aggregation buffer object T */
-  def deserialize(storageFormat: Any): T
-
-  /**
-   * Returns the aggregation-buffer-object storage format's Sql type.
-   *
-   * Here is a list of supported storage format and corresponding Sql type:
-   *
-   * {{{
-   *   aggregation buffer object's Storage format    |  storage format's Sql type
-   * ------------------------------------------------------------------------------------------
-   *   Array[Byte] (*)                               |  BinaryType (*)
-   *   Null                                          |  NullType
-   *   Boolean                                       |  BooleanType
-   *   Byte                                          |  ByteType
-   *   Short                                         |  ShortType
-   *   Int                                           |  IntegerType
-   *   Long                                          |  LongType
-   *   Float                                         |  FloatType
-   *   Double                                        |  DoubleType
-   *   org.apache.spark.sql.types.Decimal            |  DecimalType
-   *   org.apache.spark.unsafe.types.UTF8String      |  StringType
-   *   org.apache.spark.unsafe.types.CalendarInterval|  CalendarIntervalType
-   *   org.apache.spark.sql.catalyst.util.MapData    |  MapType
-   *   org.apache.spark.sql.catalyst.util.ArrayData  |  ArrayType
-   *   org.apache.spark.sql.catalyst.InternalRow     |
-   * }}}
-   *
-   */
-  def aggregationBufferStorageFormatSqlType: DataType
+  /** De-serializes the serialized format Array[Byte], and produces aggregation buffer object T */
+  def deserialize(storageFormat: Array[Byte]): T
 
   final override def initialize(buffer: MutableRow): Unit = {
     val bufferObject = createAggregationBuffer()
@@ -519,29 +491,29 @@ abstract class TypedImperativeAggregate[T] extends ImperativeAggregate {
   }
 
   final override def merge(buffer: MutableRow, inputBuffer: InternalRow): Unit = {
-    val bufferObject = field(buffer, mutableAggBufferOffset).asInstanceOf[T]
-    val inputObject = deserialize(field(inputBuffer, inputAggBufferOffset))
+    val bufferObject = field[T](buffer, mutableAggBufferOffset)
+    val inputObject = deserialize(field[Array[Byte]](inputBuffer, inputAggBufferOffset))
     merge(bufferObject, inputObject)
   }
 
   final override def eval(buffer: InternalRow): Any = {
-    val bufferObject = field(buffer, mutableAggBufferOffset)
+    val bufferObject = field[AnyRef](buffer, mutableAggBufferOffset)
     if (bufferObject.getClass == aggregationBufferClass) {
       // When used in Window frame aggregation, eval(buffer: InternalRow) is called directly
       // on the object aggregation buffer without intermediate serializing/de-serializing.
       eval(bufferObject.asInstanceOf[T])
     } else {
-      eval(deserialize(bufferObject))
+      eval(deserialize(bufferObject.asInstanceOf[Array[Byte]]))
     }
   }
 
-  private def field(input: InternalRow, offset: Int): AnyRef = {
-    input.get(offset, null)
+  private def field[U](input: InternalRow, fieldIndex: Int): U = {
+    input.get(fieldIndex, null).asInstanceOf[U]
   }
 
   final override lazy val aggBufferAttributes: Seq[AttributeReference] = {
     // Underlying storage type for the aggregation buffer object
-    Seq(AttributeReference("buf", aggregationBufferStorageFormatSqlType)())
+    Seq(AttributeReference("buf", BinaryType)())
   }
 
   final override lazy val inputAggBufferAttributes: Seq[AttributeReference] =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TypedImperativeAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TypedImperativeAggregateSuite.scala
@@ -17,21 +17,27 @@
 
 package org.apache.spark.sql
 
+import com.google.common.primitives.Ints
+
 import org.apache.spark.sql.TypedImperativeAggregateSuite.TypedMax
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, UnsafeRow}
-import org.apache.spark.sql.catalyst.expressions.aggregate.{TypedImperativeAggregate}
+import org.apache.spark.sql.catalyst.expressions.{BoundReference, Expression, GenericMutableRow, SpecificMutableRow, UnsafeRow}
+import org.apache.spark.sql.catalyst.expressions.aggregate.TypedImperativeAggregate
 import org.apache.spark.sql.execution.aggregate.SortAggregateExec
+import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.{AbstractDataType, DataType, IntegerType}
+import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType, IntegerType, LongType}
 
 class TypedImperativeAggregateSuite extends QueryTest with SharedSQLContext {
 
   import testImplicits._
 
-  private val data = Seq((1, 0), (3, 1), (2, 0), (6, 3), (3, 1), (4, 1), (5, 0))
+  private val random = new java.util.Random()
 
+  private val data = (0 until 1000).map { _ =>
+    (random.nextInt(10), random.nextInt(100))
+  }
   test("aggregate with object aggregate buffer") {
     val agg = new TypedMax(BoundReference(0, IntegerType, nullable = false))
 
@@ -55,37 +61,66 @@ class TypedImperativeAggregateSuite extends QueryTest with SharedSQLContext {
 
     assert(mergeBuffer.value == data.map(_._1).max)
     assert(agg.eval(mergeBuffer) == data.map(_._1).max)
+
+    // Tests low level eval(row: InternalRow) API.
+    val array: Array[Any] = Array(mergeBuffer)
+    val row = new GenericMutableRow(array)
+
+    // Evaluates directly on row consist of aggregation buffer object.
+    assert(agg.eval(row) == data.map(_._1).max)
+
+    // Serializes the aggregation buffer object and then evals.
+    agg.serializeAggregateBufferInPlace(row)
+    assert(agg.eval(row) == data.map(_._1).max)
+  }
+
+  test("supports SpecificMutableRow as mutable row") {
+    val aggregationBufferSchema = Seq(IntegerType, LongType, BinaryType, IntegerType)
+    val aggBufferOffset = 2
+    val inputBufferObject = 1
+    val buffer = new SpecificMutableRow(aggregationBufferSchema)
+    val agg = new TypedMax(BoundReference(inputBufferObject, IntegerType, nullable = false))
+      .withNewMutableAggBufferOffset(aggBufferOffset)
+      .withNewInputAggBufferOffset(inputBufferObject)
+
+    agg.initialize(buffer)
+    data.foreach { kv =>
+      val input = InternalRow(kv._1, kv._2)
+      agg.update(buffer, input)
+    }
+    assert(agg.eval(buffer) == data.map(_._2).max)
   }
 
   test("dataframe aggregate with object aggregate buffer, should not use HashAggregate") {
     val df = data.toDF("a", "b")
     val max = new TypedMax($"a".expr)
 
-    // Always use SortAggregateExec instead of HashAggregateExec for planning even if the aggregate
-    //  buffer attributes are mutable fields (every field can be mutated inline like int, long...)
-    val allFieldsMutable = max.aggBufferSchema.map(_.dataType).forall(UnsafeRow.isMutable)
+    // Always uses SortAggregateExec
     val sparkPlan = df.select(Column(max.toAggregateExpression())).queryExecution.sparkPlan
-    assert(allFieldsMutable == true && sparkPlan.isInstanceOf[SortAggregateExec])
+    assert(sparkPlan.isInstanceOf[SortAggregateExec])
   }
 
   test("dataframe aggregate with object aggregate buffer, no group by") {
-    val df = data.toDF("a", "b").coalesce(2)
-    checkAnswer(
-      df.select(typedMax($"a"), count($"a"), typedMax($"b"), count($"b")),
-      Seq(Row(6, 7, 3, 7))
-    )
+    val df = data.toDF("key", "value").coalesce(2)
+    val query = df.select(typedMax($"key"), count($"key"), typedMax($"value"), count($"value"))
+    val maxKey = data.map(_._1).max
+    val countKey = data.size
+    val maxValue = data.map(_._2).max
+    val countValue = data.size
+    val expected = Seq(Row(maxKey, countKey, maxValue, countValue))
+    checkAnswer(query, expected)
   }
 
   test("dataframe aggregate with object aggregate buffer, with group by") {
-    val df = data.toDF("a", "b").coalesce(2)
-    checkAnswer(
-      df.groupBy($"b").agg(typedMax($"a"), count($"a"), typedMax($"a")),
-      Seq(
-        Row(0, 5, 3, 5),
-        Row(1, 4, 3, 4),
-        Row(3, 6, 1, 6)
-      )
-    )
+    val df = data.toDF("value", "key").coalesce(2)
+    val query = df.groupBy($"key").agg(typedMax($"value"), count($"value"), typedMax($"value"))
+    val expected = data.groupBy(_._2).toSeq.map { group =>
+      val (key, values) = group
+      val valueMax = values.map(_._1).max
+      val countValue = values.size
+      Row(key, valueMax, countValue, valueMax)
+    }
+    checkAnswer(query, expected)
   }
 
   test("dataframe aggregate with object aggregate buffer, empty inputs, no group by") {
@@ -102,6 +137,36 @@ class TypedImperativeAggregateSuite extends QueryTest with SharedSQLContext {
       Seq.empty[Row])
   }
 
+  test("TypedImperativeAggregate should not break Window function") {
+    val df = data.toDF("key", "value")
+    // OVER (PARTITION BY a ORDER BY b ROW BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
+    val w = Window.orderBy("value").partitionBy("key").rowsBetween(Long.MinValue, 0)
+
+    val query = df.select(sum($"key").over(w), typedMax($"key").over(w), sum($"value").over(w),
+      typedMax($"value").over(w))
+
+    val expected = data.groupBy(_._1).toSeq.flatMap { group =>
+      val (key, values) = group
+      val sortedValues = values.map(_._2).sorted
+
+      var outputRows = Seq.empty[Row]
+      var i = 0
+      while (i < sortedValues.size) {
+        val unboundedPrecedingAndCurrent = sortedValues.slice(0, i + 1)
+        val sumKey = key * unboundedPrecedingAndCurrent.size
+        val maxKey = key
+        val sumValue = unboundedPrecedingAndCurrent.sum
+        val maxValue = unboundedPrecedingAndCurrent.max
+
+        outputRows :+= Row(sumKey, maxKey, sumValue, maxValue)
+        i += 1
+      }
+
+      outputRows
+    }
+    checkAnswer(query, expected)
+  }
+
   private def typedMax(column: Column): Column = {
     val max = TypedMax(column.expr)
     Column(max.toAggregateExpression())
@@ -159,14 +224,10 @@ object TypedImperativeAggregateSuite {
 
     override def aggregationBufferClass: Class[MaxValue] = classOf[MaxValue]
 
-    override def serialize(buffer: MaxValue): Any = buffer.value
+    override def serialize(buffer: MaxValue): Array[Byte] = Ints.toByteArray(buffer.value)
 
-    override def aggregationBufferStorageFormatSqlType: DataType = IntegerType
-
-    override def deserialize(storageFormat: Any): MaxValue = {
-      storageFormat match {
-        case i: Int => new MaxValue(i)
-      }
+    override def deserialize(storageFormat: Array[Byte]): MaxValue = {
+      new MaxValue(Ints.fromByteArray(storageFormat))
     }
   }