Move to Native Spark UDAFs for window processing.

apache · hvanhovell · Jul 22, 2015 · Jul 26, 2015 · Jul 26, 2015 · Jul 26, 2015
commit 84401e79d6567004771df6340e40c8e9807d78ec
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -76,6 +76,7 @@ class Analyzer(
       ResolveGenerate ::
       ResolveFunctions ::
       ResolveAliases ::
+      ResolveWindowFrame ::
       ExtractWindowExpressions ::
       GlobalAggregates ::
       UnresolvedHavingClauseAttributes ::
@@ -557,11 +558,18 @@ class Analyzer(
     }
 
     def containsAggregates(exprs: Seq[Expression]): Boolean = {
-      exprs.foreach(_.foreach {
-        case agg: AggregateExpression => return true
-        case _ =>
-      })
-      false
+      // Collect all Windowed Aggregate Expressions.
+      val blacklist = exprs.flatMap { expr =>
+        expr.collect {
+          case WindowExpression(ae: AggregateExpression, _) => ae
+        }
+      }.toSet
+
+      // Find the first Aggregate Expression that is not Windowed.
+      exprs.exists(_.collectFirst {
+        case ae: AggregateExpression if !blacklist.contains(ae) => ae
+      }.isDefined)
+
     }
   }
 
@@ -763,26 +771,38 @@ class Analyzer(
 
       // Now, we extract regular expressions from expressionsWithWindowFunctions
       // by using extractExpr.
+      val seenWindowAggregates = new ArrayBuffer[AggregateExpression]
       val newExpressionsWithWindowFunctions = expressionsWithWindowFunctions.map {
         _.transform {
           // Extracts children expressions of a WindowFunction (input parameters of
           // a WindowFunction).
           case wf : WindowFunction =>
-            val newChildren = wf.children.map(extractExpr(_))
+            val newChildren = wf.children.map(extractExpr)
+            wf.withNewChildren(newChildren)
+
+          case wf : WindowFunction2 =>
+            val newChildren = wf.children.map(extractExpr)
             wf.withNewChildren(newChildren)
 
           // Extracts expressions from the partition spec and order spec.
           case wsc @ WindowSpecDefinition(partitionSpec, orderSpec, _) =>
-            val newPartitionSpec = partitionSpec.map(extractExpr(_))
+            val newPartitionSpec = partitionSpec.map(extractExpr)
             val newOrderSpec = orderSpec.map { so =>
               val newChild = extractExpr(so.child)
               so.copy(child = newChild)
             }
             wsc.copy(partitionSpec = newPartitionSpec, orderSpec = newOrderSpec)
 
+          // Extract Windowed AggregateExpression
+          case we @ WindowExpression(agg: AggregateExpression, spec: WindowSpecDefinition) =>
+            val newAggChildren = agg.children.map(extractExpr)
+            val newAgg = agg.withNewChildren(newAggChildren)
+            seenWindowAggregates += newAgg
+            WindowExpression(newAgg, spec)
+
           // Extracts AggregateExpression. For example, for SUM(x) - Sum(y) OVER (...),
           // we need to extract SUM(x).
-          case agg: AggregateExpression =>
+          case agg: AggregateExpression if !seenWindowAggregates.contains(agg) =>
             val withName = Alias(agg, s"_w${extractedExprBuffer.length}")()
             extractedExprBuffer += withName
             withName.toAttribute
@@ -957,6 +977,85 @@ class Analyzer(
         Project(p.output, newPlan.withNewChildren(newChild :: Nil))
     }
   }
+
+  /**
+   * Removes all still-need-evaluate ordering expressions from sort and use an inner project to
+   * materialize them, finally use a outer project to project them away to keep the result same.
+   * Then we can make sure we only sort by [[AttributeReference]]s.
+   *
+   * As an example,
+   * {{{
+   *   Sort('a, 'b + 1,
+   *     Relation('a, 'b))
+   * }}}
+   * will be turned into:
+   * {{{
+   *   Project('a, 'b,
+   *     Sort('a, '_sortCondition,
+   *       Project('a, 'b, ('b + 1).as("_sortCondition"),
+   *         Relation('a, 'b))))
+   * }}}
+   */
+  object RemoveEvaluationFromSort extends Rule[LogicalPlan] {
+    private def hasAlias(expr: Expression) = {
+      expr.find {
+        case a: Alias => true
+        case _ => false
+      }.isDefined
+    }
+
+    override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+      // The ordering expressions have no effect to the output schema of `Sort`,
+      // so `Alias`s in ordering expressions are unnecessary and we should remove them.
+      case s@Sort(ordering, _, _) if ordering.exists(hasAlias) =>
+        val newOrdering = ordering.map(_.transformUp {
+          case Alias(child, _) => child
+        }.asInstanceOf[SortOrder])
+        s.copy(order = newOrdering)
+
+      case s@Sort(ordering, global, child)
+        if s.expressions.forall(_.resolved) && s.childrenResolved && !s.hasNoEvaluation =>
+
+        val (ref, needEval) = ordering.partition(_.child.isInstanceOf[AttributeReference])
+
+        val namedExpr = needEval.map(_.child match {
+          case n: NamedExpression => n
+          case e => Alias(e, "_sortCondition")()
+        })
+
+        val newOrdering = ref ++ needEval.zip(namedExpr).map { case (order, ne) =>
+          order.copy(child = ne.toAttribute)
+        }
+
+        // Add still-need-evaluate ordering expressions into inner project and then project
+        // them away after the sort.
+        Project(child.output,
+          Sort(newOrdering, global,
+            Project(child.output ++ namedExpr, child)))
+    }
+  }
+
+  /*
+   * Check and add proper window frames for all window functions.
+   */
+  object ResolveWindowFrame extends Rule[LogicalPlan] {
+    def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+      case logical: LogicalPlan => logical.transformExpressionsDown {
+        case WindowExpression(wf: WindowFunction2,
+            WindowSpecDefinition(_, _, f: SpecifiedWindowFrame))
+          if wf.frame != UnspecifiedFrame && wf.frame != f =>
+          failAnalysis(s"The frame of the window '$f' does not match the required frame " +
+            s"'${wf.frame}'")
+        case WindowExpression(wf: WindowFunction2,
+            s @ WindowSpecDefinition(_, o, UnspecifiedFrame))
+            if wf.frame != UnspecifiedFrame =>
+          WindowExpression(wf, s.copy(frameSpecification = wf.frame))
+        case we @ WindowExpression(e, s @ WindowSpecDefinition(_, o, UnspecifiedFrame)) =>
+          val frame = SpecifiedWindowFrame.defaultWindowFrame(!o.isEmpty, false)
+          we.copy(windowSpec = s.copy(frameSpecification = frame))
+      }
+    }
+  }
 }
 
 /**

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -250,7 +250,17 @@ object FunctionRegistry {
     expression[Sha1]("sha1"),
     expression[Sha2]("sha2"),
     expression[SparkPartitionID]("spark_partition_id"),
-    expression[InputFileName]("input_file_name")
+    expression[InputFileName]("input_file_name"),
+
+    // window functions
+    expression[Lead]("lead"),
+    expression[Lag]("lag"),
+    expression[RowNumber]("row_number"),
+    expression[CumeDist]("cume_dist"),
+    expression[NTile]("ntile"),
+    expression[Rank]("rank"),
+    expression[DenseRank]("dense_rank"),
+    expression[PercentRank]("percent_rank")
   )
 
   val builtin: FunctionRegistry = {

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/sets.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/sets.scala
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.aggregate
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext, CodegenFallback}
+import org.apache.spark.sql.types.DataType
+import org.apache.spark.util.collection.OpenHashSet
+
+
+/** Reduce a set using an algebraic expression. */
+case class ReduceSetAlgebraic(left: Expression, right: AlgebraicAggregate)
+  extends BinaryExpression with CodegenFallback {
+
+  override def dataType: DataType = right.dataType
+
+  private[this] val single = right.children.size == 1
+  private[this] val singleValueOrdinal = right.bufferSchema.length
+
+  // This might be taking reuse too far...
+  @transient private[this] lazy val buffer = {
+    val singleSize = if (single) 1 else 0
+    new GenericMutableRow(singleValueOrdinal + singleSize)
+  }
+
+  @transient private[this] lazy val initial =
+    InterpretedMutableProjection(right.initialValues).target(buffer)
+
+  @transient private[this] lazy val update = {
+    val schema = right.bufferAttributes ++ right.children.map { child =>
+      AttributeReference("child", child.dataType, child.nullable)()
+    }
+    new InterpretedMutableProjection(right.updateExpressions, schema).target(buffer)
+  }
+
+  @transient private[this] lazy val evaluate =
+    BindReferences.bindReference(right.evaluateExpression, right.bufferSchema.toAttributes)
+
+  @transient private[this] lazy val joinRow = new JoinedRow4
+
+  override def eval(input: InternalRow): Any = {
+    val result = left.eval(input).asInstanceOf[OpenHashSet[Any]]
+    if (result != null) {
+      initial(EmptyRow)
+      val iterator = result.iterator
+      // Prevent branch during iteration.
+      if (single) {
+        while (iterator.hasNext) {
+          buffer.update(singleValueOrdinal, iterator.next)
+          update(buffer)
+        }
+      } else {
+        while (iterator.hasNext) {
+          joinRow(buffer, iterator.next.asInstanceOf[InternalRow])
+          update(joinRow)
+        }
+      }
+      evaluate.eval(buffer)
+    } else null
+  }
+}
+/** Reduce a set using an AggregateFunction2. */
+case class ReduceSetAggregate(left: Expression, right: AggregateFunction2)
+  extends BinaryExpression with CodegenFallback {
+
+  right.bufferOffset = 0
+
+  override def dataType: DataType = right.dataType
+
+  private[this] val single = right.children.size == 1
+  @transient private[this] lazy val buffer = new GenericMutableRow(right.bufferSchema.size)
+  @transient private[this] lazy val singleValueInput = new GenericMutableRow(1)
+
+  override def eval(input: InternalRow): Any = {
+    val result = left.eval(input).asInstanceOf[OpenHashSet[Any]]
+    if (result != null) {
+      right.initialize(buffer)
+      val iterator = result.iterator
+      if (single) {
+        while (iterator.hasNext) {
+          singleValueInput.update(0, iterator.next())
+          right.update(buffer, singleValueInput)
+        }
+      } else {
+        while (iterator.hasNext) {
+          right.update(buffer, iterator.next().asInstanceOf[InternalRow])
+        }
+      }
+      right.eval(buffer)
+    } else null
+  }
+}