Add a superclass for *AggregateExec

apache · maropu · Jan 25, 2016 · Feb 5, 2016 · Apr 25, 2016 · Jun 7, 2016
commit e37ef6afd47e9dd325a7f9e6d0826a3cb66c8e2e
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.aggregate
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
+import org.apache.spark.sql.execution.aggregate.{Aggregate => AggregateExec}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.streaming.{StateStoreRestoreExec, StateStoreSaveExec}
 
@@ -27,20 +28,11 @@ import org.apache.spark.sql.execution.streaming.{StateStoreRestoreExec, StateSto
  */
 object AggUtils {
 
-  private[execution] def isAggregate(operator: SparkPlan): Boolean = {
-    operator.isInstanceOf[HashAggregateExec] || operator.isInstanceOf[SortAggregateExec]
-  }
-
-  private[execution] def supportPartialAggregate(operator: SparkPlan): Boolean = {
-    assert(isAggregate(operator))
-    def supportPartial(exprs: Seq[AggregateExpression]) =
-      exprs.map(_.aggregateFunction).forall(_.supportsPartial)
-    operator match {
-      case agg @ HashAggregateExec(_, _, aggregateExpressions, _, _, _, _) =>
-        supportPartial(aggregateExpressions)
-      case agg @ SortAggregateExec(_, _, aggregateExpressions, _, _, _, _) =>
-        supportPartial(aggregateExpressions)
-    }
+  private[execution] def supportPartialAggregate(operator: SparkPlan): Boolean = operator match {
+    case agg: AggregateExec =>
+      agg.aggregateExpressions.map(_.aggregateFunction).forall(_.supportsPartial)
+    case _ =>
+      false
   }
 
   private def createPartialAggregateExec(
@@ -86,23 +78,18 @@ object AggUtils {
 
   private[execution] def createPartialAggregate(operator: SparkPlan)
       : (SparkPlan, SparkPlan) = operator match {
-    case agg @ HashAggregateExec(_, groupingExpressions, aggregateExpressions, _, _, _, child) =>
-      val mapSideAgg = createPartialAggregateExec(
-        groupingExpressions, aggregateExpressions, child)
-      val mergeAgg = agg.copy(
-        groupingExpressions = groupingExpressions.map(_.toAttribute),
-        aggregateExpressions = updateMergeAggregateMode(aggregateExpressions),
-        initialInputBufferOffset = groupingExpressions.length)
-
-      (mergeAgg, mapSideAgg)
-
-    case agg @ SortAggregateExec(_, groupingExpressions, aggregateExpressions, _, _, _, child) =>
+    case agg: Aggregate =>
       val mapSideAgg = createPartialAggregateExec(
-        groupingExpressions, aggregateExpressions, child)
-      val mergeAgg = agg.copy(
-        groupingExpressions = groupingExpressions.map(_.toAttribute),
-        aggregateExpressions = updateMergeAggregateMode(aggregateExpressions),
-        initialInputBufferOffset = groupingExpressions.length)
+        agg.groupingExpressions, agg.aggregateExpressions, agg.child)
+      val mergeAgg = createAggregateExec(
+        requiredChildDistributionExpressions = agg.requiredChildDistributionExpressions,
+        groupingExpressions = agg.groupingExpressions.map(_.toAttribute),
+        aggregateExpressions = updateMergeAggregateMode(agg.aggregateExpressions),
+        aggregateAttributes = agg.aggregateAttributes,
+        initialInputBufferOffset = agg.groupingExpressions.length,
+        resultExpressions = agg.resultExpressions,
+        child = agg.child
+      )
 
       (mergeAgg, mapSideAgg)
   }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/Aggregate.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.aggregate
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
+import org.apache.spark.sql.catalyst.plans.physical._
+import org.apache.spark.sql.execution.SparkPlan
+
+/**
+ * A base class for aggregate implementation.
+ */
+trait Aggregate {
+  self: SparkPlan =>
+
+  val requiredChildDistributionExpressions: Option[Seq[Expression]]
+  val groupingExpressions: Seq[NamedExpression]
+  val aggregateExpressions: Seq[AggregateExpression]
+  val aggregateAttributes: Seq[Attribute]
+  val initialInputBufferOffset: Int
+  val resultExpressions: Seq[NamedExpression]
+  val child: SparkPlan
+
+  protected[this] val aggregateBufferAttributes = {
+    aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
+  }
+
+  override def producedAttributes: AttributeSet =
+    AttributeSet(aggregateAttributes) ++
+      AttributeSet(resultExpressions.diff(groupingExpressions).map(_.toAttribute)) ++
+      AttributeSet(aggregateBufferAttributes)
+
+  override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute)
+
+
+  override def requiredChildDistribution: List[Distribution] = {
+    requiredChildDistributionExpressions match {
+      case Some(exprs) if exprs.isEmpty => AllTuples :: Nil
+      case Some(exprs) if exprs.nonEmpty => ClusteredDistribution(exprs) :: Nil
+      case None => UnspecifiedDistribution :: Nil
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -24,7 +24,6 @@ import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.types.{DecimalType, StringType, StructType}
@@ -42,11 +41,7 @@ case class HashAggregateExec(
     initialInputBufferOffset: Int,
     resultExpressions: Seq[NamedExpression],
     child: SparkPlan)
-  extends UnaryExecNode with CodegenSupport {
-
-  private[this] val aggregateBufferAttributes = {
-    aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
-  }
+  extends UnaryExecNode with Aggregate with CodegenSupport {
 
   require(HashAggregateExec.supportsAggregate(aggregateBufferAttributes))
 
@@ -60,21 +55,6 @@ case class HashAggregateExec(
     "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size"),
     "aggTime" -> SQLMetrics.createTimingMetric(sparkContext, "aggregate time"))
 
-  override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute)
-
-  override def producedAttributes: AttributeSet =
-    AttributeSet(aggregateAttributes) ++
-    AttributeSet(resultExpressions.diff(groupingExpressions).map(_.toAttribute)) ++
-    AttributeSet(aggregateBufferAttributes)
-
-  override def requiredChildDistribution: List[Distribution] = {
-    requiredChildDistributionExpressions match {
-      case Some(exprs) if exprs.isEmpty => AllTuples :: Nil
-      case Some(exprs) if exprs.nonEmpty => ClusteredDistribution(exprs) :: Nil
-      case None => UnspecifiedDistribution :: Nil
-    }
-  }
-
   // This is for testing. We force TungstenAggregationIterator to fall back to the unsafe row hash
   // map and/or the sort-based aggregation once it has processed a given number of input rows.
   private val testFallbackStartsAt: Option[(Int, Int)] = {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortAggregateExec.scala
@@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
-import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, Distribution, UnspecifiedDistribution}
 import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.util.Utils
@@ -38,30 +37,11 @@ case class SortAggregateExec(
     initialInputBufferOffset: Int,
     resultExpressions: Seq[NamedExpression],
     child: SparkPlan)
-  extends UnaryExecNode {
-
-  private[this] val aggregateBufferAttributes = {
-    aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
-  }
-
-  override def producedAttributes: AttributeSet =
-    AttributeSet(aggregateAttributes) ++
-      AttributeSet(resultExpressions.diff(groupingExpressions).map(_.toAttribute)) ++
-      AttributeSet(aggregateBufferAttributes)
+  extends UnaryExecNode with Aggregate {
 
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
 
-  override def output: Seq[Attribute] = resultExpressions.map(_.toAttribute)
-
-  override def requiredChildDistribution: List[Distribution] = {
-    requiredChildDistributionExpressions match {
-      case Some(exprs) if exprs.isEmpty => AllTuples :: Nil
-      case Some(exprs) if exprs.nonEmpty => ClusteredDistribution(exprs) :: Nil
-      case None => UnspecifiedDistribution :: Nil
-    }
-  }
-
   override def requiredChildOrdering: Seq[Seq[SortOrder]] = {
     groupingExpressions.map(SortOrder(_, Ascending)) :: Nil
   }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.aggregate.AggUtils
+import org.apache.spark.sql.execution.aggregate.{Aggregate, AggUtils}
 import org.apache.spark.sql.internal.SQLConf
 
 /**
@@ -155,36 +155,28 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
     assert(requiredChildDistributions.length == operator.children.length)
     assert(requiredChildOrderings.length == operator.children.length)
 
-    // Ensure that the operator's children satisfy their output distribution requirements:
-    val childrenWithDist = operator.children.zip(requiredChildDistributions)
-
     def createShuffleExchange(dist: Distribution, child: SparkPlan) =
       ShuffleExchange(createPartitioning(dist, defaultNumPreShufflePartitions), child)
 
-    var (parent, children) = if (!AggUtils.isAggregate(operator)) {
-      val newChildren = childrenWithDist.map {
-        case (child, distribution) if child.outputPartitioning.satisfies(distribution) =>
-          child
-        case (child, BroadcastDistribution(mode)) =>
-          BroadcastExchangeExec(mode, child)
-        case (child, distribution) =>
-          createShuffleExchange(distribution, child)
-      }
-      (operator, newChildren)
-    } else {
-      val (child, distribution) = childrenWithDist.head
-      if (!child.outputPartitioning.satisfies(distribution)) {
-        if (AggUtils.supportPartialAggregate(operator)) {
-          // If an aggregation needs a shuffle and support partial aggregations, a map-side partial
-          // aggregation and a shuffle are added as children.
-          val (mergeAgg, mapSideAgg) = AggUtils.createPartialAggregate(operator)
-          (mergeAgg, createShuffleExchange(distribution, mapSideAgg) :: Nil)
-        } else {
-          (operator, createShuffleExchange(distribution, child) :: Nil)
+    var (parent, children) = operator match {
+      case agg if AggUtils.supportPartialAggregate(agg) &&
+          !operator.outputPartitioning.satisfies(requiredChildDistributions.head) =>
+        // If an aggregation needs a shuffle and support partial aggregations, a map-side partial
+        // aggregation and a shuffle are added as children.
+        val (mergeAgg, mapSideAgg) = AggUtils.createPartialAggregate(operator)
+        (mergeAgg, createShuffleExchange(requiredChildDistributions.head, mapSideAgg) :: Nil)
+      case _ =>
+        // Ensure that the operator's children satisfy their output distribution requirements:
+        val childrenWithDist = operator.children.zip(requiredChildDistributions)
+        val newChildren = childrenWithDist.map {
+          case (child, distribution) if child.outputPartitioning.satisfies(distribution) =>
+            child
+          case (child, BroadcastDistribution(mode)) =>
+            BroadcastExchangeExec(mode, child)
+          case (child, distribution) =>
+            createShuffleExchange(distribution, child)
         }
-      } else {
-        (operator, child :: Nil)
-      }
+        (operator, newChildren)
     }
 
     // If the operator has multiple children and specifies child output distributions (e.g. join),