address comments

apache · davies · Feb 26, 2016 · Mar 4, 2016 · Mar 4, 2016 · Mar 8, 2016
commit 7df43ca78846966b0af8045b924d646d97505925
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -239,9 +239,9 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
   override def innerChildren: Seq[PlanType] = subqueries
 
   /**
-   * Cleaned copy of this query plan.
+   * Canonicalized copy of this query plan.
    */
-  protected lazy val cleaned: PlanType = this
+  protected lazy val canonicalized: PlanType = this
 
   /**
    * Returns true when the given query plan will return the same results as this query plan.
@@ -257,8 +257,8 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends TreeNode[PlanT
    * can do better should override this function.
    */
   def sameResult(plan: PlanType): Boolean = {
-    val cleanLeft = this.cleaned
-    val cleanRight = plan.cleaned
+    val cleanLeft = this.canonicalized
+    val cleanRight = plan.canonicalized
     cleanLeft.getClass == cleanRight.getClass &&
       cleanLeft.children.size == cleanRight.children.size &&
       cleanLeft.cleanArgs == cleanRight.cleanArgs &&

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -114,7 +114,7 @@ abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
    */
   def childrenResolved: Boolean = children.forall(_.resolved)
 
-  override lazy val cleaned: LogicalPlan = EliminateSubqueryAliases(this)
+  override lazy val canonicalized: LogicalPlan = EliminateSubqueryAliases(this)
 
   /**
    * Optionally resolves the given strings to a [[NamedExpression]] using the input from all child

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
@@ -35,6 +35,8 @@ class SparkPlanInfo(
     val metrics: Seq[SQLMetricInfo]) {
 
   override def hashCode(): Int = {
+    // hashCode of simpleString should be good enough to distinguish the plans from each other
+    // within a plan
     simpleString.hashCode
   }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/Exchange.scala
@@ -37,8 +37,9 @@ abstract class Exchange extends UnaryNode {
 }
 
 /**
- * A wrapper for reused exchange to have different output, which is required to resolve the
- * attributes in following plans.
+ * A wrapper for reused exchange to have different output, because two exchanges which produce
+ * logically identical output will have distinct sets of output attribute ids, so we need to
+ * preserve the original ids because they're what downstream operators are expecting.
  */
 case class ReusedExchange(override val output: Seq[Attribute], child: Exchange) extends LeafNode {
 
@@ -73,15 +74,15 @@ private[sql] case class ReuseExchange(sqlContext: SQLContext) extends Rule[Spark
     val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]()
     plan.transformUp {
       case exchange: Exchange =>
+        // the exchanges that have same results usually also have same schemas (same column names).
         val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]())
         val samePlan = sameSchema.find { e =>
           exchange.sameResult(e)
         }
         if (samePlan.isDefined) {
           // Keep the output of this exchange, the following plans require that to resolve
           // attributes.
-          val reused = ReusedExchange(exchange.output, samePlan.get)
-          reused
+          ReusedExchange(exchange.output, samePlan.get)
         } else {
           sameSchema += exchange
           exchange

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/ShuffleExchange.scala
@@ -104,12 +104,12 @@ case class ShuffleExchange(
   /**
    * Caches the created ShuffleRowRDD so we can reuse that.
    */
-  private var shuffleRDD: ShuffledRowRDD = null
+  private var cachedShuffleRDD: ShuffledRowRDD = null
 
   protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
     // Returns the same ShuffleRowRDD if this plan is used by multiple plans.
-    if (shuffleRDD == null) {
-      shuffleRDD = coordinator match {
+    if (cachedShuffleRDD == null) {
+      cachedShuffleRDD = coordinator match {
         case Some(exchangeCoordinator) =>
           val shuffleRDD = exchangeCoordinator.postShuffleRDD(this)
           assert(shuffleRDD.partitions.length == newPartitioning.numPartitions)
@@ -119,7 +119,7 @@ case class ShuffleExchange(
           preparePostShuffleRDD(shuffleDependency)
       }
     }
-    shuffleRDD
+    cachedShuffleRDD
   }
 }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
@@ -104,8 +104,7 @@ private[sql] object SparkPlanGraph {
         } else {
           subgraph.nodes += node
         }
-        // ShuffleExchange or BroadcastExchange
-        if (name.endsWith("Exchange")) {
+        if (name == "ShuffleExchange" || name == "BroadcastExchange") {
           exchanges += planInfo -> node
         }
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1341,6 +1341,7 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     val df = sqlContext.range(100)
     val agg1 = df.groupBy().count()
     val agg2 = df.groupBy().count()
+    // two aggregates with different ExprId within them should have same result
     agg1.queryExecution.executedPlan.sameResult(agg2.queryExecution.executedPlan)
   }