optimize for full outer join

apache · chenghao-intel · May 26, 2015 · May 27, 2015 · May 27, 2015 · Jun 22, 2015
commit 491a89042c41058653e9b41297c14bb1da856f4d
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -50,8 +50,19 @@ case object AllTuples extends Distribution
  * [[Expression Expressions]] will be co-located. Based on the context, this
  * can mean such tuples are either co-located in the same partition or they will be contiguous
  * within a single partition.
+ * There is also another constraint, the `clustering` value contains null will be considered
+ * as a valid value if `nullKeysSensitive` == true.
+ *
+ * For examples:
+ * JOIN KEYS: values contains null will be considered as invalid values, which means
+ *          the tuples could be in different partition.
+ * GROUP BY KEYS: values contains null will be considered as the valid value, which means
+ *          the tuples should be in the same partition.
  */
-case class ClusteredDistribution(clustering: Seq[Expression]) extends Distribution {
+case class ClusteredDistribution(
+    clustering: Seq[Expression],
+    nullKeysSensitive: Boolean) extends Distribution {
+
   require(
     clustering != Nil,
     "The clustering expressions of a ClusteredDistribution should not be Nil. " +
@@ -157,7 +168,7 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
 
   override def satisfies(required: Distribution): Boolean = required match {
     case UnspecifiedDistribution => true
-    case ClusteredDistribution(requiredClustering) =>
+    case ClusteredDistribution(requiredClustering, false) =>
       clusteringSet.subsetOf(requiredClustering.toSet)
     case _ => false
   }
@@ -201,7 +212,7 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int)
     case OrderedDistribution(requiredOrdering) =>
       val minSize = Seq(requiredOrdering.size, ordering.size).min
       requiredOrdering.take(minSize) == ordering.take(minSize)
-    case ClusteredDistribution(requiredClustering) =>
+    case ClusteredDistribution(requiredClustering, false) =>
       clusteringSet.subsetOf(requiredClustering.toSet)
     case _ => false
   }

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/DistributionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/DistributionSuite.scala
@@ -51,17 +51,17 @@ class DistributionSuite extends SparkFunSuite {
 
     checkSatisfied(
       HashPartitioning(Seq('a, 'b, 'c), 10),
-      ClusteredDistribution(Seq('a, 'b, 'c)),
+      ClusteredDistribution(Seq('a, 'b, 'c), false),
       true)
 
     checkSatisfied(
       HashPartitioning(Seq('b, 'c), 10),
-      ClusteredDistribution(Seq('a, 'b, 'c)),
+      ClusteredDistribution(Seq('a, 'b, 'c), false),
       true)
 
     checkSatisfied(
       SinglePartition,
-      ClusteredDistribution(Seq('a, 'b, 'c)),
+      ClusteredDistribution(Seq('a, 'b, 'c), false),
       true)
 
     checkSatisfied(
@@ -72,12 +72,12 @@ class DistributionSuite extends SparkFunSuite {
     // Cases which need an exchange between two data properties.
     checkSatisfied(
       HashPartitioning(Seq('a, 'b, 'c), 10),
-      ClusteredDistribution(Seq('b, 'c)),
+      ClusteredDistribution(Seq('b, 'c), false),
       false)
 
     checkSatisfied(
       HashPartitioning(Seq('a, 'b, 'c), 10),
-      ClusteredDistribution(Seq('d, 'e)),
+      ClusteredDistribution(Seq('d, 'e), false),
       false)
 
     checkSatisfied(
@@ -128,17 +128,17 @@ class DistributionSuite extends SparkFunSuite {
 
     checkSatisfied(
       RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
-      ClusteredDistribution(Seq('a, 'b, 'c)),
+      ClusteredDistribution(Seq('a, 'b, 'c), false),
       true)
 
     checkSatisfied(
       RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
-      ClusteredDistribution(Seq('c, 'b, 'a)),
+      ClusteredDistribution(Seq('c, 'b, 'a), false),
       true)
 
     checkSatisfied(
       RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
-      ClusteredDistribution(Seq('b, 'c, 'a, 'd)),
+      ClusteredDistribution(Seq('b, 'c, 'a, 'd), false),
       true)
 
     // Cases which need an exchange between two data properties.
@@ -158,12 +158,12 @@ class DistributionSuite extends SparkFunSuite {
 
     checkSatisfied(
       RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
-      ClusteredDistribution(Seq('a, 'b)),
+      ClusteredDistribution(Seq('a, 'b), false),
       false)
 
     checkSatisfied(
       RangePartitioning(Seq('a.asc, 'b.asc, 'c.asc), 10),
-      ClusteredDistribution(Seq('c, 'd)),
+      ClusteredDistribution(Seq('c, 'd), false),
       false)
 
     checkSatisfied(

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
@@ -51,7 +51,7 @@ case class Aggregate(
       if (groupingExpressions == Nil) {
         AllTuples :: Nil
       } else {
-        ClusteredDistribution(groupingExpressions) :: Nil
+        ClusteredDistribution(groupingExpressions, true) :: Nil
       }
     }
   }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -291,9 +291,11 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
       def addOperatorsIfNecessary(
           partitioning: Partitioning,
           rowOrdering: Seq[SortOrder],
-          child: SparkPlan): SparkPlan = {
+          child: SparkPlan,
+          alwaysShuffle: Boolean = false): SparkPlan = {
         val needSort = rowOrdering.nonEmpty && child.outputOrdering != rowOrdering
-        val needsShuffle = child.outputPartitioning != partitioning
+
+        val needsShuffle = (child.outputPartitioning != partitioning) || alwaysShuffle
 
         val withShuffle = if (needsShuffle) {
           Exchange(partitioning, Nil, child)
@@ -326,8 +328,8 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
         val fixedChildren = requirements.zipped.map {
           case (AllTuples, rowOrdering, child) =>
             addOperatorsIfNecessary(SinglePartition, rowOrdering, child)
-          case (ClusteredDistribution(clustering), rowOrdering, child) =>
-            addOperatorsIfNecessary(HashPartitioning(clustering, numPartitions), rowOrdering, child)
+          case (ClusteredDistribution(clustering, nullKeySensitive), rowOrdering, child) =>
+            addOperatorsIfNecessary(HashPartitioning(clustering, numPartitions), rowOrdering, child, nullKeySensitive)
           case (OrderedDistribution(ordering), rowOrdering, child) =>
             addOperatorsIfNecessary(RangePartitioning(ordering, numPartitions), rowOrdering, child)
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala
@@ -60,7 +60,7 @@ case class GeneratedAggregate(
       if (groupingExpressions == Nil) {
         AllTuples :: Nil
       } else {
-        ClusteredDistribution(groupingExpressions) :: Nil
+        ClusteredDistribution(groupingExpressions, false) :: Nil
       }
     }
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
@@ -45,7 +45,7 @@ case class Window(
       // This operator will be very expensive.
       AllTuples :: Nil
     } else {
-      ClusteredDistribution(windowSpec.partitionSpec) :: Nil
+      ClusteredDistribution(windowSpec.partitionSpec, true) :: Nil
     }
 
   // Since window functions are adding columns to the input rows, the child's outputPartitioning

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -245,6 +245,37 @@ case class ExternalSort(
   override def outputOrdering: Seq[SortOrder] = sortOrder
 }
 
+/**
+ * :: DeveloperApi ::
+ * Computes the set of distinct input rows using a HashSet.
+ * @param partial when true the distinct operation is performed partially, per partition, without
+ *                shuffling the data.
+ * @param child the input query plan.
+ */
+@DeveloperApi
+case class Distinct(partial: Boolean, child: SparkPlan) extends UnaryNode {
+  override def output: Seq[Attribute] = child.output
+
+  override def requiredChildDistribution: Seq[Distribution] =
+    if (partial) UnspecifiedDistribution :: Nil else ClusteredDistribution(child.output, true) :: Nil
+
+  protected override def doExecute(): RDD[Row] = {
+    child.execute().mapPartitions { iter =>
+      val hashSet = new scala.collection.mutable.HashSet[Row]()
+
+      var currentRow: Row = null
+      while (iter.hasNext) {
+        currentRow = iter.next()
+        if (!hashSet.contains(currentRow)) {
+          hashSet.add(currentRow.copy())
+        }
+      }
+
+      hashSet.iterator
+    }
+  }
+}
+
 /**
  * :: DeveloperApi ::
  * Return a new RDD that has exactly `numPartitions` partitions.

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -53,7 +53,7 @@ case class HashOuterJoin(
   }
 
   override def requiredChildDistribution: Seq[ClusteredDistribution] =
-    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+    ClusteredDistribution(leftKeys, false) :: ClusteredDistribution(rightKeys, false) :: Nil
 
   override def output: Seq[Attribute] = {
     joinType match {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
@@ -38,7 +38,7 @@ case class LeftSemiJoinHash(
   override val buildSide: BuildSide = BuildRight
 
   override def requiredChildDistribution: Seq[ClusteredDistribution] =
-    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+    ClusteredDistribution(leftKeys, false) :: ClusteredDistribution(rightKeys, false) :: Nil
 
   override def output: Seq[Attribute] = left.output
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
@@ -41,7 +41,7 @@ case class ShuffledHashJoin(
   override def outputPartitioning: Partitioning = left.outputPartitioning
 
   override def requiredChildDistribution: Seq[ClusteredDistribution] =
-    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+    ClusteredDistribution(leftKeys, false) :: ClusteredDistribution(rightKeys, false) :: Nil
 
   protected override def doExecute(): RDD[InternalRow] = {
     buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) =>

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
@@ -42,7 +42,7 @@ case class SortMergeJoin(
   override def outputPartitioning: Partitioning = left.outputPartitioning
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+    ClusteredDistribution(leftKeys, false) :: ClusteredDistribution(rightKeys, false) :: Nil
 
   // this is to manually construct an ordering that can be used to compare keys from both sides
   private val keyOrdering: RowOrdering = RowOrdering.forSchema(leftKeys.map(_.dataType))

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -30,6 +30,26 @@ import org.apache.spark.sql.{Row, SQLConf, execution}
 
 
 class PlannerSuite extends SparkFunSuite {
+  test("multiway full outer join") {
+    val planned = testData
+                  .join(testData2, testData("key") === testData2("a"), "outer")
+                  .join(testData3, testData("key") === testData3("a"), "outer")
+                  .queryExecution.executedPlan
+    val exchanges = planned.collect { case n: Exchange => n }
+
+    assert(exchanges.size === 3)
+  }
+
+  test("full outer join followed by aggregation") {
+    val planned = testData
+      .join(testData2, testData("key") === testData2("a"), "outer") // join key testData('key)
+      .groupBy(testData("key")).agg(testData("key"), count("a"))    // group by key testData('key)
+      .queryExecution.executedPlan
+    val exchanges = planned.collect { case n: Exchange => n }
+
+    assert(exchanges.size === 3)
+  }
+
   test("unions are collapsed") {
     val query = testData.unionAll(testData).unionAll(testData).logicalPlan
     val planned = BasicOperators(query).head