more changes

apache · imback82 · Sep 4, 2020 · Sep 4, 2020 · Sep 5, 2020 · Sep 7, 2020
commit 7fa91cadd7d8788a6ac855292edb9cfe7b415486
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2698,6 +2698,16 @@ object SQLConf {
       .checkValue(_ >= 0, "The value must be non-negative.")
       .createWithDefault(8)
 
+  val OPTIMIZE_SORT_MERGE_JOIN_WITH_PARTIAL_HASH_DISTRIBUTION =
+    buildConf("spark.sql.execution.sortMergeJoin.optimizePartialHashDistribution.enabled")
+      .internal()
+      .doc("Optimizes sort merge join if both side of join have partial hash distributions - " +
+        "the output partitioning is HashPartitioning and its expressions are a subset of join " +
+        "keys on the respective side - by eliminating the shuffle.")
+      .version("3.1.0")
+      .booleanConf
+      .createWithDefault(false)
+
   val OPTIMIZE_NULL_AWARE_ANTI_JOIN =
     buildConf("spark.sql.optimizeNullAwareAntiJoin")
       .internal()
@@ -3357,6 +3367,9 @@ class SQLConf extends Serializable with Logging {
   def coalesceBucketsInJoinMaxBucketRatio: Int =
     getConf(SQLConf.COALESCE_BUCKETS_IN_JOIN_MAX_BUCKET_RATIO)
 
+  def optimizeSortMergeJoinWithPartialHashDistribution: Boolean =
+    getConf(SQLConf.OPTIMIZE_SORT_MERGE_JOIN_WITH_PARTIAL_HASH_DISTRIBUTION)
+
   def optimizeNullAwareAntiJoin: Boolean =
     getConf(SQLConf.OPTIMIZE_NULL_AWARE_ANTI_JOIN)
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.catalyst.util.truncatedString
 import org.apache.spark.sql.execution.adaptive.{AdaptiveExecutionContext, InsertAdaptiveSparkPlan}
 import org.apache.spark.sql.execution.bucketing.CoalesceBucketsInJoin
 import org.apache.spark.sql.execution.dynamicpruning.PlanDynamicPruningFilters
-import org.apache.spark.sql.execution.exchange.{EnsureRequirements, RemoveShuffleExchangeForSortMergeJoin, ReuseExchange}
+import org.apache.spark.sql.execution.exchange.{EnsureRequirements, OptimizeSortMergeJoinWithPartialHashDistribution, ReuseExchange}
 import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.OutputMode
@@ -344,7 +344,7 @@ object QueryExecution {
       PlanSubqueries(sparkSession),
       RemoveRedundantProjects(sparkSession.sessionState.conf),
       EnsureRequirements(sparkSession.sessionState.conf),
-      RemoveShuffleExchangeForSortMergeJoin(sparkSession.sessionState.conf),
+      OptimizeSortMergeJoinWithPartialHashDistribution(sparkSession.sessionState.conf),
       ApplyColumnarRulesAndInsertTransitions(sparkSession.sessionState.conf,
         sparkSession.sessionState.columnarRules),
       CollapseCodegenStages(sparkSession.sessionState.conf),

diff --git a/...pache/spark/sql/execution/exchange/OptimizeSortMergeJoinWithPartialHashDistribution.scala b/...pache/spark/sql/execution/exchange/OptimizeSortMergeJoinWithPartialHashDistribution.scala
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.exchange
+
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.{SortExec, SparkPlan}
+import org.apache.spark.sql.execution.joins.SortMergeJoinExec
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * This rule removes shuffle for the sort merge join if the following conditions are met:
+ * - The child of ShuffleExchangeExec has HashPartitioning with the same number of partitions
+ *   as the other side of join.
+ * - The child of ShuffleExchangeExec has output partitioning which has the subset of
+ *   join keys on the respective join side.
+ *
+ * If the above conditions are met, shuffle can be eliminated for the sort merge join
+ * because rows are sorted before join logic is applied.
+ */
+case class OptimizeSortMergeJoinWithPartialHashDistribution(conf: SQLConf) extends Rule[SparkPlan] {
+  def apply(plan: SparkPlan): SparkPlan = {
+    if (!conf.optimizeSortMergeJoinWithPartialHashDistribution) {
+      return plan
+    }
+
+    plan.transformUp {
+      case s @ SortMergeJoinExec(_, _, _, _,
+        lSort @ SortExec(_, _,
+          ExtractShuffleExchangeExecChild(
+            lChild,
+            lChildOutputPartitioning: HashPartitioning),
+          _),
+        rSort @ SortExec(_, _,
+          ExtractShuffleExchangeExecChild(
+            rChild,
+            rChildOutputPartitioning: HashPartitioning),
+          _),
+        false) if isPartialHashDistribution(
+          s.leftKeys, lChildOutputPartitioning, s.rightKeys, rChildOutputPartitioning) =>
+        // Remove ShuffleExchangeExec.
+        s.copy(left = lSort.copy(child = lChild), right = rSort.copy(child = rChild))
+      case other => other
+    }
+  }
+
+  /*
+   * Returns true if both HashPartitioning have the same number of partitions and
+   * their partitioning expressions are a subset of their respective join keys.
+   */
+  private def isPartialHashDistribution(
+      leftKeys: Seq[Expression],
+      leftPartitioning: HashPartitioning,
+      rightKeys: Seq[Expression],
+      rightPartitioning: HashPartitioning): Boolean = {
+    val mapping = leftKeyToRightKeyMapping(leftKeys, rightKeys)
+    (leftPartitioning.numPartitions == rightPartitioning.numPartitions) &&
+      leftPartitioning.expressions.zip(rightPartitioning.expressions)
+        .forall {
+          case (le, re) => mapping.get(le.canonicalized)
+            .map(_.exists(_.semanticEquals(re)))
+            .getOrElse(false)
+        }
+  }
+
+  /*
+   * Returns a mapping from left key to right key if there is a one-to-one mapping.
+   * Otherwise, returns None.
+   */
+  private def leftKeyToRightKeyMapping(
+      leftKeys: Seq[Expression],
+      rightKeys: Seq[Expression]): Map[Expression, Seq[Expression]] = {
+    assert(leftKeys.length == rightKeys.length)
+    val mapping = mutable.Map.empty[Expression, Seq[Expression]]
+    leftKeys.zip(rightKeys).foreach {
+      case (leftKey, rightKey) =>
+        val key = leftKey.canonicalized
+        mapping.get(key) match {
+          case Some(v) => mapping.put(key, v :+ rightKey)
+          case None => mapping.put(key, Seq(rightKey))
+        }
+    }
+    mapping.toMap
+  }
+}
+
+/**
+ * An extractor that extracts the child of ShuffleExchangeExec and the child's
+ * output partitioning.
+ */
+object ExtractShuffleExchangeExecChild {
+  def unapply(plan: SparkPlan): Option[(SparkPlan, Partitioning)] = {
+    plan match {
+      case s: ShuffleExchangeExec => Some(s.child, s.child.outputPartitioning)
+      case _ => None
+    }
+  }
+}
diff --git a/...scala/org/apache/spark/sql/execution/exchange/RemoveShuffleExchangeForSortMergeJoin.scala b/...scala/org/apache/spark/sql/execution/exchange/RemoveShuffleExchangeForSortMergeJoin.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -1314,4 +1314,59 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan
       }
     }
   }
+
+  test("SPARK-XXXXX: Optimize sort merge join with partial hash distribution") {
+    withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "0") {
+      withTable("t1", "t2") {
+        val df1 = (0 until 100).map(i => (i % 5, i % 13, i.toString)).toDF("i1", "j1", "k1")
+        val df2 = (0 until 100).map(i => (i % 3, i % 17, i.toString)).toDF("i2", "j2", "k2")
+        df1.write.format("parquet").bucketBy(8, "i1").saveAsTable("t1")
+        df2.write.format("parquet").bucketBy(8, "i2").saveAsTable("t2")
+        val t1 = spark.table("t1")
+        val t2 = spark.table("t2")
+
+        def verify(
+            f: => DataFrame,
+            numShufflesWithoutOptimization: Int,
+            numShufflesWithOptimization: Int): Unit = {
+          withSQLConf(
+            SQLConf.OPTIMIZE_SORT_MERGE_JOIN_WITH_PARTIAL_HASH_DISTRIBUTION.key -> "false") {
+            val dfWithoutOptimization = f
+            assert(dfWithoutOptimization.queryExecution.executedPlan.collect {
+              case s: ShuffleExchangeExec => s }.length == numShufflesWithoutOptimization)
+
+            withSQLConf(
+              SQLConf.OPTIMIZE_SORT_MERGE_JOIN_WITH_PARTIAL_HASH_DISTRIBUTION.key -> "true") {
+              val dfWithOptimization = f
+              assert(dfWithOptimization.queryExecution.executedPlan.collect {
+                case s: ShuffleExchangeExec => s }.length == numShufflesWithOptimization)
+              checkAnswer(dfWithOptimization, dfWithoutOptimization)
+            }
+          }
+        }
+
+        def verifyShuffleRemoved(f: => DataFrame): Unit = verify(f, 2, 0)
+        def verifyShuffleNotRemoved(f: => DataFrame): Unit = verify(f, 2, 2)
+
+        // Partial hash distribution by i1 and i2.
+        verifyShuffleRemoved(t1.join(t2, t1("i1") === t2("i2") && t1("j1") === t2("j2")))
+        verifyShuffleRemoved(t1.join(t2, t1("i1") === t2("i2") && t1("j1") + 1 === t2("j2")))
+        verifyShuffleRemoved(
+          t1.join(t2, t1("i1") === t2("i2") && t1("j1") === t2("j2") && t1("k1") === t2("k2")))
+        // Partial hash distribution by i1 and i2, but different join key orders.
+        verifyShuffleRemoved(t1.join(t2, t1("j1") === t2("j2") && t1("i1") === t2("i2")))
+        verifyShuffleRemoved(
+          t1.join(t2, t1("j1") === t2("j2") && t1("i1") === t2("i2") && t1("k1") === t2("k2")))
+        // Many-to-one mapping for join keys (right to left)
+        verifyShuffleRemoved(t1.join(t2, t1("i1") === t2("i2") && t1("j1") === t2("i2")))
+        // One-to-many mapping for join keys (left to right)
+        verifyShuffleRemoved(t1.join(t2, t1("i1") === t2("i2") && t1("i1") === t2("j2")))
+
+        // Join keys are not a subset of distribution.
+        verifyShuffleNotRemoved(t1.join(t2, t1("j1") === t2("j2")))
+        // The join key (i1 + 1) doesn't match with distribution expression.
+        verifyShuffleNotRemoved(t1.join(t2, t1("i1") + 1 === t2("i2") && t1("j1") === t2("j2")))
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -1012,37 +1012,4 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
       }
     }
   }
-
-  test("terry") {
-    withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "0") {
-      withTable("t1", "t2") {
-        val df1 = (0 until 100).map(i => (i % 5, i % 13, i.toString)).toDF("i1", "j1", "k1")
-        val df2 = (0 until 100).map(i => (i % 5, i % 13, i.toString)).toDF("i2", "j2", "k2")
-        df1.write.format("parquet").bucketBy(8, "i1").saveAsTable("t1")
-        df2.write.format("parquet").bucketBy(8, "i2").saveAsTable("t2")
-        val t1 = spark.table("t1")
-        val t2 = spark.table("t2")
-        val joined = t1.join(t2, t1("i1") === t2("i2") && t1("j1") === t2("j2"))
-        joined.explain(true)
-      }
-    }
-  }
-
-  test("terry + 1") {
-    withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "0") {
-      withTable("t1", "t2") {
-        val df1 = (0 until 5).map(i => (i % 5, i % 13, i.toString)).toDF("i1", "j1", "k1")
-        val df2 = (0 until 5).map(i => (i % 3, i % 13, i.toString)).toDF("i2", "j2", "k2")
-        df1.write.format("parquet").bucketBy(8, "i1").saveAsTable("t1")
-        df2.write.format("parquet").bucketBy(8, "i2").saveAsTable("t2")
-        val t1 = spark.table("t1")
-        val t2 = spark.table("t2")
-        val joined = t1.join(t2, t1("i1") === t2("i2") && t1("j1") + 1 === t2("j2"))
-        joined.explain(true)
-        df1.show
-        df2.show
-        joined.show
-      }
-    }
-  }
 }