[SPARK-21417][SQL] Infer join conditions using propagated constraints

## What changes were proposed in this pull request? This PR adds an optimization rule that infers join conditions using propagated constraints. For instance, if there is a join, where the left relation has 'a = 1' and the right relation has 'b = 1', then the rule infers 'a = b' as a join predicate. Only semantically new predicates are appended to the existing join condition. Refer to the corresponding ticket and tests for more details. ## How was this patch tested? This patch comes with a new test suite to cover the implemented logic. Author: aokolnychyi <[email protected]> Closes apache#18692 from aokolnychyi/spark-21417.
apache-spark-on-k8s · liyinan926 · Nov 30, 2017 · Nov 30, 2017 · Nov 30, 2017 · Nov 30, 2017
commit 6ac57fd0d1c82b834eb4bf0dd57596b92a99d6de
diff --git a/...st/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMap.scala b/...st/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMap.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.EquivalentExpressionMap.SemanticallyEqualExpr
+
+/**
+ * A class that allows you to map an expression into a set of equivalent expressions. The keys are
+ * handled based on their semantic meaning and ignoring cosmetic differences. The values are
+ * represented as [[ExpressionSet]]s.
+ *
+ * The underlying representation of keys depends on the [[Expression.semanticHash]] and
+ * [[Expression.semanticEquals]] methods.
+ *
+ * {{{
+ *   val map = new EquivalentExpressionMap()
+ *
+ *   map.put(1 + 2, a)
+ *   map.put(rand(), b)
+ *
+ *   map.get(2 + 1) => Set(a) // 1 + 2 and 2 + 1 are semantically equivalent
+ *   map.get(1 + 2) => Set(a) // 1 + 2 and 2 + 1 are semantically equivalent
+ *   map.get(rand()) => Set() // non-deterministic expressions are not equivalent
+ * }}}
+ */
+class EquivalentExpressionMap {
+
+  private val equivalenceMap = mutable.HashMap.empty[SemanticallyEqualExpr, ExpressionSet]
+
+  def put(expression: Expression, equivalentExpression: Expression): Unit = {
+    val equivalentExpressions = equivalenceMap.getOrElseUpdate(expression, ExpressionSet.empty)
+    equivalenceMap(expression) = equivalentExpressions + equivalentExpression
+  }
+
+  def get(expression: Expression): Set[Expression] =
+    equivalenceMap.getOrElse(expression, ExpressionSet.empty)
+}
+
+object EquivalentExpressionMap {
+
+  private implicit class SemanticallyEqualExpr(val expr: Expression) {
+    override def equals(obj: Any): Boolean = obj match {
+      case other: SemanticallyEqualExpr => expr.semanticEquals(other.expr)
+      case _ => false
+    }
+
+    override def hashCode: Int = expr.semanticHash()
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala
@@ -27,6 +27,8 @@ object ExpressionSet {
     expressions.foreach(set.add)
     set
   }
+
+  val empty: ExpressionSet = ExpressionSet(Nil)
 }
 
 /**

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -87,6 +87,7 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
       PushProjectionThroughUnion,
       ReorderJoin,
       EliminateOuterJoin,
+      EliminateCrossJoin,
       InferFiltersFromConstraints,
       BooleanSimplification,
       PushPredicateThroughJoin,

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import scala.annotation.tailrec
+import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.ExtractFiltersAndInnerJoins
@@ -152,3 +153,62 @@ object EliminateOuterJoin extends Rule[LogicalPlan] with PredicateHelper {
       if (j.joinType == newJoinType) f else Filter(condition, j.copy(joinType = newJoinType))
   }
 }
+
+/**
+ * A rule that eliminates CROSS joins by inferring join conditions from propagated constraints.
+ *
+ * The optimization is applicable only to CROSS joins. For other join types, adding inferred join
+ * conditions would potentially shuffle children as child node's partitioning won't satisfy the JOIN
+ * node's requirements which otherwise could have.
+ *
+ * For instance, given a CROSS join with the constraint 'a = 1' from the left child and the
+ * constraint 'b = 1' from the right child, this rule infers a new join predicate 'a = b' and
+ * converts it to an Inner join.
+ */
+object EliminateCrossJoin extends Rule[LogicalPlan] with PredicateHelper {
+
+  def apply(plan: LogicalPlan): LogicalPlan = {
+    if (SQLConf.get.constraintPropagationEnabled) {
+      eliminateCrossJoin(plan)
+    } else {
+      plan
+    }
+  }
+
+  private def eliminateCrossJoin(plan: LogicalPlan): LogicalPlan = plan transform {
+    case join @ Join(leftPlan, rightPlan, Cross, None) =>
+      val leftConstraints = join.constraints.filter(_.references.subsetOf(leftPlan.outputSet))
+      val rightConstraints = join.constraints.filter(_.references.subsetOf(rightPlan.outputSet))
+      val inferredJoinPredicates = inferJoinPredicates(leftConstraints, rightConstraints)
+      val joinConditionOpt = inferredJoinPredicates.reduceOption(And)
+      if (joinConditionOpt.isDefined) Join(leftPlan, rightPlan, Inner, joinConditionOpt) else join
+  }
+
+  private def inferJoinPredicates(
+      leftConstraints: Set[Expression],
+      rightConstraints: Set[Expression]): mutable.Set[EqualTo] = {
+
+    val equivalentExpressionMap = new EquivalentExpressionMap()
+
+    leftConstraints.foreach {
+      case EqualTo(attr: Attribute, expr: Expression) =>
+        equivalentExpressionMap.put(expr, attr)
+      case EqualTo(expr: Expression, attr: Attribute) =>
+        equivalentExpressionMap.put(expr, attr)
+      case _ =>
+    }
+
+    val joinConditions = mutable.Set.empty[EqualTo]
+
+    rightConstraints.foreach {
+      case EqualTo(attr: Attribute, expr: Expression) =>
+        joinConditions ++= equivalentExpressionMap.get(expr).map(EqualTo(attr, _))
+      case EqualTo(expr: Expression, attr: Attribute) =>
+        joinConditions ++= equivalentExpressionMap.get(expr).map(EqualTo(attr, _))
+      case _ =>
+    }
+
+    joinConditions
+  }
+
+}
diff --git a/...c/test/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMapSuite.scala b/...c/test/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMapSuite.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.dsl.expressions._
+
+class EquivalentExpressionMapSuite extends SparkFunSuite {
+
+  private val onePlusTwo = Literal(1) + Literal(2)
+  private val twoPlusOne = Literal(2) + Literal(1)
+  private val rand = Rand(10)
+
+  test("behaviour of the equivalent expression map") {
+    val equivalentExpressionMap = new EquivalentExpressionMap()
+    equivalentExpressionMap.put(onePlusTwo, 'a)
+    equivalentExpressionMap.put(Literal(1) + Literal(3), 'b)
+    equivalentExpressionMap.put(rand, 'c)
+
+    // 1 + 2 should be equivalent to 2 + 1
+    assertResult(ExpressionSet(Seq('a)))(equivalentExpressionMap.get(twoPlusOne))
+    // non-deterministic expressions should not be equivalent
+    assertResult(ExpressionSet.empty)(equivalentExpressionMap.get(rand))
+
+    // if the same (key, value) is added several times, the map still returns only one entry
+    equivalentExpressionMap.put(onePlusTwo, 'a)
+    equivalentExpressionMap.put(twoPlusOne, 'a)
+    assertResult(ExpressionSet(Seq('a)))(equivalentExpressionMap.get(twoPlusOne))
+
+    // get several equivalent attributes
+    equivalentExpressionMap.put(onePlusTwo, 'e)
+    assertResult(ExpressionSet(Seq('a, 'e)))(equivalentExpressionMap.get(onePlusTwo))
+    assertResult(2)(equivalentExpressionMap.get(onePlusTwo).size)
+
+    // several non-deterministic expressions should not be equivalent
+    equivalentExpressionMap.put(rand, 'd)
+    assertResult(ExpressionSet.empty)(equivalentExpressionMap.get(rand))
+    assertResult(0)(equivalentExpressionMap.get(rand).size)
+  }
+
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -27,6 +27,8 @@ object ExpressionSet { @@
         expressions.foreach(set.add)
         set
       }
+      val empty: ExpressionSet = ExpressionSet(Nil)
     }
     /**
@@ Expand Down @@