StreamingSymmetricHashJoinExec should require HashClusteredPartitioni…

…ng from children
apache · cloud-fan · Jun 18, 2018 · Jun 19, 2018 · Jun 20, 2018 · Jun 21, 2018
commit d102da370babba06cfa1a349a98bbe56dda3d056
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -68,50 +68,42 @@ case object AllTuples extends Distribution {
   }
 }
 
-/**
- * Represents data where tuples that share the same values for the `clustering`
- * [[Expression Expressions]] will be co-located. Based on the context, this
- * can mean such tuples are either co-located in the same partition or they will be contiguous
- * within a single partition.
- */
-case class ClusteredDistribution(
-    clustering: Seq[Expression],
-    requiredNumPartitions: Option[Int] = None) extends Distribution {
+abstract class ClusteredDistributionBase(exprs: Seq[Expression]) extends Distribution {
   require(
-    clustering != Nil,
-    "The clustering expressions of a ClusteredDistribution should not be Nil. " +
+    exprs.nonEmpty,
+    s"The clustering expressions of a ${getClass.getSimpleName} should not be empty. " +
       "An AllTuples should be used to represent a distribution that only has " +
       "a single partition.")
 
   override def createPartitioning(numPartitions: Int): Partitioning = {
     assert(requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions,
-      s"This ClusteredDistribution requires ${requiredNumPartitions.get} partitions, but " +
+      s"This ${getClass.getSimpleName} requires ${requiredNumPartitions.get} partitions, but " +
         s"the actual number of partitions is $numPartitions.")
-    HashPartitioning(clustering, numPartitions)
+    HashPartitioning(exprs, numPartitions)
   }
 }
 
 /**
- * Represents data where tuples have been clustered according to the hash of the given
- * `expressions`. The hash function is defined as `HashPartitioning.partitionIdExpression`, so only
+ * Represents data where tuples that share the same values for the `clustering`
+ * [[Expression Expressions]] will be co-located. Based on the context, this
+ * can mean such tuples are either co-located in the same partition or they will be contiguous
+ * within a single partition.
+ */
+case class ClusteredDistribution(
+    clustering: Seq[Expression],
+    requiredNumPartitions: Option[Int] = None) extends ClusteredDistributionBase(clustering)
+
+/**
+ * Represents data where tuples have been clustered according to the hash of the given expressions.
+ * The hash function is defined as [[HashPartitioning.partitionIdExpression]], so only
  * [[HashPartitioning]] can satisfy this distribution.
  *
  * This is a strictly stronger guarantee than [[ClusteredDistribution]]. Given a tuple and the
  * number of partitions, this distribution strictly requires which partition the tuple should be in.
  */
-case class HashClusteredDistribution(expressions: Seq[Expression]) extends Distribution {
-  require(
-    expressions != Nil,
-    "The expressions for hash of a HashPartitionedDistribution should not be Nil. " +
-      "An AllTuples should be used to represent a distribution that only has " +
-      "a single partition.")
-
-  override def requiredNumPartitions: Option[Int] = None
-
-  override def createPartitioning(numPartitions: Int): Partitioning = {
-    HashPartitioning(expressions, numPartitions)
-  }
-}
+case class HashClusteredDistribution(
+    hashExprs: Seq[Expression],
+    requiredNumPartitions: Option[Int] = None) extends ClusteredDistributionBase(hashExprs)
 
 /**
  * Represents data where tuples have been ordered according to the `ordering`
@@ -207,15 +199,18 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
 
   override def satisfies(required: Distribution): Boolean = {
     super.satisfies(required) || {
-      required match {
-        case h: HashClusteredDistribution =>
-          expressions.length == h.expressions.length && expressions.zip(h.expressions).forall {
-            case (l, r) => l.semanticEquals(r)
-          }
-        case ClusteredDistribution(requiredClustering, requiredNumPartitions) =>
-          expressions.forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
-            (requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions)
-        case _ => false
+      val satisfyNumPartitions = required.requiredNumPartitions.isEmpty ||
+        required.requiredNumPartitions.get == numPartitions
+      satisfyNumPartitions && {
+        required match {
+          case h: HashClusteredDistribution =>
+            expressions.length == h.hashExprs.length && expressions.zip(h.hashExprs).forall {
+              case (l, r) => l.semanticEquals(r)
+            }
+          case c: ClusteredDistribution =>
+            expressions.forall(x => c.clustering.exists(_.semanticEquals(x)))
+          case _ => false
+        }
       }
     }
   }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -73,7 +73,7 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
         // these children may not be partitioned in the same way.
         // Please see the comment in withCoordinator for more details.
         val supportsDistribution = requiredChildDistributions.forall { dist =>
-          dist.isInstanceOf[ClusteredDistribution] || dist.isInstanceOf[HashClusteredDistribution]
+          dist.isInstanceOf[ClusteredDistributionBase]
         }
         children.length > 1 && supportsDistribution
       }

diff --git a/.../main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala b/.../main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
@@ -167,8 +167,8 @@ case class StreamingSymmetricHashJoinExec(
   val nullRight = new GenericInternalRow(right.output.map(_.withNullability(true)).length)
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftKeys, stateInfo.map(_.numPartitions)) ::
-      ClusteredDistribution(rightKeys, stateInfo.map(_.numPartitions)) :: Nil
+    HashClusteredDistribution(leftKeys, stateInfo.map(_.numPartitions)) ::
+      HashClusteredDistribution(rightKeys, stateInfo.map(_.numPartitions)) :: Nil
 
   override def output: Seq[Attribute] = joinType match {
     case _: InnerLike => left.output ++ right.output

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
@@ -404,6 +404,20 @@ class StreamingInnerJoinSuite extends StreamTest with StateStoreMetricsTest with
       AddData(input3, 5, 10),
       CheckNewAnswer((5, 10, 5, 15, 5, 25)))
   }
+
+  test("streaming join should require HashClusteredDistribution from children") {
+    val input1 = MemoryStream[Int]
+    val input2 = MemoryStream[Int]
+
+    val df1 = input1.toDF.select('value as 'a, 'value * 2 as 'b)
+    val df2 = input2.toDF.select('value as 'a, 'value * 2 as 'b).repartition('b)
+    val joined = df1.join(df2, Seq("a", "b")).select('a)
+
+    testStream(joined)(
+      AddData(input1, 1.to(1000): _*),
+      AddData(input2, 1.to(1000): _*),
+      CheckAnswer(1.to(1000): _*))
+  }
 }