From 02cc5509455d3f9d6d683a46fe4a50fcde8da348 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Wed, 28 Mar 2018 19:38:59 -0700
Subject: [PATCH 1/5] Fixed join issue

---
 .../execution/streaming/StreamingSymmetricHashJoinExec.scala   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
index c351f658cb95..fa7c8ee906ec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
@@ -167,7 +167,8 @@ case class StreamingSymmetricHashJoinExec(
   val nullRight = new GenericInternalRow(right.output.map(_.withNullability(true)).length)
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+    ClusteredDistribution(leftKeys, stateInfo.map(_.numPartitions)) ::
+      ClusteredDistribution(rightKeys, stateInfo.map(_.numPartitions)) :: Nil
 
   override def output: Seq[Attribute] = joinType match {
     case _: InnerLike => left.output ++ right.output

From 7046fbd5244e5d3adb75b7d090d57f1adc8b9859 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 29 Mar 2018 15:32:55 -0700
Subject: [PATCH 2/5] Fix compilation

---
 .../org/apache/spark/sql/streaming/StreamTest.scala    | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index e44aef09f1f3..8adeeae41f2f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -444,6 +444,16 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
         }
       }
 
+      // Verify if stateful operators have correct metadata and distribution
+      // This can often catch hard to debug errors when developing stateful operators
+      val executedPlan = currentStream.lastExecution.executedPlan
+      executedPlan.collect { case s: StatefulOperator => s }.foreach { s =>
+        assert(s.stateInfo.isDefined)
+        s.requiredChildDistribution.foreach { d =>
+          assert(d.requiredNumPartitions.isDefined)
+        }
+      }
+
       val (latestBatchData, allData) = sink match {
         case s: MemorySink => (s.latestBatchData, s.allData)
         case s: MemorySinkV2 => (s.latestBatchData, s.allData)

From c162f8def7f7f57b9e8b954a5fe2f96368b5ed2f Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 29 Mar 2018 16:22:53 -0700
Subject: [PATCH 3/5] Removed unnecessary tests

---
 .../sql/streaming/DeduplicateSuite.scala      |  8 +--
 .../FlatMapGroupsWithStateSuite.scala         |  5 +-
 .../sql/streaming/StatefulOperatorTest.scala  | 49 -------------------
 .../spark/sql/streaming/StreamTest.scala      | 25 +++++++---
 .../streaming/StreamingAggregationSuite.scala |  4 +-
 5 files changed, 21 insertions(+), 70 deletions(-)
 delete mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/StatefulOperatorTest.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala
index caf2bab8a585..0088b64d6195 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/DeduplicateSuite.scala
@@ -25,9 +25,7 @@ import org.apache.spark.sql.execution.streaming.{MemoryStream, StreamingDeduplic
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.functions._
 
-class DeduplicateSuite extends StateStoreMetricsTest
-    with BeforeAndAfterAll
-    with StatefulOperatorTest {
+class DeduplicateSuite extends StateStoreMetricsTest with BeforeAndAfterAll {
 
   import testImplicits._
 
@@ -44,8 +42,6 @@ class DeduplicateSuite extends StateStoreMetricsTest
       AddData(inputData, "a"),
       CheckLastBatch("a"),
       assertNumStateRows(total = 1, updated = 1),
-      AssertOnQuery(sq =>
-        checkChildOutputHashPartitioning[StreamingDeduplicateExec](sq, Seq("value"))),
       AddData(inputData, "a"),
       CheckLastBatch(),
       assertNumStateRows(total = 1, updated = 0),
@@ -63,8 +59,6 @@ class DeduplicateSuite extends StateStoreMetricsTest
       AddData(inputData, "a" -> 1),
       CheckLastBatch("a" -> 1),
       assertNumStateRows(total = 1, updated = 1),
-      AssertOnQuery(sq =>
-        checkChildOutputHashPartitioning[StreamingDeduplicateExec](sq, Seq("_1"))),
       AddData(inputData, "a" -> 2), // Dropped
       CheckLastBatch(),
       assertNumStateRows(total = 1, updated = 0),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
index de2b51678cea..b1416bff87ee 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
@@ -42,8 +42,7 @@ case class RunningCount(count: Long)
 case class Result(key: Long, count: Int)
 
 class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
-    with BeforeAndAfterAll
-    with StatefulOperatorTest {
+    with BeforeAndAfterAll {
 
   import testImplicits._
   import GroupStateImpl._
@@ -618,8 +617,6 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
       AddData(inputData, "a"),
       CheckLastBatch(("a", "1")),
       assertNumStateRows(total = 1, updated = 1),
-      AssertOnQuery(sq => checkChildOutputHashPartitioning[FlatMapGroupsWithStateExec](
-        sq, Seq("value"))),
       AddData(inputData, "a", "b"),
       CheckLastBatch(("a", "2"), ("b", "1")),
       assertNumStateRows(total = 2, updated = 2),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StatefulOperatorTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StatefulOperatorTest.scala
deleted file mode 100644
index 45142278993b..000000000000
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StatefulOperatorTest.scala
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.streaming
-
-import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.streaming._
-
-trait StatefulOperatorTest {
-  /**
-   * Check that the output partitioning of a child operator of a Stateful operator satisfies the
-   * distribution that we expect for our Stateful operator.
-   */
-  protected def checkChildOutputHashPartitioning[T <: StatefulOperator](
-      sq: StreamingQuery,
-      colNames: Seq[String]): Boolean = {
-    val attr = sq.asInstanceOf[StreamExecution].lastExecution.analyzed.output
-    val partitions = sq.sparkSession.sessionState.conf.numShufflePartitions
-    val groupingAttr = attr.filter(a => colNames.contains(a.name))
-    checkChildOutputPartitioning(sq, HashPartitioning(groupingAttr, partitions))
-  }
-
-  /**
-   * Check that the output partitioning of a child operator of a Stateful operator satisfies the
-   * distribution that we expect for our Stateful operator.
-   */
-  protected def checkChildOutputPartitioning[T <: StatefulOperator](
-      sq: StreamingQuery,
-      expectedPartitioning: Partitioning): Boolean = {
-    val operator = sq.asInstanceOf[StreamExecution].lastExecution
-      .executedPlan.collect { case p: T => p }
-    operator.head.children.forall(
-      _.outputPartitioning.numPartitions == expectedPartitioning.numPartitions)
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 8adeeae41f2f..55027c0133dc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -37,6 +37,7 @@ import org.apache.spark.SparkEnv
 import org.apache.spark.sql.{Dataset, Encoder, QueryTest, Row}
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.physical.AllTuples
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.datasources.v2.StreamingDataSourceV2Relation
 import org.apache.spark.sql.execution.streaming._
@@ -444,13 +445,23 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
         }
       }
 
-      // Verify if stateful operators have correct metadata and distribution
-      // This can often catch hard to debug errors when developing stateful operators
-      val executedPlan = currentStream.lastExecution.executedPlan
-      executedPlan.collect { case s: StatefulOperator => s }.foreach { s =>
-        assert(s.stateInfo.isDefined)
-        s.requiredChildDistribution.foreach { d =>
-          assert(d.requiredNumPartitions.isDefined)
+      if (currentStream.isInstanceOf[MicroBatchExecution]) {
+        // Verify if stateful operators have correct metadata and distribution
+        // This can often catch hard to debug errors when developing stateful operators
+        val executedPlan = currentStream.lastExecution.executedPlan
+        executedPlan.collect { case s: StatefulOperator => s }.foreach { s =>
+          assert(s.stateInfo.isDefined)
+          assert(s.stateInfo.get.numPartitions >= 1)
+
+          s.requiredChildDistribution.foreach { d =>
+            withClue(s"$s specifies incorrect # partitions in requiredChildDistribution $d") {
+              assert(d.requiredNumPartitions.isDefined)
+              assert(d.requiredNumPartitions.get >= 1)
+              if (d != AllTuples) {
+                assert(d.requiredNumPartitions.get == s.stateInfo.get.numPartitions)
+              }
+            }
+          }
         }
       }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index 97e065193fd0..1cae8cb8d47f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -44,7 +44,7 @@ object FailureSingleton {
 }
 
 class StreamingAggregationSuite extends StateStoreMetricsTest
-    with BeforeAndAfterAll with Assertions with StatefulOperatorTest {
+    with BeforeAndAfterAll with Assertions {
 
   override def afterAll(): Unit = {
     super.afterAll()
@@ -281,8 +281,6 @@ class StreamingAggregationSuite extends StateStoreMetricsTest
       AddData(inputData, 0L, 5L, 5L, 10L),
       AdvanceManualClock(10 * 1000),
       CheckLastBatch((0L, 1), (5L, 2), (10L, 1)),
-      AssertOnQuery(sq =>
-        checkChildOutputHashPartitioning[StateStoreRestoreExec](sq, Seq("value"))),
 
       // advance clock to 20 seconds, should retain keys >= 10
       AddData(inputData, 15L, 15L, 20L),

From 555eeb376796e931ab8770301163d1016787ee64 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 29 Mar 2018 18:03:02 -0700
Subject: [PATCH 4/5] Addressed comments

---
 .../spark/sql/execution/streaming/IncrementalExecution.scala  | 2 +-
 .../scala/org/apache/spark/sql/streaming/StreamTest.scala     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
index a10ed5f2df1b..1a83c884d55b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/IncrementalExecution.scala
@@ -62,7 +62,7 @@ class IncrementalExecution(
       StreamingDeduplicationStrategy :: Nil
   }
 
-  private val numStateStores = offsetSeqMetadata.conf.get(SQLConf.SHUFFLE_PARTITIONS.key)
+  private[sql] val numStateStores = offsetSeqMetadata.conf.get(SQLConf.SHUFFLE_PARTITIONS.key)
     .map(SQLConf.SHUFFLE_PARTITIONS.valueConverter)
     .getOrElse(sparkSession.sessionState.conf.numShufflePartitions)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 55027c0133dc..11ad84a46b83 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -450,8 +450,8 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
         // This can often catch hard to debug errors when developing stateful operators
         val executedPlan = currentStream.lastExecution.executedPlan
         executedPlan.collect { case s: StatefulOperator => s }.foreach { s =>
-          assert(s.stateInfo.isDefined)
-          assert(s.stateInfo.get.numPartitions >= 1)
+          assert(
+            s.stateInfo.map(_.numPartitions).contains(currentStream.lastExecution.numStateStores))
 
           s.requiredChildDistribution.foreach { d =>
             withClue(s"$s specifies incorrect # partitions in requiredChildDistribution $d") {

From aeec84375b494d52f69c40bcf6d7df30947c8908 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Fri, 30 Mar 2018 12:10:12 -0700
Subject: [PATCH 5/5] Added null checks to avoid flakiness

---
 .../org/apache/spark/sql/streaming/StreamTest.scala    | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 11ad84a46b83..00741d660dd2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -445,14 +445,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
         }
       }
 
-      if (currentStream.isInstanceOf[MicroBatchExecution]) {
+      val lastExecution = currentStream.lastExecution
+      if (currentStream.isInstanceOf[MicroBatchExecution] && lastExecution != null) {
         // Verify if stateful operators have correct metadata and distribution
         // This can often catch hard to debug errors when developing stateful operators
-        val executedPlan = currentStream.lastExecution.executedPlan
-        executedPlan.collect { case s: StatefulOperator => s }.foreach { s =>
-          assert(
-            s.stateInfo.map(_.numPartitions).contains(currentStream.lastExecution.numStateStores))
-
+        lastExecution.executedPlan.collect { case s: StatefulOperator => s }.foreach { s =>
+          assert(s.stateInfo.map(_.numPartitions).contains(lastExecution.numStateStores))
           s.requiredChildDistribution.foreach { d =>
             withClue(s"$s specifies incorrect # partitions in requiredChildDistribution $d") {
               assert(d.requiredNumPartitions.isDefined)