Merge remote-tracking branch 'apache/master' into writerTask

apache · jose-torres · May 15, 2018 · May 17, 2018 · May 17, 2018 · May 17, 2018
commit 63d38d849107eed226449cec8d24c2241cd583c9
diff --git a/...rg/apache/spark/sql/execution/streaming/continuous/shuffle/ContinuousShuffleReadRDD.scala b/...rg/apache/spark/sql/execution/streaming/continuous/shuffle/ContinuousShuffleReadRDD.scala
@@ -25,13 +25,18 @@ import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.util.NextIterator
 
-case class ContinuousShuffleReadPartition(index: Int, queueSize: Int, numShuffleWriters: Int)
+case class ContinuousShuffleReadPartition(
+      index: Int,
+      queueSize: Int,
+      numShuffleWriters: Int,
+      epochIntervalMs: Long)
     extends Partition {
   // Initialized only on the executor, and only once even as we call compute() multiple times.
   lazy val (reader: ContinuousShuffleReader, endpoint) = {
     val env = SparkEnv.get.rpcEnv
-    val receiver = new UnsafeRowReceiver(queueSize, numShuffleWriters, env)
-    val endpoint = env.setupEndpoint(s"UnsafeRowReceiver-${UUID.randomUUID().toString}", receiver)
+    val receiver = new UnsafeRowReceiver(queueSize, numShuffleWriters, epochIntervalMs, env)
+    val endpoint = env.setupEndpoint(s"UnsafeRowReceiver-${UUID.randomUUID()}", receiver)
+
     TaskContext.get().addTaskCompletionListener { ctx =>
       env.stop(endpoint)
     }
@@ -43,15 +48,24 @@ case class ContinuousShuffleReadPartition(index: Int, queueSize: Int, numShuffle
  * RDD at the map side of each continuous processing shuffle task. Upstream tasks send their
  * shuffle output to the wrapped receivers in partitions of this RDD; each of the RDD's tasks
  * poll from their receiver until an epoch marker is sent.
+ *
+ * @param sc the RDD context
+ * @param numPartitions the number of read partitions for this RDD
+ * @param queueSize the size of the row buffers to use
+ * @param numShuffleWriters the number of continuous shuffle writers feeding into this RDD
+ * @param epochIntervalMs the checkpoint interval of the streaming query
  */
-class ContinuousShuffleReadRDD(sc: SparkContext, numPartitions: Int, numShuffleWriters: Int = 1)
-    extends RDD[UnsafeRow](sc, Nil) {
-
-  private val queueSize = sc.conf.get(SQLConf.CONTINUOUS_STREAMING_EXECUTOR_QUEUE_SIZE)
+class ContinuousShuffleReadRDD(
+    sc: SparkContext,
+    numPartitions: Int,
+    queueSize: Int = 1024,
+    numShuffleWriters: Int = 1,
+    epochIntervalMs: Long = 1000)
+  extends RDD[UnsafeRow](sc, Nil) {
 
   override protected def getPartitions: Array[Partition] = {
     (0 until numPartitions).map { partIndex =>
-      ContinuousShuffleReadPartition(partIndex, queueSize, numShuffleWriters)
+      ContinuousShuffleReadPartition(partIndex, queueSize, numShuffleWriters, epochIntervalMs)
     }.toArray
   }
 

diff --git a/...scala/org/apache/spark/sql/execution/streaming/continuous/shuffle/UnsafeRowReceiver.scala b/...scala/org/apache/spark/sql/execution/streaming/continuous/shuffle/UnsafeRowReceiver.scala
@@ -18,9 +18,7 @@
 package org.apache.spark.sql.execution.streaming.continuous.shuffle
 
 import java.util.concurrent._
-import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger}
-
-import scala.concurrent.Future
+import java.util.concurrent.atomic.AtomicBoolean
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
@@ -29,6 +27,10 @@ import org.apache.spark.util.NextIterator
 
 /**
  * Messages for the UnsafeRowReceiver endpoint. Either an incoming row or an epoch marker.
+ *
+ * Each message comes tagged with writerId, identifying which writer the message is coming
+ * from. The receiver will only begin the next epoch once all writers have sent an epoch
+ * marker ending the current epoch.
  */
 private[shuffle] sealed trait UnsafeRowReceiverMessage extends Serializable {
   def writerId: Int
@@ -47,6 +49,7 @@ private[shuffle] case class ReceiverEpochMarker(writerId: Int) extends UnsafeRow
 private[shuffle] class UnsafeRowReceiver(
       queueSize: Int,
       numShuffleWriters: Int,
+      epochIntervalMs: Long,
       override val rpcEnv: RpcEnv)
     extends ThreadSafeRpcEndpoint with ContinuousShuffleReader with Logging {
   // Note that this queue will be drained from the main task thread and populated in the RPC
@@ -70,7 +73,8 @@ private[shuffle] class UnsafeRowReceiver(
 
   override def read(): Iterator[UnsafeRow] = {
     new NextIterator[UnsafeRow] {
-      private val numWriterEpochMarkers = new AtomicInteger(0)
+      // An array of flags for whether each writer ID has gotten an epoch marker.
+      private val writerEpochMarkersReceived = Array.fill(numShuffleWriters)(false)
 
       private val executor = Executors.newFixedThreadPool(numShuffleWriters)
       private val completion = new ExecutorCompletionService[UnsafeRowReceiverMessage](executor)
@@ -79,26 +83,49 @@ private[shuffle] class UnsafeRowReceiver(
         override def call(): UnsafeRowReceiverMessage = queues(writerId).take()
       }
 
+      // Initialize by submitting tasks to read the first row from each writer.
       (0 until numShuffleWriters).foreach(writerId => completion.submit(completionTask(writerId)))
 
+      /**
+       * In each call to getNext(), we pull the next row available in the completion queue, and then
+       * submit another task to read the next row from the writer which returned it.
+       *
+       * When a writer sends an epoch marker, we note that it's finished and don't submit another
+       * task for it in this epoch. The iterator is over once all writers have sent an epoch marker.
+       */
       override def getNext(): UnsafeRow = {
-        completion.take().get() match {
-          case ReceiverRow(writerId, r) =>
-            // Start reading the next element in the queue we just took from.
-            completion.submit(completionTask(writerId))
-            r
-            // TODO use writerId
-          case ReceiverEpochMarker(writerId) =>
-            // Don't read any more from this queue. If all the writers have sent epoch markers,
-            // the epoch is over; otherwise we need rows from one of the remaining writers.
-            val writersCompleted = numWriterEpochMarkers.incrementAndGet()
-            if (writersCompleted == numShuffleWriters) {
-              finished = true
-              null
-            } else {
-              getNext()
+        var nextRow: UnsafeRow = null
+        while (!finished && nextRow == null) {
+          completion.poll(epochIntervalMs, TimeUnit.MILLISECONDS) match {
+            case null =>
+              // Try again if the poll didn't wait long enough to get a real result.
+              // But we should be getting at least an epoch marker every checkpoint interval.
+              val writerIdsUncommitted = writerEpochMarkersReceived.zipWithIndex.collect {
+                case (flag, idx) if !flag => idx
+              }
+              logWarning(
+                s"Completion service failed to make progress after $epochIntervalMs ms. Waiting " +
+                  s"for writers $writerIdsUncommitted to send epoch markers.")
+
+            // The completion service guarantees this future will be available immediately.
+            case future => future.get() match {
+              case ReceiverRow(writerId, r) =>
+                // Start reading the next element in the queue we just took from.
+                completion.submit(completionTask(writerId))
+                nextRow = r
+              case ReceiverEpochMarker(writerId) =>
+                // Don't read any more from this queue. If all the writers have sent epoch markers,
+                // the epoch is over; otherwise we need to loop again to poll from the remaining
+                // writers.
+                writerEpochMarkersReceived(writerId) = true
+                if (writerEpochMarkersReceived.forall(_ == true)) {
+                  finished = true
+                }
             }
+          }
         }
+
+        nextRow
       }
 
       override def close(): Unit = {