[SPARK-25250] : Addressing Reviews January 8, 2019

Refactoring method name to completeTasks, also calling same method from task completion in ShuffleMapStage but not killing them.
apache · pgandhi999 · Oct 23, 2018 · Oct 23, 2018 · Dec 28, 2018 · Dec 28, 2018
commit f395b6551732d67656676be9289f4436713c7ca6
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1384,8 +1384,7 @@ private[spark] class DAGScheduler(
                 if (!job.finished(rt.outputId)) {
                   job.finished(rt.outputId) = true
                   job.numFinished += 1
-                  taskScheduler.markPartitionIdAsCompletedAndKillCorrespondingTaskAttempts(
-                    task.partitionId, task.stageId)
+                  taskScheduler.completeTasks(task.partitionId, task.stageId, true)
                   // If the whole job has finished, remove it
                   if (job.numFinished == job.numPartitions) {
                     markStageAsFinished(resultStage)
@@ -1429,6 +1428,7 @@ private[spark] class DAGScheduler(
             val status = event.result.asInstanceOf[MapStatus]
             val execId = status.location.executorId
             logDebug("ShuffleMapTask finished on " + execId)
+            taskScheduler.completeTasks(task.partitionId, task.stageId, false)
             if (failedEpoch.contains(execId) && smt.epoch <= failedEpoch(execId)) {
               logInfo(s"Ignoring possibly bogus $smt completion from executor $execId")
             } else {

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskScheduler.scala
@@ -109,7 +109,6 @@ private[spark] trait TaskScheduler {
    */
   def applicationAttemptId(): Option[String]
 
-  def markPartitionIdAsCompletedAndKillCorrespondingTaskAttempts(
-    partitionId: Int, stageId: Int): Unit
+  def completeTasks(partitionId: Int, stageId: Int, killTasks: Boolean): Unit
 
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -288,20 +288,22 @@ private[spark] class TaskSchedulerImpl(
 
   /**
    * SPARK-25250: Whenever any Result Task gets successfully completed, we simply mark the
-   * corresponding partition id as completed in all attempts for that particular stage. As a
-   * result, we do not see any Killed tasks due to TaskCommitDenied Exceptions showing up
-   * in the UI.
+   * corresponding partition id as completed in all attempts for that particular stage and
+   * additionally, for a Result Stage, we also kill the remaining task attempts running on the
+   * same partition. As a result, we do not see any Killed tasks due to
+   * TaskCommitDenied Exceptions showing up in the UI.
    */
-  override def markPartitionIdAsCompletedAndKillCorrespondingTaskAttempts(
-      partitionId: Int, stageId: Int): Unit = {
+  override def completeTasks(partitionId: Int, stageId: Int, killTasks: Boolean): Unit = {
     taskSetsByStageIdAndAttempt.getOrElse(stageId, Map()).values.foreach { tsm =>
       tsm.partitionToIndex.get(partitionId) match {
         case Some(index) =>
           tsm.markPartitionIdAsCompletedForTaskAttempt(index)
-          val taskInfoList = tsm.taskAttempts(index)
-          taskInfoList.filter(_.running).foreach { taskInfo =>
-            killTaskAttempt(taskInfo.taskId, false,
-              s"Corresponding Partition ID $partitionId has been marked as Completed")
+          if (killTasks) {
+            val taskInfoList = tsm.taskAttempts(index)
+            taskInfoList.filter(_.running).foreach { taskInfo =>
+              killTaskAttempt(taskInfo.taskId, false,
+                s"Corresponding Partition ID $partitionId has been marked as Completed")
+            }
           }
 
         case None =>

diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -160,8 +160,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
     override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
     override def workerRemoved(workerId: String, host: String, message: String): Unit = {}
     override def applicationAttemptId(): Option[String] = None
-    override def markPartitionIdAsCompletedAndKillCorrespondingTaskAttempts(
-      partitionId: Int, stageId: Int): Unit = {}
+    override def completeTasks(partitionId: Int, stageId: Int, killTasks: Boolean): Unit = {}
   }
 
   /** Length of time to wait while draining listener events. */
@@ -669,8 +668,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
       override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
       override def workerRemoved(workerId: String, host: String, message: String): Unit = {}
       override def applicationAttemptId(): Option[String] = None
-      override def markPartitionIdAsCompletedAndKillCorrespondingTaskAttempts(
-        partitionId: Int, stageId: Int): Unit = {}
+      override def completeTasks(partitionId: Int, stageId: Int, killTasks: Boolean): Unit = {}
     }
     val noKillScheduler = new DAGScheduler(
       sc,

diff --git a/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ExternalClusterManagerSuite.scala
@@ -94,6 +94,5 @@ private class DummyTaskScheduler extends TaskScheduler {
       accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
       blockManagerId: BlockManagerId,
       executorMetrics: ExecutorMetrics): Boolean = true
-  override def markPartitionIdAsCompletedAndKillCorrespondingTaskAttempts(
-      partitionId: Int, stageId: Int): Unit = {}
+  override def completeTasks(partitionId: Int, stageId: Int, killTasks: Boolean): Unit = {}
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -1350,7 +1350,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
     val offersSecondAttempt = (0 until 10).map{ idx => WorkerOffer(s"exec-$idx", s"host-$idx", 1) }
     taskScheduler.resourceOffers(offersSecondAttempt)
 
-    taskScheduler.markPartitionIdAsCompletedAndKillCorrespondingTaskAttempts(2, 0)
+    taskScheduler.completeTasks(2, 0, true)
 
     val tsm1 = taskScheduler.taskSetManagerForAttempt(0, 1).get
     val indexInTsm = tsm1.partitionToIndex(2)