[SPARK-25250] : Adding unit test

apache · pgandhi999 · Oct 23, 2018 · Oct 23, 2018 · Dec 28, 2018 · Dec 28, 2018
commit 7ce6f104980a0cbb0a3d017d40003983f09b0c58
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -133,6 +133,8 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
   /** Stages for which the DAGScheduler has called TaskScheduler.cancelTasks(). */
   val cancelledStages = new HashSet[Int]()
 
+  val completedPartitions = new HashMap[Int, HashSet[Int]]()
+
   val taskScheduler = new TaskScheduler() {
     override def schedulingMode: SchedulingMode = SchedulingMode.FIFO
     override def rootPool: Pool = new Pool("", schedulingMode, 0, 0)
@@ -160,7 +162,15 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
     override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
     override def workerRemoved(workerId: String, host: String, message: String): Unit = {}
     override def applicationAttemptId(): Option[String] = None
-    override def completeTasks(partitionId: Int, stageId: Int, taskInfo: TaskInfo): Unit = {}
+    // Since, the method completeTasks in TaskSchedulerImpl.scala marks the partition complete
+    // for all stage attempts in the particular stage id, it does not need any info about
+    // stageAttemptId. Hence, completed partition id's are stored only for stage id's to mock
+    // the method implementation here.
+    override def completeTasks(partitionId: Int, stageId: Int, taskInfo: TaskInfo): Unit = {
+      val partitionIds = completedPartitions.getOrElseUpdate(stageId, new HashSet[Int])
+      partitionIds.add(partitionId)
+      completedPartitions.put(stageId, partitionIds)
+    }
   }
 
   /** Length of time to wait while draining listener events. */
@@ -249,6 +259,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
     cancelledStages.clear()
     cacheLocations.clear()
     results.clear()
+    completedPartitions.clear()
     securityMgr = new SecurityManager(conf)
     broadcastManager = new BroadcastManager(true, conf, securityMgr)
     mapOutputTracker = new MapOutputTrackerMaster(conf, broadcastManager, true) {
@@ -2851,6 +2862,40 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
     }
   }
 
+  test("SPARK-25250: Late zombie task completions handled correctly even before" +
+    " new taskset launched") {
+    val shuffleMapRdd = new MyRDD(sc, 4, Nil)
+    val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(4))
+    val reduceRdd = new MyRDD(sc, 4, List(shuffleDep), tracker = mapOutputTracker)
+    submit(reduceRdd, Array(0, 1, 2, 3))
+
+    completeShuffleMapStageSuccessfully(0, 0, numShufflePartitions = 4)
+
+    // Fail Stage 1 Attempt 0 with Fetch Failure
+    runEvent(makeCompletionEvent(
+      taskSets(1).tasks(0),
+      FetchFailed(makeBlockManagerId("hostA"), shuffleDep.shuffleId, 0, 0, "ignored"),
+      null))
+
+    // this will trigger a resubmission of stage 0, since we've lost some of its
+    // map output, for the next iteration through the loop
+    scheduler.resubmitFailedStages()
+    completeShuffleMapStageSuccessfully(0, 1, numShufflePartitions = 4)
+
+    runEvent(makeCompletionEvent(
+      taskSets(1).tasks(3), Success, Nil, Nil))
+    assert(completedPartitions.get(taskSets(3).stageId).get.contains(
+      taskSets(3).tasks(1).partitionId) == false, "Corresponding partition id for" +
+      " stage 1 attempt 1 is not complete yet")
+
+    // this will mark partition id 1 of stage 1 attempt 0 as complete. So we expect the status
+    // of that partition id to be reflected for stage 1 attempt 1 as well.
+    runEvent(makeCompletionEvent(
+      taskSets(1).tasks(1), Success, Nil, Nil))
+    assert(completedPartitions.get(taskSets(3).stageId).get.contains(
+      taskSets(3).tasks(1).partitionId) == true)
+  }
+
   /**
    * Assert that the supplied TaskSet has exactly the given hosts as its preferred locations.
    * Note that this checks only the host and not the executor ID.

diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -20,7 +20,6 @@ package org.apache.spark.scheduler
 import java.nio.ByteBuffer
 
 import scala.collection.mutable.HashMap
-import scala.collection.mutable.Set
 import scala.concurrent.duration._
 
 import org.mockito.ArgumentMatchers.{any, anyInt, anyString, eq => meq}
@@ -40,14 +39,6 @@ class FakeSchedulerBackend extends SchedulerBackend {
   def reviveOffers() {}
   def defaultParallelism(): Int = 1
   def maxNumConcurrentTasks(): Int = 0
-  val killedTaskIds: Set[Long] = Set[Long]()
-  override def killTask(
-      taskId: Long,
-      executorId: String,
-      interruptThread: Boolean,
-      reason: String): Unit = {
-    killedTaskIds.add(taskId)
-  }
 }
 
 class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with BeforeAndAfterEach