[CARMEL-7492][CARMEL-6086] DAGScheduler exit because of jobId not exist (apache#162)

wangyum · GitHub Enterprise · commit b78c2942250a · 2024-02-03T19:32:04.000-06:00
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1005,6 +1005,7 @@ private[spark] class DAGScheduler(
    * @param job The job whose state to cleanup.
    */
   private def cleanupStateForJobAndIndependentStages(job: ActiveJob): Unit = {
+    logInfo(s"Clean up job:${job.jobId} state")
     val registeredStages = jobIdToStageIds.get(job.jobId)
     if (registeredStages.isEmpty || registeredStages.get.isEmpty) {
       logError("No stages registered for job " + job.jobId)
@@ -1883,6 +1884,8 @@ private[spark] class DAGScheduler(
             eventProcessLoop.post(SubmitMissingTask(
               stage,
               jobId,
+              properties,
+              partitionsToCompute,
               taskIdToLocations,
               taskBinary,
               partitions))
@@ -1891,6 +1894,8 @@ private[spark] class DAGScheduler(
               SubmitMissingTask(
                 stage,
                 jobId,
+                properties,
+                partitionsToCompute,
                 taskIdToLocations,
                 taskBinary,
                 partitions))
@@ -1930,18 +1935,18 @@ private[spark] class DAGScheduler(
       // If we reach here, it is very possible the job was already cancelled.
       return
     }
-    val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()
+
     // Use the scheduling pool, job group, description, etc. from an ActiveJob associated
     // with this Stage
-    val properties = jobIdToActiveJob(missingTask.jobId).properties
+    val properties = missingTask.properties
     val artifacts = jobIdToActiveJob(missingTask.jobId).artifacts
 
     val tasks: Seq[Task[_]] = try {
       val serializedTaskMetrics = closureSerializer.serialize(stage.latestInfo.taskMetrics).array()
       stage match {
         case stage: ShuffleMapStage =>
           stage.pendingPartitions.clear()
-          partitionsToCompute.map { id =>
+          missingTask.partitionsToCompute.map { id =>
             val locs = missingTask.taskIdToLocations(id)
             val part = missingTask.partitions(id)
             stage.pendingPartitions += id
@@ -1952,7 +1957,7 @@ private[spark] class DAGScheduler(
           }
 
         case stage: ResultStage =>
-          partitionsToCompute.map { id =>
+          missingTask.partitionsToCompute.map { id =>
             val p: Int = stage.partitions(id)
             val part = missingTask.partitions(p)
             val locs = missingTask.taskIdToLocations(id)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGSchedulerEvent.scala
@@ -113,6 +113,8 @@ case class SpeculativeTaskSubmitted(task: Task[_], taskIndex: Int = -1) extends
 private[scheduler] case class SubmitMissingTask(
     stage: Stage,
     jobId: Int,
+    properties: Properties,
+    partitionsToCompute: Seq[Int],
     taskIdToLocations: scala.collection.Map[Int, Seq[TaskLocation]],
     taskBinary: Broadcast[Array[Byte]],
     partitions: Array[Partition]) extends DAGSchedulerEvent