-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-25250][CORE] : Late zombie task completions handled correctly even before new taskset launched #22806
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
5ad6efd
8667c28
a73f619
5509165
ee5bc68
7677aec
67e1644
f395b65
f7102ca
6709fe1
5234e87
9efbc58
6abd52c
89373af
231c51b
fcfe9f5
7ce6f10
0610939
929fbf9
393f901
52e832a
afbac96
024ec53
d2b7044
d6ac4a9
e9b363b
b55dbb0
551f412
28017ed
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1383,6 +1383,8 @@ private[spark] class DAGScheduler( | |
| if (!job.finished(rt.outputId)) { | ||
| job.finished(rt.outputId) = true | ||
| job.numFinished += 1 | ||
| taskScheduler.markPartitionIdAsCompletedAndKillCorrespondingTaskAttempts( | ||
|
||
| task.partitionId, task.stageId) | ||
| // If the whole job has finished, remove it | ||
| if (job.numFinished == job.numPartitions) { | ||
| markStageAsFinished(resultStage) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -109,4 +109,7 @@ private[spark] trait TaskScheduler { | |
| */ | ||
| def applicationAttemptId(): Option[String] | ||
|
|
||
| def markPartitionIdAsCompletedAndKillCorrespondingTaskAttempts( | ||
|
||
| partitionId: Int, stageId: Int): Unit | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -286,6 +286,29 @@ private[spark] class TaskSchedulerImpl( | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * SPARK-25250: Whenever any Result Task gets successfully completed, we simply mark the | ||
| * corresponding partition id as completed in all attempts for that particular stage. As a | ||
| * result, we do not see any Killed tasks due to TaskCommitDenied Exceptions showing up | ||
| * in the UI. | ||
| */ | ||
| override def markPartitionIdAsCompletedAndKillCorrespondingTaskAttempts( | ||
|
||
| partitionId: Int, stageId: Int): Unit = { | ||
| taskSetsByStageIdAndAttempt.getOrElse(stageId, Map()).values.foreach { tsm => | ||
| val index: Option[Int] = tsm.partitionToIndex.get(partitionId) | ||
| if (!index.isEmpty) { | ||
|
||
| tsm.markPartitionIdAsCompletedForTaskAttempt(index.get) | ||
| val taskInfoList = tsm.taskAttempts(index.get) | ||
| taskInfoList.foreach { taskInfo => | ||
|
||
| if (taskInfo.running) { | ||
| killTaskAttempt(taskInfo.taskId, false, "Corresponding Partition Id " + partitionId + | ||
|
||
| " has been marked as Completed") | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Called to indicate that all task attempts (including speculated tasks) associated with the | ||
| * given TaskSetManager have completed, so state associated with the TaskSetManager should be | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1096,6 +1096,11 @@ private[spark] class TaskSetManager( | |
| def executorAdded() { | ||
| recomputeLocality() | ||
| } | ||
|
|
||
| def markPartitionIdAsCompletedForTaskAttempt(index: Int): Unit = { | ||
|
||
| successful(index) = true | ||
|
||
| maybeFinishTaskSet() | ||
| } | ||
| } | ||
|
|
||
| private[spark] object TaskSetManager { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| package org.apache.spark.scheduler | ||
|
|
||
| import java.nio.ByteBuffer | ||
| import java.util.HashSet | ||
|
|
||
| import scala.collection.mutable.HashMap | ||
| import scala.concurrent.duration._ | ||
|
|
@@ -39,6 +40,14 @@ class FakeSchedulerBackend extends SchedulerBackend { | |
| def reviveOffers() {} | ||
| def defaultParallelism(): Int = 1 | ||
| def maxNumConcurrentTasks(): Int = 0 | ||
| val killedTaskIds: HashSet[Long] = new HashSet[Long]() | ||
|
||
| override def killTask( | ||
| taskId: Long, | ||
|
||
| executorId: String, | ||
| interruptThread: Boolean, | ||
| reason: String): Unit = { | ||
| killedTaskIds.add(taskId) | ||
| } | ||
| } | ||
|
|
||
| class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with BeforeAndAfterEach | ||
|
|
@@ -1319,4 +1328,26 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B | |
| tsm.handleFailedTask(tsm.taskAttempts.head.head.taskId, TaskState.FAILED, TaskKilled("test")) | ||
| assert(tsm.isZombie) | ||
| } | ||
| test("SPARK-25250 On successful completion of a task attempt on a partition id, kill other" + | ||
|
||
| " running task attempts on that same partition") { | ||
| val taskScheduler = setupSchedulerWithMockTaskSetBlacklist() | ||
| val firstAttempt = FakeTask.createTaskSet(10, stageAttemptId = 0) | ||
| taskScheduler.submitTasks(firstAttempt) | ||
| val offersFirstAttempt = (0 until 10).map{ idx => WorkerOffer(s"exec-$idx", s"host-$idx", 1) } | ||
| taskScheduler.resourceOffers(offersFirstAttempt) | ||
| val tsm0 = taskScheduler.taskSetManagerForAttempt(0, 0).get | ||
| val matchingTaskInfoFirstAttempt = tsm0.taskAttempts(0).head | ||
| tsm0.handleFailedTask(matchingTaskInfoFirstAttempt.taskId, TaskState.FAILED, | ||
| FetchFailed(null, 0, 0, 0, "fetch failed")) | ||
| val secondAttempt = FakeTask.createTaskSet(10, stageAttemptId = 1) | ||
| taskScheduler.submitTasks(secondAttempt) | ||
| val offersSecondAttempt = (0 until 10).map{ idx => WorkerOffer(s"exec-$idx", s"host-$idx", 1) } | ||
| taskScheduler.resourceOffers(offersSecondAttempt) | ||
| taskScheduler.markPartitionIdAsCompletedAndKillCorrespondingTaskAttempts(2, 0) | ||
| val tsm1 = taskScheduler.taskSetManagerForAttempt(0, 1).get | ||
| val indexInTsm = tsm1.partitionToIndex(2) | ||
| val matchingTaskInfoSecondAttempt = tsm1.taskAttempts.flatten.filter(_.index == indexInTsm).head | ||
| assert(taskScheduler.backend.asInstanceOf[FakeSchedulerBackend].killedTaskIds.contains( | ||
| matchingTaskInfoSecondAttempt.taskId)) | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this require the newest
TaskSethas been created inTaskSetSchedulerat this point ? If so, can we make sure of that ?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, this method does not require that the newest TaskSet be created.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If not, how does it know about the completed partition ? Or say, is it possible this happens druing the time we have put the stage into the resubmit queue but haven't submited to TaskScheduler(so there's no newest TaskSet) ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ahh I see what you are saying. Yes, this method assumes that the newest TaskSet has been created. Yes, indeed a tiny possibility exists that when this method gets called, possibly the new TaskSet might have not been added to
taskSetsByStageIdAndAttempt. However, when I wrote the code, my assumption was that there is always a small delay for the task completion event to propagate to the DAGScheduler. I have tested this code by reproducing FetchFailures multiple times and when this method is called, the new TaskSet is always present so there has not yet been an instance when the race condition described above has occurred, while before this fix, I was able to reproduce the bug like 4 out of 5 times. Still, a valid point.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I agree this could be a extremely rare case and your work has proved that this pr is really a effective fix.
And as I mentioned above, if we could record this completed partition in
TaskSchedulerat this point and telling others TaskSets about the completed patitions once they're created inTaskScheduler, we may avoid the potential issue totally. But, I think we don't need to do this right now since your fix has already proved to be effective. Instead, how about leaving some comments to explain the potential issue ? Thus, we may easily fix it once we really hit it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Makes sense, have updated the comment. Thank you.