-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-25250][CORE] : Late zombie task completions handled correctly even before new taskset launched #22806
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
5ad6efd
8667c28
a73f619
5509165
ee5bc68
7677aec
67e1644
f395b65
f7102ca
6709fe1
5234e87
9efbc58
6abd52c
89373af
231c51b
fcfe9f5
7ce6f10
0610939
929fbf9
393f901
52e832a
afbac96
024ec53
d2b7044
d6ac4a9
e9b363b
b55dbb0
551f412
28017ed
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -295,16 +295,17 @@ private[spark] class TaskSchedulerImpl( | |
| override def markPartitionIdAsCompletedAndKillCorrespondingTaskAttempts( | ||
| partitionId: Int, stageId: Int): Unit = { | ||
| taskSetsByStageIdAndAttempt.getOrElse(stageId, Map()).values.foreach { tsm => | ||
| val index: Option[Int] = tsm.partitionToIndex.get(partitionId) | ||
| if (!index.isEmpty) { | ||
| tsm.markPartitionIdAsCompletedForTaskAttempt(index.get) | ||
| val taskInfoList = tsm.taskAttempts(index.get) | ||
| taskInfoList.foreach { taskInfo => | ||
| if (taskInfo.running) { | ||
| killTaskAttempt(taskInfo.taskId, false, "Corresponding Partition Id " + partitionId + | ||
| " has been marked as Completed") | ||
| tsm.partitionToIndex.get(partitionId) match { | ||
| case Some(index) => | ||
| tsm.markPartitionIdAsCompletedForTaskAttempt(index) | ||
|
||
| val taskInfoList = tsm.taskAttempts(index) | ||
| taskInfoList.filter(_.running).foreach { taskInfo => | ||
| killTaskAttempt(taskInfo.taskId, false, | ||
| s"Corresponding Partition ID $partitionId has been marked as Completed") | ||
| } | ||
| } | ||
|
|
||
| case None => | ||
| logError(s"No corresponding index found for partition ID $partitionId") | ||
|
||
| } | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,9 +18,9 @@ | |
| package org.apache.spark.scheduler | ||
|
|
||
| import java.nio.ByteBuffer | ||
| import java.util.HashSet | ||
|
|
||
| import scala.collection.mutable.HashMap | ||
| import scala.collection.mutable.Set | ||
| import scala.concurrent.duration._ | ||
|
|
||
| import org.mockito.Matchers.{anyInt, anyObject, anyString, eq => meq} | ||
|
|
@@ -40,7 +40,7 @@ class FakeSchedulerBackend extends SchedulerBackend { | |
| def reviveOffers() {} | ||
| def defaultParallelism(): Int = 1 | ||
| def maxNumConcurrentTasks(): Int = 0 | ||
| val killedTaskIds: HashSet[Long] = new HashSet[Long]() | ||
| val killedTaskIds: Set[Long] = Set[Long]() | ||
| override def killTask( | ||
| taskId: Long, | ||
|
||
| executorId: String, | ||
|
|
@@ -1328,22 +1328,30 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B | |
| tsm.handleFailedTask(tsm.taskAttempts.head.head.taskId, TaskState.FAILED, TaskKilled("test")) | ||
| assert(tsm.isZombie) | ||
| } | ||
|
|
||
| test("SPARK-25250 On successful completion of a task attempt on a partition id, kill other" + | ||
|
||
| " running task attempts on that same partition") { | ||
| val taskScheduler = setupSchedulerWithMockTaskSetBlacklist() | ||
|
|
||
| val firstAttempt = FakeTask.createTaskSet(10, stageAttemptId = 0) | ||
| taskScheduler.submitTasks(firstAttempt) | ||
|
|
||
| val offersFirstAttempt = (0 until 10).map{ idx => WorkerOffer(s"exec-$idx", s"host-$idx", 1) } | ||
| taskScheduler.resourceOffers(offersFirstAttempt) | ||
|
|
||
| val tsm0 = taskScheduler.taskSetManagerForAttempt(0, 0).get | ||
| val matchingTaskInfoFirstAttempt = tsm0.taskAttempts(0).head | ||
| tsm0.handleFailedTask(matchingTaskInfoFirstAttempt.taskId, TaskState.FAILED, | ||
| FetchFailed(null, 0, 0, 0, "fetch failed")) | ||
|
|
||
| val secondAttempt = FakeTask.createTaskSet(10, stageAttemptId = 1) | ||
| taskScheduler.submitTasks(secondAttempt) | ||
|
|
||
| val offersSecondAttempt = (0 until 10).map{ idx => WorkerOffer(s"exec-$idx", s"host-$idx", 1) } | ||
| taskScheduler.resourceOffers(offersSecondAttempt) | ||
|
|
||
| taskScheduler.markPartitionIdAsCompletedAndKillCorrespondingTaskAttempts(2, 0) | ||
|
|
||
| val tsm1 = taskScheduler.taskSetManagerForAttempt(0, 1).get | ||
| val indexInTsm = tsm1.partitionToIndex(2) | ||
| val matchingTaskInfoSecondAttempt = tsm1.taskAttempts.flatten.filter(_.index == indexInTsm).head | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doesn't this logic overlap with
killAllTaskAttempts? should it reuse that logic? I understand it does something a little different, and I don't know this code well, but seems like there are related but separate implementations of something similar here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As far as I understand the code,
killAllTaskAttemptskills all the running tasks for a particular stage whereasmarkPartitionIdAsCompletedAndKillCorrespondingTaskAttemptskills all running tasks for all stages and attempts working on a particular partition that has already been marked as completed by one of the previously running tasks for that corresponding partition. So the logic is different for both the cases, but we can modify the code to have one fixed method for performing both these tasks. Let me know what you think!