Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
5ad6efd
[SPARK-25250] : On successful completion of a task attempt on a parti…
Oct 23, 2018
8667c28
Merge branch 'master' of https://github.com/pgandhi999/spark into SPA…
Oct 23, 2018
a73f619
[SPARK-25250] : Calling maybeFinishTaskSet() from method and adding c…
Dec 28, 2018
5509165
Merge branch 'master' of https://github.com/pgandhi999/spark into SPA…
Dec 28, 2018
ee5bc68
[SPARK-25250] : Fixing scalastyle tests
Dec 28, 2018
7677aec
[SPARK-25250] : Addressing Reviews January 2, 2019
Jan 2, 2019
67e1644
Merge branch 'master' of https://github.com/pgandhi999/spark into SPA…
Jan 8, 2019
f395b65
[SPARK-25250] : Addressing Reviews January 8, 2019
Jan 8, 2019
f7102ca
[SPARK-25250] : Addressing Reviews January 9, 2019
Jan 9, 2019
6709fe1
[SPARK-25250] : Addressing Reviews January 10, 2019
Jan 10, 2019
5234e87
Merge branch 'master' of https://github.com/pgandhi999/spark into SPA…
Jan 10, 2019
9efbc58
[SPARK-25250] : Addressing Reviews January 15, 2019
Jan 15, 2019
6abd52c
[SPARK-25250] : Addressing Reviews January 15, 2019 - 2
Jan 15, 2019
89373af
Merge branch 'master' of https://github.com/pgandhi999/spark into SPA…
Jan 16, 2019
231c51b
[SPARK-25250] : Addressing Reviews January 16, 2019
Jan 16, 2019
fcfe9f5
[SPARK-25250] : Addressing Reviews January 18, 2019
Jan 18, 2019
7ce6f10
[SPARK-25250] : Adding unit test
Jan 22, 2019
0610939
Merge branch 'master' of https://github.com/pgandhi999/spark into SPA…
Jan 22, 2019
929fbf9
[SPARK-25250] : Addressing Reviews January 24, 2019
Jan 24, 2019
393f901
[SPARK-25250] : Fixing Unit Tests
Jan 25, 2019
52e832a
[SPARK-25250] : Addressing Reviews January 30, 2019
Jan 30, 2019
afbac96
Merge branch 'master' of https://github.com/pgandhi999/spark into SPA…
Jan 30, 2019
024ec53
[SPARK-25250] : Fixing Scalastyle Checks
Jan 30, 2019
d2b7044
[SPARK-25250] : Addressing Minor Reviews January 30, 2019
Jan 30, 2019
d6ac4a9
[SPARK-25250] : Addressing Reviews January 31, 2019
Jan 31, 2019
e9b363b
Merge branch 'master' of https://github.com/pgandhi999/spark into SPA…
Feb 15, 2019
b55dbb0
[SPARK-25250] : Restructuring PR and trying out a different solution
Feb 18, 2019
551f412
[SPARK-25250] : Fixing indentation
Feb 18, 2019
28017ed
[SPARK-25250] : Addressing Reviews February 19, 2019
Feb 19, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[SPARK-25250] : Adding unit test
  • Loading branch information
pgandhi committed Jan 22, 2019
commit 7ce6f104980a0cbb0a3d017d40003983f09b0c58
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
/** Stages for which the DAGScheduler has called TaskScheduler.cancelTasks(). */
val cancelledStages = new HashSet[Int]()

val completedPartitions = new HashMap[Int, HashSet[Int]]()

val taskScheduler = new TaskScheduler() {
override def schedulingMode: SchedulingMode = SchedulingMode.FIFO
override def rootPool: Pool = new Pool("", schedulingMode, 0, 0)
Expand Down Expand Up @@ -160,7 +162,15 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
override def workerRemoved(workerId: String, host: String, message: String): Unit = {}
override def applicationAttemptId(): Option[String] = None
override def completeTasks(partitionId: Int, stageId: Int, taskInfo: TaskInfo): Unit = {}
// Since, the method completeTasks in TaskSchedulerImpl.scala marks the partition complete
// for all stage attempts in the particular stage id, it does not need any info about
// stageAttemptId. Hence, completed partition id's are stored only for stage id's to mock
// the method implementation here.
override def completeTasks(partitionId: Int, stageId: Int, taskInfo: TaskInfo): Unit = {
val partitionIds = completedPartitions.getOrElseUpdate(stageId, new HashSet[Int])
partitionIds.add(partitionId)
completedPartitions.put(stageId, partitionIds)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you don't need the last put, you're updating a mutable hashset.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

}
}

/** Length of time to wait while draining listener events. */
Expand Down Expand Up @@ -249,6 +259,7 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
cancelledStages.clear()
cacheLocations.clear()
results.clear()
completedPartitions.clear()
securityMgr = new SecurityManager(conf)
broadcastManager = new BroadcastManager(true, conf, securityMgr)
mapOutputTracker = new MapOutputTrackerMaster(conf, broadcastManager, true) {
Expand Down Expand Up @@ -2851,6 +2862,40 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
}
}

test("SPARK-25250: Late zombie task completions handled correctly even before" +
" new taskset launched") {
val shuffleMapRdd = new MyRDD(sc, 4, Nil)
val shuffleDep = new ShuffleDependency(shuffleMapRdd, new HashPartitioner(4))
val reduceRdd = new MyRDD(sc, 4, List(shuffleDep), tracker = mapOutputTracker)
submit(reduceRdd, Array(0, 1, 2, 3))

completeShuffleMapStageSuccessfully(0, 0, numShufflePartitions = 4)

// Fail Stage 1 Attempt 0 with Fetch Failure
runEvent(makeCompletionEvent(
taskSets(1).tasks(0),
FetchFailed(makeBlockManagerId("hostA"), shuffleDep.shuffleId, 0, 0, "ignored"),
null))

// this will trigger a resubmission of stage 0, since we've lost some of its
// map output, for the next iteration through the loop
scheduler.resubmitFailedStages()
completeShuffleMapStageSuccessfully(0, 1, numShufflePartitions = 4)

runEvent(makeCompletionEvent(
taskSets(1).tasks(3), Success, Nil, Nil))
assert(completedPartitions.get(taskSets(3).stageId).get.contains(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assert doesn't add anything over the one above -- you've already asserted the stage is the same for both taskSets, and checked the entire set.

and just fyi, map.get(key).get can just be map(key).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it does not add anything but it simply ensures that the partition id for stage attempt 1 is not complete beforehand so that after marking corresponding task in stage attempt 0 as complete, the result should be reflected for stage attempt 1 as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but you're not actually checking that. The communication between stage attempts has been mocked out here. You're giving the illusion of checking that.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

taskSets(3).tasks(1).partitionId) == false, "Corresponding partition id for" +
" stage 1 attempt 1 is not complete yet")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a better check would be to make sure you have exactly the right set of completed partitions. I think it would also good to add some more comments and check just on test setup, something like:

// tasksets 1 & 3 should be two different attempts for our reduce stage -- lets double-check test setup
val reduceStage = taskSets(1).stageId
assert(taskSets(3).stageId === reduceStage)

// complete one task from the original taskset, make sure we update the taskSchedulerImpl so it can notify
// all taskSetManagers.  Some of that is mocked here, just check there is the right event.
val taskToComplete = taskSets(1).tasks(3)
runEvent(makeCompletionEvent(taskToComplete, Success, Nil, Nil))
assert(completedPartitions.getOrElse(reduceStage, Set()) === Set(taskToComplete.partitionId))

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


// this will mark partition id 1 of stage 1 attempt 0 as complete. So we expect the status
// of that partition id to be reflected for stage 1 attempt 1 as well.
runEvent(makeCompletionEvent(
taskSets(1).tasks(1), Success, Nil, Nil))
assert(completedPartitions.get(taskSets(3).stageId).get.contains(
taskSets(3).tasks(1).partitionId) == true)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was originally thinking that we'd get rid of the test from #21131 in TaskSchedulerImplSuite, but now I see it tests a bunch of stuff in TaskSchedulerImpl and TSM which get mocked here, so I guess we need to keep it. It might be worthwhile at least adding a comment here mentioning that this goes along with "Completions in zombie tasksets update status of non-zombie taskset" in TaskSchedulerImplSuite.

This isn't really checking that the update happens inside the event loop ... but I guess thats OK, I dunno if its really worth trying to test that exactly.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have added comment. I have simply tried to mock the method completeTasks to test that event loop is marking completed partitions correctly. As far as the update happening inside event loop, since, we are not killing tasks anymore, writing a test for that might be complicated. Dunno. Your suggestions are welcome.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again, I'd just check the full Set here, rather than the existence of one particular partition. You've already checked the stage Ids are the same with both taskSets. So can be

assert(completedPartitions(reduceStage) === Set(taskSets(1).tasks(1).partitionId))

I know these changes might make it seem like we're not testing interaction between the taskSets, but really that was the case before as well, this just makes it easier to follow. That's kinda what bugs me about how the whole TaskSchedulerImpl part gets mocked out here, but anyway we can set that aside for now.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, have added the required check.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you added the check against the set below, but you can also remove the check for containing a specific element above, as that's implied with the full Set equality. Same goes for all the checks for individual elements of the set in this test.

also fyi, instead of assert(foo.contains(...) == true) just do assert(foo.contains(...)) (and assert(!foo.contains instead of == false)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, removed redundant assert statements.

}

/**
* Assert that the supplied TaskSet has exactly the given hosts as its preferred locations.
* Note that this checks only the host and not the executor ID.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ package org.apache.spark.scheduler
import java.nio.ByteBuffer

import scala.collection.mutable.HashMap
import scala.collection.mutable.Set
import scala.concurrent.duration._

import org.mockito.ArgumentMatchers.{any, anyInt, anyString, eq => meq}
Expand All @@ -40,14 +39,6 @@ class FakeSchedulerBackend extends SchedulerBackend {
def reviveOffers() {}
def defaultParallelism(): Int = 1
def maxNumConcurrentTasks(): Int = 0
val killedTaskIds: Set[Long] = Set[Long]()
override def killTask(
taskId: Long,
executorId: String,
interruptThread: Boolean,
reason: String): Unit = {
killedTaskIds.add(taskId)
}
}

class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with BeforeAndAfterEach
Expand Down