address comments of zsxwing

scwf · scwf · commit 1127ca1538e9 · 2016-09-24T07:03:22.000+08:00
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1235,13 +1235,13 @@ class DAGScheduler(
       case FetchFailed(bmAddress, shuffleId, mapId, reduceId, failureMessage) =>
         val failedStage = stageIdToStage(task.stageId)
         val mapStage = shuffleIdToMapStage(shuffleId)
-        var abortedStage = false
 
         if (failedStage.latestInfo.attemptId != task.stageAttemptId) {
           logInfo(s"Ignoring fetch failure from $task as it's from $failedStage attempt" +
             s" ${task.stageAttemptId} and there is a more recent attempt for that stage " +
             s"(attempt ID ${failedStage.latestInfo.attemptId}) running")
         } else {
+          var abortedStage = false
           // It is likely that we receive multiple FetchFailed for a single stage (because we have
           // multiple tasks running concurrently on different executors). In that case, it is
           // possible the fetch failure has already been handled by the scheduler.
@@ -1257,6 +1257,7 @@ class DAGScheduler(
           if (disallowStageRetryForTest) {
             abortStage(failedStage, "Fetch failure will not retry stage due to testing config",
               None)
+            abortedStage = true
           } else if (failedStage.failedOnFetchAndShouldAbort(task.stageAttemptId)) {
             abortStage(failedStage, s"$failedStage (${failedStage.name}) " +
               s"has failed the maximum allowable number of " +
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -18,12 +18,9 @@
 package org.apache.spark.scheduler
 
 import java.util.Properties
-import java.util.concurrent.Executors
 
 import scala.annotation.meta.param
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet, Map}
-import scala.concurrent.{ExecutionContext, Future}
-import scala.concurrent.duration.DurationConversions
 import scala.language.reflectiveCalls
 import scala.util.control.NonFatal
 
@@ -37,7 +34,7 @@ import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.shuffle.MetadataFetchFailedException
 import org.apache.spark.storage.{BlockId, BlockManagerId, BlockManagerMaster}
-import org.apache.spark.util._
+import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, CallSite, LongAccumulator, Utils}
 
 class DAGSchedulerEventProcessLoopTester(dagScheduler: DAGScheduler)
   extends DAGSchedulerEventProcessLoop(dagScheduler) {
@@ -2110,12 +2107,8 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
   }
 
   test("The failed stage never resubmitted due to abort stage in another thread") {
-    implicit val executorContext = ExecutionContext
-      .fromExecutorService(Executors.newFixedThreadPool(5))
-    val duration = 60.seconds
-
-    val f1 = Future {
-      try {
+    failAfter(60.seconds) {
+      val e = intercept[SparkException] {
         val rdd1 = sc.makeRDD(Array(1, 2, 3, 4), 2).map(x => (x, 1)).groupByKey()
         val shuffleHandle =
           rdd1.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]].shuffleHandle
@@ -2125,14 +2118,14 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
               BlockManagerId("1", "1", 1), shuffleHandle.shuffleId, 0, 0, "test")
           case (x, _) => x
         }.count()
-      } catch {
-        case e: Throwable =>
-          logInfo("expected abort stage1: " + e.getMessage)
       }
+      assert(e.getMessage.contains("org.apache.spark.shuffle.FetchFailedException"))
     }
-    ThreadUtils.awaitResult(f1, duration)
-    val f2 = Future {
-      try {
+
+    // The following job that fails due to fetching failure will hang without
+    // the fix for SPARK-17644
+    failAfter(60.seconds) {
+      val e = intercept[SparkException] {
         val rdd2 = sc.makeRDD(Array(1, 2, 3, 4), 2).map(x => (x, 1)).groupByKey()
         val shuffleHandle =
           rdd2.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]].shuffleHandle
@@ -2142,17 +2135,9 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with Timeou
               BlockManagerId("1", "1", 1), shuffleHandle.shuffleId, 0, 0, "test")
           case (x, _) => x
         }.count()
-      } catch {
-        case e: Throwable =>
-          logInfo("expected abort stage2: " + e.getMessage)
       }
+      assert(e.getMessage.contains("org.apache.spark.shuffle.FetchFailedException"))
     }
-    try {
-      ThreadUtils.awaitResult(f2, duration)
-    } catch {
-      case e: Throwable => fail("The failed stage never resubmitted")
-    }
-    executorContext.shutdown()
   }
 
   /**