apache
diff --git a/‎core/src/main/resources/org/apache/spark/ui/static/webui.js‎
Lines changed: 1 addition & 0 deletions b/‎core/src/main/resources/org/apache/spark/ui/static/webui.js‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala‎
Lines changed: 8 additions & 7 deletions b/‎core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala‎
Lines changed: 15 additions & 5 deletions b/‎core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SecurityManager.scala‎
Lines changed: 13 additions & 6 deletions b/‎core/src/main/scala/org/apache/spark/SecurityManager.scala‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 10 additions & 3 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala‎
Lines changed: 26 additions & 3 deletions b/‎core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala‎
Lines changed: 26 additions & 3 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala‎
Lines changed: 2 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala‎
Lines changed: 12 additions & 10 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/status/AppStatusListener.scala‎
Lines changed: 4 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/status/AppStatusListener.scala‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/status/AppStatusStore.scala‎
Lines changed: 5 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/status/AppStatusStore.scala‎
Lines changed: 5 additions & 1 deletion
@@ -72,6 +72,7 @@ $(function() {
   collapseTablePageLoad('collapse-aggregated-allActiveStages','aggregated-allActiveStages');
   collapseTablePageLoad('collapse-aggregated-allPendingStages','aggregated-allPendingStages');
   collapseTablePageLoad('collapse-aggregated-allCompletedStages','aggregated-allCompletedStages');
+  collapseTablePageLoad('collapse-aggregated-allSkippedStages','aggregated-allSkippedStages');
   collapseTablePageLoad('collapse-aggregated-allFailedStages','aggregated-allFailedStages');
   collapseTablePageLoad('collapse-aggregated-activeStages','aggregated-activeStages');
   collapseTablePageLoad('collapse-aggregated-pendingOrSkippedStages','aggregated-pendingOrSkippedStages');
 
@@ -55,18 +55,18 @@ private[spark] trait ExecutorAllocationClient {
   /**
    * Request that the cluster manager kill the specified executors.
    *
-   * When asking the executor to be replaced, the executor loss is considered a failure, and
-   * killed tasks that are running on the executor will count towards the failure limits. If no
-   * replacement is being requested, then the tasks will not count towards the limit.
-   *
    * @param executorIds identifiers of executors to kill
-   * @param replace whether to replace the killed executors with new ones, default false
+   * @param adjustTargetNumExecutors whether the target number of executors will be adjusted down
+   *                                 after these executors have been killed
+   * @param countFailures if there are tasks running on the executors when they are killed, whether
+    *                     to count those failures toward task failure limits
    * @param force whether to force kill busy executors, default false
    * @return the ids of the executors acknowledged by the cluster manager to be removed.
    */
   def killExecutors(
     executorIds: Seq[String],
-    replace: Boolean = false,
+    adjustTargetNumExecutors: Boolean,
+    countFailures: Boolean,
     force: Boolean = false): Seq[String]
 
   /**
@@ -81,7 +81,8 @@ private[spark] trait ExecutorAllocationClient {
    * @return whether the request is acknowledged by the cluster manager.
    */
   def killExecutor(executorId: String): Boolean = {
-    val killedExecutors = killExecutors(Seq(executorId))
+    val killedExecutors = killExecutors(Seq(executorId), adjustTargetNumExecutors = true,
+      countFailures = false)
     killedExecutors.nonEmpty && killedExecutors(0).equals(executorId)
   }
 }
@@ -29,6 +29,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.{DYN_ALLOCATION_MAX_EXECUTORS, DYN_ALLOCATION_MIN_EXECUTORS}
 import org.apache.spark.metrics.source.Source
 import org.apache.spark.scheduler._
+import org.apache.spark.storage.BlockManagerMaster
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
 
 /**
@@ -81,7 +82,8 @@ import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
 private[spark] class ExecutorAllocationManager(
     client: ExecutorAllocationClient,
     listenerBus: LiveListenerBus,
-    conf: SparkConf)
+    conf: SparkConf,
+    blockManagerMaster: BlockManagerMaster)
   extends Logging {
 
   allocationManager =>
@@ -151,7 +153,7 @@ private[spark] class ExecutorAllocationManager(
   private var clock: Clock = new SystemClock()
 
   // Listener for Spark events that impact the allocation policy
-  private val listener = new ExecutorAllocationListener
+  val listener = new ExecutorAllocationListener
 
   // Executor that handles the scheduling task.
   private val executor =
@@ -334,6 +336,11 @@ private[spark] class ExecutorAllocationManager(
 
       // If the new target has not changed, avoid sending a message to the cluster manager
       if (numExecutorsTarget < oldNumExecutorsTarget) {
+        // We lower the target number of executors but don't actively kill any yet.  Killing is
+        // controlled separately by an idle timeout.  It's still helpful to reduce the target number
+        // in case an executor just happens to get lost (eg., bad hardware, or the cluster manager
+        // preempts it) -- in that case, there is no point in trying to immediately  get a new
+        // executor, since we wouldn't even use it yet.
         client.requestTotalExecutors(numExecutorsTarget, localityAwareTasks, hostToLocalTaskCount)
         logDebug(s"Lowering target number of executors to $numExecutorsTarget (previously " +
           s"$oldNumExecutorsTarget) because not all requested executors are actually needed")
@@ -455,7 +462,10 @@ private[spark] class ExecutorAllocationManager(
     val executorsRemoved = if (testing) {
       executorIdsToBeRemoved
     } else {
-      client.killExecutors(executorIdsToBeRemoved)
+      // We don't want to change our target number of executors, because we already did that
+      // when the task backlog decreased.
+      client.killExecutors(executorIdsToBeRemoved, adjustTargetNumExecutors = false,
+        countFailures = false, force = false)
     }
     // [SPARK-21834] killExecutors api reduces the target number of executors.
     // So we need to update the target with desired value.
@@ -575,7 +585,7 @@ private[spark] class ExecutorAllocationManager(
         // Note that it is not necessary to query the executors since all the cached
         // blocks we are concerned with are reported to the driver. Note that this
         // does not include broadcast blocks.
-        val hasCachedBlocks = SparkEnv.get.blockManager.master.hasCachedBlocks(executorId)
+        val hasCachedBlocks = blockManagerMaster.hasCachedBlocks(executorId)
         val now = clock.getTimeMillis()
         val timeout = {
           if (hasCachedBlocks) {
@@ -610,7 +620,7 @@ private[spark] class ExecutorAllocationManager(
    * This class is intentionally conservative in its assumptions about the relative ordering
    * and consistency of events returned by the listener.
    */
-  private class ExecutorAllocationListener extends SparkListener {
+  private[spark] class ExecutorAllocationListener extends SparkListener {
 
     private val stageIdToNumTasks = new mutable.HashMap[Int, Int]
     // Number of running tasks per stage including speculative tasks.
 
@@ -520,19 +520,25 @@ private[spark] class SecurityManager(
    *
    * If authentication is disabled, do nothing.
    *
-   * In YARN mode, generate a new secret and store it in the current user's credentials.
+   * In YARN and local mode, generate a new secret and store it in the current user's credentials.
    *
    * In other modes, assert that the auth secret is set in the configuration.
    */
   def initializeAuth(): Unit = {
+    import SparkMasterRegex._
+
     if (!sparkConf.get(NETWORK_AUTH_ENABLED)) {
       return
     }
 
-    if (sparkConf.get(SparkLauncher.SPARK_MASTER, null) != "yarn") {
-      require(sparkConf.contains(SPARK_AUTH_SECRET_CONF),
-        s"A secret key must be specified via the $SPARK_AUTH_SECRET_CONF config.")
-      return
+    val master = sparkConf.get(SparkLauncher.SPARK_MASTER, "")
+    master match {
+      case "yarn" | "local" | LOCAL_N_REGEX(_) | LOCAL_N_FAILURES_REGEX(_, _) =>
+        // Secret generation allowed here
+      case _ =>
+        require(sparkConf.contains(SPARK_AUTH_SECRET_CONF),
+          s"A secret key must be specified via the $SPARK_AUTH_SECRET_CONF config.")
+        return
     }
 
     val rnd = new SecureRandom()
@@ -541,7 +547,8 @@ private[spark] class SecurityManager(
     rnd.nextBytes(secretBytes)
 
     val creds = new Credentials()
-    creds.addSecretKey(SECRET_LOOKUP_KEY, secretBytes)
+    val secretStr = HashCodes.fromBytes(secretBytes).toString()
+    creds.addSecretKey(SECRET_LOOKUP_KEY, secretStr.getBytes(UTF_8))
     UserGroupInformation.getCurrentUser().addCredentials(creds)
   }
 
 
@@ -534,7 +534,8 @@ class SparkContext(config: SparkConf) extends Logging {
         schedulerBackend match {
           case b: ExecutorAllocationClient =>
             Some(new ExecutorAllocationManager(
-              schedulerBackend.asInstanceOf[ExecutorAllocationClient], listenerBus, _conf))
+              schedulerBackend.asInstanceOf[ExecutorAllocationClient], listenerBus, _conf,
+              _env.blockManager.master))
           case _ =>
             None
         }
@@ -1633,6 +1634,8 @@ class SparkContext(config: SparkConf) extends Logging {
    * :: DeveloperApi ::
    * Request that the cluster manager kill the specified executors.
    *
+   * This is not supported when dynamic allocation is turned on.
+   *
    * @note This is an indication to the cluster manager that the application wishes to adjust
    * its resource usage downwards. If the application wishes to replace the executors it kills
    * through this method with new ones, it should follow up explicitly with a call to
@@ -1644,7 +1647,10 @@ class SparkContext(config: SparkConf) extends Logging {
   def killExecutors(executorIds: Seq[String]): Boolean = {
     schedulerBackend match {
       case b: ExecutorAllocationClient =>
-        b.killExecutors(executorIds, replace = false, force = true).nonEmpty
+        require(executorAllocationManager.isEmpty,
+          "killExecutors() unsupported with Dynamic Allocation turned on")
+        b.killExecutors(executorIds, adjustTargetNumExecutors = true, countFailures = false,
+          force = true).nonEmpty
       case _ =>
         logWarning("Killing executors is not supported by current scheduler.")
         false
@@ -1682,7 +1688,8 @@ class SparkContext(config: SparkConf) extends Logging {
   private[spark] def killAndReplaceExecutor(executorId: String): Boolean = {
     schedulerBackend match {
       case b: ExecutorAllocationClient =>
-        b.killExecutors(Seq(executorId), replace = true, force = true).nonEmpty
+        b.killExecutors(Seq(executorId), adjustTargetNumExecutors = false, countFailures = true,
+          force = true).nonEmpty
       case _ =>
         logWarning("Killing executors is not supported by current scheduler.")
         false
 
@@ -17,7 +17,7 @@
 
 package org.apache.spark.api.python
 
-import java.io.{DataInputStream, DataOutputStream, InputStream, OutputStreamWriter}
+import java.io.{DataInputStream, DataOutputStream, EOFException, InputStream, OutputStreamWriter}
 import java.net.{InetAddress, ServerSocket, Socket, SocketException}
 import java.nio.charset.StandardCharsets
 import java.util.Arrays
@@ -182,7 +182,8 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
 
       try {
         // Create and start the daemon
-        val pb = new ProcessBuilder(Arrays.asList(pythonExec, "-m", daemonModule))
+        val command = Arrays.asList(pythonExec, "-m", daemonModule)
+        val pb = new ProcessBuilder(command)
         val workerEnv = pb.environment()
         workerEnv.putAll(envVars.asJava)
         workerEnv.put("PYTHONPATH", pythonPath)
@@ -191,7 +192,29 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
         daemon = pb.start()
 
         val in = new DataInputStream(daemon.getInputStream)
-        daemonPort = in.readInt()
+        try {
+          daemonPort = in.readInt()
+        } catch {
+          case _: EOFException =>
+            throw new SparkException(s"No port number in $daemonModule's stdout")
+        }
+
+        // test that the returned port number is within a valid range.
+        // note: this does not cover the case where the port number
+        // is arbitrary data but is also coincidentally within range
+        if (daemonPort < 1 || daemonPort > 0xffff) {
+          val exceptionMessage = f"""
+            |Bad data in $daemonModule's standard output. Invalid port number:
+            |  $daemonPort (0x$daemonPort%08x)
+            |Python command to execute the daemon was:
+            |  ${command.asScala.mkString(" ")}
+            |Check that you don't have any unexpected modules or libraries in
+            |your PYTHONPATH:
+            |  $pythonPath
+            |Also, check if you have a sitecustomize.py module in your python path,
+            |or in your python installation, that is printing to standard output"""
+          throw new SparkException(exceptionMessage.stripMargin)
+        }
 
         // Redirect daemon stdout and stderr
         redirectStreamsToStderr(in, daemon.getErrorStream)
 
@@ -152,7 +152,8 @@ private[scheduler] class BlacklistTracker (
         case Some(a) =>
           logInfo(s"Killing blacklisted executor id $exec " +
             s"since ${config.BLACKLIST_KILL_ENABLED.key} is set.")
-          a.killExecutors(Seq(exec), true, true)
+          a.killExecutors(Seq(exec), adjustTargetNumExecutors = false, countFailures = false,
+            force = true)
         case None =>
           logWarning(s"Not attempting to kill blacklisted executor id $exec " +
             s"since allocation client is not defined.")
 
@@ -147,7 +147,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
       case KillExecutorsOnHost(host) =>
         scheduler.getExecutorsAliveOnHost(host).foreach { exec =>
-          killExecutors(exec.toSeq, replace = true, force = true)
+          killExecutors(exec.toSeq, adjustTargetNumExecutors = false, countFailures = false,
+            force = true)
         }
 
       case UpdateDelegationTokens(newDelegationTokens) =>
@@ -584,18 +585,18 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   /**
    * Request that the cluster manager kill the specified executors.
    *
-   * When asking the executor to be replaced, the executor loss is considered a failure, and
-   * killed tasks that are running on the executor will count towards the failure limits. If no
-   * replacement is being requested, then the tasks will not count towards the limit.
-   *
    * @param executorIds identifiers of executors to kill
-   * @param replace whether to replace the killed executors with new ones, default false
+   * @param adjustTargetNumExecutors whether the target number of executors be adjusted down
+   *                                 after these executors have been killed
+   * @param countFailures if there are tasks running on the executors when they are killed, whether
+   *                      those failures be counted to task failure limits?
    * @param force whether to force kill busy executors, default false
    * @return the ids of the executors acknowledged by the cluster manager to be removed.
    */
   final override def killExecutors(
       executorIds: Seq[String],
-      replace: Boolean,
+      adjustTargetNumExecutors: Boolean,
+      countFailures: Boolean,
       force: Boolean): Seq[String] = {
     logInfo(s"Requesting to kill executor(s) ${executorIds.mkString(", ")}")
 
@@ -610,20 +611,21 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
       val executorsToKill = knownExecutors
         .filter { id => !executorsPendingToRemove.contains(id) }
         .filter { id => force || !scheduler.isExecutorBusy(id) }
-      executorsToKill.foreach { id => executorsPendingToRemove(id) = !replace }
+      executorsToKill.foreach { id => executorsPendingToRemove(id) = !countFailures }
 
       logInfo(s"Actual list of executor(s) to be killed is ${executorsToKill.mkString(", ")}")
 
       // If we do not wish to replace the executors we kill, sync the target number of executors
       // with the cluster manager to avoid allocating new ones. When computing the new target,
       // take into account executors that are pending to be added or removed.
       val adjustTotalExecutors =
-        if (!replace) {
+        if (adjustTargetNumExecutors) {
           requestedTotalExecutors = math.max(requestedTotalExecutors - executorsToKill.size, 0)
           if (requestedTotalExecutors !=
               (numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)) {
             logDebug(
-              s"""killExecutors($executorIds, $replace, $force): Executor counts do not match:
+              s"""killExecutors($executorIds, $adjustTargetNumExecutors, $countFailures, $force):
+                 |Executor counts do not match:
                  |requestedTotalExecutors  = $requestedTotalExecutors
                  |numExistingExecutors     = $numExistingExecutors
                  |numPendingExecutors      = $numPendingExecutors
 
@@ -915,7 +915,10 @@ private[spark] class AppStatusListener(
       return
     }
 
-    val view = kvstore.view(classOf[StageDataWrapper]).index("completionTime").first(0L)
+    // As the completion time of a skipped stage is always -1, we will remove skipped stages first.
+    // This is safe since the job itself contains enough information to render skipped stages in the
+    // UI.
+    val view = kvstore.view(classOf[StageDataWrapper]).index("completionTime")
     val stages = KVUtils.viewToSeq(view, countToDelete.toInt) { s =>
       s.info.status != v1.StageStatus.ACTIVE && s.info.status != v1.StageStatus.PENDING
     }
 
@@ -95,7 +95,11 @@ private[spark] class AppStatusStore(
   }
 
   def lastStageAttempt(stageId: Int): v1.StageData = {
-    val it = store.view(classOf[StageDataWrapper]).index("stageId").reverse().first(stageId)
+    val it = store.view(classOf[StageDataWrapper])
+      .index("stageId")
+      .reverse()
+      .first(stageId)
+      .last(stageId)
       .closeableIterator()
     try {
       if (it.hasNext()) {
Original file line number	Diff line number	Diff line change
`@@ -915,7 +915,10 @@ private[spark] class AppStatusListener(`
`915`	`915`	`return`
`916`	`916`	`}`
`917`	`917`
`918`		`- val view = kvstore.view(classOf[StageDataWrapper]).index("completionTime").first(0L)`
	`918`	`+ // As the completion time of a skipped stage is always -1, we will remove skipped stages first.`
	`919`	`+ // This is safe since the job itself contains enough information to render skipped stages in the`
	`920`	`+ // UI.`
	`921`	`+ val view = kvstore.view(classOf[StageDataWrapper]).index("completionTime")`
`919`	`922`	`val stages = KVUtils.viewToSeq(view, countToDelete.toInt) { s =>`
`920`	`923`	`s.info.status != v1.StageStatus.ACTIVE && s.info.status != v1.StageStatus.PENDING`
`921`	`924`	`}`