-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-31486] [CORE] spark.submit.waitAppCompletion flag to control spark-submit exit in Standalone Cluster Mode #28258
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
13ea149
68d76d0
34c7d26
a93ce76
d5eded1
8eef373
0918106
e225495
20f1bd6
9050a08
45c9817
743d93d
fe142a8
27a81c9
0e152f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,8 @@ | |
|
|
||
| package org.apache.spark.deploy | ||
|
|
||
| import java.util.concurrent.TimeUnit | ||
|
|
||
| import scala.collection.mutable.HashSet | ||
| import scala.concurrent.ExecutionContext | ||
| import scala.reflect.ClassTag | ||
|
|
@@ -61,6 +63,11 @@ private class ClientEndpoint( | |
|
|
||
| private val lostMasters = new HashSet[RpcAddress] | ||
| private var activeMasterEndpoint: RpcEndpointRef = null | ||
| private val waitAppCompletion = conf.getBoolean("spark.standalone.submit.waitAppCompletion", | ||
| false) | ||
| private val REPORT_DRIVER_STATUS_INTERVAL = 10000 | ||
| private var submittedDriverID = "" | ||
|
|
||
|
|
||
| private def getProperty(key: String, conf: SparkConf): Option[String] = { | ||
| sys.props.get(key).orElse(conf.getOption(key)) | ||
|
|
@@ -105,6 +112,10 @@ private class ClientEndpoint( | |
| asyncSendToMasterAndForwardReply[SubmitDriverResponse]( | ||
| RequestSubmitDriver(driverDescription)) | ||
|
|
||
| forwardMessageThread.scheduleAtFixedRate(() => Utils.tryLogNonFatalError { | ||
| MonitorDriverStatus() | ||
| }, 0, REPORT_DRIVER_STATUS_INTERVAL, TimeUnit.MILLISECONDS) | ||
|
|
||
| case "kill" => | ||
| val driverId = driverArgs.driverId | ||
| asyncSendToMasterAndForwardReply[KillDriverResponse](RequestKillDriver(driverId)) | ||
|
|
@@ -123,16 +134,24 @@ private class ClientEndpoint( | |
| }(forwardMessageExecutionContext) | ||
| } | ||
| } | ||
| private def MonitorDriverStatus(): Unit = { | ||
|
||
| if (submittedDriverID != "") { | ||
| asyncSendToMasterAndForwardReply[DriverStatusResponse](RequestDriverStatus(submittedDriverID)) | ||
| } | ||
| } | ||
|
|
||
| /* Find out driver status then exit the JVM */ | ||
| /** | ||
| * Find out driver status then exit the JVM. If the waitAppCompletion is set to true, monitors | ||
| * the application until it finishes, fails or is killed. | ||
| */ | ||
| def pollAndReportStatus(driverId: String): Unit = { | ||
| // Since ClientEndpoint is the only RpcEndpoint in the process, blocking the event loop thread | ||
| // is fine. | ||
| logInfo("... waiting before polling master for driver state") | ||
| Thread.sleep(5000) | ||
| logInfo("... polling master for driver state") | ||
| val statusResponse = | ||
| activeMasterEndpoint.askSync[DriverStatusResponse](RequestDriverStatus(driverId)) | ||
| activeMasterEndpoint.askSync[DriverStatusResponse](RequestDriverStatus(driverId)) | ||
| if (statusResponse.found) { | ||
| logInfo(s"State of $driverId is ${statusResponse.state.get}") | ||
| // Worker node, if present | ||
|
|
@@ -148,20 +167,27 @@ private class ClientEndpoint( | |
| e.printStackTrace() | ||
| System.exit(-1) | ||
| case _ => | ||
| System.exit(0) | ||
| if (!waitAppCompletion) { | ||
| logInfo(s"spark-submit not configured to wait for completion, " + | ||
| s"exiting spark-submit JVM.") | ||
| System.exit(0) | ||
| } else { | ||
| logInfo(s"spark-submit is configured to wait for completion, " + | ||
| s"continue monitoring driver status.") | ||
| } | ||
| } | ||
| } else { | ||
| logError(s"ERROR: Cluster master did not recognize $driverId") | ||
| System.exit(-1) | ||
| } | ||
| } else { | ||
| logError(s"ERROR: Cluster master did not recognize $driverId") | ||
| System.exit(-1) | ||
| } | ||
| } | ||
|
|
||
| override def receive: PartialFunction[Any, Unit] = { | ||
|
|
||
| case SubmitDriverResponse(master, success, driverId, message) => | ||
| logInfo(message) | ||
| if (success) { | ||
| activeMasterEndpoint = master | ||
| submittedDriverID = driverId.get | ||
| pollAndReportStatus(driverId.get) | ||
| } else if (!Utils.responseFromBackup(message)) { | ||
| System.exit(-1) | ||
|
|
@@ -176,6 +202,22 @@ private class ClientEndpoint( | |
| } else if (!Utils.responseFromBackup(message)) { | ||
| System.exit(-1) | ||
| } | ||
|
|
||
| case DriverStatusResponse(found, state, _, _, _) => | ||
|
||
| if (found) { | ||
| state.get match { | ||
| case DriverState.FINISHED | DriverState.FAILED | | ||
| DriverState.ERROR | DriverState.KILLED => | ||
| logInfo(s"State of driver $submittedDriverID is ${state.get}, " + | ||
| s"exiting spark-submit JVM.") | ||
| System.exit(0) | ||
| case _ => | ||
| logDebug(s"State of driver $submittedDriverID is ${state.get}, " + | ||
| s"continue monitoring driver status.") | ||
| } | ||
| } else { | ||
| System.exit(-1) | ||
| } | ||
| } | ||
|
|
||
| override def onDisconnected(remoteAddress: RpcAddress): Unit = { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.