-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-31486] [CORE] spark.submit.waitAppCompletion flag to control spark-submit exit in Standalone Cluster Mode #28258
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
13ea149
68d76d0
34c7d26
a93ce76
d5eded1
8eef373
0918106
e225495
20f1bd6
9050a08
45c9817
743d93d
fe142a8
27a81c9
0e152f4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -61,6 +61,9 @@ private class ClientEndpoint( | |
|
|
||
| private val lostMasters = new HashSet[RpcAddress] | ||
| private var activeMasterEndpoint: RpcEndpointRef = null | ||
| private val waitAppCompletion = conf.getBoolean("spark.submit.waitAppCompletion", false) | ||
| private val REPORT_DRIVER_STATUS_INTERVAL = 1000 | ||
|
|
||
|
|
||
| private def getProperty(key: String, conf: SparkConf): Option[String] = { | ||
| sys.props.get(key).orElse(conf.getOption(key)) | ||
|
|
@@ -124,38 +127,57 @@ private class ClientEndpoint( | |
| } | ||
| } | ||
|
|
||
| /* Find out driver status then exit the JVM */ | ||
| /** | ||
| * Find out driver status then exit the JVM. If the waitAppCompletion is set to true, monitors | ||
| * the application until it finishes, fails or is killed. | ||
| */ | ||
| def pollAndReportStatus(driverId: String): Unit = { | ||
| // Since ClientEndpoint is the only RpcEndpoint in the process, blocking the event loop thread | ||
| // is fine. | ||
| logInfo("... waiting before polling master for driver state") | ||
| Thread.sleep(5000) | ||
| logInfo("... polling master for driver state") | ||
| val statusResponse = | ||
| activeMasterEndpoint.askSync[DriverStatusResponse](RequestDriverStatus(driverId)) | ||
| if (statusResponse.found) { | ||
| logInfo(s"State of $driverId is ${statusResponse.state.get}") | ||
| // Worker node, if present | ||
| (statusResponse.workerId, statusResponse.workerHostPort, statusResponse.state) match { | ||
| case (Some(id), Some(hostPort), Some(DriverState.RUNNING)) => | ||
| logInfo(s"Driver running on $hostPort ($id)") | ||
| case _ => | ||
| } | ||
| // Exception, if present | ||
| statusResponse.exception match { | ||
| case Some(e) => | ||
| logError(s"Exception from cluster was: $e") | ||
| e.printStackTrace() | ||
| System.exit(-1) | ||
| case _ => | ||
| System.exit(0) | ||
| while (true) { | ||
| val statusResponse = | ||
|
||
| activeMasterEndpoint.askSync[DriverStatusResponse](RequestDriverStatus(driverId)) | ||
| if (statusResponse.found) { | ||
| logInfo(s"State of $driverId is ${statusResponse.state.get}") | ||
| // Worker node, if present | ||
| (statusResponse.workerId, statusResponse.workerHostPort, statusResponse.state) match { | ||
| case (Some(id), Some(hostPort), Some(DriverState.RUNNING)) => | ||
| logInfo(s"Driver running on $hostPort ($id)") | ||
| case _ => | ||
| } | ||
| // Exception, if present | ||
| statusResponse.exception match { | ||
| case Some(e) => | ||
| logError(s"Exception from cluster was: $e") | ||
| e.printStackTrace() | ||
| System.exit(-1) | ||
| case _ => | ||
| if (!waitAppCompletion) { | ||
| logInfo(s"No exception found and waitAppCompletion is false, " + | ||
| s"exiting spark-submit JVM.") | ||
| System.exit(0) | ||
| } else if (statusResponse.state.get == DriverState.FINISHED || | ||
|
||
| statusResponse.state.get == DriverState.FAILED || | ||
| statusResponse.state.get == DriverState.ERROR || | ||
| statusResponse.state.get == DriverState.KILLED) { | ||
| logInfo(s"waitAppCompletion is true, state is ${statusResponse.state.get}, " + | ||
| s"exiting spark-submit JVM.") | ||
| System.exit(0) | ||
| } else { | ||
| logTrace(s"waitAppCompletion is true, state is ${statusResponse.state.get}," + | ||
akshatb1 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| s"continue monitoring driver status.") | ||
| } | ||
| } | ||
| } else { | ||
| logError(s"ERROR: Cluster master did not recognize $driverId") | ||
| System.exit(-1) | ||
| } | ||
| } else { | ||
| logError(s"ERROR: Cluster master did not recognize $driverId") | ||
| System.exit(-1) | ||
| Thread.sleep(REPORT_DRIVER_STATUS_INTERVAL) | ||
| } | ||
| } | ||
|
|
||
| override def receive: PartialFunction[Any, Unit] = { | ||
|
|
||
| case SubmitDriverResponse(master, success, driverId, message) => | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -240,6 +240,16 @@ SPARK_MASTER_OPTS supports the following system properties: | |
| </td> | ||
| <td>1.6.3</td> | ||
| </tr> | ||
| <tr> | ||
| <td><code>spark.submit.waitAppCompletion</code></td> | ||
|
||
| <td><code>true</code></td> | ||
akshatb1 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| <td> | ||
| In Standalone cluster mode, controls whether the client waits to exit until the application completes. | ||
| If set to <code>true</code>, the client process will stay alive reporting the application's status. | ||
| Otherwise, the client process will exit after submission. | ||
| </td> | ||
| <td>3.1.0</td> | ||
| </tr> | ||
| <tr> | ||
| <td><code>spark.worker.timeout</code></td> | ||
| <td>60</td> | ||
|
|
||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could block
ClientEndpointwhenwaitAppCompletion=true?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hey, please pay attention to my comment here. I believe the current implementation could block
ClientEndpointbecause it's aThreadSafeRpcEndpoint. When enablingwaitAppCompletion,ClientEndpointwould actually keep handling messageSubmitDriverResponseuntil the application finished. So,ClientEndpointis unable to handle other messages, e.g.RemoteProcessDisconnected,RemoteProcessConnectionError, at the same time, which breaks the current behaviour. Furthermore, it could also block messages from backup masters, though not fatal in this case.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Ngone51 Apologies, somehow missed this comment. How can I quickly verify this? I am looking into this. Could you kindly suggest if you have any pointers on how this can be fixed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can periodically send a message (e.g. we can send it after
Thread.sleep(REPORT_DRIVER_STATUS_INTERVAL)) toClientEndpointitself to check driver's status.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A possible way to verify this is to launch a long running application and then shutdown Master at the middle and see whether
onDisconnectedis called immediately.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Ngone51 I launched a long-running application with flag enabled and disabled and stopped the Spark Master in middle. In both cases, I see the following in driver logs. I couldn't find any difference in logs.
onDisconnectedmethod fromStandaloneAppClient.scalais getting called:Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Ngone51 Thanks for this suggestion. Just to confirm, are you suggesting to do this in line # 180 in pollAndReportStatus method? Or should we handle this outside?

Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @akshatb1 , logs are from
StandaloneAppClient$ClientEndpointandStandaloneSchedulerBackendrather thanorg.apache.spark.deploy.ClientEndpoint. Can you check again?I think just after line 180 should be ok.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Ngone51 Yes, not sure about the logs from
StandaloneAppClient$ClientEndpoint. I will check again. This is the command I am using to submit jobs:./bin/spark-submit --master spark://127.0.0.1:7077 --conf spark.standalone.submit.waitAppCompletion=true --deploy-mode cluster --class org.apache.spark.examples.SparkPi examples/target/original-spark-examples_2.12-3.1.0-SNAPSHOT.jarUh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @Ngone51 , I tried putting periodic messages in the loop in
pollAndReportStatusbut it doesn't seem to receive message until the loop sending is completed (checked with aforloop, will be stuck in an infinite loop in case of currentwhile(true)loop). Hence, I have implemented it based on sending an async message from thepollAndReportStatusmethod and if need be, send the message again or exit while receiving the message. Please let me know what you think of this approach. I have tested for the common scenarios and I could seeonNetworkErrormethod getting called on shutting down Spark master when an application is running.