-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-34764][CORE][K8S][UI] Propagate reason for exec loss to Web UI #32436
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
b8eb7be
d46d3cd
b5db580
ae77e33
dee0963
4e478a0
6ebcc55
433ee83
19355d4
e62e3b1
a4263cd
3ab5a09
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -217,14 +217,42 @@ private[spark] class ExecutorPodsLifecycleManager( | |
| ExecutorExited(exitCode, exitCausedByApp, exitMessage) | ||
| } | ||
|
|
||
| // A utility function to try and help people figure out whats gone wrong faster. | ||
| private def describeExitCode(code: Int): String = { | ||
attilapiros marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| val humanStr = code match { | ||
| case 0 => "(success)" | ||
| case 1 => "(generic, look at logs to clarify)" | ||
| case 42 => "(douglas adams)" | ||
attilapiros marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| // Spark specific | ||
| case 10 => "(Uncaught exception)" | ||
holdenk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| case 50 => "(Uncaught exception)" | ||
| case 52 => "(JVM OOM)" | ||
| case 53 => "(DiskStore failed to create temp dir)" | ||
| // K8s & JVM specific exit codes | ||
| case 126 => "(not executable - possibly perm or arch)" | ||
| case 137 => "(SIGKILL, possible container OOM)" | ||
| case 139 => "(SIGSEGV: that's unexpected)" | ||
| case 255 => "(exit-1, your guess is as good as mine)" | ||
|
||
| case _ => "" | ||
holdenk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
| s"${code}${humanStr}" | ||
| } | ||
|
|
||
| private def exitReasonMessage(podState: FinalPodState, execId: Long, exitCode: Int) = { | ||
| val pod = podState.pod | ||
| val reason = Option(pod.getStatus.getReason) | ||
| val message = Option(pod.getStatus.getMessage) | ||
| val explained = describeExitCode(exitCode) | ||
| val exitMsg = s"The executor with id $execId exited with exit code $explained." | ||
| val reasonStr = reason.map(r => s"The API gave the following brief reason: ${r}") | ||
| val msgStr = message.map(m => s"The API gave the following message: ${m}") | ||
|
|
||
|
|
||
| s""" | ||
| |The executor with id $execId exited with exit code $exitCode. | ||
| |The API gave the following brief reason: ${reason.getOrElse("N/A")} | ||
| |The API gave the following message: ${message.getOrElse("N/A")} | ||
| |${exitMsg} | ||
| |${reasonStr.getOrElse("")} | ||
| |${msgStr.getOrElse("")} | ||
| | | ||
| |The API gave the following container statuses: | ||
| | | ||
| |${containersDescription(pod)} | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this empty always in non-K8s resource managers?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No exec loss reason is populated for YARN as well :)