Skip to content
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ <h4 class="title-table">Executors</h4>
Shuffle Write</span></th>
<th>Logs</th>
<th>Thread Dump</th>
<th>Exec Loss Reason</th>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this empty always in non-K8s resource managers?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No exec loss reason is populated for YARN as well :)

</tr>
</thead>
<tbody>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@ function getThreadDumpEnabled() {
return threadDumpEnabled;
}

function formatLossReason(status, type, row) {
if (row.removeReason) {
return row.removeReason
} else {
return ""
}
}

function formatStatus(status, type, row) {
if (row.isExcluded) {
return "Excluded";
Expand Down Expand Up @@ -126,7 +134,7 @@ function totalDurationColor(totalGCTime, totalDuration) {
}

var sumOptionalColumns = [3, 4];
var execOptionalColumns = [5, 6, 7, 8, 9, 10, 13, 14];
var execOptionalColumns = [5, 6, 7, 8, 9, 10, 13, 14, 15];
var execDataTable;
var sumDataTable;

Expand Down Expand Up @@ -535,7 +543,12 @@ $(document).ready(function () {
data: 'id', render: function (data, type) {
return type === 'display' ? ("<a href='threadDump/?executorId=" + data + "'>Thread Dump</a>" ) : data;
}
}
},
{
data: 'executorLossReason',
render: formatLossReason
},

],
"order": [[0, "asc"]],
"columnDefs": [
Expand Down Expand Up @@ -701,6 +714,7 @@ $(document).ready(function () {
"<div id='direct_mapped_pool_memory' class='direct_mapped_pool_memory-checkbox-div'><input type='checkbox' class='toggle-vis' data-sum-col-idx='' data-exec-col-idx='10'> Peak Pool Memory Direct / Mapped</div>" +
"<div id='extra_resources' class='resources-checkbox-div'><input type='checkbox' class='toggle-vis' data-sum-col-idx='' data-exec-col-idx='13'> Resources</div>" +
"<div id='resource_prof_id' class='resource-prof-id-checkbox-div'><input type='checkbox' class='toggle-vis' data-sum-col-idx='' data-exec-col-idx='14'> Resource Profile Id</div>" +
"<div id='exec_loss_reason' class='exec-loss-reason-checkbox-div'><input type='checkbox' class='toggle-vis' data-sum-col-idx='' data-exec-col-idx='15'> Exec Loss Reason</div>" +
"</div>");

reselectCheckboxesBasedOnTaskTableState();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,14 +217,42 @@ private[spark] class ExecutorPodsLifecycleManager(
ExecutorExited(exitCode, exitCausedByApp, exitMessage)
}

// A utility function to try and help people figure out whats gone wrong faster.
private def describeExitCode(code: Int): String = {
val humanStr = code match {
case 0 => "(success)"
case 1 => "(generic, look at logs to clarify)"
case 42 => "(douglas adams)"
// Spark specific
case 10 => "(Uncaught exception)"
case 50 => "(Uncaught exception)"
case 52 => "(JVM OOM)"
case 53 => "(DiskStore failed to create temp dir)"
// K8s & JVM specific exit codes
case 126 => "(not executable - possibly perm or arch)"
case 137 => "(SIGKILL, possible container OOM)"
case 139 => "(SIGSEGV: that's unexpected)"
case 255 => "(exit-1, your guess is as good as mine)"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I expected we have an error code for Evicted due to out of disk during worker decommission. What is the error code for that, @holdenk ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I think it's going to be inconsistent depending on how exactly it shows up (e.g. does the JVM have an uncaught exception trying to write a file or do we exceed the resource quota). So for now I don't have a clear exit code to map to it unfortunately. I could try and add a base handler for IO errors that are uncaught to exit with a specific code, but I'd rather do that in a separate PR.

case _ => ""
}
s"${code}${humanStr}"
}

private def exitReasonMessage(podState: FinalPodState, execId: Long, exitCode: Int) = {
val pod = podState.pod
val reason = Option(pod.getStatus.getReason)
val message = Option(pod.getStatus.getMessage)
val explained = describeExitCode(exitCode)
val exitMsg = s"The executor with id $execId exited with exit code $explained."
val reasonStr = reason.map(r => s"The API gave the following brief reason: ${r}")
val msgStr = message.map(m => s"The API gave the following message: ${m}")


s"""
|The executor with id $execId exited with exit code $exitCode.
|The API gave the following brief reason: ${reason.getOrElse("N/A")}
|The API gave the following message: ${message.getOrElse("N/A")}
|${exitMsg}
|${reasonStr.getOrElse("")}
|${msgStr.getOrElse("")}
|
|The API gave the following container statuses:
|
|${containersDescription(pod)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,7 @@ private[spark] class KubernetesClusterSchedulerBackend(

// Allow removeExecutor to be accessible by ExecutorPodsLifecycleEventHandler
private[k8s] def doRemoveExecutor(executorId: String, reason: ExecutorLossReason): Unit = {
if (isExecutorActive(executorId)) {
removeExecutor(executorId, reason)
}
removeExecutor(executorId, reason)
}

private def setUpExecutorConfigMap(driverPod: Option[Pod]): Unit = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte
val failedPod = failedExecutorWithoutDeletion(1)
snapshotsStore.updatePod(failedPod)
snapshotsStore.notifySubscribers()
val msg = exitReasonMessage(1, failedPod)
val msg = exitReasonMessage(1, failedPod, 1)
val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg)
verify(schedulerBackend).doRemoveExecutor("1", expectedLossReason)
verify(namedExecutorPods(failedPod.getMetadata.getName)).delete()
Expand All @@ -81,7 +81,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte
snapshotsStore.notifySubscribers()
snapshotsStore.updatePod(failedPod)
snapshotsStore.notifySubscribers()
val msg = exitReasonMessage(1, failedPod)
val msg = exitReasonMessage(1, failedPod, 1)
val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg)
verify(schedulerBackend, times(1)).doRemoveExecutor("1", expectedLossReason)
verify(namedExecutorPods(failedPod.getMetadata.getName), times(2)).delete()
Expand Down Expand Up @@ -114,7 +114,7 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte
eventHandlerUnderTest.conf.set(Config.KUBERNETES_DELETE_EXECUTORS, false)
snapshotsStore.updatePod(failedPod)
snapshotsStore.notifySubscribers()
val msg = exitReasonMessage(1, failedPod)
val msg = exitReasonMessage(1, failedPod, 1)
val expectedLossReason = ExecutorExited(1, exitCausedByApp = true, msg)
verify(schedulerBackend).doRemoveExecutor("1", expectedLossReason)
verify(namedExecutorPods(failedPod.getMetadata.getName), never()).delete()
Expand All @@ -126,13 +126,41 @@ class ExecutorPodsLifecycleManagerSuite extends SparkFunSuite with BeforeAndAfte
assert(pod.getMetadata().getLabels().get(SPARK_EXECUTOR_INACTIVE_LABEL) === "true")
}

private def exitReasonMessage(failedExecutorId: Int, failedPod: Pod): String = {
// A utility function to try and help people figure out whats gone wrong faster.
private def describeExitCode(code: Int): String = {
val humanStr = code match {
case 0 => "(success)"
case 1 => "(generic, look at logs to clarify)"
case 42 => "(douglas adams)"
// Spark specific
case 10 => "(Uncaught exception)"
case 50 => "(Uncaught exception)"
case 52 => "(JVM OOM)"
case 53 => "(DiskStore failed to create temp dir)"
// K8s & JVM specific exit codes
case 126 => "(not executable - possibly perm or arch)"
case 137 => "(SIGKILL, possible container OOM)"
case 139 => "(SIGSEGV: that's unexpected)"
case 255 => "(exit-1, your guess is as good as mine)"
case _ => ""
}
s"${code}${humanStr}"
}

private def exitReasonMessage(execId: Int, failedPod: Pod, exitCode: Int): String = {
val reason = Option(failedPod.getStatus.getReason)
val message = Option(failedPod.getStatus.getMessage)
val explained = describeExitCode(exitCode)
val exitMsg = s"The executor with id $execId exited with exit code $explained."
val reasonStr = reason.map(r => s"The API gave the following brief reason: ${r}")
val msgStr = message.map(m => s"The API gave the following message: ${m}")


s"""
|The executor with id $failedExecutorId exited with exit code 1.
|The API gave the following brief reason: ${reason.getOrElse("N/A")}
|The API gave the following message: ${message.getOrElse("N/A")}
|${exitMsg}
|${reasonStr.getOrElse("")}
|${msgStr.getOrElse("")}
|
|The API gave the following container statuses:
|
|${containersDescription(failedPod)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,10 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn

backend.start()
backend.doRemoveExecutor("1", ExecutorKilled)
verify(driverEndpointRef, never()).send(RemoveExecutor("1", ExecutorKilled))
verify(driverEndpointRef).send(RemoveExecutor("1", ExecutorKilled))

backend.doRemoveExecutor("2", ExecutorKilled)
verify(driverEndpointRef, never()).send(RemoveExecutor("1", ExecutorKilled))
verify(driverEndpointRef).send(RemoveExecutor("2", ExecutorKilled))
}

test("Kill executors") {
Expand Down