Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 27 additions & 6 deletions workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1681,6 +1681,12 @@ func (woc *wfOperationCtx) inferFailedReason(ctx context.Context, pod *apiv1.Pod
return wfv1.NodeFailed, fmt.Sprintf("can't find failed message for pod %s namespace %s", pod.Name, pod.Namespace)
}

// Track whether critical containers completed successfully (terminated with exit code 0).
// We must confirm both to return success—otherwise a pod-level failure (eviction, node death)
// could be incorrectly reported as success.
mainContainerSucceeded := false
waitContainerSucceeded := false

for _, ctr := range ctrs {

// Virtual Kubelet environment will not set the terminate on waiting container
Expand All @@ -1692,11 +1698,15 @@ func (woc *wfOperationCtx) inferFailedReason(ctx context.Context, pod *apiv1.Pod
}
t := ctr.State.Terminated
if t == nil {
// We should never get here
woc.log.WithFields(logging.Fields{"podName": pod.Name, "containerName": ctr.Name}).Warn(ctx, "Pod phase was Failed but container did not have terminated state")
continue
}
if t.ExitCode == 0 {
if tmpl.IsMainContainerName(ctr.Name) {
mainContainerSucceeded = true
} else if ctr.Name == common.WaitContainerName {
waitContainerSucceeded = true
}
continue
}

Expand Down Expand Up @@ -1725,11 +1735,22 @@ func (woc *wfOperationCtx) inferFailedReason(ctx context.Context, pod *apiv1.Pod
}
}

// If we get here, we have detected that the main/wait containers succeed but the sidecar(s)
// were SIGKILL'd. The executor may have had to forcefully terminate the sidecar (kill -9),
// resulting in a 137 exit code (which we had ignored earlier). If failMessages is empty, it
// indicates that this is the case and we return Success instead of Failure.
return wfv1.NodeSucceeded, ""
// Determine final status based on whether we confirmed main and wait succeeded
// Slightly convulted approach to avoid the exhaustive linter getting upset
if mainContainerSucceeded {
if waitContainerSucceeded {
// Both succeeded - sidecars may have been force-killed (137/143), which is fine
return wfv1.NodeSucceeded, ""
} else {
return wfv1.NodeFailed, "pod failed: wait container did not complete successfully"
}
} else {
if waitContainerSucceeded {
return wfv1.NodeFailed, "pod failed: main container did not complete successfully"
} else {
return wfv1.NodeFailed, "pod failed: neither main nor wait container completed successfully"
}
}
}

func (woc *wfOperationCtx) createPVCs(ctx context.Context) error {
Expand Down
130 changes: 130 additions & 0 deletions workflow/controller/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7186,6 +7186,136 @@ func TestPodFailureWithContainerWaitingState(t *testing.T) {
assert.Equal(t, "Pod failed before main container starts due to ContainerCreating: Container is creating", msg)
}

// Test that when containers are in Running state (not Terminated) on a Failed pod,
// we correctly return Failed instead of incorrectly returning Succeeded.
// This can happen during node eviction or other pod-level failures.
var podFailedWithRunningContainers = `
apiVersion: v1
kind: Pod
metadata:
name: test-pod
spec:
containers:
- name: main
env:
- name: ARGO_CONTAINER_NAME
value: main
status:
phase: Failed
reason: Evicted
message: "The node was low on resource: memory."
containerStatuses:
- name: main
ready: false
restartCount: 0
state:
running:
startedAt: "2021-01-22T09:50:16Z"
- name: wait
ready: false
restartCount: 0
state:
running:
startedAt: "2021-01-22T09:50:16Z"
`

func TestPodFailureWithRunningContainers(t *testing.T) {
var pod apiv1.Pod
wfv1.MustUnmarshal(podFailedWithRunningContainers, &pod)
assert.NotNil(t, pod)
ctx := logging.TestContext(t.Context())
// Pod has a message, so it should return that
nodeStatus, msg := newWoc(ctx).inferFailedReason(ctx, &pod, nil)
assert.Equal(t, wfv1.NodeFailed, nodeStatus)
assert.Equal(t, "The node was low on resource: memory.", msg)
}

// Test that when containers don't have terminated state and there's no pod message,
// we correctly return Failed with information about which containers couldn't be confirmed.
var podFailedWithRunningContainersNoMessage = `
apiVersion: v1
kind: Pod
metadata:
name: test-pod
spec:
containers:
- name: main
env:
- name: ARGO_CONTAINER_NAME
value: main
status:
phase: Failed
containerStatuses:
- name: main
ready: false
restartCount: 0
state:
running:
startedAt: "2021-01-22T09:50:16Z"
- name: wait
ready: false
restartCount: 0
state:
running:
startedAt: "2021-01-22T09:50:16Z"
`

func TestPodFailureWithRunningContainersNoMessage(t *testing.T) {
var pod apiv1.Pod
wfv1.MustUnmarshal(podFailedWithRunningContainersNoMessage, &pod)
assert.NotNil(t, pod)
ctx := logging.TestContext(t.Context())
nodeStatus, msg := newWoc(ctx).inferFailedReason(ctx, &pod, nil)
assert.Equal(t, wfv1.NodeFailed, nodeStatus)
assert.Equal(t, "pod failed: neither main nor wait container completed successfully", msg)
}

// Test that sidecar SIGKILL still results in success when main and wait succeeded
var podFailedWithSidecarSigkill = `
apiVersion: v1
kind: Pod
metadata:
name: test-pod
spec:
containers:
- name: main
env:
- name: ARGO_CONTAINER_NAME
value: main
status:
phase: Failed
containerStatuses:
- name: main
ready: false
state:
terminated:
exitCode: 0
reason: Completed
- name: wait
ready: false
state:
terminated:
exitCode: 0
reason: Completed
- name: sidecar
ready: false
state:
terminated:
exitCode: 137
reason: Error
`

func TestPodFailureWithSidecarSigkill(t *testing.T) {
var pod apiv1.Pod
wfv1.MustUnmarshal(podFailedWithSidecarSigkill, &pod)
assert.NotNil(t, pod)
ctx := logging.TestContext(t.Context())
nodeStatus, msg := newWoc(ctx).inferFailedReason(ctx, &pod, nil)
// Main and wait succeeded, sidecar was SIGKILL'd - this should be success
assert.Equal(t, wfv1.NodeSucceeded, nodeStatus)
assert.Empty(t, msg)
}

var podWithWaitContainerOOM = `
apiVersion: v1
kind: Pod
Expand Down
Loading