Skip to content

Commit d24f6b5

Browse files
authored
fix: if pod fails without container termination, don't mark node succeeded always (cherry-pick #15150 for 3.7) (#15155)
Signed-off-by: Alan Clucas <[email protected]>
1 parent faf5933 commit d24f6b5

File tree

2 files changed

+155
-6
lines changed

2 files changed

+155
-6
lines changed

workflow/controller/operator.go

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1683,6 +1683,12 @@ func (woc *wfOperationCtx) inferFailedReason(pod *apiv1.Pod, tmpl *wfv1.Template
16831683
return wfv1.NodeFailed, fmt.Sprintf("can't find failed message for pod %s namespace %s", pod.Name, pod.Namespace)
16841684
}
16851685

1686+
// Track whether critical containers completed successfully (terminated with exit code 0).
1687+
// We must confirm both to return success—otherwise a pod-level failure (eviction, node death)
1688+
// could be incorrectly reported as success.
1689+
mainContainerSucceeded := false
1690+
waitContainerSucceeded := false
1691+
16861692
for _, ctr := range ctrs {
16871693

16881694
// Virtual Kubelet environment will not set the terminate on waiting container
@@ -1694,11 +1700,15 @@ func (woc *wfOperationCtx) inferFailedReason(pod *apiv1.Pod, tmpl *wfv1.Template
16941700
}
16951701
t := ctr.State.Terminated
16961702
if t == nil {
1697-
// We should never get here
16981703
log.Warnf("Pod %s phase was Failed but %s did not have terminated state", pod.Name, ctr.Name)
16991704
continue
17001705
}
17011706
if t.ExitCode == 0 {
1707+
if tmpl.IsMainContainerName(ctr.Name) {
1708+
mainContainerSucceeded = true
1709+
} else if ctr.Name == common.WaitContainerName {
1710+
waitContainerSucceeded = true
1711+
}
17021712
continue
17031713
}
17041714

@@ -1727,11 +1737,22 @@ func (woc *wfOperationCtx) inferFailedReason(pod *apiv1.Pod, tmpl *wfv1.Template
17271737
}
17281738
}
17291739

1730-
// If we get here, we have detected that the main/wait containers succeed but the sidecar(s)
1731-
// were SIGKILL'd. The executor may have had to forcefully terminate the sidecar (kill -9),
1732-
// resulting in a 137 exit code (which we had ignored earlier). If failMessages is empty, it
1733-
// indicates that this is the case and we return Success instead of Failure.
1734-
return wfv1.NodeSucceeded, ""
1740+
// Determine final status based on whether we confirmed main and wait succeeded
1741+
// Slightly convulted approach to avoid the exhaustive linter getting upset
1742+
if mainContainerSucceeded {
1743+
if waitContainerSucceeded {
1744+
// Both succeeded - sidecars may have been force-killed (137/143), which is fine
1745+
return wfv1.NodeSucceeded, ""
1746+
} else {
1747+
return wfv1.NodeFailed, "pod failed: wait container did not complete successfully"
1748+
}
1749+
} else {
1750+
if waitContainerSucceeded {
1751+
return wfv1.NodeFailed, "pod failed: main container did not complete successfully"
1752+
} else {
1753+
return wfv1.NodeFailed, "pod failed: neither main nor wait container completed successfully"
1754+
}
1755+
}
17351756
}
17361757

17371758
func (woc *wfOperationCtx) createPVCs(ctx context.Context) error {

workflow/controller/operator_test.go

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7171,6 +7171,134 @@ func TestPodFailureWithContainerWaitingState(t *testing.T) {
71717171
assert.Equal(t, "Pod failed before main container starts due to ContainerCreating: Container is creating", msg)
71727172
}
71737173

7174+
// Test that when containers are in Running state (not Terminated) on a Failed pod,
7175+
// we correctly return Failed instead of incorrectly returning Succeeded.
7176+
// This can happen during node eviction or other pod-level failures.
7177+
var podFailedWithRunningContainers = `
7178+
apiVersion: v1
7179+
kind: Pod
7180+
metadata:
7181+
name: test-pod
7182+
spec:
7183+
containers:
7184+
- name: main
7185+
env:
7186+
- name: ARGO_CONTAINER_NAME
7187+
value: main
7188+
status:
7189+
phase: Failed
7190+
reason: Evicted
7191+
message: "The node was low on resource: memory."
7192+
containerStatuses:
7193+
- name: main
7194+
ready: false
7195+
restartCount: 0
7196+
state:
7197+
running:
7198+
startedAt: "2021-01-22T09:50:16Z"
7199+
- name: wait
7200+
ready: false
7201+
restartCount: 0
7202+
state:
7203+
running:
7204+
startedAt: "2021-01-22T09:50:16Z"
7205+
`
7206+
7207+
func TestPodFailureWithRunningContainers(t *testing.T) {
7208+
var pod apiv1.Pod
7209+
wfv1.MustUnmarshal(podFailedWithRunningContainers, &pod)
7210+
assert.NotNil(t, pod)
7211+
7212+
// Pod has a message, so it should return that
7213+
nodeStatus, msg := newWoc().inferFailedReason(&pod, nil)
7214+
assert.Equal(t, wfv1.NodeFailed, nodeStatus)
7215+
assert.Equal(t, "The node was low on resource: memory.", msg)
7216+
}
7217+
7218+
// Test that when containers don't have terminated state and there's no pod message,
7219+
// we correctly return Failed with information about which containers couldn't be confirmed.
7220+
var podFailedWithRunningContainersNoMessage = `
7221+
apiVersion: v1
7222+
kind: Pod
7223+
metadata:
7224+
name: test-pod
7225+
spec:
7226+
containers:
7227+
- name: main
7228+
env:
7229+
- name: ARGO_CONTAINER_NAME
7230+
value: main
7231+
status:
7232+
phase: Failed
7233+
containerStatuses:
7234+
- name: main
7235+
ready: false
7236+
restartCount: 0
7237+
state:
7238+
running:
7239+
startedAt: "2021-01-22T09:50:16Z"
7240+
- name: wait
7241+
ready: false
7242+
restartCount: 0
7243+
state:
7244+
running:
7245+
startedAt: "2021-01-22T09:50:16Z"
7246+
`
7247+
7248+
func TestPodFailureWithRunningContainersNoMessage(t *testing.T) {
7249+
var pod apiv1.Pod
7250+
wfv1.MustUnmarshal(podFailedWithRunningContainersNoMessage, &pod)
7251+
assert.NotNil(t, pod)
7252+
nodeStatus, msg := newWoc().inferFailedReason(&pod, nil)
7253+
assert.Equal(t, wfv1.NodeFailed, nodeStatus)
7254+
assert.Equal(t, "pod failed: neither main nor wait container completed successfully", msg)
7255+
}
7256+
7257+
// Test that sidecar SIGKILL still results in success when main and wait succeeded
7258+
var podFailedWithSidecarSigkill = `
7259+
apiVersion: v1
7260+
kind: Pod
7261+
metadata:
7262+
name: test-pod
7263+
spec:
7264+
containers:
7265+
- name: main
7266+
env:
7267+
- name: ARGO_CONTAINER_NAME
7268+
value: main
7269+
status:
7270+
phase: Failed
7271+
containerStatuses:
7272+
- name: main
7273+
ready: false
7274+
state:
7275+
terminated:
7276+
exitCode: 0
7277+
reason: Completed
7278+
- name: wait
7279+
ready: false
7280+
state:
7281+
terminated:
7282+
exitCode: 0
7283+
reason: Completed
7284+
- name: sidecar
7285+
ready: false
7286+
state:
7287+
terminated:
7288+
exitCode: 137
7289+
reason: Error
7290+
`
7291+
7292+
func TestPodFailureWithSidecarSigkill(t *testing.T) {
7293+
var pod apiv1.Pod
7294+
wfv1.MustUnmarshal(podFailedWithSidecarSigkill, &pod)
7295+
assert.NotNil(t, pod)
7296+
nodeStatus, msg := newWoc().inferFailedReason(&pod, nil)
7297+
// Main and wait succeeded, sidecar was SIGKILL'd - this should be success
7298+
assert.Equal(t, wfv1.NodeSucceeded, nodeStatus)
7299+
assert.Empty(t, msg)
7300+
}
7301+
71747302
var podWithWaitContainerOOM = `
71757303
apiVersion: v1
71767304
kind: Pod

0 commit comments

Comments
 (0)