@@ -7171,6 +7171,134 @@ func TestPodFailureWithContainerWaitingState(t *testing.T) {
71717171 assert .Equal (t , "Pod failed before main container starts due to ContainerCreating: Container is creating" , msg )
71727172}
71737173
7174+ // Test that when containers are in Running state (not Terminated) on a Failed pod,
7175+ // we correctly return Failed instead of incorrectly returning Succeeded.
7176+ // This can happen during node eviction or other pod-level failures.
7177+ var podFailedWithRunningContainers = `
7178+ apiVersion: v1
7179+ kind: Pod
7180+ metadata:
7181+ name: test-pod
7182+ spec:
7183+ containers:
7184+ - name: main
7185+ env:
7186+ - name: ARGO_CONTAINER_NAME
7187+ value: main
7188+ status:
7189+ phase: Failed
7190+ reason: Evicted
7191+ message: "The node was low on resource: memory."
7192+ containerStatuses:
7193+ - name: main
7194+ ready: false
7195+ restartCount: 0
7196+ state:
7197+ running:
7198+ startedAt: "2021-01-22T09:50:16Z"
7199+ - name: wait
7200+ ready: false
7201+ restartCount: 0
7202+ state:
7203+ running:
7204+ startedAt: "2021-01-22T09:50:16Z"
7205+ `
7206+
7207+ func TestPodFailureWithRunningContainers (t * testing.T ) {
7208+ var pod apiv1.Pod
7209+ wfv1 .MustUnmarshal (podFailedWithRunningContainers , & pod )
7210+ assert .NotNil (t , pod )
7211+
7212+ // Pod has a message, so it should return that
7213+ nodeStatus , msg := newWoc ().inferFailedReason (& pod , nil )
7214+ assert .Equal (t , wfv1 .NodeFailed , nodeStatus )
7215+ assert .Equal (t , "The node was low on resource: memory." , msg )
7216+ }
7217+
7218+ // Test that when containers don't have terminated state and there's no pod message,
7219+ // we correctly return Failed with information about which containers couldn't be confirmed.
7220+ var podFailedWithRunningContainersNoMessage = `
7221+ apiVersion: v1
7222+ kind: Pod
7223+ metadata:
7224+ name: test-pod
7225+ spec:
7226+ containers:
7227+ - name: main
7228+ env:
7229+ - name: ARGO_CONTAINER_NAME
7230+ value: main
7231+ status:
7232+ phase: Failed
7233+ containerStatuses:
7234+ - name: main
7235+ ready: false
7236+ restartCount: 0
7237+ state:
7238+ running:
7239+ startedAt: "2021-01-22T09:50:16Z"
7240+ - name: wait
7241+ ready: false
7242+ restartCount: 0
7243+ state:
7244+ running:
7245+ startedAt: "2021-01-22T09:50:16Z"
7246+ `
7247+
7248+ func TestPodFailureWithRunningContainersNoMessage (t * testing.T ) {
7249+ var pod apiv1.Pod
7250+ wfv1 .MustUnmarshal (podFailedWithRunningContainersNoMessage , & pod )
7251+ assert .NotNil (t , pod )
7252+ nodeStatus , msg := newWoc ().inferFailedReason (& pod , nil )
7253+ assert .Equal (t , wfv1 .NodeFailed , nodeStatus )
7254+ assert .Equal (t , "pod failed: neither main nor wait container completed successfully" , msg )
7255+ }
7256+
7257+ // Test that sidecar SIGKILL still results in success when main and wait succeeded
7258+ var podFailedWithSidecarSigkill = `
7259+ apiVersion: v1
7260+ kind: Pod
7261+ metadata:
7262+ name: test-pod
7263+ spec:
7264+ containers:
7265+ - name: main
7266+ env:
7267+ - name: ARGO_CONTAINER_NAME
7268+ value: main
7269+ status:
7270+ phase: Failed
7271+ containerStatuses:
7272+ - name: main
7273+ ready: false
7274+ state:
7275+ terminated:
7276+ exitCode: 0
7277+ reason: Completed
7278+ - name: wait
7279+ ready: false
7280+ state:
7281+ terminated:
7282+ exitCode: 0
7283+ reason: Completed
7284+ - name: sidecar
7285+ ready: false
7286+ state:
7287+ terminated:
7288+ exitCode: 137
7289+ reason: Error
7290+ `
7291+
7292+ func TestPodFailureWithSidecarSigkill (t * testing.T ) {
7293+ var pod apiv1.Pod
7294+ wfv1 .MustUnmarshal (podFailedWithSidecarSigkill , & pod )
7295+ assert .NotNil (t , pod )
7296+ nodeStatus , msg := newWoc ().inferFailedReason (& pod , nil )
7297+ // Main and wait succeeded, sidecar was SIGKILL'd - this should be success
7298+ assert .Equal (t , wfv1 .NodeSucceeded , nodeStatus )
7299+ assert .Empty (t , msg )
7300+ }
7301+
71747302var podWithWaitContainerOOM = `
71757303apiVersion: v1
71767304kind: Pod
0 commit comments