Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pkg/monitor/monitorapi/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@ const (
FailedToAuthenticateWithOpenShiftUser IntervalReason = "FailedToAuthenticateWithOpenShiftUser"
FailedContactingAPIReason IntervalReason = "FailedContactingAPI"

UnhealthyReason IntervalReason = "Unhealthy"

UpgradeStartedReason IntervalReason = "UpgradeStarted"
UpgradeVersionReason IntervalReason = "UpgradeVersion"
UpgradeRollbackReason IntervalReason = "UpgradeRollback"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,42 @@ func NewUpgradePathologicalEventMatchers(kubeConfig *rest.Config, finalIntervals
m := newFailedSchedulingDuringNodeUpdatePathologicalEventMatcher(finalIntervals)
registry.AddPathologicalEventMatcherOrDie(m)

// Prometheus pods may have readiness probe errors during upgrades.
registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{
name: "PrometheusReadinessProbeErrors",
locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`),
monitorapi.LocatorPodKey: regexp.MustCompile(`^prometheus-k8s-[0,1]$`),
},
messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`),
messageHumanRegex: regexp.MustCompile("Readiness probe errored"),
jira: "https://issues.redhat.com/browse/OCPBUGS-62703",
/*
05:50:32 openshift-monitoring kubelet prometheus-k8s-1
Unhealthy
Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found

05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0
Unhealthy
Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1

11:44:16 (x56) openshift-monitoring kubelet prometheus-k8s-0
Unhealthy
Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1

Readiness probes run during the lifecycle of the container, including termination.
Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.

To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
The job below hit ~60 readiness errors during the upgrade:
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
However, the job below hit readiness errors 774 times during the upgrade:
https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
*/
repeatThresholdOverride: 100,
})

return registry
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package pathologicaleventlibrary

import (
_ "embed"
"fmt"
"testing"
"time"

Expand Down Expand Up @@ -666,3 +667,118 @@ func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) {
})
}
}

func TestPathologicalEventsPrometheusReadinessProbeErrors(t *testing.T) {
const namespace = "openshift-monitoring"

unhealthyReasonPathologicalMessageWithHumanMessage := func(humanMessage string, repetitionCount int) monitorapi.Message {
return monitorapi.Message{
Reason: monitorapi.UnhealthyReason,
HumanMessage: humanMessage,
Annotations: map[monitorapi.AnnotationKey]string{
monitorapi.AnnotationCount: fmt.Sprintf("%d", repetitionCount),
monitorapi.AnnotationPathological: "true",
},
}
}

nsLocatorWithPodKey := func(pod, ns string) monitorapi.Locator {
return monitorapi.Locator{
Type: monitorapi.LocatorTypePod,
Keys: map[monitorapi.LocatorKey]string{
monitorapi.LocatorNamespaceKey: ns,
monitorapi.LocatorPodKey: pod,
},
}
}

tests := []struct {
name string
expectedMessage string
pod string
ns string
humanMessage string
repetitionCount int
}{
{
name: "Readiness probe error (stopping container) on first Prometheus pod",
expectedMessage: "",
pod: "prometheus-k8s-0",
ns: namespace,
humanMessage: "Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
repetitionCount: 100,
},
{
name: "Readiness probe error (terminated container) on second Prometheus pod",
expectedMessage: "",
pod: "prometheus-k8s-1",
ns: namespace,
humanMessage: "Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found",
repetitionCount: 100,
},
{
name: "Readiness probe error (stopping container, different human message) on second Prometheus pod",
expectedMessage: "",
pod: "prometheus-k8s-1",
ns: namespace,
humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
repetitionCount: 100,
},
{
name: "Readiness probe error (stopping container) on a Prometheus pod in a different namespace should not be ignored",
expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/foo pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
pod: "prometheus-k8s-1",
ns: "foo",
humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
repetitionCount: 100,
},
{
name: "Readiness probe error (stopping container) on non-existent Prometheus pod should not be ignored",
expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-2 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
pod: "prometheus-k8s-2",
ns: namespace,
humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
repetitionCount: 100,
},
{
name: "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit",
expectedMessage: "1 events happened too frequently\n\nevent happened 101 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
pod: "prometheus-k8s-1",
ns: namespace,
humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
repetitionCount: 101,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
events := monitorapi.Intervals([]monitorapi.Interval{
{
Condition: monitorapi.Condition{
Locator: nsLocatorWithPodKey(test.pod, test.ns),
Message: unhealthyReasonPathologicalMessageWithHumanMessage(test.humanMessage, test.repetitionCount),
},
},
})
evaluator := duplicateEventsEvaluator{
registry: NewUpgradePathologicalEventMatchers(nil, events),
}

testName := "events should not repeat"
junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false)
jUnitName := getJUnitName(testName, test.ns)
for _, junit := range junits {
if junit.Name == jUnitName {
if test.expectedMessage != "" {
require.NotNil(t, junit.FailureOutput, "expected junit to have failure output")
require.Equal(t, test.expectedMessage, junit.FailureOutput.Output)
} else {
require.Nil(t, junit.FailureOutput, "expected success but got failure output for junit: %s", junit.Name)
}

break
}
}
})
}
}