openshift · rexagod · Oct 9, 2025 · Oct 28, 2025 · Oct 30, 2025 · Nov 4, 2025
diff --git a/pkg/monitor/monitorapi/types.go b/pkg/monitor/monitorapi/types.go
@@ -251,6 +251,8 @@ const (
 	FailedToAuthenticateWithOpenShiftUser IntervalReason = "FailedToAuthenticateWithOpenShiftUser"
 	FailedContactingAPIReason             IntervalReason = "FailedContactingAPI"
 
+	UnhealthyReason IntervalReason = "Unhealthy"
+
 	UpgradeStartedReason  IntervalReason = "UpgradeStarted"
 	UpgradeVersionReason  IntervalReason = "UpgradeVersion"
 	UpgradeRollbackReason IntervalReason = "UpgradeRollback"

diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
@@ -502,6 +502,9 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva
 	twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals)
 	registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher)
 
+	prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher := newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals)
+	registry.AddPathologicalEventMatcherOrDie(prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher)
+
 	return registry
 }
 
@@ -1171,3 +1174,62 @@ func newCrioReloadedTooOftenEventMatcher(finalInternals monitorapi.Intervals) Ev
 		allowIfWithinIntervals: crioReloadedIntervals,
 	}
 }
+
+func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals monitorapi.Intervals) EventMatcher {
+	podNamePrefix := "prometheus-k8s"
+	podNamespace := "openshift-monitoring"
+	messageHumanizedSubstring := "Readiness probe errored"
+	messageReason := monitorapi.UnhealthyReason
+	matcher := &SimplePathologicalEventMatcher{
+		name: "PrometheusReadinessProbeErrorsDuringUpgrades",
+		locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
+			monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^` + podNamespace + `$`),
+			monitorapi.LocatorPodKey:       regexp.MustCompile(`^` + podNamePrefix + `-[0,1]$`),
+		},
+		messageReasonRegex: regexp.MustCompile(`^` + string(messageReason) + `$`),
+		messageHumanRegex:  regexp.MustCompile(messageHumanizedSubstring),
+		jira:               "https://issues.redhat.com/browse/OCPBUGS-62703",
+	}
+
+	// Sanity check in case no `finalIntervals` are provided.
+	if finalIntervals == nil || len(finalIntervals) == 0 {
+		matcher.neverAllow = true
+		return matcher
+	}
+
+	/*
+		05:50:32	openshift-monitoring	kubelet	prometheus-k8s-1
+		Unhealthy
+		Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
+
+		05:53:52 (x25)	openshift-monitoring	kubelet	prometheus-k8s-0
+		Unhealthy
+		Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+
+		11:44:16 (x56)	openshift-monitoring	kubelet	prometheus-k8s-0
+		Unhealthy
+		Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+	*/
+	testIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
+		return eventInterval.Locator.Type == monitorapi.LocatorTypePod &&
+			eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == podNamespace &&
+			strings.HasPrefix(eventInterval.Locator.Keys[monitorapi.LocatorPodKey], podNamePrefix) &&
+			eventInterval.Message.Reason == messageReason &&
+			strings.Contains(eventInterval.Message.HumanMessage, messageHumanizedSubstring)
+	})
+
+	if len(testIntervals) > 0 {
+		// Readiness probe errors are expected during upgrades, allow a higher threshold.
+		// Set the threshold to 100 to allow for a high number of readiness probe errors
+		// during the upgrade, but not so high that we would miss a real problem, i.e.,
+		// the job below (and usually) hit ~60 readiness errors during the upgrade,
+		// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048,
+		// However, the job below hit readiness errors 774 times during the upgrade,
+		// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856.
+		matcher.repeatThresholdOverride = 100
+	} else {
+		matcher.neverAllow = true
+	}
+
+	return matcher
+}