openshift · openshift-merge-robot · Dec 17, 2022 · Nov 21, 2022 · petr-muller · Nov 23, 2022
diff --git a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml
@@ -87,9 +87,9 @@ spec:
     - alert: ClusterOperatorDown
       annotations:
         summary: Cluster operator has not been available for 10 minutes.
-        description: The {{ "{{ $labels.name }}" }} operator may be down or disabled, and the components it manages may be unavailable or degraded.  Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
+        description: The {{ "{{ $labels.name }}" }} operator may be down or disabled because {{ "${{ $labels.reason }}" }}, and the components it manages may be unavailable or degraded.  Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator {{ "{{ $labels.name }}" }}'{{ "{{ with $console_url := \"console_url\" | query }}{{ if ne (len (label \"url\" (first $console_url ) ) ) 0}} or {{ label \"url\" (first $console_url ) }}/settings/cluster/{{ end }}{{ end }}" }}.
       expr: |
-        max by (namespace, name) (cluster_operator_up{job="cluster-version-operator"} == 0)
+        max by (namespace, name, reason) (cluster_operator_up{job="cluster-version-operator"} == 0)
       for: 10m
       labels:
         severity: critical

diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go
@@ -94,8 +94,8 @@ version for 'cluster', or empty for 'initial'.
 		}, []string{"name"}),
 		clusterOperatorUp: prometheus.NewGaugeVec(prometheus.GaugeOpts{
 			Name: "cluster_operator_up",
-			Help: "Reports key highlights of the active cluster operators.",
-		}, []string{"name", "version"}),
+			Help: "1 if a cluster operator is Available=True.  0 otherwise, including if a cluster operator sets no Available condition.  The 'version' label tracks the 'operator' version.  The 'reason' label is passed through from the Available condition, unless the cluster operator sets no Available condition, in which case NoAvailableCondition is used.",
+		}, []string{"name", "version", "reason"}),
 		clusterOperatorConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
 			Name: "cluster_operator_conditions",
 			Help: "Report the conditions for active cluster operators. 0 is False and 1 is True.",
@@ -339,7 +339,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
 	ch <- m.version.WithLabelValues("", "", "", "").Desc()
 	ch <- m.availableUpdates.WithLabelValues("", "").Desc()
 	ch <- m.capability.WithLabelValues("").Desc()
-	ch <- m.clusterOperatorUp.WithLabelValues("", "").Desc()
+	ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc()
 	ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
 	ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
 	ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
@@ -489,12 +489,16 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
 		if version == "" {
 			klog.V(2).Infof("ClusterOperator %s is not setting the 'operator' version", op.Name)
 		}
-		g := m.clusterOperatorUp.WithLabelValues(op.Name, version)
-		if resourcemerge.IsOperatorStatusConditionTrue(op.Status.Conditions, configv1.OperatorAvailable) {
-			g.Set(1)
-		} else {
-			g.Set(0)
+		var isUp float64
+		reason := "NoAvailableCondition"
+		if condition := resourcemerge.FindOperatorStatusCondition(op.Status.Conditions, configv1.OperatorAvailable); condition != nil {
+			reason = condition.Reason
+			if condition.Status == configv1.ConditionTrue {
+				isUp = 1
+			}
 		}
+		g := m.clusterOperatorUp.WithLabelValues(op.Name, version, reason)
+		g.Set(isUp)
 		ch <- g
 		for _, condition := range op.Status.Conditions {
 			if condition.Status != configv1.ConditionFalse && condition.Status != configv1.ConditionTrue {

diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go
@@ -193,7 +193,7 @@ func Test_operatorMetrics_Collect(t *testing.T) {
 					t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
 				}
 				expectMetric(t, metrics[0], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""})
-				expectMetric(t, metrics[1], 0, map[string]string{"name": "test", "version": "10.1.5-1"})
+				expectMetric(t, metrics[1], 0, map[string]string{"name": "test", "version": "10.1.5-1", "reason": "NoAvailableCondition"})
 				expectMetric(t, metrics[2], 1, map[string]string{"type": ""})
 			},
 		},