diff --git a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml index 6ad9e2624..edf1b0a1d 100644 --- a/install/0000_90_cluster-version-operator_02_servicemonitor.yaml +++ b/install/0000_90_cluster-version-operator_02_servicemonitor.yaml @@ -82,7 +82,11 @@ spec: annotations: message: Cluster operator {{ "{{ $labels.name }}" }} has been degraded for 10 minutes. Operator is degraded because {{ "{{ $labels.reason }}" }} and cluster upgrades will be unstable. expr: | - cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"} == 1 + ( + cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"} + or on (name) + group by (name) (cluster_operator_up{job="cluster-version-operator"}) + ) == 1 for: 10m labels: severity: critical diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index fdeec7acb..b63bc5dd6 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -358,9 +358,7 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { klog.V(4).Infof("ClusterOperator %s is not setting the 'operator' version", op.Name) } g := m.clusterOperatorUp.WithLabelValues(op.Name, version) - failing := resourcemerge.IsOperatorStatusConditionTrue(op.Status.Conditions, configv1.OperatorDegraded) - available := resourcemerge.IsOperatorStatusConditionTrue(op.Status.Conditions, configv1.OperatorAvailable) - if available && !failing { + if resourcemerge.IsOperatorStatusConditionTrue(op.Status.Conditions, configv1.OperatorAvailable) { g.Set(1) } else { g.Set(0) diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go index d74f07fe6..0891b50df 100644 --- a/pkg/cvo/metrics_test.go +++ b/pkg/cvo/metrics_test.go @@ -170,7 +170,67 @@ func Test_operatorMetrics_Collect(t *testing.T) { }, }, { - name: "collects cluster operator status failure", + name: "collects cluster operator without conditions", + optr: &Operator{ + coLister: &coLister{ + Items: []*configv1.ClusterOperator{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Status: configv1.ClusterOperatorStatus{ + Versions: []configv1.OperandVersion{ + {Name: "operator", Version: "10.1.5-1"}, + {Name: "operand", Version: "10.1.5-2"}, + }, + }, + }, + }, + }, + }, + wants: func(t *testing.T, metrics []prometheus.Metric) { + if len(metrics) != 3 { + t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) + } + expectMetric(t, metrics[0], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""}) + expectMetric(t, metrics[1], 0, map[string]string{"name": "test", "version": "10.1.5-1"}) + expectMetric(t, metrics[2], 1, map[string]string{"type": ""}) + }, + }, + { + name: "collects cluster operator unavailable", + optr: &Operator{ + coLister: &coLister{ + Items: []*configv1.ClusterOperator{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Status: configv1.ClusterOperatorStatus{ + Versions: []configv1.OperandVersion{ + {Name: "operator", Version: "10.1.5-1"}, + {Name: "operand", Version: "10.1.5-2"}, + }, + Conditions: []configv1.ClusterOperatorStatusCondition{ + {Type: configv1.OperatorAvailable, Status: configv1.ConditionFalse}, + }, + }, + }, + }, + }, + }, + wants: func(t *testing.T, metrics []prometheus.Metric) { + if len(metrics) != 4 { + t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) + } + expectMetric(t, metrics[0], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""}) + expectMetric(t, metrics[1], 0, map[string]string{"name": "test", "version": "10.1.5-1"}) + expectMetric(t, metrics[2], 0, map[string]string{"name": "test", "condition": "Available"}) + expectMetric(t, metrics[3], 1, map[string]string{"type": ""}) + }, + }, + { + name: "collects cluster operator degraded", optr: &Operator{ coLister: &coLister{ Items: []*configv1.ClusterOperator{ @@ -197,7 +257,7 @@ func Test_operatorMetrics_Collect(t *testing.T) { t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics)) } expectMetric(t, metrics[0], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""}) - expectMetric(t, metrics[1], 0, map[string]string{"name": "test", "version": "10.1.5-1"}) + expectMetric(t, metrics[1], 1, map[string]string{"name": "test", "version": "10.1.5-1"}) expectMetric(t, metrics[2], 1, map[string]string{"name": "test", "condition": "Available"}) expectMetric(t, metrics[3], 1, map[string]string{"name": "test", "condition": "Degraded"}) expectMetric(t, metrics[4], 1, map[string]string{"type": ""})