diff --git a/pkg/cli/admin/upgrade/status/alerts.go b/pkg/cli/admin/upgrade/status/alerts.go new file mode 100644 index 0000000000..503c4ebf9f --- /dev/null +++ b/pkg/cli/admin/upgrade/status/alerts.go @@ -0,0 +1,99 @@ +package status + +import ( + "time" + + configv1 "github.com/openshift/api/config/v1" +) + +// Alerts that will be included in the health upgrade evaluation, even if they were triggered before the upgrade began. +type AllowedAlerts map[string]struct{} + +var allowedAlerts AllowedAlerts = map[string]struct{}{ + "PodDisruptionBudgetLimit": {}, + "PodDisruptionBudgetAtLimit": {}, +} + +func (al AllowedAlerts) Contains(alert string) bool { + _, exists := al[alert] + return exists +} + +type AlertLabels struct { + AlertName string `json:"alertname,omitempty"` + Name string `json:"name,omitempty"` + Namespace string `json:"namespace,omitempty"` + Reason string `json:"reason,omitempty"` + Severity string `json:"severity,omitempty"` +} + +type AlertAnnotations struct { + Description string `json:"description,omitempty"` + Summary string `json:"summary,omitempty"` + Runbook string `json:"runbook_url,omitempty"` +} + +type Alert struct { + Labels AlertLabels `json:"labels,omitempty"` + Annotations AlertAnnotations `json:"annotations,omitempty"` + State string `json:"state,omitempty"` + Value string `json:"value,omitempty"` + ActiveAt time.Time `json:"activeAt,omitempty"` + PartialResponseStrategy string `json:"partialResponseStrategy,omitempty"` +} + +// Stores alert data returned by thanos +type AlertData struct { + Status string `json:"status"` + Data Data `json:"data"` +} + +type Data struct { + Alerts []Alert `json:"alerts"` +} + +func parseAlertDataToInsights(alertData AlertData, startedAt time.Time) []updateInsight { + var alerts []Alert = alertData.Data.Alerts + var updateInsights []updateInsight = []updateInsight{} + + for _, alert := range alerts { + if startedAt.After(alert.ActiveAt) && !allowedAlerts.Contains(alert.Labels.AlertName) { + continue + } + if alert.State == "pending" { + continue + } + updateInsights = append(updateInsights, updateInsight{ + startedAt: alert.ActiveAt, + impact: updateInsightImpact{ + level: alertImpactLevel(alert.Labels.Severity), + impactType: unknownImpactType, + summary: "Alert: " + alert.Annotations.Summary, + description: alert.Annotations.Description, + }, + remediation: updateInsightRemediation{reference: alert.Annotations.Runbook}, + scope: updateInsightScope{ + scopeType: scopeTypeCluster, + resources: []scopeResource{{ + kind: scopeGroupKind{group: configv1.GroupName, kind: "Alert"}, + namespace: alert.Labels.Namespace, + name: alert.Labels.AlertName, + }}, + }, + }) + } + return updateInsights +} + +func alertImpactLevel(ail string) impactLevel { + switch ail { + case "warning": + return warningImpactLevel + case "critical": + return criticalInfoLevel + case "info": + return infoImpactLevel + default: + return infoImpactLevel + } +} diff --git a/pkg/cli/admin/upgrade/status/alerts_test.go b/pkg/cli/admin/upgrade/status/alerts_test.go new file mode 100644 index 0000000000..bb79a3e8bc --- /dev/null +++ b/pkg/cli/admin/upgrade/status/alerts_test.go @@ -0,0 +1,156 @@ +package status + +import ( + "reflect" + "testing" + "time" + + configv1 "github.com/openshift/api/config/v1" +) + +func TestParseAlertDataToInsights(t *testing.T) { + now := time.Now() + + // Define test cases + tests := []struct { + name string + alertData AlertData + startedAt time.Time + expectedCount int + }{ + { + name: "Empty Alerts", + alertData: AlertData{ + Data: Data{Alerts: []Alert{}}, + }, + startedAt: now, + expectedCount: 0, + }, + { + name: "Alert Active After Start Time", + alertData: AlertData{ + Data: Data{ + Alerts: []Alert{ + {ActiveAt: now.Add(10 * time.Minute), Labels: AlertLabels{Severity: "critical", Namespace: "default", AlertName: "NodeDown"}, Annotations: AlertAnnotations{Summary: "Node is down"}}, + {ActiveAt: now.Add(-10 * time.Minute), Labels: AlertLabels{Severity: "warning", Namespace: "default", AlertName: "DiskSpaceLow"}, Annotations: AlertAnnotations{Summary: "Disk space low"}}, + }, + }, + }, + startedAt: now, + expectedCount: 1, + }, + { + name: "Alert Active Before Start Time", + alertData: AlertData{ + Data: Data{ + Alerts: []Alert{ + {ActiveAt: now.Add(-10 * time.Minute), Labels: AlertLabels{Severity: "warning", Namespace: "default", AlertName: "DiskSpaceLow"}, Annotations: AlertAnnotations{Summary: "Disk space low"}}, + }, + }, + }, + startedAt: now, + expectedCount: 0, + }, + { + name: "Alert Active Before Start Time, Allowed", + alertData: AlertData{ + Data: Data{ + Alerts: []Alert{ + {ActiveAt: now.Add(-20 * time.Minute), Labels: AlertLabels{Severity: "info", Namespace: "default", AlertName: "PodDisruptionBudgetAtLimit"}, Annotations: AlertAnnotations{Summary: "PodDisruptionBudgetAtLimit is at limit"}}, + {ActiveAt: now.Add(-20 * time.Minute), Labels: AlertLabels{Severity: "info", Namespace: "default", AlertName: "AlertmanagerReceiversNotConfigured"}, Annotations: AlertAnnotations{Summary: "Receivers (notification integrations) are not configured on Alertmanager"}}, + }, + }, + }, + startedAt: now, + expectedCount: 1, + }, + { + name: "Alert Active Before Start Time, Not Allowed", + alertData: AlertData{ + Data: Data{ + Alerts: []Alert{ + {ActiveAt: now.Add(-20 * time.Minute), Labels: AlertLabels{Severity: "info", Namespace: "default", AlertName: "AlertmanagerReceiversNotConfigured"}, Annotations: AlertAnnotations{Summary: "Receivers (notification integrations) are not configured on Alertmanager"}}, + }, + }, + }, + startedAt: now, + expectedCount: 0, + }, + } + + // Execute test cases + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + insights := parseAlertDataToInsights(tt.alertData, tt.startedAt) + if got := len(insights); got != tt.expectedCount { + t.Errorf("parseAlertDataToInsights() = %v, want %v", got, tt.expectedCount) + } + }) + } +} + +func TestParseAlertDataToInsightsWithData(t *testing.T) { + now := time.Now() + + tests := []struct { + name string + alertData AlertData + startedAt time.Time + expectedInsights []updateInsight + }{ + { + name: "Alert Active After Start Time", + alertData: AlertData{ + Data: Data{ + Alerts: []Alert{ + {ActiveAt: now.Add(10 * time.Minute), Labels: AlertLabels{Severity: "critical", Namespace: "default", AlertName: "NodeDown"}, Annotations: AlertAnnotations{Summary: "Node is down"}}, + }, + }, + }, + startedAt: now, + expectedInsights: []updateInsight{ + { + startedAt: now.Add(10 * time.Minute), + impact: updateInsightImpact{ + level: alertImpactLevel("critical"), + impactType: unknownImpactType, + summary: "Alert: Node is down", + }, + scope: updateInsightScope{ + scopeType: scopeTypeCluster, + resources: []scopeResource{ + { + kind: scopeGroupKind{group: configv1.GroupName, kind: "Alert"}, + namespace: "default", + name: "NodeDown", + }, + }, + }, + }, + }, + }, + { + name: "Alert Active Before Start Time", + alertData: AlertData{ + Data: Data{ + Alerts: []Alert{ + {ActiveAt: now.Add(-10 * time.Minute), Labels: AlertLabels{Severity: "warning", Namespace: "default", AlertName: "DiskSpaceLow"}, Annotations: AlertAnnotations{Summary: "Disk space low"}}, + }, + }, + }, + startedAt: now, + expectedInsights: []updateInsight{}, + }, + } + + // Execute test cases + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + insights := parseAlertDataToInsights(tt.alertData, tt.startedAt) + if !reflect.DeepEqual(insights, tt.expectedInsights) { + t.Errorf("parseAlertDataToInsights() got %#v, want %#v", insights, tt.expectedInsights) + } + }) + } + +} diff --git a/pkg/cli/admin/upgrade/status/examples/4.15.0-ec2-unavailable-mco-20m-alerts.json b/pkg/cli/admin/upgrade/status/examples/4.15.0-ec2-unavailable-mco-20m-alerts.json new file mode 100644 index 0000000000..071965738b --- /dev/null +++ b/pkg/cli/admin/upgrade/status/examples/4.15.0-ec2-unavailable-mco-20m-alerts.json @@ -0,0 +1,224 @@ +{ + "status": "success", + "data": { + "alerts": [ + { + "labels": { + "alertname": "ClusterOperatorDegraded", + "name": "kube-apiserver", + "namespace": "openshift-cluster-version", + "reason": "NodeInstaller_InstallerPodFailed", + "severity": "warning" + }, + "annotations": { + "description": "The kube-apiserver operator is degraded because NodeInstaller_InstallerPodFailed, and the components it manages may have reduced quality of service. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator kube-apiserver' or https://console-openshift-console.apps.ci-ln-kwszvwk-76ef8.aws-2.ci.openshift.org/settings/cluster/.", + "summary": "Cluster operator has been degraded for 30 minutes." + }, + "state": "pending", + "activeAt": "2023-11-24T15:43:23.156620559Z", + "value": "1e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "ClusterOperatorDown", + "name": "authentication", + "namespace": "openshift-cluster-version", + "reason": "WellKnown_NotReady", + "severity": "critical" + }, + "annotations": { + "description": "The authentication operator may be down or disabled because WellKnown_NotReady, and the components it manages may be unavailable or degraded. Cluster upgrades may not complete. For more information refer to 'oc get -o yaml clusteroperator authentication' or https://console-openshift-console.apps.ci-ln-kwszvwk-76ef8.aws-2.ci.openshift.org/settings/cluster/.", + "summary": "Cluster operator has not been available for 10 minutes." + }, + "state": "pending", + "activeAt": "2023-11-24T15:53:23.156620559Z", + "value": "0e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "UpdateAvailable", + "channel": "candidate-4.15", + "namespace": "openshift-cluster-version", + "severity": "info", + "upstream": "https://api.integration.openshift.com/api/upgrades_info/graph" + }, + "annotations": { + "description": "For more information refer to 'oc adm upgrade' or https://console-openshift-console.apps.ci-ln-kwszvwk-76ef8.aws-2.ci.openshift.org/settings/cluster/.", + "summary": "Your upstream update recommendation service recommends you update your cluster." + }, + "state": "firing", + "activeAt": "2023-11-24T15:31:53.183007399Z", + "value": "1e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "Watchdog", + "namespace": "openshift-monitoring", + "severity": "none" + }, + "annotations": { + "description": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty.\n", + "summary": "An alert that should always be firing to certify that Alertmanager is working properly." + }, + "state": "firing", + "activeAt": "2023-11-24T15:27:54.164800319Z", + "value": "1e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "TargetDown", + "job": "marketplace-operator-metrics", + "namespace": "openshift-marketplace", + "service": "marketplace-operator-metrics", + "severity": "warning" + }, + "annotations": { + "description": "100% of the marketplace-operator-metrics/marketplace-operator-metrics targets in openshift-marketplace namespace have been unreachable for more than 15 minutes. This may be a symptom of network connectivity issues, down nodes, or failures within these components. Assess the health of the infrastructure and nodes running these targets and then contact support.", + "summary": "Some targets were not reachable from the monitoring server for an extended period of time." + }, + "state": "pending", + "activeAt": "2023-11-24T15:53:23.163229601Z", + "value": "1e+02", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "AlertmanagerReceiversNotConfigured", + "namespace": "openshift-monitoring", + "severity": "warning" + }, + "annotations": { + "description": "Alerts are not configured to be sent to a notification system, meaning that you may not be notified in a timely fashion when important failures occur. Check the OpenShift documentation to learn how to configure notifications with Alertmanager.", + "summary": "Receivers (notification integrations) are not configured on Alertmanager", + "runbook_url": "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/AlertManager.md" + }, + "state": "firing", + "activeAt": "2023-11-23T15:47:42Z", + "value": "0e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "KubeStateMetricsWatchErrors", + "namespace": "openshift-monitoring", + "severity": "warning" + }, + "annotations": { + "description": "kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.", + "summary": "kube-state-metrics is experiencing errors in watch operations." + }, + "state": "pending", + "activeAt": "2023-11-24T15:42:13.192504153Z", + "value": "4.166666666666665e-01", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "KubePodNotReady", + "namespace": "openshift-kube-apiserver", + "pod": "kube-apiserver-startup-monitor-ip-10-0-60-26.us-west-1.compute.internal", + "severity": "warning" + }, + "annotations": { + "description": "Pod openshift-kube-apiserver/kube-apiserver-startup-monitor-ip-10-0-60-26.us-west-1.compute.internal has been in a non-ready state for longer than 15 minutes.", + "runbook_url": "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePodNotReady.md", + "summary": "Pod has been in a non-ready state for more than 15 minutes." + }, + "state": "firing", + "activeAt": "2023-11-24T15:41:52.75038242Z", + "value": "1e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "KubeAPIDown", + "severity": "critical" + }, + "annotations": { + "description": "KubeAPI has disappeared from Prometheus target discovery.", + "runbook_url": "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubeAPIDown.md", + "summary": "Target disappeared from Prometheus target discovery." + }, + "state": "pending", + "activeAt": "2023-11-24T15:45:26.218594457Z", + "value": "1e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertmanager": "https://10.128.0.127:9095/api/v2/alerts", + "alertname": "PrometheusErrorSendingAlertsToSomeAlertmanagers", + "container": "kube-rbac-proxy", + "endpoint": "metrics", + "instance": "10.128.0.132:9092", + "job": "prometheus-k8s", + "namespace": "openshift-monitoring", + "pod": "prometheus-k8s-0", + "service": "prometheus-k8s", + "severity": "warning" + }, + "annotations": { + "description": "21.7% errors while sending alerts from Prometheus openshift-monitoring/prometheus-k8s-0 to Alertmanager https://10.128.0.127:9095/api/v2/alerts.", + "summary": "Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager." + }, + "state": "pending", + "activeAt": "2023-11-24T15:42:43.329697712Z", + "value": "2.1701925925925927e+01", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "PodDisruptionBudgetAtLimit", + "controller": "alertmanager", + "namespace": "openshift-monitoring", + "severity": "warning" + }, + "annotations": { + "description": "PodDisruptionBudgetAtLimit. in namespace <>", + "runbook_url": "https:///PDB.md", + "summary": "PodDisruptionBudgetAtLimit for pods <>" + }, + "state": "firing", + "activeAt": "2023-11-23T15:39:33.014999722Z", + "value": "6.708842592592593e-01", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "PrometheusOperatorWatchErrors", + "controller": "prometheus", + "namespace": "openshift-monitoring", + "severity": "warning" + }, + "annotations": { + "description": "Errors while performing watch operations in controller prometheus in openshift-monitoring namespace.", + "summary": "Errors while performing watch operations in controller." + }, + "state": "pending", + "activeAt": "2023-11-24T15:39:33.014999722Z", + "value": "7.103480392156862e-01", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "PrometheusOperatorWatchErrors", + "controller": "thanos", + "namespace": "openshift-monitoring", + "severity": "warning" + }, + "annotations": { + "description": "Errors while performing watch operations in controller thanos in openshift-monitoring namespace.", + "summary": "Errors while performing watch operations in controller." + }, + "state": "pending", + "activeAt": "2023-11-24T15:39:33.014999722Z", + "value": "6.557058823529411e-01", + "partialResponseStrategy": "WARN" + } + ] + } +} \ No newline at end of file diff --git a/pkg/cli/admin/upgrade/status/examples/4.15.0-ec2-unavailable-mco-20m.detailed-output b/pkg/cli/admin/upgrade/status/examples/4.15.0-ec2-unavailable-mco-20m.detailed-output index 6225bf52a2..a7381ef276 100644 --- a/pkg/cli/admin/upgrade/status/examples/4.15.0-ec2-unavailable-mco-20m.detailed-output +++ b/pkg/cli/admin/upgrade/status/examples/4.15.0-ec2-unavailable-mco-20m.detailed-output @@ -33,3 +33,21 @@ Message: Cluster Operator machine-config is unavailable (MachineConfigController Resources: clusteroperators.config.openshift.io: machine-config Description: Cluster not available for [{operator 4.14.0-rc.3}]: ControllerConfig.machineconfiguration.openshift.io "machine-config-controller" is invalid: [status.controllerCertificates[0].notAfter: Required value, status.controllerCertificates[0].notBefore: Required value, status.controllerCertificates[1].notAfter: Required value, status.controllerCertificates[1].notBefore: Required value, status.controllerCertificates[2].notAfter: Required value, status.controllerCertificates[2].notBefore: Required value, status.controllerCertificates[3].notAfter: Required value, status.controllerCertificates[3].notBefore: Required value, status.controllerCertificates[4].notAfter: Required value, status.controllerCertificates[4].notBefore: Required value, status.controllerCertificates[5].notAfter: Required value, status.controllerCertificates[5].notBefore: Required value, status.controllerCertificates[6].notAfter: Required value, status.controllerCertificates[6].notBefore: Required value, status.controllerCertificates[7].notAfter: Required value, status.controllerCertificates[7].notBefore: Required value, status.controllerCertificates[8].notAfter: Required value, status.controllerCertificates[8].notBefore: Required value, status.controllerCertificates[9].notAfter: Required value, status.controllerCertificates[9].notBefore: Required value, : Invalid value: "null": some validation rules were not checked because the object was invalid; correct the existing errors to complete validation] + +Message: Alert: Pod has been in a non-ready state for more than 15 minutes. + Since: 6m35s + Level: Warning + Impact: Unknown + Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/KubePodNotReady.md + Resources: + alerts.config.openshift.io: openshift-kube-apiserver/KubePodNotReady + Description: Pod openshift-kube-apiserver/kube-apiserver-startup-monitor-ip-10-0-60-26.us-west-1.compute.internal has been in a non-ready state for longer than 15 minutes. + +Message: Alert: PodDisruptionBudgetAtLimit for pods <> + Since: 24h8m54s + Level: Warning + Impact: Unknown + Reference: https:///PDB.md + Resources: + alerts.config.openshift.io: openshift-monitoring/PodDisruptionBudgetAtLimit + Description: PodDisruptionBudgetAtLimit. in namespace <> diff --git a/pkg/cli/admin/upgrade/status/examples/4.15.0-ec2-unavailable-mco-20m.output b/pkg/cli/admin/upgrade/status/examples/4.15.0-ec2-unavailable-mco-20m.output index 5289e9e559..c40a74b8c4 100644 --- a/pkg/cli/admin/upgrade/status/examples/4.15.0-ec2-unavailable-mco-20m.output +++ b/pkg/cli/admin/upgrade/status/examples/4.15.0-ec2-unavailable-mco-20m.output @@ -25,7 +25,9 @@ ip-10-0-4-159.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ip-10-0-99-40.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ? = Update Health = -SINCE LEVEL IMPACT MESSAGE -20m24s Error API Availability Cluster Operator machine-config is unavailable (MachineConfigControllerFailed) +SINCE LEVEL IMPACT MESSAGE +20m24s Error API Availability Cluster Operator machine-config is unavailable (MachineConfigControllerFailed) +6m35s Warning Unknown Alert: Pod has been in a non-ready state for more than 15 minutes. +24h8m54s Warning Unknown Alert: PodDisruptionBudgetAtLimit for pods <> Run with --details=health for additional description and links to related online documentation diff --git a/pkg/cli/admin/upgrade/status/health.go b/pkg/cli/admin/upgrade/status/health.go index ef047b0c4a..07e8953526 100644 --- a/pkg/cli/admin/upgrade/status/health.go +++ b/pkg/cli/admin/upgrade/status/health.go @@ -14,6 +14,7 @@ type scopeType string const ( scopeTypeControlPlane scopeType = "ControlPlane" scopeTypeWorkerPool scopeType = "WorkerPool" + scopeTypeCluster scopeType = "Cluster" ) type scopeGroupKind struct { @@ -83,6 +84,7 @@ type impactType string // considered whether these are exactly the ones that we need. const ( noneImpactType impactType = "None" + unknownImpactType impactType = "Unknown" apiAvailabilityImpactType impactType = "API Availability" clusterCapacityImpactType impactType = "Cluster Capacity" applicationAvailabilityImpactType impactType = "Application Availability" diff --git a/pkg/cli/admin/upgrade/status/mockresources.go b/pkg/cli/admin/upgrade/status/mockresources.go index 161740285e..5e41f00f71 100644 --- a/pkg/cli/admin/upgrade/status/mockresources.go +++ b/pkg/cli/admin/upgrade/status/mockresources.go @@ -17,6 +17,7 @@ type mockData struct { machineConfigPoolsPath string machineConfigsPath string nodesPath string + alertsPath string clusterVersion *configv1.ClusterVersion clusterOperators *configv1.ClusterOperatorList machineConfigPools *machineconfigv1.MachineConfigPoolList diff --git a/pkg/cli/admin/upgrade/status/status.go b/pkg/cli/admin/upgrade/status/status.go index a60470e991..af45e644e4 100644 --- a/pkg/cli/admin/upgrade/status/status.go +++ b/pkg/cli/admin/upgrade/status/status.go @@ -3,7 +3,10 @@ package status import ( "context" + "encoding/json" "fmt" + "os" + // "sort" "strings" "time" @@ -18,8 +21,11 @@ import ( configv1 "github.com/openshift/api/config/v1" machineconfigv1 "github.com/openshift/api/machineconfiguration/v1" + routev1 "github.com/openshift/api/route/v1" configv1client "github.com/openshift/client-go/config/clientset/versioned" machineconfigv1client "github.com/openshift/client-go/machineconfiguration/clientset/versioned" + routev1client "github.com/openshift/client-go/route/clientset/versioned/typed/route/v1" + "github.com/openshift/oc/pkg/cli/admin/inspectalerts" "github.com/openshift/oc/pkg/cli/admin/upgrade/status/mco" ) @@ -67,6 +73,8 @@ type options struct { ConfigClient configv1client.Interface CoreClient corev1client.CoreV1Interface MachineConfigClient machineconfigv1client.Interface + RouteClient routev1client.RouteV1Interface + getAlerts func(ctx context.Context) ([]byte, error) } func (o *options) enabledDetailed(what string) bool { @@ -88,6 +96,7 @@ func (o *options) Complete(f kcmdutil.Factory, cmd *cobra.Command, args []string o.mockData.machineConfigPoolsPath = strings.Replace(o.mockData.cvPath, cvSuffix, "-mcp.yaml", 1) o.mockData.machineConfigsPath = strings.Replace(o.mockData.cvPath, cvSuffix, "-mc.yaml", 1) o.mockData.nodesPath = strings.Replace(o.mockData.cvPath, cvSuffix, "-node.yaml", 1) + o.mockData.alertsPath = strings.Replace(o.mockData.cvPath, cvSuffix, "-alerts.json", 1) } if o.mockData.cvPath == "" { @@ -110,6 +119,19 @@ func (o *options) Complete(f kcmdutil.Factory, cmd *cobra.Command, args []string return err } o.CoreClient = coreClient + + routeClient, err := routev1client.NewForConfig(cfg) + if err != nil { + return err + } + o.RouteClient = routeClient + + routeGetter := func(ctx context.Context, namespace string, name string, opts metav1.GetOptions) (*routev1.Route, error) { + return routeClient.Routes(namespace).Get(ctx, name, opts) + } + o.getAlerts = func(ctx context.Context) ([]byte, error) { + return inspectalerts.GetAlerts(ctx, routeGetter, cfg.BearerToken) + } } else { err := o.mockData.load() if err != nil { @@ -249,6 +271,26 @@ func (o *options) Run(ctx context.Context) error { } updatingFor := now.Sub(startedAt).Round(time.Second) + // get the alerts for the cluster. if we're unable to fetch the alerts, we'll let the user know that alerts + // are not being fetched, but rest of the command should work. + var alertData AlertData + var alertBytes []byte + var err error + if ap := o.mockData.alertsPath; ap != "" { + alertBytes, err = os.ReadFile(o.mockData.alertsPath) + } else { + alertBytes, err = o.getAlerts(ctx) + } + if err != nil { + fmt.Println("Unable to fetch alerts, ignoring alerts in 'Update Health': ", err) + } else { + // Unmarshal the JSON data into the struct + if err := json.Unmarshal(alertBytes, &alertData); err != nil { + fmt.Println("Ignoring alerts in 'Update Health'. Error unmarshaling alerts: %w", err) + } + updateInsights = append(updateInsights, parseAlertDataToInsights(alertData, startedAt)...) + } + controlPlaneStatusData, insights := assessControlPlaneStatus(cv, operators.Items, now) updateInsights = append(updateInsights, insights...) _ = controlPlaneStatusData.Write(o.Out)