Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Revert "Revert "Merge pull request #30269 from hongkailiu/OTA-1626""
This reverts commit aba036a.
  • Loading branch information
hongkailiu committed Oct 27, 2025
commit b66a7e41d08c761fb4777bc26f06821038c7d41c
2 changes: 2 additions & 0 deletions pkg/defaultmonitortests/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/openshift/origin/pkg/monitortests/authentication/requiredsccmonitortests"
admupgradestatus "github.com/openshift/origin/pkg/monitortests/cli/adm_upgrade/status"
azuremetrics "github.com/openshift/origin/pkg/monitortests/cloud/azure/metrics"
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/clusterversionchecker"
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/legacycvomonitortests"
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/operatorstateanalyzer"
"github.com/openshift/origin/pkg/monitortests/clusterversionoperator/terminationmessagepolicy"
Expand Down Expand Up @@ -172,6 +173,7 @@ func newUniversalMonitorTests(info monitortestframework.MonitorTestInitializatio
monitorTestRegistry.AddMonitorTestOrDie("termination-message-policy", "Cluster Version Operator", terminationmessagepolicy.NewAnalyzer())
monitorTestRegistry.AddMonitorTestOrDie("operator-state-analyzer", "Cluster Version Operator", operatorstateanalyzer.NewAnalyzer())
monitorTestRegistry.AddMonitorTestOrDie("required-scc-annotation-checker", "Cluster Version Operator", requiredsccmonitortests.NewAnalyzer())
monitorTestRegistry.AddMonitorTestOrDie("cluster-version-checker", "Cluster Version Operator", clusterversionchecker.NewClusterVersionChecker())

monitorTestRegistry.AddMonitorTestOrDie("etcd-log-analyzer", "etcd", etcdloganalyzer.NewEtcdLogAnalyzer())
monitorTestRegistry.AddMonitorTestOrDie("legacy-etcd-invariants", "etcd", legacyetcdmonitortests.NewLegacyTests())
Expand Down
17 changes: 17 additions & 0 deletions pkg/monitor/monitorapi/identification_operator.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package monitorapi

import (
"fmt"

configv1 "github.com/openshift/api/config/v1"
)

Expand All @@ -25,3 +27,18 @@ func GetOperatorConditionStatus(interval Interval) *configv1.ClusterOperatorStat
condition.Message = interval.Message.HumanMessage
return condition
}

// GetOperatorConditionHumanMessage constructs a human-readable message from a given ClusterOperatorStatusCondition with a given prefix
func GetOperatorConditionHumanMessage(s *configv1.ClusterOperatorStatusCondition, prefix string) string {
if s == nil {
return ""
}
switch {
case len(s.Reason) > 0 && len(s.Message) > 0:
return fmt.Sprintf("%s%s=%s: %s: %s", prefix, s.Type, s.Status, s.Reason, s.Message)
case len(s.Message) > 0:
return fmt.Sprintf("%s%s=%s: %s", prefix, s.Type, s.Status, s.Message)
default:
return fmt.Sprintf("%s%s=%s", prefix, s.Type, s.Status)
}
}
6 changes: 2 additions & 4 deletions pkg/monitortests/cli/adm_upgrade/status/monitortest.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,23 +225,21 @@ func (w *monitor) noFailures() *junitapi.JUnitTestCase {
}

var failures []string
var total int
for _, snap := range w.ocAdmUpgradeStatus {
total++
if snap.err != nil {
failures = append(failures, fmt.Sprintf("- %s: %v", snap.when.Format(time.RFC3339), snap.err))
}
}

if total == 0 {
if len(w.ocAdmUpgradeStatus) == 0 {
noFailures.SkipMessage = &junitapi.SkipMessage{
Message: "Test skipped because no oc adm upgrade status output was collected",
}
return noFailures
}

// Zero failures is too strict for at least SNO clusters
p := (float32(len(failures)) / float32(total)) * 100
p := (float32(len(failures)) / float32(len(w.ocAdmUpgradeStatus))) * 100
if (!w.isSNO && p > 0) || (w.isSNO && p > 10) {
noFailures.FailureOutput = &junitapi.FailureOutput{
Message: fmt.Sprintf("oc adm upgrade status failed %d times (of %d)", len(failures), len(w.ocAdmUpgradeStatus)),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
package clusterversionchecker

import (
"context"
"encoding/json"
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
"time"

configv1 "github.com/openshift/api/config/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"

"github.com/openshift/origin/pkg/monitor/monitorapi"
"github.com/openshift/origin/pkg/monitortestframework"
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
exutil "github.com/openshift/origin/test/extended/util"
)

type monitor struct {
notSupportedReason error
summary map[string]int
}

func NewClusterVersionChecker() monitortestframework.MonitorTest {
return &monitor{summary: map[string]int{}}
}

func (w *monitor) PrepareCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
kubeClient, err := kubernetes.NewForConfig(adminRESTConfig)
if err != nil {
return err
}
isMicroShift, err := exutil.IsMicroShiftCluster(kubeClient)
if err != nil {
return fmt.Errorf("unable to determine if cluster is MicroShift: %v", err)
}
if isMicroShift {
w.notSupportedReason = &monitortestframework.NotSupportedError{Reason: "platform MicroShift not supported"}
return w.notSupportedReason
}

nodes, err := kubeClient.CoreV1().Nodes().List(ctx, metav1.ListOptions{LabelSelector: "node-role.kubernetes.io/worker"})
if err != nil {
return fmt.Errorf("unable to list nodes: %v", err)
}

if s := len(nodes.Items); s > 250 {
w.notSupportedReason = &monitortestframework.NotSupportedError{Reason: fmt.Sprintf("cluster with %d worker nodes (over 250) not supported", s)}
return w.notSupportedReason
}

return nil
}

func (w *monitor) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, recorder monitorapi.RecorderWriter) error {
return w.notSupportedReason
}

func (w *monitor) CollectData(ctx context.Context, storageDir string, beginning, end time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
return nil, nil, w.notSupportedReason
}

func (w *monitor) ConstructComputedIntervals(ctx context.Context, startingIntervals monitorapi.Intervals, recordedResources monitorapi.ResourcesMap, beginning, end time.Time) (monitorapi.Intervals, error) {
return nil, w.notSupportedReason
}

func (w *monitor) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
if w.notSupportedReason != nil {
return nil, w.notSupportedReason
}
return w.noFailingUnknownCondition(finalIntervals), nil
}

func (w *monitor) WriteContentToStorage(ctx context.Context, storageDir, timeSuffix string, finalIntervals monitorapi.Intervals, finalResourceState monitorapi.ResourcesMap) error {
outputFile := filepath.Join(storageDir, fmt.Sprintf("cluster-version-checker%s.json", timeSuffix))
data, err := json.Marshal(w.summary)
if err != nil {
return fmt.Errorf("unable to marshal summary: %w", err)
}
if err := os.WriteFile(outputFile, data, 0644); err != nil {
return fmt.Errorf("unable to write summary to %q: %w", outputFile, err)
}
return nil
}

func (*monitor) Cleanup(ctx context.Context) error {
return nil
}

func (w *monitor) noFailingUnknownCondition(intervals monitorapi.Intervals) []*junitapi.JUnitTestCase {
var start, stop time.Time
for _, event := range intervals {
if start.IsZero() || event.From.Before(start) {
start = event.From
}
if stop.IsZero() || event.To.After(stop) {
stop = event.To
}
}
duration := stop.Sub(start).Seconds()

noFailures := &junitapi.JUnitTestCase{
Name: "[bz-Cluster Version Operator] cluster version should not report Failing=Unknown during a normal cluster upgrade",
Duration: duration,
}

var failures []string
violations := sets.New[string]()

for _, interval := range intervals {
if interval.Locator.Type != monitorapi.LocatorTypeClusterVersion {
continue
}
cvName, ok := interval.Locator.Keys[monitorapi.LocatorClusterVersionKey]
if !ok || cvName != "version" {
continue
}

c := monitorapi.GetOperatorConditionStatus(interval)
if c == nil {
continue
}
key := fmt.Sprintf("%s-%s-%s", string(c.Type), string(c.Status), c.Reason)
if _, ok := w.summary[key]; ok {
w.summary[key]++
} else {
w.summary[key] = 1
}
// https://github.com/openshift/cluster-version-operator/blob/28a376a13ad1daec926f6174ac37ada2bd32c071/pkg/cvo/status.go#L332-L333
if c.Type == "Failing" && c.Status == configv1.ConditionUnknown && c.Reason == "SlowClusterOperator" {
// This is too hacky, but we do not have API to expose the CO names that took long to upgrade
coNames, err := parseClusterOperatorNames(c.Message)
if err != nil {
failures = append(failures, fmt.Sprintf("failed to parse cluster operator names from message %q: %v", c.Message, err))
continue
}
violations = violations.Union(coNames)
}
}

if len(failures) > 0 {
noFailures.FailureOutput = &junitapi.FailureOutput{
Message: fmt.Sprintf("Checking cluster version failed %d times (of %d intervals) from %s to %s", len(failures), len(intervals), start.Format(time.RFC3339), stop.Format(time.RFC3339)),
Output: strings.Join(failures, "\n"),
}
} else {
noFailures.SystemOut = fmt.Sprintf("Checking cluster version succussfully checked %d intervals from %s to %s", len(intervals), start.Format(time.RFC3339), stop.Format(time.RFC3339))
}

ret := []*junitapi.JUnitTestCase{noFailures}

for _, coName := range sets.List(violations) {
bzComponent := platformidentification.GetBugzillaComponentForOperator(coName)
if bzComponent == "Unknown" {
bzComponent = coName
}

m := 30
if coName == "machine-config" {
m = 3 * m
}
output := fmt.Sprintf("Cluster Operator %s has not completed version change after %d minutes", coName, m)
ret = append(ret, &junitapi.JUnitTestCase{
Name: fmt.Sprintf("[bz-%v] clusteroperator/%v must complete version change under %d minutes", bzComponent, coName, m),
Duration: duration,
FailureOutput: &junitapi.FailureOutput{
Output: output,
Message: output,
},
})
}

return ret
}

var (
// we have to modify the keyword here accordingly if CVO changes the message
regWaitingLong = regexp.MustCompile(`waiting on.*which is longer than expected`)
regWaitingMCOOver90m = regexp.MustCompile(`machine-config over 90 minutes`)
regWaitingCOOver30m = regexp.MustCompile(`.*waiting on (.+) over 30 minutes`)
)

func parseClusterOperatorNames(message string) (sets.Set[string], error) {
if !regWaitingLong.MatchString(message) {
return nil, fmt.Errorf("failed to parse cluster operator names from %q", message)
}
ret := sets.Set[string]{}
if regWaitingMCOOver90m.MatchString(message) {
ret.Insert("machine-config")
}
matches := regWaitingCOOver30m.FindStringSubmatch(message)
if len(matches) > 1 {
coNames := strings.Split(matches[1], ",")
for _, coName := range coNames {
ret.Insert(strings.TrimSpace(coName))
}
}
if len(ret) == 0 {
return nil, fmt.Errorf("failed to parse cluster operator names from %q", message)
}
return ret, nil
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package clusterversionchecker

import (
"fmt"
"testing"

"github.com/google/go-cmp/cmp"
configv1 "github.com/openshift/api/config/v1"
"k8s.io/apimachinery/pkg/util/sets"

"github.com/openshift/origin/pkg/monitor/monitorapi"
)

func Test_parseClusterOperatorNames(t *testing.T) {
t.Parallel()

testCases := []struct {
name string
reason string
message string
expected sets.Set[string]
expectedErr error
}{
{
name: "unexpected",
reason: "reason",
message: "unexpected",
expectedErr: fmt.Errorf("failed to parse cluster operator names from %q", "changed to Some=Unknown: reason: unexpected"),
},
{
name: "legit waiting on",
message: "Working towards 1.2.3: waiting on co-not-timeout",
expectedErr: fmt.Errorf("failed to parse cluster operator names from %q", "changed to Some=Unknown: Working towards 1.2.3: waiting on co-not-timeout"),
},
{
name: "one CO timeout",
reason: "SlowClusterOperator",
message: "waiting on co-timeout over 30 minutes which is longer than expected",
expected: sets.New[string]("co-timeout"),
},
{
name: "mco timeout",
reason: "SlowClusterOperator",
message: "waiting on machine-config over 90 minutes which is longer than expected",
expected: sets.New[string]("machine-config"),
},
{
name: "mco timeout",
reason: "SlowClusterOperator",
message: "waiting on machine-config over 90 minutes which is longer than expected",
expected: sets.New[string]("machine-config"),
},
{
name: "two COs timeout",
reason: "SlowClusterOperator",
message: "waiting on co-timeout, co-bar-timeout over 30 minutes which is longer than expected",
expected: sets.New[string]("co-timeout", "co-bar-timeout"),
},
{
name: "one CO and mco timeout",
reason: "SlowClusterOperator",
message: "waiting on co-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected",
expected: sets.New[string]("machine-config", "co-timeout"),
},
{
name: "three COs timeout",
reason: "SlowClusterOperator",
message: "waiting on co-timeout, co-bar-timeout over 30 minutes and machine-config over 90 minutes which is longer than expected",
expected: sets.New[string]("machine-config", "co-timeout", "co-bar-timeout"),
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
t.Parallel()
msg := monitorapi.GetOperatorConditionHumanMessage(&configv1.ClusterOperatorStatusCondition{
Type: "Some",
Status: configv1.ConditionUnknown,
Message: tc.message,
Reason: tc.reason,
}, "changed to ")
actual, actuallErr := parseClusterOperatorNames(msg)
if diff := cmp.Diff(tc.expectedErr, actuallErr, cmp.FilterValues(func(x, y interface{}) bool {
_, ok1 := x.(error)
_, ok2 := y.(error)
return ok1 && ok2
}, cmp.Comparer(func(x, y interface{}) bool {
xe := x.(error)
ye := y.(error)
if xe == nil || ye == nil {
return xe == nil && ye == nil
}
return xe.Error() == ye.Error()
}))); diff != "" {
t.Errorf("unexpected error (-want +got):\n%s", diff)
}

if actuallErr == nil {
if diff := cmp.Diff(tc.expected, actual); diff != "" {
t.Errorf("unexpected result (-want +got):\n%s", diff)
}
}
})
}
}
Loading