diff --git a/.openshift-tests-extension/openshift_payload_cluster-version-operator.json b/.openshift-tests-extension/openshift_payload_cluster-version-operator.json index e9be44724..c2a80fe72 100644 --- a/.openshift-tests-extension/openshift_payload_cluster-version-operator.json +++ b/.openshift-tests-extension/openshift_payload_cluster-version-operator.json @@ -1,4 +1,17 @@ [ + { + "name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator should work with risks from alerts", + "labels": { + "OTA-1813": {}, + "Serial": {} + }, + "resources": { + "isolation": {} + }, + "source": "openshift:payload:cluster-version-operator", + "lifecycle": "blocking", + "environmentSelector": {} + }, { "name": "[Jira:\"Cluster Version Operator\"] cluster-version-operator should work with accept risks", "labels": { diff --git a/go.mod b/go.mod index ec68b7efb..fa61e5c00 100644 --- a/go.mod +++ b/go.mod @@ -17,6 +17,7 @@ require ( github.com/operator-framework/api v0.17.1 github.com/operator-framework/operator-lifecycle-manager v0.22.0 github.com/pkg/errors v0.9.1 + github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.86.0 github.com/prometheus-operator/prometheus-operator/pkg/client v0.86.0 github.com/prometheus/client_golang v1.22.0 github.com/prometheus/client_model v0.6.1 @@ -70,7 +71,6 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.86.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/robfig/cron v1.2.0 // indirect github.com/sirupsen/logrus v1.9.3 // indirect diff --git a/pkg/clusterconditions/promql/alerts.go b/pkg/clusterconditions/promql/alerts.go new file mode 100644 index 000000000..672f593b7 --- /dev/null +++ b/pkg/clusterconditions/promql/alerts.go @@ -0,0 +1,95 @@ +package promql + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/prometheus/client_golang/api" + prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/config" + + "k8s.io/klog/v2" + + "github.com/openshift/cluster-version-operator/pkg/clusterconditions" +) + +type Getter interface { + Get(ctx context.Context) (prometheusv1.AlertsResult, error) +} + +func NewAlertGetter(promQLTarget clusterconditions.PromQLTarget) Getter { + p := NewPromQL(promQLTarget) + condition := p.Condition + v, ok := condition.(*PromQL) + if !ok { + panic("invalid condition type") + } + return &ocAlertGetter{promQL: v, expiration: 1 * time.Minute} +} + +type ocAlertGetter struct { + promQL *PromQL + + mutex sync.Mutex + cached prometheusv1.AlertsResult + expiration time.Duration + lastRefresh time.Time +} + +func (o *ocAlertGetter) Get(ctx context.Context) (prometheusv1.AlertsResult, error) { + if time.Now().After(o.lastRefresh.Add(o.expiration)) { + if err := o.refresh(ctx); err != nil { + klog.Errorf("Failed to refresh alerts, using stale cache instead: %v", err) + } + } + return o.cached, nil +} + +func (o *ocAlertGetter) refresh(ctx context.Context) error { + o.mutex.Lock() + defer o.mutex.Unlock() + + klog.Info("refresh alerts ...") + p := o.promQL + host, err := p.Host(ctx) + if err != nil { + return fmt.Errorf("failure determine thanos IP: %w", err) + } + p.url.Host = host + clientConfig := api.Config{Address: p.url.String()} + + if roundTripper, err := config.NewRoundTripperFromConfig(p.HTTPClientConfig, "cluster-conditions"); err == nil { + clientConfig.RoundTripper = roundTripper + } else { + return fmt.Errorf("creating PromQL round-tripper: %w", err) + } + + promqlClient, err := api.NewClient(clientConfig) + if err != nil { + return fmt.Errorf("creating PromQL client: %w", err) + } + + client := &statusCodeNotImplementedForPostClient{ + client: promqlClient, + } + + v1api := prometheusv1.NewAPI(client) + + queryContext := ctx + if p.QueryTimeout > 0 { + var cancel context.CancelFunc + queryContext, cancel = context.WithTimeout(ctx, p.QueryTimeout) + defer cancel() + } + + r, err := v1api.Alerts(queryContext) + if err != nil { + return fmt.Errorf("failed to get alerts: %w", err) + } + o.cached = r + o.lastRefresh = time.Now() + klog.Infof("refreshed: %d alerts", len(o.cached.Alerts)) + return nil +} diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index b9155c64e..0e95b7f50 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -15,6 +15,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/google/uuid" + prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/meta" @@ -145,6 +146,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 optrAvailableUpdates.ShouldReconcileAcceptRisks = optr.shouldReconcileAcceptRisks optrAvailableUpdates.AcceptRisks = acceptRisks optrAvailableUpdates.ConditionRegistry = optr.conditionRegistry + optrAvailableUpdates.AlertGetter = optr.AlertGetter optrAvailableUpdates.Condition = condition responseFailed := (condition.Type == configv1.RetrievedUpdates && @@ -159,6 +161,11 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 } } + if optrAvailableUpdates.ShouldReconcileAcceptRisks() { + if err := optrAvailableUpdates.evaluateAlertRisks(ctx); err != nil { + klog.Errorf("Failed to evaluate alert conditions: %v", err) + } + } optrAvailableUpdates.evaluateConditionalUpdates(ctx) queueKey := optr.queueKey() @@ -212,6 +219,11 @@ type availableUpdates struct { // RiskConditions stores the condition for every risk (name, url, message, matchingRules). RiskConditions map[string][]metav1.Condition + + AlertGetter AlertGetter + + // AlertRisks stores the condition for every alert that might have impact on cluster update + AlertRisks []configv1.ConditionalUpdateRisk } func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool { @@ -317,6 +329,8 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates { ShouldReconcileAcceptRisks: optr.shouldReconcileAcceptRisks, AcceptRisks: optr.availableUpdates.AcceptRisks, RiskConditions: optr.availableUpdates.RiskConditions, + AlertRisks: optr.availableUpdates.AlertRisks, + AlertGetter: optr.availableUpdates.AlertGetter, LastAttempt: optr.availableUpdates.LastAttempt, LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange, Current: *optr.availableUpdates.Current.DeepCopy(), @@ -509,6 +523,170 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran } } +type AlertGetter interface { + Get(ctx context.Context) (prometheusv1.AlertsResult, error) +} + +func (u *availableUpdates) evaluateAlertRisks(ctx context.Context) error { + if u == nil || u.AlertGetter == nil { + u.AlertRisks = nil + return nil + } + alertsResult, err := u.AlertGetter.Get(ctx) + if err != nil { + return fmt.Errorf("failed to get alerts: %w", err) + } + + u.AlertRisks = alertsToRisks(alertsResult.Alerts) + return nil +} + +func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk { + klog.V(2).Infof("Found %d alerts", len(alerts)) + risks := map[string]configv1.ConditionalUpdateRisk{} + for _, alert := range alerts { + var alertName string + if alertName = string(alert.Labels["alertname"]); alertName == "" { + continue + } + if alert.State == "pending" { + continue + } + + var summary string + if summary = string(alert.Annotations["summary"]); summary == "" { + summary = alertName + } + if !strings.HasSuffix(summary, ".") { + summary += "." + } + + var description string + alertMessage := string(alert.Annotations["message"]) + alertDescription := string(alert.Annotations["description"]) + switch { + case alertMessage != "" && alertDescription != "": + description += " The alert description is: " + alertDescription + " | " + alertMessage + case alertDescription != "": + description += " The alert description is: " + alertDescription + case alertMessage != "": + description += " The alert description is: " + alertMessage + default: + description += " The alert has no description." + } + + var runbook string + if runbook = string(alert.Annotations["runbook"]); runbook == "" { + runbook = "" + } + + details := fmt.Sprintf("%s%s %s", summary, description, runbook) + + // TODO (hongkliu): + alertURL := "https://console.com/monitoring/alertrules/id" + alertPromQL := "todo-expression" + + severity := string(alert.Labels["severity"]) + if severity == "critical" { + if alertName == internal.AlertNamePodDisruptionBudgetLimit { + details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details) + } + risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: fmt.Sprintf("Alert:%s", alert.State), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting significant cluster issues worth investigating", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + + if alertName == internal.AlertNamePodDisruptionBudgetAtLimit { + details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details) + risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: internal.AlertConditionReason(string(alert.State)), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which might slow node drains", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + + if internal.HavePullWaiting.Has(alertName) || + internal.HaveNodes.Has(alertName) || + alertName == internal.AlertNameVirtHandlerDaemonSetRolloutFailing || + alertName == internal.AlertNameVMCannotBeEvicted { + risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: internal.AlertConditionReason(string(alert.State)), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which may slow workload redistribution during rolling node updates", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + + updatePrecheck := string(alert.Labels["openShiftUpdatePrecheck"]) + if updatePrecheck == "true" { + risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: fmt.Sprintf("Alert:%s", alert.State), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting issues worth investigating before updating the cluster", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + } + + klog.V(2).Infof("Got %d risks", len(risks)) + if len(risks) == 0 { + return nil + } + + var ret []configv1.ConditionalUpdateRisk + var keys []string + for k := range risks { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + ret = append(ret, risks[k]) + } + return ret +} + +func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url, promQL string, condition metav1.Condition) configv1.ConditionalUpdateRisk { + risk, ok := risks[riskName] + if !ok { + return configv1.ConditionalUpdateRisk{ + Name: riskName, + Message: message, + URL: url, + MatchingRules: []configv1.ClusterCondition{ + { + Type: "PromQL", + PromQL: &configv1.PromQLClusterCondition{ + PromQL: promQL, + }, + }, + }, + Conditions: []metav1.Condition{condition}, + } + } + + if c := meta.FindStatusCondition(risk.Conditions, condition.Type); c != nil { + c.Message = fmt.Sprintf("%s; %s", c.Message, condition.Message) + if c.LastTransitionTime.After(condition.LastTransitionTime.Time) { + c.LastTransitionTime = condition.LastTransitionTime + } + meta.SetStatusCondition(&risk.Conditions, *c) + } + + return risk +} + func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { if u == nil { return @@ -518,7 +696,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { risks := risksInOrder(riskVersions) u.RiskConditions = loadRiskConditions(ctx, risks, riskVersions, u.ConditionRegistry) - if err := sanityCheck(u.ConditionalUpdates); err != nil { + if err := sanityCheck(u.ConditionalUpdates, u.AlertRisks); err != nil { klog.Errorf("Sanity check failed which might impact risk evaluation: %v", err) } for i, conditionalUpdate := range u.ConditionalUpdates { @@ -536,7 +714,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { } } -func sanityCheck(updates []configv1.ConditionalUpdate) error { +func sanityCheck(updates []configv1.ConditionalUpdate, alertRisks []configv1.ConditionalUpdateRisk) error { risks := map[string]configv1.ConditionalUpdateRisk{} var errs []error for _, update := range updates { @@ -552,6 +730,11 @@ func sanityCheck(updates []configv1.ConditionalUpdate) error { } } } + for _, alertRisk := range alertRisks { + if _, ok := risks[alertRisk.Name]; ok { + errs = append(errs, fmt.Errorf("found alert risk and conditional update risk share the name: %s", alertRisk.Name)) + } + } return utilerrors.NewAggregate(errs) } @@ -596,6 +779,7 @@ const ( recommendedReasonAllExposedRisksAccepted = "AllExposedRisksAccepted" recommendedReasonEvaluationFailed = "EvaluationFailed" recommendedReasonMultiple = "MultipleReasons" + //recommendedReasonAlertFiring = "AlertFiring" // recommendedReasonExposed is used instead of the original name if it does // not match the pattern for a valid k8s condition reason. @@ -613,10 +797,13 @@ var reasonPattern = regexp.MustCompile(`^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ func newRecommendedReason(now, want string) string { switch { case now == recommendedReasonRisksNotExposed || - now == recommendedReasonAllExposedRisksAccepted || + now == recommendedReasonAllExposedRisksAccepted && want != recommendedReasonRisksNotExposed || + now == recommendedReasonEvaluationFailed && want == recommendedReasonExposed || now == want: return want - case want == recommendedReasonRisksNotExposed: + case want == recommendedReasonRisksNotExposed || + (now == recommendedReasonEvaluationFailed) && want == recommendedReasonAllExposedRisksAccepted || + now == recommendedReasonExposed && (want == recommendedReasonAllExposedRisksAccepted || want == recommendedReasonEvaluationFailed): return now default: return recommendedReasonMultiple @@ -673,7 +860,6 @@ func evaluateConditionalUpdate( if len(errorMessages) > 0 { recommended.Message = strings.Join(errorMessages, "\n\n") } - return recommended } diff --git a/pkg/cvo/availableupdates_test.go b/pkg/cvo/availableupdates_test.go index 525e9e136..cb11edaa1 100644 --- a/pkg/cvo/availableupdates_test.go +++ b/pkg/cvo/availableupdates_test.go @@ -2,11 +2,14 @@ package cvo import ( "context" + "encoding/json" "errors" "fmt" "net/http" "net/http/httptest" "net/url" + "os" + "path/filepath" "runtime" "testing" "time" @@ -14,6 +17,8 @@ import ( "github.com/blang/semver/v4" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" + prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" @@ -216,6 +221,7 @@ var availableUpdatesCmpOpts = []cmp.Option{ cmpopts.IgnoreTypes(time.Time{}), cmpopts.IgnoreInterfaces(struct { clusterconditions.ConditionRegistry + AlertGetter }{}), } @@ -780,9 +786,10 @@ func TestSyncAvailableUpdatesDesiredUpdate(t *testing.T) { func Test_sanityCheck(t *testing.T) { tests := []struct { - name string - updates []configv1.ConditionalUpdate - expected error + name string + updates []configv1.ConditionalUpdate + alertRisks []configv1.ConditionalUpdateRisk + expected error }{ { name: "good", @@ -790,6 +797,7 @@ func Test_sanityCheck(t *testing.T) { {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskA"}}}, {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskB"}}}, }, + alertRisks: []configv1.ConditionalUpdateRisk{{Name: "SomeAlert"}}, }, { name: "invalid risk name", @@ -813,10 +821,19 @@ func Test_sanityCheck(t *testing.T) { }, expected: utilerrors.NewAggregate([]error{fmt.Errorf("found collision on risk riskA: {[] riskA []} and {[] some riskA []}")}), }, + { + name: "alert risk and conditional update risk conflict", + updates: []configv1.ConditionalUpdate{ + {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskA"}}}, + {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskB"}}}, + }, + alertRisks: []configv1.ConditionalUpdateRisk{{Name: "riskA"}}, + expected: utilerrors.NewAggregate([]error{fmt.Errorf("found alert risk and conditional update risk share the name: riskA")}), + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - actual := sanityCheck(tt.updates) + actual := sanityCheck(tt.updates, tt.alertRisks) if diff := cmp.Diff(tt.expected, actual, cmp.Comparer(func(x, y error) bool { if x == nil || y == nil { return x == nil && y == nil @@ -950,3 +967,334 @@ func Test_loadRiskConditions(t *testing.T) { }) } } + +type mockAlertGetter struct { + ret prometheusv1.AlertsResult + jsonFile string +} + +func (m *mockAlertGetter) Get(_ context.Context) (prometheusv1.AlertsResult, error) { + var ret prometheusv1.AlertsResult + if m.jsonFile != "" { + data, err := os.ReadFile(m.jsonFile) + if err != nil { + return ret, err + } + err = json.Unmarshal(data, &ret) + if err != nil { + return ret, err + } + return ret, nil + } + return m.ret, nil +} + +func Test_evaluateAlertConditions(t *testing.T) { + t1 := time.Now() + t2 := time.Now().Add(-3 * time.Minute) + t3, err := time.Parse(time.RFC3339, "2026-03-04T00:38:19.02109776Z") + if err != nil { + t.Fatalf("failed to parse time: %v", err) + } + tests := []struct { + name string + u *availableUpdates + expected error + expectedAlertRisks []configv1.ConditionalUpdateRisk + }{ + { + name: "basic case", + u: &availableUpdates{ + AlertGetter: &mockAlertGetter{ + ret: prometheusv1.AlertsResult{ + Alerts: []prometheusv1.Alert{ + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetLimit"), + model.LabelName("severity"): model.LabelValue("critical"), + model.LabelName("namespace"): model.LabelValue("namespace"), + model.LabelName("poddisruptionbudget"): model.LabelValue("some-pdb"), + }, + State: prometheusv1.AlertStateFiring, + Annotations: map[model.LabelName]model.LabelValue{ + model.LabelName("summary"): model.LabelValue("summary"), + model.LabelName("description"): model.LabelValue("description"), + model.LabelName("message"): model.LabelValue("message"), + model.LabelName("runbook"): model.LabelValue("http://runbook.example.com/runbooks/abc.md"), + }, + ActiveAt: t1, + }, + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("not-important"), + }, + State: prometheusv1.AlertStatePending, + }, + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetAtLimit"), + model.LabelName("severity"): model.LabelValue("severity"), + model.LabelName("namespace"): model.LabelValue("namespace"), + model.LabelName("poddisruptionbudget"): model.LabelValue("some-pdb"), + }, + State: prometheusv1.AlertStateFiring, + Annotations: map[model.LabelName]model.LabelValue{ + model.LabelName("summary"): model.LabelValue("summary"), + model.LabelName("description"): model.LabelValue("description"), + model.LabelName("message"): model.LabelValue("message"), + model.LabelName("runbook"): model.LabelValue("http://runbook.example.com/runbooks/bbb.md"), + }, + ActiveAt: t1, + }, + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetAtLimit"), + model.LabelName("severity"): model.LabelValue("severity"), + model.LabelName("namespace"): model.LabelValue("namespace"), + model.LabelName("poddisruptionbudget"): model.LabelValue("another-pdb"), + }, + State: prometheusv1.AlertStateFiring, + Annotations: map[model.LabelName]model.LabelValue{ + model.LabelName("summary"): model.LabelValue("summary"), + model.LabelName("description"): model.LabelValue("description"), + model.LabelName("message"): model.LabelValue("message"), + model.LabelName("runbook"): model.LabelValue("http://runbook.example.com/runbooks/bbb.md"), + }, + ActiveAt: t2, + }, + }, + }, + }, + }, + expectedAlertRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "PodDisruptionBudgetAtLimit", + Message: "summary.", + URL: "https://console.com/monitoring/alertrules/id", + MatchingRules: []configv1.ClusterCondition{ + { + Type: "PromQL", + PromQL: &configv1.PromQLClusterCondition{ + PromQL: "todo-expression", + }, + }, + }, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md; severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=another-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md", + LastTransitionTime: metav1.NewTime(t2), + }}, + }, + { + Name: "PodDisruptionBudgetLimit", + Message: "summary.", + URL: "https://console.com/monitoring/alertrules/id", + MatchingRules: []configv1.ClusterCondition{ + { + Type: "PromQL", + PromQL: &configv1.PromQLClusterCondition{ + PromQL: "todo-expression", + }, + }, + }, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert PodDisruptionBudgetLimit firing, suggesting significant cluster issues worth investigating. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/abc.md", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + }, + { + name: "from file", + u: &availableUpdates{ + AlertGetter: &mockAlertGetter{ + jsonFile: filepath.Join("testdata", "alerts.json"), + }, + }, + expectedAlertRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://console.com/monitoring/alertrules/id", + MatchingRules: []configv1.ClusterCondition{ + { + Type: "PromQL", + PromQL: &configv1.PromQLClusterCondition{ + PromQL: "todo-expression", + }, + }, + }, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t3), + }}, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + actual := tt.u.evaluateAlertRisks(context.TODO()) + if diff := cmp.Diff(tt.expected, actual, cmp.Comparer(func(x, y error) bool { + if x == nil || y == nil { + return x == nil && y == nil + } + return x.Error() == y.Error() + })); diff != "" { + t.Errorf("evaluateAlertConditions() mismatch (-want +got):\n%s", diff) + } + + if actual == nil { + if diff := cmp.Diff(tt.expectedAlertRisks, tt.u.AlertRisks); diff != "" { + t.Errorf("AlertRisks mismatch (-want +got):\n%s", diff) + } + } + }) + } +} + +func Test_newRecommendedReason(t *testing.T) { + tests := []struct { + name string + now string + want string + expected string + }{ + { + name: "recommendedReasonRisksNotExposed to recommendedReasonAllExposedRisksAccepted", + now: recommendedReasonRisksNotExposed, + want: recommendedReasonAllExposedRisksAccepted, + expected: recommendedReasonAllExposedRisksAccepted, + }, + { + name: "recommendedReasonRisksNotExposed to recommendedReasonEvaluationFailed", + now: recommendedReasonRisksNotExposed, + want: recommendedReasonEvaluationFailed, + expected: recommendedReasonEvaluationFailed, + }, + { + name: "recommendedReasonRisksNotExposed to recommendedReasonMultiple", + now: recommendedReasonRisksNotExposed, + want: recommendedReasonMultiple, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonRisksNotExposed to recommendedReasonExposed", + now: recommendedReasonRisksNotExposed, + want: recommendedReasonExposed, + expected: recommendedReasonExposed, + }, + { + name: "recommendedReasonAllExposedRisksAccepted to recommendedReasonRisksNotExposed", + now: recommendedReasonAllExposedRisksAccepted, + want: recommendedReasonRisksNotExposed, + expected: recommendedReasonAllExposedRisksAccepted, + }, + { + name: "recommendedReasonAllExposedRisksAccepted to recommendedReasonEvaluationFailed", + now: recommendedReasonAllExposedRisksAccepted, + want: recommendedReasonEvaluationFailed, + expected: recommendedReasonEvaluationFailed, + }, + { + name: "recommendedReasonAllExposedRisksAccepted to recommendedReasonMultiple", + now: recommendedReasonAllExposedRisksAccepted, + want: recommendedReasonMultiple, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonAllExposedRisksAccepted to recommendedReasonExposed", + now: recommendedReasonAllExposedRisksAccepted, + want: recommendedReasonExposed, + expected: recommendedReasonExposed, + }, + { + name: "recommendedReasonEvaluationFailed to recommendedReasonRisksNotExposed", + now: recommendedReasonEvaluationFailed, + want: recommendedReasonRisksNotExposed, + expected: recommendedReasonEvaluationFailed, + }, + { + name: "recommendedReasonEvaluationFailed to recommendedReasonAllExposedRisksAccepted", + now: recommendedReasonEvaluationFailed, + want: recommendedReasonAllExposedRisksAccepted, + expected: recommendedReasonEvaluationFailed, + }, + { + name: "recommendedReasonEvaluationFailed to recommendedReasonMultiple", + now: recommendedReasonEvaluationFailed, + want: recommendedReasonMultiple, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonEvaluationFailed to recommendedReasonExposed", + now: recommendedReasonEvaluationFailed, + want: recommendedReasonExposed, + expected: recommendedReasonExposed, + }, + { + name: "recommendedReasonMultiple to recommendedReasonRisksNotExposed", + now: recommendedReasonMultiple, + want: recommendedReasonRisksNotExposed, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonMultiple to recommendedReasonAllExposedRisksAccepted", + now: recommendedReasonMultiple, + want: recommendedReasonAllExposedRisksAccepted, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonMultiple to recommendedReasonEvaluationFailed", + now: recommendedReasonMultiple, + want: recommendedReasonEvaluationFailed, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonMultiple to recommendedReasonExposed", + now: recommendedReasonMultiple, + want: recommendedReasonExposed, + expected: recommendedReasonMultiple, + }, + { + name: "recommendedReasonExposed to recommendedReasonRisksNotExposed", + now: recommendedReasonExposed, + want: recommendedReasonRisksNotExposed, + expected: recommendedReasonExposed, + }, + { + name: "recommendedReasonExposed to recommendedReasonAllExposedRisksAccepted", + now: recommendedReasonExposed, + want: recommendedReasonAllExposedRisksAccepted, + expected: recommendedReasonExposed, + }, + { + name: "recommendedReasonExposed to recommendedReasonEvaluationFailed", + now: recommendedReasonExposed, + want: recommendedReasonEvaluationFailed, + expected: recommendedReasonExposed, + }, + { + name: "recommendedReasonExposed to recommendedReasonMultiple", + now: recommendedReasonExposed, + want: recommendedReasonMultiple, + expected: recommendedReasonMultiple, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + actual := newRecommendedReason(tt.now, tt.want) + if diff := cmp.Diff(tt.expected, actual); diff != "" { + t.Errorf("newRecommendedReason mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index 1a2afaf21..9866fe80c 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -43,6 +43,7 @@ import ( "github.com/openshift/cluster-version-operator/lib/resourcebuilder" "github.com/openshift/cluster-version-operator/lib/validation" "github.com/openshift/cluster-version-operator/pkg/clusterconditions" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" "github.com/openshift/cluster-version-operator/pkg/clusterconditions/standard" "github.com/openshift/cluster-version-operator/pkg/customsignaturestore" "github.com/openshift/cluster-version-operator/pkg/cvo/configuration" @@ -207,6 +208,8 @@ type Operator struct { // configuration, if enabled, reconciles the ClusterVersionOperator configuration. configuration *configuration.ClusterVersionOperatorConfiguration + + AlertGetter AlertGetter } // New returns a new cluster version operator. @@ -267,6 +270,7 @@ func New( exclude: exclude, clusterProfile: clusterProfile, conditionRegistry: standard.NewConditionRegistry(promqlTarget), + AlertGetter: promql.NewAlertGetter(promqlTarget), injectClusterIdIntoPromQL: injectClusterIdIntoPromQL, requiredFeatureSet: featureSet, diff --git a/pkg/cvo/cvo_test.go b/pkg/cvo/cvo_test.go index dda43f0aa..4ed520e15 100644 --- a/pkg/cvo/cvo_test.go +++ b/pkg/cvo/cvo_test.go @@ -2742,6 +2742,7 @@ func TestOperator_availableUpdatesSync(t *testing.T) { optr.cvLister = &clientCVLister{client: optr.client} optr.cmConfigManagedLister = &cmConfigLister{} optr.eventRecorder = record.NewFakeRecorder(100) + optr.enabledCVOFeatureGates = featuregates.DefaultCvoGates("version") var updateServiceURI string if tt.handler != nil { diff --git a/pkg/cvo/status.go b/pkg/cvo/status.go index afa8f9509..fc9d1834c 100644 --- a/pkg/cvo/status.go +++ b/pkg/cvo/status.go @@ -177,7 +177,7 @@ func (optr *Operator) syncStatus(ctx context.Context, original, config *configv1 original = config.DeepCopy() } - updateClusterVersionStatus(&config.Status, status, optr.release, optr.getAvailableUpdates, optr.enabledCVOFeatureGates, validationErrs, optr.shouldReconcileAcceptRisks) + updateClusterVersionStatus(ctx, &config.Status, status, optr.release, optr.getAvailableUpdates, optr.enabledCVOFeatureGates, validationErrs, optr.shouldReconcileAcceptRisks) if klog.V(6).Enabled() { klog.Infof("Apply config: %s", cmp.Diff(original, config)) @@ -189,6 +189,7 @@ func (optr *Operator) syncStatus(ctx context.Context, original, config *configv1 // updateClusterVersionStatus updates the passed cvStatus with the latest status information func updateClusterVersionStatus( + ctx context.Context, cvStatus *configv1.ClusterVersionStatus, status *SyncWorkerStatus, release configv1.Release, @@ -227,8 +228,18 @@ func updateClusterVersionStatus( var riskNamesForDesiredImage []string if shouldReconcileAcceptRisks() { - cvStatus.ConditionalUpdates, riskNamesForDesiredImage = conditionalUpdateWithRiskNamesAndRiskConditions(cvStatus.ConditionalUpdates, getAvailableUpdates, desired.Image) - cvStatus.ConditionalUpdateRisks = conditionalUpdateRisks(cvStatus.ConditionalUpdates) + updates := getAvailableUpdates() + var riskConditions map[string][]metav1.Condition + var alertRisks []configv1.ConditionalUpdateRisk + if updates != nil { + riskConditions = updates.RiskConditions + alertRisks = updates.AlertRisks + if err := updates.evaluateAlertRisks(ctx); err != nil { + klog.Errorf("Failed to evaluate alert conditions: %v", err) + } + } + cvStatus.ConditionalUpdates, riskNamesForDesiredImage = conditionalUpdateWithRiskNamesAndRiskConditions(cvStatus.ConditionalUpdates, riskConditions, desired.Image) + cvStatus.ConditionalUpdateRisks = conditionalUpdateRisks(cvStatus.ConditionalUpdates, alertRisks) } risksMsg := "" @@ -422,14 +433,9 @@ func updateClusterVersionStatus( } } -func conditionalUpdateWithRiskNamesAndRiskConditions(conditionalUpdates []configv1.ConditionalUpdate, getAvailableUpdates func() *availableUpdates, desiredImage string) ([]configv1.ConditionalUpdate, []string) { +func conditionalUpdateWithRiskNamesAndRiskConditions(conditionalUpdates []configv1.ConditionalUpdate, riskConditions map[string][]metav1.Condition, desiredImage string) ([]configv1.ConditionalUpdate, []string) { var result []configv1.ConditionalUpdate var riskNamesForDesiredImage []string - var riskConditions map[string][]metav1.Condition - updates := getAvailableUpdates() - if updates != nil { - riskConditions = updates.RiskConditions - } for _, conditionalUpdate := range conditionalUpdates { riskNames := sets.New[string]() var risks []configv1.ConditionalUpdateRisk @@ -474,7 +480,8 @@ func conditionalUpdateWithRiskNamesAndRiskConditions(conditionalUpdates []config return result, riskNamesForDesiredImage } -func conditionalUpdateRisks(conditionalUpdates []configv1.ConditionalUpdate) []configv1.ConditionalUpdateRisk { +func conditionalUpdateRisks(conditionalUpdates []configv1.ConditionalUpdate, alertRisks []configv1.ConditionalUpdateRisk) []configv1.ConditionalUpdateRisk { + klog.V(2).Infof("Got %d alert risks", len(alertRisks)) var result []configv1.ConditionalUpdateRisk riskNames := sets.New[string]() for _, conditionalUpdate := range conditionalUpdates { @@ -486,9 +493,11 @@ func conditionalUpdateRisks(conditionalUpdates []configv1.ConditionalUpdate) []c result = append(result, risk) } } + result = append(result, alertRisks...) sort.Slice(result, func(i, j int) bool { return result[i].Name < result[j].Name }) + klog.V(2).Infof("Got %d conditional update risks", len(result)) return result } diff --git a/pkg/cvo/status_test.go b/pkg/cvo/status_test.go index 1c7aa1983..518b9b5e2 100644 --- a/pkg/cvo/status_test.go +++ b/pkg/cvo/status_test.go @@ -752,7 +752,7 @@ func TestUpdateClusterVersionStatus_FilteringMultipleErrorsForFailingCondition(t if tc.shouldModifyWhenNotReconcilingAndHistoryNotEmpty && !c.isReconciling && !c.isHistoryEmpty { expectedCondition = tc.expectedConditionModified } - updateClusterVersionStatus(cvStatus, tc.args.syncWorkerStatus, release, getAvailableUpdates, gates, noErrors, func() bool { + updateClusterVersionStatus(context.TODO(), cvStatus, tc.args.syncWorkerStatus, release, getAvailableUpdates, gates, noErrors, func() bool { return false }) condition := resourcemerge.FindOperatorStatusCondition(cvStatus.Conditions, internal.ClusterStatusFailing) @@ -1078,10 +1078,11 @@ func Test_conditionalUpdateWithRiskNamesAndRiskConditions(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - getAvailableUpdates := func() *availableUpdates { - return tt.availableUpdates + var riskConditions map[string][]metav1.Condition + if tt.availableUpdates != nil { + riskConditions = tt.availableUpdates.RiskConditions } - actual, actualNames := conditionalUpdateWithRiskNamesAndRiskConditions(tt.conditionalUpdates, getAvailableUpdates, tt.desiredImage) + actual, actualNames := conditionalUpdateWithRiskNamesAndRiskConditions(tt.conditionalUpdates, riskConditions, tt.desiredImage) if difference := cmp.Diff(tt.expected, actual, cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime")); difference != "" { t.Errorf("conditional updates differ from expected:\n%s", difference) } @@ -1093,9 +1094,11 @@ func Test_conditionalUpdateWithRiskNamesAndRiskConditions(t *testing.T) { } func Test_conditionalUpdateRisks(t *testing.T) { + t1 := time.Now() tests := []struct { name string conditionalUpdates []configv1.ConditionalUpdate + alertRisks []configv1.ConditionalUpdateRisk expected []configv1.ConditionalUpdateRisk }{ { @@ -1129,27 +1132,70 @@ func Test_conditionalUpdateRisks(t *testing.T) { }, }}}, }}, - expected: []configv1.ConditionalUpdateRisk{{Name: "Risk1", - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: metav1.ConditionTrue, - }, - }}, {Name: "Risk2", - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: metav1.ConditionFalse, + alertRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "PodDisruptionBudgetAtLimit", + Message: "summary.", + URL: "https://console.com/monitoring/alertrules/id", + MatchingRules: []configv1.ClusterCondition{ + { + Type: "PromQL", + PromQL: &configv1.PromQLClusterCondition{ + PromQL: "todo-expression", + }, + }, + }, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md; severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=another-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md", + LastTransitionTime: metav1.NewTime(t1), + }}, }, - }}, {Name: "Risk3", - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: metav1.ConditionTrue, + }, + expected: []configv1.ConditionalUpdateRisk{ + { + Name: "PodDisruptionBudgetAtLimit", + Message: "summary.", + URL: "https://console.com/monitoring/alertrules/id", + MatchingRules: []configv1.ClusterCondition{ + { + Type: "PromQL", + PromQL: &configv1.PromQLClusterCondition{ + PromQL: "todo-expression", + }, + }, + }, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md; severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=another-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md", + LastTransitionTime: metav1.NewTime(t1), + }}, }, - }}}, + {Name: "Risk1", + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: metav1.ConditionTrue, + }, + }}, {Name: "Risk2", + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: metav1.ConditionFalse, + }, + }}, {Name: "Risk3", + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: metav1.ConditionTrue, + }, + }}}, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - actual := conditionalUpdateRisks(tt.conditionalUpdates) + actual := conditionalUpdateRisks(tt.conditionalUpdates, tt.alertRisks) if difference := cmp.Diff(tt.expected, actual, cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime")); difference != "" { t.Errorf("actual differ from expected:\n%s", difference) } diff --git a/pkg/cvo/testdata/alerts.json b/pkg/cvo/testdata/alerts.json new file mode 100644 index 000000000..236878ae2 --- /dev/null +++ b/pkg/cvo/testdata/alerts.json @@ -0,0 +1,137 @@ +{ + "alerts": [ + { + "labels": { + "alertname": "ClusterNotUpgradeable", + "condition": "Upgradeable", + "endpoint": "metrics", + "name": "version", + "namespace": "openshift-cluster-version", + "severity": "info" + }, + "annotations": { + "description": "In most cases, you will still be able to apply patch releases. Reason MultipleReasons. For more information refer to 'oc adm upgrade' or https://console-openshift-console.apps.ci-ln-rwc3xi2-76ef8.aws-2.ci.openshift.org/settings/cluster/.", + "summary": "One or more cluster operators have been blocking minor or major version cluster updates for at least an hour." + }, + "state": "pending", + "activeAt": "2026-03-04T00:06:31.900150556Z", + "value": "0e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "UpdateAvailable", + "channel": "simple", + "namespace": "openshift-cluster-version", + "severity": "info", + "upstream": "https://fauxinnati-fauxinnati.apps.ota-stage.q2z4.p1.openshiftapps.com/api/upgrades_info/graph" + }, + "annotations": { + "description": "For more information refer to 'oc adm upgrade' or https://console-openshift-console.apps.ci-ln-rwc3xi2-76ef8.aws-2.ci.openshift.org/settings/cluster/.", + "summary": "Your upstream update recommendation service recommends you update your cluster." + }, + "state": "firing", + "activeAt": "2026-03-04T00:36:33.844767047Z", + "value": "2e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "TestAlert", + "container": "cluster-version-operator", + "endpoint": "metrics", + "instance": "10.0.61.171:9099", + "job": "cluster-version-operator", + "namespace": "openshift-cluster-version", + "openShiftUpdatePrecheck": "true", + "pod": "cluster-version-operator-dcb5d56cc-5jc94", + "service": "cluster-version-operator", + "severity": "critical" + }, + "annotations": { + "description": "Test description.", + "summary": "Test summary." + }, + "state": "firing", + "activeAt": "2026-03-04T00:38:19.02109776Z", + "value": "1e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "InsightsRecommendationActive", + "container": "insights-operator", + "description": "Enabling the **TechPreviewNoUpgrade** feature set on your cluster\ncan not be undone and prevents minor version updates. Please do\nnot enable this feature set on production clusters.\n", + "endpoint": "https", + "info_link": "https://console.redhat.com/openshift/insights/advisor/clusters/efe476a4-97ad-4c07-bf46-d3da03a5ff6a?first=ccx_rules_ocp.external.rules.upgrade_is_blocked_due_to_tpfg%7CTECH_PREVIEW_NO_UPGRADE_FEATURE_SET_IS_ENABLED", + "instance": "10.129.0.36:8443", + "job": "metrics", + "namespace": "openshift-insights", + "pod": "insights-operator-65f69d8d84-l6f69", + "service": "metrics", + "severity": "info", + "total_risk": "Important" + }, + "annotations": { + "description": "Insights recommendation \"Enabling the **TechPreviewNoUpgrade** feature set on your cluster\ncan not be undone and prevents minor version updates. Please do\nnot enable this feature set on production clusters.\n\" with total risk \"Important\" was detected on the cluster. More information is available at https://console.redhat.com/openshift/insights/advisor/clusters/efe476a4-97ad-4c07-bf46-d3da03a5ff6a?first=ccx_rules_ocp.external.rules.upgrade_is_blocked_due_to_tpfg%7CTECH_PREVIEW_NO_UPGRADE_FEATURE_SET_IS_ENABLED.", + "summary": "An Insights recommendation is active for this cluster." + }, + "state": "firing", + "activeAt": "2026-03-04T00:07:08.820367488Z", + "value": "1e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "TechPreviewNoUpgrade", + "container": "kube-apiserver-operator", + "endpoint": "https", + "instance": "10.129.0.11:8443", + "job": "kube-apiserver-operator", + "name": "TechPreviewNoUpgrade", + "namespace": "openshift-kube-apiserver-operator", + "pod": "kube-apiserver-operator-75c85dd77-zg8fc", + "service": "metrics", + "severity": "warning" + }, + "annotations": { + "description": "Cluster has enabled Technology Preview features that cannot be undone and will prevent upgrades. The TechPreviewNoUpgrade feature set is not recommended on production clusters.", + "summary": "Cluster has enabled tech preview features that will prevent upgrades." + }, + "state": "firing", + "activeAt": "2026-03-04T00:06:25.787986891Z", + "value": "0e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "Watchdog", + "namespace": "openshift-monitoring", + "severity": "none" + }, + "annotations": { + "description": "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty.\n", + "summary": "An alert that should always be firing to certify that Alertmanager is working properly." + }, + "state": "firing", + "activeAt": "2026-03-04T00:06:23.165306226Z", + "value": "1e+00", + "partialResponseStrategy": "WARN" + }, + { + "labels": { + "alertname": "AlertmanagerReceiversNotConfigured", + "namespace": "openshift-monitoring", + "severity": "warning" + }, + "annotations": { + "description": "Alerts are not configured to be sent to a notification system, meaning that you may not be notified in a timely fashion when important failures occur. Check the OpenShift documentation to learn how to configure notifications with Alertmanager.", + "summary": "Receivers (notification integrations) are not configured on Alertmanager" + }, + "state": "firing", + "activeAt": "2026-03-04T00:06:46.778977652Z", + "value": "0e+00", + "partialResponseStrategy": "WARN" + } + ] +} diff --git a/pkg/external/constants.go b/pkg/external/constants.go index 4cbdd95e6..21f343058 100644 --- a/pkg/external/constants.go +++ b/pkg/external/constants.go @@ -1,6 +1,8 @@ package external -import "github.com/openshift/cluster-version-operator/pkg/internal" +import ( + "github.com/openshift/cluster-version-operator/pkg/internal" +) // The constants defined here are used by the components, e.g., e2e tests that have no access // to github.com/openshift/cluster-version-operator/pkg/internal @@ -18,4 +20,13 @@ const ( // ConditionalUpdateConditionTypeRecommended is a type of the condition present on a conditional update // that indicates whether the conditional update is recommended or not ConditionalUpdateConditionTypeRecommended = internal.ConditionalUpdateConditionTypeRecommended + + // ConditionalUpdateRiskConditionTypeApplies is a type of the condition present on a conditional update risk + // that indicates whether the conditional update risk applies to the cluster + ConditionalUpdateRiskConditionTypeApplies = internal.ConditionalUpdateRiskConditionTypeApplies ) + +// IsAlertConditionReason checks if the given reason is legit for a condition of a risk from an alert that applies +func IsAlertConditionReason(reason string) bool { + return internal.IsAlertConditionReason(reason) +} diff --git a/pkg/internal/constants.go b/pkg/internal/constants.go index 3c0dd2823..eac0f36af 100644 --- a/pkg/internal/constants.go +++ b/pkg/internal/constants.go @@ -1,6 +1,10 @@ package internal import ( + "fmt" + "strings" + + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" configv1 "github.com/openshift/api/config/v1" @@ -89,4 +93,41 @@ const ( ConditionalUpdateRiskConditionTypeApplies = "Applies" ConditionalUpdateRiskConditionReasonInternalErrorFoundNoRiskCondition = "InternalErrorFoundNoRiskCondition" + + AlertNamePodDisruptionBudgetLimit = "PodDisruptionBudgetLimit" + AlertNamePodDisruptionBudgetAtLimit = "PodDisruptionBudgetAtLimit" + AlertNameKubeContainerWaiting = "KubeContainerWaiting" + + AlertNameKubeletHealthState = "KubeletHealthState" + AlertNameKubeNodeNotReady = "KubeNodeNotReady" + AlertNameKubeNodeReadinessFlapping = "KubeNodeReadinessFlapping" + AlertNameKubeNodeUnreachable = "KubeNodeUnreachable" + + AlertNameVirtHandlerDaemonSetRolloutFailing = "VirtHandlerDaemonSetRolloutFailing" + AlertNameVMCannotBeEvicted = "VMCannotBeEvicted" ) + +var ( + HavePodDisruptionBudget = sets.New[string](AlertNamePodDisruptionBudgetLimit, AlertNamePodDisruptionBudgetAtLimit) + HavePullWaiting = sets.New[string](AlertNameKubeContainerWaiting) + HaveNodes = sets.New[string]( + AlertNameKubeletHealthState, + AlertNameKubeNodeNotReady, + AlertNameKubeNodeReadinessFlapping, + AlertNameKubeNodeUnreachable, + ) +) + +const alertConditionReasonPrefix = "Alert:" + +func AlertConditionReason(state string) string { + return fmt.Sprintf("%s%s", alertConditionReasonPrefix, state) +} + +func IsAlertConditionReason(reason string) bool { + return strings.HasPrefix(reason, alertConditionReasonPrefix) +} + +func AlertConditionMessage(alertName, severity, state, impact, details string) string { + return fmt.Sprintf("%s alert %s %s, %s. %s", severity, alertName, state, impact, details) +} diff --git a/pkg/payload/precondition/clusterversion/recommendedupdate.go b/pkg/payload/precondition/clusterversion/recommendedupdate.go index 24958ab6b..dda3a3614 100644 --- a/pkg/payload/precondition/clusterversion/recommendedupdate.go +++ b/pkg/payload/precondition/clusterversion/recommendedupdate.go @@ -3,10 +3,12 @@ package clusterversion import ( "context" "fmt" + "strings" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" configv1 "github.com/openshift/api/config/v1" configv1listers "github.com/openshift/client-go/config/listers/config/v1" @@ -44,9 +46,41 @@ func (ru *RecommendedUpdate) Run(ctx context.Context, releaseContext preconditio NonBlockingWarning: true, } } + // This function should be guarded by shouldReconcileAcceptRisks() + // https://github.com/openshift/cluster-version-operator/blob/f4b9dfa6f6968d117b919089c6b32918f20843c9/pkg/cvo/cvo.go#L1187 + // However, here we do not have a handler for the operator + // Now it is guarded by clusterVersion.Status.ConditionalUpdateRisks which is guarded by the above function + unAcceptRisks := sets.New[string]() + acceptRisks := sets.New[string]() + if clusterVersion.Spec.DesiredUpdate != nil { + for _, r := range clusterVersion.Spec.DesiredUpdate.AcceptRisks { + acceptRisks.Insert(r.Name) + } + } + for _, risk := range clusterVersion.Status.ConditionalUpdateRisks { + if acceptRisks.Has(risk.Name) { + continue + } + if condition := meta.FindStatusCondition(risk.Conditions, internal.ConditionalUpdateRiskConditionTypeApplies); condition != nil && + condition.Status == metav1.ConditionTrue && + internal.IsAlertConditionReason(condition.Reason) { + unAcceptRisks.Insert(risk.Name) + } + } + var alertError *precondition.Error + if unAcceptRisks.Len() > 0 { + alertError = &precondition.Error{ + Reason: "alertMightImpactUpdate", + Message: fmt.Sprintf("Update from %s to %s is not recommended:\n\n%s", + clusterVersion.Status.Desired.Version, releaseContext.DesiredVersion, + fmt.Sprintf("Those alerts have to be accepted as risks before updates: %s", strings.Join(sets.List(unAcceptRisks), ", "))), + Name: ru.Name(), + NonBlockingWarning: true, + } + } for _, recommended := range clusterVersion.Status.AvailableUpdates { if recommended.Version == releaseContext.DesiredVersion { - return nil + return aggregate(alertError, nil) } } @@ -56,45 +90,45 @@ func (ru *RecommendedUpdate) Run(ctx context.Context, releaseContext preconditio if condition.Type == internal.ConditionalUpdateConditionTypeRecommended { switch condition.Status { case metav1.ConditionTrue: - return nil + return aggregate(alertError, nil) case metav1.ConditionFalse: - return &precondition.Error{ + return aggregate(alertError, &precondition.Error{ Reason: condition.Reason, Message: fmt.Sprintf("Update from %s to %s is not recommended:\n\n%s", clusterVersion.Status.Desired.Version, releaseContext.DesiredVersion, condition.Message), Name: ru.Name(), NonBlockingWarning: true, - } + }) default: - return &precondition.Error{ + return aggregate(alertError, &precondition.Error{ Reason: condition.Reason, Message: fmt.Sprintf("Update from %s to %s is %s=%s: %s: %s", clusterVersion.Status.Desired.Version, releaseContext.DesiredVersion, condition.Type, condition.Status, condition.Reason, condition.Message), Name: ru.Name(), NonBlockingWarning: true, - } + }) } } } - return &precondition.Error{ + return aggregate(alertError, &precondition.Error{ Reason: "UnknownConditionType", Message: fmt.Sprintf("Update from %s to %s has a status.conditionalUpdates entry, but no Recommended condition.", clusterVersion.Status.Desired.Version, releaseContext.DesiredVersion), Name: ru.Name(), NonBlockingWarning: true, - } + }) } } if clusterVersion.Spec.Channel == "" { - return &precondition.Error{ + return aggregate(alertError, &precondition.Error{ Reason: "NoChannel", Message: fmt.Sprintf("Configured channel is unset, so the recommended status of updating from %s to %s is unknown.", clusterVersion.Status.Desired.Version, releaseContext.DesiredVersion), Name: ru.Name(), NonBlockingWarning: true, - } + }) } reason := "UnknownUpdate" @@ -111,14 +145,42 @@ func (ru *RecommendedUpdate) Run(ctx context.Context, releaseContext preconditio } if msg != "" { - return &precondition.Error{ + return aggregate(alertError, &precondition.Error{ Reason: reason, Message: msg, Name: ru.Name(), NonBlockingWarning: true, - } + }) + } + return aggregate(alertError, nil) +} + +func aggregate(e1, e2 *precondition.Error) error { + if e1 == nil && e2 == nil { + return nil + } + if e1 == nil { + return e2 + } + if e2 == nil { + return e1 + } + return &precondition.Error{ + Reason: aggregateString(e1.Reason, e2.Reason, "|"), + Message: aggregateString(e1.Message, e2.Message, "\n"), + Name: aggregateString(e1.Name, e2.Name, "|"), + NonBlockingWarning: e1.NonBlockingWarning && e2.NonBlockingWarning, + } +} + +func aggregateString(s1, s2, delimiter string) string { + if s1 == "" { + return s2 + } + if s2 == "" { + return s1 } - return nil + return s1 + delimiter + s2 } // Name returns the name of the precondition. diff --git a/pkg/payload/precondition/clusterversion/recommendedupdate_test.go b/pkg/payload/precondition/clusterversion/recommendedupdate_test.go index aa59f69cd..f13d49ff6 100644 --- a/pkg/payload/precondition/clusterversion/recommendedupdate_test.go +++ b/pkg/payload/precondition/clusterversion/recommendedupdate_test.go @@ -16,12 +16,14 @@ func TestRecommendedUpdate(t *testing.T) { targetVersion := "4.3.2" tests := []struct { - name string - channel string - availableUpdates []configv1.Release - conditionalUpdates []configv1.ConditionalUpdate - conditions []configv1.ClusterOperatorStatusCondition - expected string + name string + channel string + acceptRisks []configv1.AcceptRisk + availableUpdates []configv1.Release + conditionalUpdates []configv1.ConditionalUpdate + conditionalUpdateRisks []configv1.ConditionalUpdateRisk + conditions []configv1.ClusterOperatorStatusCondition + expected string }{ { name: "recommended", @@ -57,6 +59,73 @@ func TestRecommendedUpdate(t *testing.T) { }}, expected: "Update from 4.3.0 to 4.3.2 is not recommended:\n\nFor some reason, this update is not recommended.", }, + { + name: "Recommended=False and alert risk ABC is accepted", + acceptRisks: []configv1.AcceptRisk{{Name: "ABC"}}, + conditionalUpdateRisks: []configv1.ConditionalUpdateRisk{{ + Name: "ABC", + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: metav1.ConditionTrue, + Reason: "Alert:firing", + Message: "message", + }}, + }}, + conditionalUpdates: []configv1.ConditionalUpdate{{ + Release: configv1.Release{Version: targetVersion}, + Conditions: []metav1.Condition{{ + Type: "Recommended", + Status: metav1.ConditionFalse, + Reason: "FalseReason", + Message: "For some reason, this update is not recommended.", + }}, + }}, + expected: "Update from 4.3.0 to 4.3.2 is not recommended:\n\nFor some reason, this update is not recommended.", + }, + { + name: "Recommended=False and bug risk ABC is not accepted", + conditionalUpdateRisks: []configv1.ConditionalUpdateRisk{{ + Name: "ABC", + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: metav1.ConditionTrue, + Reason: "not-alert", + }}, + }}, + conditionalUpdates: []configv1.ConditionalUpdate{{ + Release: configv1.Release{Version: targetVersion}, + Conditions: []metav1.Condition{{ + Type: "Recommended", + Status: metav1.ConditionFalse, + Reason: "FalseReason", + Message: "For some reason, this update is not recommended.", + }}, + }}, + expected: "Update from 4.3.0 to 4.3.2 is not recommended:\n\nFor some reason, this update is not recommended.", + }, + { + name: "Recommended=False and alert risk ABC is not accepted", + acceptRisks: []configv1.AcceptRisk{{Name: "BBB"}}, + conditionalUpdateRisks: []configv1.ConditionalUpdateRisk{{ + Name: "ABC", + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: metav1.ConditionTrue, + Reason: "Alert:firing", + Message: "message", + }}, + }}, + conditionalUpdates: []configv1.ConditionalUpdate{{ + Release: configv1.Release{Version: targetVersion}, + Conditions: []metav1.Condition{{ + Type: "Recommended", + Status: metav1.ConditionFalse, + Reason: "FalseReason", + Message: "For some reason, this update is not recommended.", + }}, + }}, + expected: "Update from 4.3.0 to 4.3.2 is not recommended:\n\nThose alerts have to be accepted as risks before updates: ABC\nUpdate from 4.3.0 to 4.3.2 is not recommended:\n\nFor some reason, this update is not recommended.", + }, { name: "Recommended=Unknown", conditionalUpdates: []configv1.ConditionalUpdate{{ @@ -111,12 +180,13 @@ func TestRecommendedUpdate(t *testing.T) { t.Run(testCase.name, func(t *testing.T) { clusterVersion := &configv1.ClusterVersion{ ObjectMeta: metav1.ObjectMeta{Name: "version"}, - Spec: configv1.ClusterVersionSpec{Channel: testCase.channel}, + Spec: configv1.ClusterVersionSpec{Channel: testCase.channel, DesiredUpdate: &configv1.Update{AcceptRisks: testCase.acceptRisks}}, Status: configv1.ClusterVersionStatus{ - Desired: configv1.Release{Version: "4.3.0"}, - AvailableUpdates: testCase.availableUpdates, - ConditionalUpdates: testCase.conditionalUpdates, - Conditions: testCase.conditions, + Desired: configv1.Release{Version: "4.3.0"}, + AvailableUpdates: testCase.availableUpdates, + ConditionalUpdates: testCase.conditionalUpdates, + Conditions: testCase.conditions, + ConditionalUpdateRisks: testCase.conditionalUpdateRisks, }, } cvLister := fakeClusterVersionLister(t, clusterVersion) diff --git a/test/cvo/accept_risks.go b/test/cvo/accept_risks.go index b0a65f904..3760c5fb9 100644 --- a/test/cvo/accept_risks.go +++ b/test/cvo/accept_risks.go @@ -6,10 +6,13 @@ import ( g "github.com/onsi/ginkgo/v2" o "github.com/onsi/gomega" + monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" prometheusoperatorv1client "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned/typed/monitoring/v1" + "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/rest" @@ -65,6 +68,61 @@ var _ = g.Describe(`[Jira:"Cluster Version Operator"] cluster-version-operator`, } }) + // Serial may not be essential here as it does not modify CV. + // However, there is no compliance job with TP enabled, and the + // alert we create in the test might be noise for other cases. + g.It("should work with risks from alerts", g.Label("OTA-1813"), g.Label("Serial"), func() { + g.By("Create a critical alert for testing") + prometheusRule := &monitoringv1.PrometheusRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "testing", + Namespace: external.DefaultCVONamespace, + }, + Spec: monitoringv1.PrometheusRuleSpec{ + Groups: []monitoringv1.RuleGroup{ + { + Name: "test", + Rules: []monitoringv1.Rule{ + { + Alert: "TestAlert", + Annotations: map[string]string{"summary": "Test summary.", "description": "Test description."}, + Expr: intstr.IntOrString{ + Type: intstr.String, + StrVal: `up{job="cluster-version-operator"} == 1`, + }, + Labels: map[string]string{"severity": "critical", "namespace": "openshift-cluster-version", "openShiftUpdatePrecheck": "true"}, + }, + }, + }, + }, + }, + } + created, err := monitoringClient.PrometheusRules(external.DefaultCVONamespace).Create(ctx, prometheusRule, metav1.CreateOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + defer func() { + err := monitoringClient.PrometheusRules(external.DefaultCVONamespace).Delete(ctx, created.Name, metav1.DeleteOptions{}) + if !errors.IsNotFound(err) { + o.Expect(err).To(o.BeNil()) + } + }() + + g.By("Checking if the risk shows up in ClusterVersion's status") + o.Expect(wait.PollUntilContextTimeout(ctx, 30*time.Second, 5*time.Minute, true, func(ctx context.Context) (done bool, err error) { + cv, err := configClient.ClusterVersions().Get(ctx, external.DefaultClusterVersionName, metav1.GetOptions{}) + o.Expect(err).NotTo(o.HaveOccurred()) + for _, risk := range cv.Status.ConditionalUpdateRisks { + if risk.Name == "TestAlert" { + if c := meta.FindStatusCondition(risk.Conditions, external.ConditionalUpdateRiskConditionTypeApplies); c != nil { + if c.Status == metav1.ConditionTrue && external.IsAlertConditionReason(c.Reason) { + return true, nil + } + } + } + } + return false, nil + })).NotTo(o.HaveOccurred(), "no conditional update risk from alert found in ClusterVersion's status") + }) + g.It("should work with accept risks", g.Label("Serial"), func() { cv, err := configClient.ClusterVersions().Get(ctx, external.DefaultClusterVersionName, metav1.GetOptions{}) o.Expect(err).NotTo(o.HaveOccurred())