From 1632b29d2d7db2622f4ee71baec3ea300e2dda77 Mon Sep 17 00:00:00 2001 From: Forrest Babcock <36795216+neisw@users.noreply.github.com> Date: Sun, 8 Mar 2026 15:09:12 -0400 Subject: [PATCH] Revert "OTA-1546: Add a metric and an alert for conditional risk conditions" --- ...2_prometheusrule-TechPreviewNoUpgrade.yaml | 26 ---- pkg/cvo/metrics.go | 30 ----- pkg/cvo/metrics_test.go | 113 ------------------ 3 files changed, 169 deletions(-) delete mode 100644 install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml diff --git a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml b/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml deleted file mode 100644 index f4736ca85..000000000 --- a/install/0000_90_cluster-version-operator_02_prometheusrule-TechPreviewNoUpgrade.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - k8s-app: cluster-version-operator - name: cluster-version-operator-accept-risks - namespace: openshift-cluster-version - annotations: - kubernetes.io/description: Alerting rules for the feature gate ClusterUpdateAcceptRisks. - exclude.release.openshift.io/internal-openshift-hosted: "true" - include.release.openshift.io/self-managed-high-availability: "true" - release.openshift.io/feature-gate: "ClusterUpdateAcceptRisks" -spec: - groups: - - name: cluster-version-tech-preview - rules: - - alert: OpenShiftUpdateRiskMightApply - annotations: - summary: The cluster might have been exposed to the conditional update risk for 10 minutes. - description: The conditional update risk {{ "{{ $labels.risk }}" }} might apply to the cluster because of {{ "{{ $labels.reason }}" }}, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'. - runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-version-operator/OpenShiftUpdateRiskMightApply.md - expr: | - max by (namespace, risk, reason) (last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m]) != 0) - for: 10m - labels: - severity: warning diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index 30d04302f..21d2718df 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -15,7 +15,6 @@ import ( corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apiserver/pkg/server/dynamiccertificates" @@ -55,7 +54,6 @@ type operatorMetrics struct { capability *prometheus.GaugeVec clusterOperatorUp *prometheus.GaugeVec clusterOperatorConditions *prometheus.GaugeVec - clusterVersionRiskConditions *prometheus.GaugeVec clusterOperatorConditionTransitions *prometheus.GaugeVec clusterInstaller *prometheus.GaugeVec clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec @@ -106,10 +104,6 @@ penultimate completed version for 'completed'. Name: "cluster_operator_conditions", Help: "Report the conditions for active cluster operators. 0 is False and 1 is True.", }, []string{"name", "condition", "reason"}), - clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cluster_version_risk_conditions", - Help: "Report the risk conditions for the cluster version. -1 is Unknown, 0 is False and 1 is True.", - }, []string{"condition", "risk", "reason"}), clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "cluster_operator_condition_transitions", Help: "Reports the number of times that a condition on a cluster operator changes status", @@ -495,7 +489,6 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) { ch <- m.capability.WithLabelValues("").Desc() ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc() ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc() - ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc() ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc() ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc() ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc() @@ -517,26 +510,6 @@ func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric, } } -func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Metric, risks []configv1.ConditionalUpdateRisk) { - for _, risk := range risks { - for _, condition := range risk.Conditions { - if condition.Type != internal.ConditionalUpdateRiskConditionTypeApplies { - continue - } - - g := m.clusterVersionRiskConditions.WithLabelValues(condition.Type, risk.Name, condition.Reason) - switch condition.Status { - case metav1.ConditionTrue: - g.Set(1) - case metav1.ConditionUnknown: - g.Set(-1) - } - // We do not need to do g.Set(0) as it is done when g is initialized - ch <- g - } - } -} - // Collect collects metrics from the operator into the channel ch func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { current := m.optr.currentVersion() @@ -682,9 +655,6 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) { } m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates) - if m.optr.shouldReconcileAcceptRisks() { - m.collectConditionalUpdateRisks(ch, cv.Status.ConditionalUpdateRisks) - } } g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version) diff --git a/pkg/cvo/metrics_test.go b/pkg/cvo/metrics_test.go index f12852be3..63b9c5f98 100644 --- a/pkg/cvo/metrics_test.go +++ b/pkg/cvo/metrics_test.go @@ -25,7 +25,6 @@ import ( configv1 "github.com/openshift/api/config/v1" "github.com/openshift/library-go/pkg/crypto" - "github.com/openshift/cluster-version-operator/pkg/featuregates" "github.com/openshift/cluster-version-operator/pkg/internal" ) @@ -670,7 +669,6 @@ func Test_operatorMetrics_Collect(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - tt.optr.enabledCVOFeatureGates = featuregates.DefaultCvoGates("version") tt.optr.eventRecorder = record.NewFakeRecorder(100) if tt.optr.cvLister == nil { tt.optr.cvLister = &cvLister{} @@ -977,117 +975,6 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) { } } -func Test_collectConditionalUpdateRisks(t *testing.T) { - type valueWithLabels struct { - value float64 - labels map[string]string - } - testCases := []struct { - name string - risks []configv1.ConditionalUpdateRisk - expected []valueWithLabels - }{ - { - name: "no conditional updates", - expected: []valueWithLabels{}, - }, - { - name: "unknown type", - risks: []configv1.ConditionalUpdateRisk{ - { - Name: "RiskX", - Conditions: []metav1.Condition{{ - Type: internal.ConditionalUpdateConditionTypeRecommended, - Status: metav1.ConditionFalse, - Reason: "ReasonA", - Message: "Risk does not apply", - }}, - }, - }, - }, - { - name: "apply false", - risks: []configv1.ConditionalUpdateRisk{ - { - Name: "RiskX", - Conditions: []metav1.Condition{{ - Type: internal.ConditionalUpdateRiskConditionTypeApplies, - Status: metav1.ConditionFalse, - Reason: "ReasonA", - Message: "Risk does not apply", - }}, - }, - }, - expected: []valueWithLabels{{ - labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"}, - }}, - }, - { - name: "apply true", - risks: []configv1.ConditionalUpdateRisk{ - { - Name: "RiskX", - Conditions: []metav1.Condition{{ - Type: internal.ConditionalUpdateRiskConditionTypeApplies, - Status: metav1.ConditionTrue, - Reason: "ReasonA", - Message: "Risk does not apply", - }}, - }, - }, - expected: []valueWithLabels{{ - value: 1, - labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"}, - }}, - }, - { - name: "apply unknown", - risks: []configv1.ConditionalUpdateRisk{ - { - Name: "RiskX", - Conditions: []metav1.Condition{{ - Type: internal.ConditionalUpdateRiskConditionTypeApplies, - Status: metav1.ConditionUnknown, - Reason: "ReasonA", - Message: "Risk does not apply", - }}, - }, - }, - expected: []valueWithLabels{{ - value: -1, - labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"}, - }}, - }, - } - - for _, tc := range testCases { - tc := tc - t.Run(tc.name, func(t *testing.T) { - optr := &Operator{} - m := newOperatorMetrics(optr) - ch := make(chan prometheus.Metric) - - go func() { - m.collectConditionalUpdateRisks(ch, tc.risks) - close(ch) - }() - - var collected []prometheus.Metric - for item := range ch { - collected = append(collected, item) - } - - if lenC, lenE := len(collected), len(tc.expected); lenC != lenE { - - t.Fatalf("Expected %d metrics, got %d metrics\nGot metrics: %s", lenE, lenC, spew.Sdump(collected)) - } - for i := range tc.expected { - expectMetric(t, collected[i], tc.expected[i].value, tc.expected[i].labels) - } - }) - } -} - func expectMetric(t *testing.T, metric prometheus.Metric, value float64, labels map[string]string) { t.Helper() var d dto.Metric