openshift · nrb · Mar 3, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 5, 2026
diff --git a/pkg/controllers/cloud_config_sync_controller.go b/pkg/controllers/cloud_config_sync_controller.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"fmt"
 	"reflect"
+	"time"
 
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
@@ -29,40 +30,78 @@ const (
 	// Controller conditions for the Cluster Operator resource
 	cloudConfigControllerAvailableCondition = "CloudConfigControllerAvailable"
 	cloudConfigControllerDegradedCondition  = "CloudConfigControllerDegraded"
+
+	// transientDegradedThreshold is how long transient errors must persist before
+	// the controller sets Degraded=True. This prevents brief
+	// API server blips during upgrades from immediately degrading the operator.
+	// Applies to both CloudConfigController and TrustedCAController.
+	transientDegradedThreshold = 2 * time.Minute
 )
 
 type CloudConfigReconciler struct {
 	ClusterOperatorStatusClient
-	Scheme            *runtime.Scheme
-	FeatureGateAccess featuregates.FeatureGateAccess
+	Scheme                  *runtime.Scheme
+	FeatureGateAccess       featuregates.FeatureGateAccess
+	consecutiveFailureSince *time.Time // nil when the last reconcile succeeded
+}
+
+// permanentError marks an error as a configuration problem that will not resolve
+// without operator or user intervention. The deferred dispatcher in Reconcile
+// uses isPermanent to route these to handleDegradeError (immediate degrade, no requeue)
+// rather than handleTransientError (failure window, requeue).
+type permanentError struct{ cause error }
+
+func (e *permanentError) Error() string { return e.cause.Error() }
+func (e *permanentError) Unwrap() error { return e.cause }
+
+// permanent wraps err to signal that the condition is not retriable.
+func permanent(err error) error { return &permanentError{cause: err} }
+
+func isPermanent(err error) bool {
+	_, ok := err.(*permanentError)
+	return ok
 }
 
-func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
+func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) {
 	klog.V(1).Infof("Syncing cloud-conf ConfigMap")
 
+	// Deferred dispatcher: classifies the returned error and calls the right handler.
+	// Permanent errors (wrapped with permanent()) degrade immediately without requeue.
+	// Transient errors enter the failure window and only degrade after the threshold.
+	// All nil-error paths clear the failure window.
+	defer func() {
+		if retErr == nil {
+			r.clearFailureWindow()
+			return
+		}
+		if isPermanent(retErr) {
+			result, retErr = r.handleDegradeError(ctx, retErr)
+		} else {
+			result, retErr = r.handleTransientError(ctx, retErr)
+		}
+	}()
+
 	infra := &configv1.Infrastructure{}
-	if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); err != nil {
-		klog.Errorf("infrastructure resource not found")
-		if err := r.setDegradedCondition(ctx); err != nil {
+	if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) {
+		// No cloud platform: mirror the main controller's behaviour of returning Available.
+		klog.Infof("Infrastructure cluster does not exist. Skipping...")
+		if err := r.setAvailableCondition(ctx); err != nil {
 			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
 		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, nil
+	} else if err != nil {
+		return ctrl.Result{}, err // transient
 	}
 
 	network := &configv1.Network{}
 	if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil {
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller when getting cluster Network object: %v", err)
-		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, err // transient
 	}
 
 	syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig)
 	if err != nil {
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-		}
-		return ctrl.Result{}, err
+		// nil platformStatus is a permanent misconfiguration.
+		return ctrl.Result{}, permanent(err)
 	}
 	if !syncNeeded {
 		if err := r.setAvailableCondition(ctx); err != nil {
@@ -74,11 +113,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 
 	cloudConfigTransformerFn, needsManagedConfigLookup, err := cloud.GetCloudConfigTransformer(infra.Status.PlatformStatus)
 	if err != nil {
+		// Unsupported platform won't change without a cluster reconfigure.
 		klog.Errorf("unable to get cloud config transformer function; unsupported platform")
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, permanent(err)
 	}
 
 	sourceCM := &corev1.ConfigMap{}
@@ -101,12 +138,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 			managedConfigFound = true
 		} else if errors.IsNotFound(err) {
 			klog.Warningf("managed cloud-config is not found, falling back to infrastructure config")
-		} else if err != nil {
-			klog.Errorf("unable to get managed cloud-config for sync")
-			if err := r.setDegradedCondition(ctx); err != nil {
-				return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-			}
-			return ctrl.Result{}, err
+		} else {
+			return ctrl.Result{}, err // transient
 		}
 	}
 
@@ -119,38 +152,26 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) {
 			klog.Warningf("managed cloud-config is not found, falling back to default cloud config.")
 		} else if err != nil {
-			klog.Errorf("unable to get cloud-config for sync: %v", err)
-			if err := r.setDegradedCondition(ctx); err != nil {
-				return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-			}
-			return ctrl.Result{}, err
+			return ctrl.Result{}, err // transient
 		}
 	}
 
 	sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra)
 	if err != nil {
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-		}
-		return ctrl.Result{}, err
+		// User-supplied key mismatch: permanent until the ConfigMap or Infrastructure changes.
+		return ctrl.Result{}, permanent(err)
 	}
 
-	// Check if FeatureGateAccess is configured
 	if r.FeatureGateAccess == nil {
-		klog.Errorf("FeatureGateAccess is not configured")
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-		}
-		return ctrl.Result{}, fmt.Errorf("FeatureGateAccess is not configured")
+		// Operator misconfiguration at startup: permanent.
+		return ctrl.Result{}, permanent(fmt.Errorf("FeatureGateAccess is not configured"))
 	}
 
 	features, err := r.FeatureGateAccess.CurrentFeatureGates()
 	if err != nil {
+		// The feature-gate informer may not have synced yet: transient.
 		klog.Errorf("unable to get feature gates: %v", err)
-		if errD := r.setDegradedCondition(ctx); errD != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", errD)
-		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, err // transient
 	}
 
 	if cloudConfigTransformerFn != nil {
@@ -159,10 +180,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		// we're not expecting users to put their data in the former.
 		output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features)
 		if err != nil {
-			if err := r.setDegradedCondition(ctx); err != nil {
-				return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-			}
-			return ctrl.Result{}, err
+			// Platform-specific transform failed on the current config data: permanent.
+			return ctrl.Result{}, permanent(err)
 		}
 		sourceCM.Data[defaultConfigKey] = output
 	}
@@ -175,11 +194,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 
 	// If the config does not exist, it will be created later, so we can ignore a Not Found error
 	if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !errors.IsNotFound(err) {
-		klog.Errorf("unable to get target cloud-config for sync")
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, err // transient
 	}
 
 	// Note that the source config map is actually a *transformed* source config map
@@ -193,10 +208,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 
 	if err := r.syncCloudConfigData(ctx, sourceCM, targetCM); err != nil {
 		klog.Errorf("unable to sync cloud config")
-		if err := r.setDegradedCondition(ctx); err != nil {
-			return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
-		}
-		return ctrl.Result{}, err
+		return ctrl.Result{}, err // transient
 	}
 
 	if err := r.setAvailableCondition(ctx); err != nil {
@@ -206,6 +218,47 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 	return ctrl.Result{}, nil
 }
 
+// clearFailureWindow resets the transient-error tracking. Called by the deferred
+// dispatcher in Reconcile on every successful (nil-error) return path.
+func (r *CloudConfigReconciler) clearFailureWindow() {
+	r.consecutiveFailureSince = nil
+}
+
+// handleTransientError records the start of a failure window and degrades the
+// controller only after transientDegradedThreshold has elapsed. It always
+// returns a non-nil error so controller-runtime requeues with exponential backoff.
+// Called only from the deferred dispatcher in Reconcile.
+func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) {
+	now := r.Clock.Now()
+	if r.consecutiveFailureSince == nil {
+		r.consecutiveFailureSince = &now
+		klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold)
+		return ctrl.Result{}, err
+	}
+	elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince)
+	if elapsed < transientDegradedThreshold {
+		klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err)
+		return ctrl.Result{}, err
+	}
+	klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err)
+	if setErr := r.setDegradedCondition(ctx); setErr != nil {
+		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
+	}
+	return ctrl.Result{}, err
+}
+
+// handleDegradeError sets CloudConfigControllerDegraded=True immediately and
+// returns nil so controller-runtime does NOT requeue. An existing watch on the
+// relevant resource will re-trigger reconciliation when the problem is fixed.
+// Called only from the deferred dispatcher in Reconcile.
+func (r *CloudConfigReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) {
+	klog.Errorf("CloudConfigReconciler: permanent error, setting degraded: %v", err)
+	if setErr := r.setDegradedCondition(ctx); setErr != nil {
+		return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
+	}
+	return ctrl.Result{}, nil
+}
+
 func (r *CloudConfigReconciler) isCloudConfigSyncNeeded(platformStatus *configv1.PlatformStatus, infraCloudConfigRef configv1.ConfigMapFileReference) (bool, error) {
 	if platformStatus == nil {
 		return false, fmt.Errorf("platformStatus is required")

diff --git a/pkg/controllers/cloud_config_sync_controller_test.go b/pkg/controllers/cloud_config_sync_controller_test.go
@@ -348,8 +348,26 @@ var _ = Describe("Cloud config sync controller", func() {
 			}, timeout).Should(Succeed())
 			initialCMresourceVersion := syncedCloudConfigMap.ResourceVersion
 
+			// Introducing the consecutiveFailureWindow means that there's a field that could be racy
+			// between the manager calling Reconcile and the test calling Reconcile.
+			// In production, we only have 1 instance of the reconciler running.
+			// Create a fresh reconciler that is NOT registered with the manager.
+			// It shares the same API client (thread-safe) but has its own
+			// consecutiveFailureSince field, so no data race with the manager's copy.
+			freshReconciler := &CloudConfigReconciler{
+				ClusterOperatorStatusClient: ClusterOperatorStatusClient{
+					Client:           cl,
+					Clock:            clocktesting.NewFakePassiveClock(time.Now()),
+					ManagedNamespace: targetNamespaceName,
+				},
+				Scheme: scheme.Scheme,
+				FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting(
+					nil, []configv1.FeatureGateName{"AWSServiceLBNetworkSecurityGroup"}, nil, nil,
+				),
+			}
+
 			request := reconcile.Request{NamespacedName: client.ObjectKey{Name: "foo", Namespace: "bar"}}
-			_, err := reconciler.Reconcile(ctx, request)
+			_, err := freshReconciler.Reconcile(ctx, request)
 			Expect(err).Should(Succeed())
 
 			Expect(cl.Get(ctx, syncedConfigMapKey, syncedCloudConfigMap)).Should(Succeed())
@@ -509,7 +527,7 @@ var _ = Describe("Cloud config sync reconciler", func() {
 			Expect(len(allCMs.Items)).To(BeEquivalentTo(1))
 		})
 
-		It("should error if a user-specified configmap key isn't present", func() {
+		It("should degrade immediately if a user-specified configmap key isn't present", func() {
 			infraResource := makeInfrastructureResource(configv1.AWSPlatformType)
 			infraResource.Spec.CloudConfig.Key = "notfound"
 			Expect(cl.Create(ctx, infraResource)).To(Succeed())
@@ -518,8 +536,19 @@ var _ = Describe("Cloud config sync reconciler", func() {
 			Expect(cl.Status().Update(ctx, infraResource.DeepCopy())).To(Succeed())
 
 			_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
-			Expect(err.Error()).To(ContainSubstring("specified in infra resource does not exist in source configmap"))
-
+			Expect(err).To(Succeed())
+
+			co := &configv1.ClusterOperator{}
+			Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+			var degradedCond *configv1.ClusterOperatorStatusCondition
+			for i := range co.Status.Conditions {
+				if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition {
+					degradedCond = &co.Status.Conditions[i]
+					break
+				}
+			}
+			Expect(degradedCond).NotTo(BeNil())
+			Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue))
 		})
 
 		It("should continue with reconcile when feature gates are available", func() {
@@ -584,15 +613,39 @@ var _ = Describe("Cloud config sync reconciler", func() {
 		})
 	})
 
-	It("reconcile should fail if no infra resource found", func() {
+	It("reconcile should succeed and be available if no infra resource found", func() {
 		_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
-		Expect(err.Error()).Should(BeEquivalentTo("infrastructures.config.openshift.io \"cluster\" not found"))
+		Expect(err).To(Succeed())
+
+		co := &configv1.ClusterOperator{}
+		Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+		var availCond *configv1.ClusterOperatorStatusCondition
+		for i := range co.Status.Conditions {
+			if co.Status.Conditions[i].Type == cloudConfigControllerAvailableCondition {
+				availCond = &co.Status.Conditions[i]
+				break
+			}
+		}
+		Expect(availCond).NotTo(BeNil())
+		Expect(availCond.Status).To(Equal(configv1.ConditionTrue))
 	})
 
-	It("should fail if no PlatformStatus in infra resource presented ", func() {
+	It("should degrade immediately if no PlatformStatus in infra resource", func() {
 		infraResource := makeInfrastructureResource(configv1.AWSPlatformType)
 		Expect(cl.Create(ctx, infraResource)).To(Succeed())
 		_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
-		Expect(err.Error()).Should(BeEquivalentTo("platformStatus is required"))
+		Expect(err).To(Succeed())
+
+		co := &configv1.ClusterOperator{}
+		Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
+		var degradedCond *configv1.ClusterOperatorStatusCondition
+		for i := range co.Status.Conditions {
+			if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition {
+				degradedCond = &co.Status.Conditions[i]
+				break
+			}
+		}
+		Expect(degradedCond).NotTo(BeNil())
+		Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue))
 	})
 })