Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 110 additions & 57 deletions pkg/controllers/cloud_config_sync_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"reflect"
"time"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
Expand All @@ -29,40 +30,78 @@ const (
// Controller conditions for the Cluster Operator resource
cloudConfigControllerAvailableCondition = "CloudConfigControllerAvailable"
cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded"

// transientDegradedThreshold is how long transient errors must persist before
// the controller sets Degraded=True. This prevents brief
// API server blips during upgrades from immediately degrading the operator.
// Applies to both CloudConfigController and TrustedCAController.
transientDegradedThreshold = 2 * time.Minute
)

type CloudConfigReconciler struct {
ClusterOperatorStatusClient
Scheme *runtime.Scheme
FeatureGateAccess featuregates.FeatureGateAccess
Scheme *runtime.Scheme
FeatureGateAccess featuregates.FeatureGateAccess
consecutiveFailureSince *time.Time // nil when the last reconcile succeeded
}

// permanentError marks an error as a configuration problem that will not resolve
// without operator or user intervention. The deferred dispatcher in Reconcile
// uses isPermanent to route these to handleDegradeError (immediate degrade, no requeue)
// rather than handleTransientError (failure window, requeue).
type permanentError struct{ cause error }

func (e *permanentError) Error() string { return e.cause.Error() }
func (e *permanentError) Unwrap() error { return e.cause }

// permanent wraps err to signal that the condition is not retriable.
func permanent(err error) error { return &permanentError{cause: err} }

func isPermanent(err error) bool {
_, ok := err.(*permanentError)
Copy link
Copy Markdown
Contributor

@RadekManak RadekManak Mar 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: This won't work if permanentError is not on the top level error. errors.As() should be used instead.

return ok
Comment on lines +48 to +62
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Controller runtime has a TerminalError. Is this what you want here? It allows you to skip the "requeue on error" semantic that you would normally get in CR controllers

You could use that in a similar way without creating a separate error type

}

func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) {
klog.V(1).Infof("Syncing cloud-conf ConfigMap")

// Deferred dispatcher: classifies the returned error and calls the right handler.
// Permanent errors (wrapped with permanent()) degrade immediately without requeue.
// Transient errors enter the failure window and only degrade after the threshold.
// All nil-error paths clear the failure window.
defer func() {
if retErr == nil {
r.clearFailureWindow()
return
}
if isPermanent(retErr) {
result, retErr = r.handleDegradeError(ctx, retErr)
} else {
result, retErr = r.handleTransientError(ctx, retErr)
}
}()

infra := &configv1.Infrastructure{}
if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); err != nil {
klog.Errorf("infrastructure resource not found")
if err := r.setDegradedCondition(ctx); err != nil {
if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) {
// No cloud platform: mirror the main controller's behaviour of returning Available.
klog.Infof("Infrastructure cluster does not exist. Skipping...")
if err := r.setAvailableCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, nil
} else if err != nil {
return ctrl.Result{}, err // transient
}

network := &configv1.Network{}
if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller when getting cluster Network object: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, err // transient
}

syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig)
if err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
// nil platformStatus is a permanent misconfiguration.
return ctrl.Result{}, permanent(err)
}
if !syncNeeded {
if err := r.setAvailableCondition(ctx); err != nil {
Expand All @@ -74,11 +113,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)

cloudConfigTransformerFn, needsManagedConfigLookup, err := cloud.GetCloudConfigTransformer(infra.Status.PlatformStatus)
if err != nil {
// Unsupported platform won't change without a cluster reconfigure.
klog.Errorf("unable to get cloud config transformer function; unsupported platform")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, permanent(err)
}

sourceCM := &corev1.ConfigMap{}
Expand All @@ -101,12 +138,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
managedConfigFound = true
} else if errors.IsNotFound(err) {
klog.Warningf("managed cloud-config is not found, falling back to infrastructure config")
} else if err != nil {
klog.Errorf("unable to get managed cloud-config for sync")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
} else {
return ctrl.Result{}, err // transient
}
}

Expand All @@ -119,38 +152,26 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) {
klog.Warningf("managed cloud-config is not found, falling back to default cloud config.")
} else if err != nil {
klog.Errorf("unable to get cloud-config for sync: %v", err)
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, err // transient
}
}

sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra)
if err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
// User-supplied key mismatch: permanent until the ConfigMap or Infrastructure changes.
return ctrl.Result{}, permanent(err)
}

// Check if FeatureGateAccess is configured
if r.FeatureGateAccess == nil {
klog.Errorf("FeatureGateAccess is not configured")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, fmt.Errorf("FeatureGateAccess is not configured")
// Operator misconfiguration at startup: permanent.
return ctrl.Result{}, permanent(fmt.Errorf("FeatureGateAccess is not configured"))
}

features, err := r.FeatureGateAccess.CurrentFeatureGates()
if err != nil {
// The feature-gate informer may not have synced yet: transient.
klog.Errorf("unable to get feature gates: %v", err)
if errD := r.setDegradedCondition(ctx); errD != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", errD)
}
return ctrl.Result{}, err
return ctrl.Result{}, err // transient
}

if cloudConfigTransformerFn != nil {
Expand All @@ -159,10 +180,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
// we're not expecting users to put their data in the former.
output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features)
if err != nil {
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
// Platform-specific transform failed on the current config data: permanent.
return ctrl.Result{}, permanent(err)
}
sourceCM.Data[defaultConfigKey] = output
}
Expand All @@ -175,11 +194,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)

// If the config does not exist, it will be created later, so we can ignore a Not Found error
if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !errors.IsNotFound(err) {
klog.Errorf("unable to get target cloud-config for sync")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, err // transient
}

// Note that the source config map is actually a *transformed* source config map
Expand All @@ -193,10 +208,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)

if err := r.syncCloudConfigData(ctx, sourceCM, targetCM); err != nil {
klog.Errorf("unable to sync cloud config")
if err := r.setDegradedCondition(ctx); err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err)
}
return ctrl.Result{}, err
return ctrl.Result{}, err // transient
}

if err := r.setAvailableCondition(ctx); err != nil {
Expand All @@ -206,6 +218,47 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request)
return ctrl.Result{}, nil
}

// clearFailureWindow resets the transient-error tracking. Called by the deferred
// dispatcher in Reconcile on every successful (nil-error) return path.
func (r *CloudConfigReconciler) clearFailureWindow() {
r.consecutiveFailureSince = nil
}

// handleTransientError records the start of a failure window and degrades the
// controller only after transientDegradedThreshold has elapsed. It always
// returns a non-nil error so controller-runtime requeues with exponential backoff.
// Called only from the deferred dispatcher in Reconcile.
func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) {
now := r.Clock.Now()
if r.consecutiveFailureSince == nil {
r.consecutiveFailureSince = &now
Comment on lines +233 to +234
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Multiple concurrent reconciles would race on this, should this be using some sort of atomic/lock semantic?

klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold)
return ctrl.Result{}, err
}
elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we always reconcile the same object? Do we care about errors for different objects as different limits here?

if elapsed < transientDegradedThreshold {
klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err)
return ctrl.Result{}, err
}
klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err)
if setErr := r.setDegradedCondition(ctx); setErr != nil {
return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
}
return ctrl.Result{}, err
}

// handleDegradeError sets CloudConfigControllerDegraded=True immediately and
// returns nil so controller-runtime does NOT requeue. An existing watch on the
// relevant resource will re-trigger reconciliation when the problem is fixed.
// Called only from the deferred dispatcher in Reconcile.
func (r *CloudConfigReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) {
klog.Errorf("CloudConfigReconciler: permanent error, setting degraded: %v", err)
if setErr := r.setDegradedCondition(ctx); setErr != nil {
return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr)
}
return ctrl.Result{}, nil
}

func (r *CloudConfigReconciler) isCloudConfigSyncNeeded(platformStatus *configv1.PlatformStatus, infraCloudConfigRef configv1.ConfigMapFileReference) (bool, error) {
if platformStatus == nil {
return false, fmt.Errorf("platformStatus is required")
Expand Down
69 changes: 61 additions & 8 deletions pkg/controllers/cloud_config_sync_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,26 @@ var _ = Describe("Cloud config sync controller", func() {
}, timeout).Should(Succeed())
initialCMresourceVersion := syncedCloudConfigMap.ResourceVersion

// Introducing the consecutiveFailureWindow means that there's a field that could be racy
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't love this; definitely open to alternatives.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using some sort of atomic timestamp or lock?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was hoping to not have to do locks, but it's not the worse thing in the world here.

// between the manager calling Reconcile and the test calling Reconcile.
// In production, we only have 1 instance of the reconciler running.
// Create a fresh reconciler that is NOT registered with the manager.
// It shares the same API client (thread-safe) but has its own
// consecutiveFailureSince field, so no data race with the manager's copy.
freshReconciler := &CloudConfigReconciler{
ClusterOperatorStatusClient: ClusterOperatorStatusClient{
Client: cl,
Clock: clocktesting.NewFakePassiveClock(time.Now()),
ManagedNamespace: targetNamespaceName,
},
Scheme: scheme.Scheme,
FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting(
nil, []configv1.FeatureGateName{"AWSServiceLBNetworkSecurityGroup"}, nil, nil,
),
}

request := reconcile.Request{NamespacedName: client.ObjectKey{Name: "foo", Namespace: "bar"}}
_, err := reconciler.Reconcile(ctx, request)
_, err := freshReconciler.Reconcile(ctx, request)
Expect(err).Should(Succeed())

Expect(cl.Get(ctx, syncedConfigMapKey, syncedCloudConfigMap)).Should(Succeed())
Expand Down Expand Up @@ -509,7 +527,7 @@ var _ = Describe("Cloud config sync reconciler", func() {
Expect(len(allCMs.Items)).To(BeEquivalentTo(1))
})

It("should error if a user-specified configmap key isn't present", func() {
It("should degrade immediately if a user-specified configmap key isn't present", func() {
infraResource := makeInfrastructureResource(configv1.AWSPlatformType)
infraResource.Spec.CloudConfig.Key = "notfound"
Expect(cl.Create(ctx, infraResource)).To(Succeed())
Expand All @@ -518,8 +536,19 @@ var _ = Describe("Cloud config sync reconciler", func() {
Expect(cl.Status().Update(ctx, infraResource.DeepCopy())).To(Succeed())

_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
Expect(err.Error()).To(ContainSubstring("specified in infra resource does not exist in source configmap"))

Expect(err).To(Succeed())

co := &configv1.ClusterOperator{}
Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
var degradedCond *configv1.ClusterOperatorStatusCondition
for i := range co.Status.Conditions {
if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition {
degradedCond = &co.Status.Conditions[i]
break
}
}
Expect(degradedCond).NotTo(BeNil())
Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue))
})

It("should continue with reconcile when feature gates are available", func() {
Expand Down Expand Up @@ -584,15 +613,39 @@ var _ = Describe("Cloud config sync reconciler", func() {
})
})

It("reconcile should fail if no infra resource found", func() {
It("reconcile should succeed and be available if no infra resource found", func() {
_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
Expect(err.Error()).Should(BeEquivalentTo("infrastructures.config.openshift.io \"cluster\" not found"))
Expect(err).To(Succeed())

co := &configv1.ClusterOperator{}
Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
var availCond *configv1.ClusterOperatorStatusCondition
for i := range co.Status.Conditions {
if co.Status.Conditions[i].Type == cloudConfigControllerAvailableCondition {
availCond = &co.Status.Conditions[i]
break
}
}
Expect(availCond).NotTo(BeNil())
Expect(availCond.Status).To(Equal(configv1.ConditionTrue))
})

It("should fail if no PlatformStatus in infra resource presented ", func() {
It("should degrade immediately if no PlatformStatus in infra resource", func() {
infraResource := makeInfrastructureResource(configv1.AWSPlatformType)
Expect(cl.Create(ctx, infraResource)).To(Succeed())
_, err := reconciler.Reconcile(context.TODO(), ctrl.Request{})
Expect(err.Error()).Should(BeEquivalentTo("platformStatus is required"))
Expect(err).To(Succeed())

co := &configv1.ClusterOperator{}
Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed())
var degradedCond *configv1.ClusterOperatorStatusCondition
for i := range co.Status.Conditions {
if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition {
degradedCond = &co.Status.Conditions[i]
break
}
}
Expect(degradedCond).NotTo(BeNil())
Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue))
})
})
Loading