-
Notifications
You must be signed in to change notification settings - Fork 75
OCPBUGS-42837: Do not set Degraded=True on transient errors #436
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
ba392a3
165ff0a
8e7f3f9
2265858
a538e81
3bf2b21
6dfb25e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,6 +4,7 @@ import ( | |
| "context" | ||
| "fmt" | ||
| "reflect" | ||
| "time" | ||
|
|
||
| corev1 "k8s.io/api/core/v1" | ||
| "k8s.io/apimachinery/pkg/api/errors" | ||
|
|
@@ -29,40 +30,78 @@ const ( | |
| // Controller conditions for the Cluster Operator resource | ||
| cloudConfigControllerAvailableCondition = "CloudConfigControllerAvailable" | ||
| cloudConfigControllerDegradedCondition = "CloudConfigControllerDegraded" | ||
|
|
||
| // transientDegradedThreshold is how long transient errors must persist before | ||
| // the controller sets Degraded=True. This prevents brief | ||
| // API server blips during upgrades from immediately degrading the operator. | ||
| // Applies to both CloudConfigController and TrustedCAController. | ||
| transientDegradedThreshold = 2 * time.Minute | ||
| ) | ||
|
|
||
| type CloudConfigReconciler struct { | ||
| ClusterOperatorStatusClient | ||
| Scheme *runtime.Scheme | ||
| FeatureGateAccess featuregates.FeatureGateAccess | ||
| Scheme *runtime.Scheme | ||
| FeatureGateAccess featuregates.FeatureGateAccess | ||
| consecutiveFailureSince *time.Time // nil when the last reconcile succeeded | ||
| } | ||
|
|
||
| // permanentError marks an error as a configuration problem that will not resolve | ||
| // without operator or user intervention. The deferred dispatcher in Reconcile | ||
| // uses isPermanent to route these to handleDegradeError (immediate degrade, no requeue) | ||
| // rather than handleTransientError (failure window, requeue). | ||
| type permanentError struct{ cause error } | ||
|
|
||
| func (e *permanentError) Error() string { return e.cause.Error() } | ||
| func (e *permanentError) Unwrap() error { return e.cause } | ||
|
|
||
| // permanent wraps err to signal that the condition is not retriable. | ||
| func permanent(err error) error { return &permanentError{cause: err} } | ||
|
|
||
| func isPermanent(err error) bool { | ||
| _, ok := err.(*permanentError) | ||
| return ok | ||
|
Comment on lines
+48
to
+62
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Controller runtime has a TerminalError. Is this what you want here? It allows you to skip the "requeue on error" semantic that you would normally get in CR controllers You could use that in a similar way without creating a separate error type |
||
| } | ||
|
|
||
| func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { | ||
| func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { | ||
| klog.V(1).Infof("Syncing cloud-conf ConfigMap") | ||
|
|
||
| // Deferred dispatcher: classifies the returned error and calls the right handler. | ||
| // Permanent errors (wrapped with permanent()) degrade immediately without requeue. | ||
| // Transient errors enter the failure window and only degrade after the threshold. | ||
| // All nil-error paths clear the failure window. | ||
| defer func() { | ||
| if retErr == nil { | ||
| r.clearFailureWindow() | ||
| return | ||
| } | ||
| if isPermanent(retErr) { | ||
| result, retErr = r.handleDegradeError(ctx, retErr) | ||
| } else { | ||
| result, retErr = r.handleTransientError(ctx, retErr) | ||
| } | ||
| }() | ||
|
|
||
| infra := &configv1.Infrastructure{} | ||
| if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); err != nil { | ||
| klog.Errorf("infrastructure resource not found") | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| if err := r.Get(ctx, client.ObjectKey{Name: infrastructureResourceName}, infra); errors.IsNotFound(err) { | ||
| // No cloud platform: mirror the main controller's behaviour of returning Available. | ||
| klog.Infof("Infrastructure cluster does not exist. Skipping...") | ||
| if err := r.setAvailableCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| return ctrl.Result{}, nil | ||
| } else if err != nil { | ||
| return ctrl.Result{}, err // transient | ||
| } | ||
|
|
||
| network := &configv1.Network{} | ||
| if err := r.Get(ctx, client.ObjectKey{Name: "cluster"}, network); err != nil { | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller when getting cluster Network object: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| return ctrl.Result{}, err // transient | ||
| } | ||
|
|
||
| syncNeeded, err := r.isCloudConfigSyncNeeded(infra.Status.PlatformStatus, infra.Spec.CloudConfig) | ||
| if err != nil { | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| // nil platformStatus is a permanent misconfiguration. | ||
| return ctrl.Result{}, permanent(err) | ||
| } | ||
| if !syncNeeded { | ||
| if err := r.setAvailableCondition(ctx); err != nil { | ||
|
|
@@ -74,11 +113,9 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) | |
|
|
||
| cloudConfigTransformerFn, needsManagedConfigLookup, err := cloud.GetCloudConfigTransformer(infra.Status.PlatformStatus) | ||
| if err != nil { | ||
| // Unsupported platform won't change without a cluster reconfigure. | ||
| klog.Errorf("unable to get cloud config transformer function; unsupported platform") | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| return ctrl.Result{}, permanent(err) | ||
| } | ||
|
|
||
| sourceCM := &corev1.ConfigMap{} | ||
|
|
@@ -101,12 +138,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) | |
| managedConfigFound = true | ||
| } else if errors.IsNotFound(err) { | ||
| klog.Warningf("managed cloud-config is not found, falling back to infrastructure config") | ||
| } else if err != nil { | ||
| klog.Errorf("unable to get managed cloud-config for sync") | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| } else { | ||
| return ctrl.Result{}, err // transient | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -119,38 +152,26 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) | |
| if err := r.Get(ctx, openshiftUnmanagedCMKey, sourceCM); errors.IsNotFound(err) { | ||
| klog.Warningf("managed cloud-config is not found, falling back to default cloud config.") | ||
| } else if err != nil { | ||
| klog.Errorf("unable to get cloud-config for sync: %v", err) | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| return ctrl.Result{}, err // transient | ||
| } | ||
| } | ||
|
|
||
| sourceCM, err = r.prepareSourceConfigMap(sourceCM, infra) | ||
| if err != nil { | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| // User-supplied key mismatch: permanent until the ConfigMap or Infrastructure changes. | ||
| return ctrl.Result{}, permanent(err) | ||
| } | ||
|
|
||
| // Check if FeatureGateAccess is configured | ||
| if r.FeatureGateAccess == nil { | ||
| klog.Errorf("FeatureGateAccess is not configured") | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, fmt.Errorf("FeatureGateAccess is not configured") | ||
| // Operator misconfiguration at startup: permanent. | ||
| return ctrl.Result{}, permanent(fmt.Errorf("FeatureGateAccess is not configured")) | ||
| } | ||
|
|
||
| features, err := r.FeatureGateAccess.CurrentFeatureGates() | ||
| if err != nil { | ||
| // The feature-gate informer may not have synced yet: transient. | ||
| klog.Errorf("unable to get feature gates: %v", err) | ||
| if errD := r.setDegradedCondition(ctx); errD != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", errD) | ||
| } | ||
| return ctrl.Result{}, err | ||
| return ctrl.Result{}, err // transient | ||
| } | ||
|
|
||
| if cloudConfigTransformerFn != nil { | ||
|
|
@@ -159,10 +180,8 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) | |
| // we're not expecting users to put their data in the former. | ||
| output, err := cloudConfigTransformerFn(sourceCM.Data[defaultConfigKey], infra, network, features) | ||
| if err != nil { | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| // Platform-specific transform failed on the current config data: permanent. | ||
| return ctrl.Result{}, permanent(err) | ||
| } | ||
| sourceCM.Data[defaultConfigKey] = output | ||
| } | ||
|
|
@@ -175,11 +194,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) | |
|
|
||
| // If the config does not exist, it will be created later, so we can ignore a Not Found error | ||
| if err := r.Get(ctx, targetConfigMapKey, targetCM); err != nil && !errors.IsNotFound(err) { | ||
| klog.Errorf("unable to get target cloud-config for sync") | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| return ctrl.Result{}, err // transient | ||
| } | ||
|
|
||
| // Note that the source config map is actually a *transformed* source config map | ||
|
|
@@ -193,10 +208,7 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) | |
|
|
||
| if err := r.syncCloudConfigData(ctx, sourceCM, targetCM); err != nil { | ||
| klog.Errorf("unable to sync cloud config") | ||
| if err := r.setDegradedCondition(ctx); err != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set conditions for cloud config controller: %v", err) | ||
| } | ||
| return ctrl.Result{}, err | ||
| return ctrl.Result{}, err // transient | ||
| } | ||
|
|
||
| if err := r.setAvailableCondition(ctx); err != nil { | ||
|
|
@@ -206,6 +218,47 @@ func (r *CloudConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) | |
| return ctrl.Result{}, nil | ||
| } | ||
|
|
||
| // clearFailureWindow resets the transient-error tracking. Called by the deferred | ||
| // dispatcher in Reconcile on every successful (nil-error) return path. | ||
| func (r *CloudConfigReconciler) clearFailureWindow() { | ||
| r.consecutiveFailureSince = nil | ||
| } | ||
|
|
||
| // handleTransientError records the start of a failure window and degrades the | ||
| // controller only after transientDegradedThreshold has elapsed. It always | ||
| // returns a non-nil error so controller-runtime requeues with exponential backoff. | ||
| // Called only from the deferred dispatcher in Reconcile. | ||
| func (r *CloudConfigReconciler) handleTransientError(ctx context.Context, err error) (ctrl.Result, error) { | ||
| now := r.Clock.Now() | ||
| if r.consecutiveFailureSince == nil { | ||
| r.consecutiveFailureSince = &now | ||
|
Comment on lines
+233
to
+234
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Multiple concurrent reconciles would race on this, should this be using some sort of atomic/lock semantic? |
||
| klog.V(4).Infof("CloudConfigReconciler: transient failure started (%v), will degrade after %s", err, transientDegradedThreshold) | ||
| return ctrl.Result{}, err | ||
| } | ||
| elapsed := r.Clock.Now().Sub(*r.consecutiveFailureSince) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we always reconcile the same object? Do we care about errors for different objects as different limits here? |
||
| if elapsed < transientDegradedThreshold { | ||
| klog.V(4).Infof("CloudConfigReconciler: transient failure ongoing for %s (threshold %s): %v", elapsed, transientDegradedThreshold, err) | ||
| return ctrl.Result{}, err | ||
| } | ||
| klog.Warningf("CloudConfigReconciler: transient failure exceeded threshold (%s), setting degraded: %v", elapsed, err) | ||
| if setErr := r.setDegradedCondition(ctx); setErr != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) | ||
| } | ||
| return ctrl.Result{}, err | ||
| } | ||
|
|
||
| // handleDegradeError sets CloudConfigControllerDegraded=True immediately and | ||
| // returns nil so controller-runtime does NOT requeue. An existing watch on the | ||
| // relevant resource will re-trigger reconciliation when the problem is fixed. | ||
| // Called only from the deferred dispatcher in Reconcile. | ||
| func (r *CloudConfigReconciler) handleDegradeError(ctx context.Context, err error) (ctrl.Result, error) { | ||
| klog.Errorf("CloudConfigReconciler: permanent error, setting degraded: %v", err) | ||
| if setErr := r.setDegradedCondition(ctx); setErr != nil { | ||
| return ctrl.Result{}, fmt.Errorf("failed to set degraded condition: %v", setErr) | ||
| } | ||
| return ctrl.Result{}, nil | ||
| } | ||
|
|
||
| func (r *CloudConfigReconciler) isCloudConfigSyncNeeded(platformStatus *configv1.PlatformStatus, infraCloudConfigRef configv1.ConfigMapFileReference) (bool, error) { | ||
| if platformStatus == nil { | ||
| return false, fmt.Errorf("platformStatus is required") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -348,8 +348,26 @@ var _ = Describe("Cloud config sync controller", func() { | |
| }, timeout).Should(Succeed()) | ||
| initialCMresourceVersion := syncedCloudConfigMap.ResourceVersion | ||
|
|
||
| // Introducing the consecutiveFailureWindow means that there's a field that could be racy | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't love this; definitely open to alternatives.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using some sort of atomic timestamp or lock?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was hoping to not have to do locks, but it's not the worse thing in the world here. |
||
| // between the manager calling Reconcile and the test calling Reconcile. | ||
| // In production, we only have 1 instance of the reconciler running. | ||
| // Create a fresh reconciler that is NOT registered with the manager. | ||
| // It shares the same API client (thread-safe) but has its own | ||
| // consecutiveFailureSince field, so no data race with the manager's copy. | ||
| freshReconciler := &CloudConfigReconciler{ | ||
| ClusterOperatorStatusClient: ClusterOperatorStatusClient{ | ||
| Client: cl, | ||
| Clock: clocktesting.NewFakePassiveClock(time.Now()), | ||
| ManagedNamespace: targetNamespaceName, | ||
| }, | ||
| Scheme: scheme.Scheme, | ||
| FeatureGateAccess: featuregates.NewHardcodedFeatureGateAccessForTesting( | ||
| nil, []configv1.FeatureGateName{"AWSServiceLBNetworkSecurityGroup"}, nil, nil, | ||
| ), | ||
| } | ||
|
|
||
| request := reconcile.Request{NamespacedName: client.ObjectKey{Name: "foo", Namespace: "bar"}} | ||
| _, err := reconciler.Reconcile(ctx, request) | ||
| _, err := freshReconciler.Reconcile(ctx, request) | ||
| Expect(err).Should(Succeed()) | ||
|
|
||
| Expect(cl.Get(ctx, syncedConfigMapKey, syncedCloudConfigMap)).Should(Succeed()) | ||
|
|
@@ -509,7 +527,7 @@ var _ = Describe("Cloud config sync reconciler", func() { | |
| Expect(len(allCMs.Items)).To(BeEquivalentTo(1)) | ||
| }) | ||
|
|
||
| It("should error if a user-specified configmap key isn't present", func() { | ||
| It("should degrade immediately if a user-specified configmap key isn't present", func() { | ||
| infraResource := makeInfrastructureResource(configv1.AWSPlatformType) | ||
| infraResource.Spec.CloudConfig.Key = "notfound" | ||
| Expect(cl.Create(ctx, infraResource)).To(Succeed()) | ||
|
|
@@ -518,8 +536,19 @@ var _ = Describe("Cloud config sync reconciler", func() { | |
| Expect(cl.Status().Update(ctx, infraResource.DeepCopy())).To(Succeed()) | ||
|
|
||
| _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) | ||
| Expect(err.Error()).To(ContainSubstring("specified in infra resource does not exist in source configmap")) | ||
|
|
||
| Expect(err).To(Succeed()) | ||
|
|
||
| co := &configv1.ClusterOperator{} | ||
| Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) | ||
| var degradedCond *configv1.ClusterOperatorStatusCondition | ||
| for i := range co.Status.Conditions { | ||
| if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition { | ||
| degradedCond = &co.Status.Conditions[i] | ||
| break | ||
| } | ||
| } | ||
| Expect(degradedCond).NotTo(BeNil()) | ||
| Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue)) | ||
| }) | ||
|
|
||
| It("should continue with reconcile when feature gates are available", func() { | ||
|
|
@@ -584,15 +613,39 @@ var _ = Describe("Cloud config sync reconciler", func() { | |
| }) | ||
| }) | ||
|
|
||
| It("reconcile should fail if no infra resource found", func() { | ||
| It("reconcile should succeed and be available if no infra resource found", func() { | ||
| _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) | ||
| Expect(err.Error()).Should(BeEquivalentTo("infrastructures.config.openshift.io \"cluster\" not found")) | ||
| Expect(err).To(Succeed()) | ||
|
|
||
| co := &configv1.ClusterOperator{} | ||
| Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) | ||
| var availCond *configv1.ClusterOperatorStatusCondition | ||
| for i := range co.Status.Conditions { | ||
| if co.Status.Conditions[i].Type == cloudConfigControllerAvailableCondition { | ||
| availCond = &co.Status.Conditions[i] | ||
| break | ||
| } | ||
| } | ||
| Expect(availCond).NotTo(BeNil()) | ||
| Expect(availCond.Status).To(Equal(configv1.ConditionTrue)) | ||
| }) | ||
|
|
||
| It("should fail if no PlatformStatus in infra resource presented ", func() { | ||
| It("should degrade immediately if no PlatformStatus in infra resource", func() { | ||
| infraResource := makeInfrastructureResource(configv1.AWSPlatformType) | ||
| Expect(cl.Create(ctx, infraResource)).To(Succeed()) | ||
| _, err := reconciler.Reconcile(context.TODO(), ctrl.Request{}) | ||
| Expect(err.Error()).Should(BeEquivalentTo("platformStatus is required")) | ||
| Expect(err).To(Succeed()) | ||
|
|
||
| co := &configv1.ClusterOperator{} | ||
| Expect(cl.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co)).To(Succeed()) | ||
| var degradedCond *configv1.ClusterOperatorStatusCondition | ||
| for i := range co.Status.Conditions { | ||
| if co.Status.Conditions[i].Type == cloudConfigControllerDegradedCondition { | ||
| degradedCond = &co.Status.Conditions[i] | ||
| break | ||
| } | ||
| } | ||
| Expect(degradedCond).NotTo(BeNil()) | ||
| Expect(degradedCond.Status).To(Equal(configv1.ConditionTrue)) | ||
| }) | ||
| }) | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: This won't work if permanentError is not on the top level error. errors.As() should be used instead.