@@ -27,12 +27,14 @@ import (
2727 corev1 "k8s.io/api/core/v1"
2828 apierrors "k8s.io/apimachinery/pkg/api/errors"
2929 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+ "k8s.io/apimachinery/pkg/labels"
3031 "k8s.io/apimachinery/pkg/runtime"
3132 "k8s.io/apimachinery/pkg/types"
3233 "k8s.io/client-go/util/workqueue"
3334 ctrl "sigs.k8s.io/controller-runtime"
3435 "sigs.k8s.io/controller-runtime/pkg/client"
3536 "sigs.k8s.io/controller-runtime/pkg/controller"
37+ "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3638 "sigs.k8s.io/controller-runtime/pkg/event"
3739 "sigs.k8s.io/controller-runtime/pkg/handler"
3840 "sigs.k8s.io/controller-runtime/pkg/log"
@@ -49,6 +51,11 @@ import (
4951 "github.com/NVIDIA/gpu-operator/internal/validator"
5052)
5153
54+ const (
55+ nvidiaDriverNodeLabelFinalizer = "nvidia.com/nvidiadriver-node-labels"
56+ managedByLabel = "nvidia.com/gpu.driver.managed-by"
57+ )
58+
5259// NVIDIADriverReconciler reconciles a NVIDIADriver object
5360type NVIDIADriverReconciler struct {
5461 client.Client
@@ -97,6 +104,16 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
97104 return reconcile.Result {}, wrappedErr
98105 }
99106
107+ // Handle deletion: cleanup labels and finalizer
108+ if ! instance .ObjectMeta .DeletionTimestamp .IsZero () {
109+ return r .reconcileDelete (ctx , instance )
110+ }
111+
112+ // Add finalizer if not present
113+ if ! controllerutil .ContainsFinalizer (instance , nvidiaDriverNodeLabelFinalizer ) {
114+ return r .addFinalizer (ctx , instance )
115+ }
116+
100117 // Get the singleton NVIDIA ClusterPolicy object in the cluster.
101118 clusterPolicyList := & gpuv1.ClusterPolicyList {}
102119 if err := r .List (ctx , clusterPolicyList ); err != nil {
@@ -151,6 +168,12 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
151168 return reconcile.Result {}, nil
152169 }
153170
171+ // Reconcile node labels
172+ if err := r .reconcileNodeLabels (ctx , instance ); err != nil {
173+ logger .Error (err , "failed to reconcile node labels" )
174+ return reconcile.Result {}, err
175+ }
176+
154177 if instance .Spec .UsePrecompiledDrivers () && (instance .Spec .IsGDSEnabled () || instance .Spec .IsGDRCopyEnabled ()) {
155178 err := errors .New ("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers" )
156179 logger .Error (err , "unsupported driver combination detected" )
@@ -220,6 +243,146 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
220243 return reconcile.Result {}, nil
221244}
222245
246+ // addFinalizer adds a finalizer to the NVIDIADriver resource
247+ func (r * NVIDIADriverReconciler ) addFinalizer (ctx context.Context , instance * nvidiav1alpha1.NVIDIADriver ) (reconcile.Result , error ) {
248+ logger := log .FromContext (ctx )
249+ logger .Info ("Adding finalizer to NVIDIADriver" )
250+ patch := client .MergeFrom (instance .DeepCopy ())
251+ controllerutil .AddFinalizer (instance , nvidiaDriverNodeLabelFinalizer )
252+ if err := r .Patch (ctx , instance , patch ); err != nil {
253+ return reconcile.Result {}, err
254+ }
255+ return reconcile.Result {}, nil
256+ }
257+
258+ // reconcileDelete handles the deletion of a NVIDIADriver resource
259+ // It ensures that any node labels managed by this NVIDIADriver are cleaned up
260+ func (r * NVIDIADriverReconciler ) reconcileDelete (ctx context.Context , instance * nvidiav1alpha1.NVIDIADriver ) (reconcile.Result , error ) {
261+ logger := log .FromContext (ctx )
262+
263+ if controllerutil .ContainsFinalizer (instance , nvidiaDriverNodeLabelFinalizer ) {
264+ logger .Info ("NVIDIADriver is being deleted, cleaning up node labels" )
265+
266+ // Remove node labels before deleting
267+ if err := r .cleanupNodeLabels (ctx , instance ); err != nil {
268+ logger .Error (err , "failed to cleanup node labels" )
269+ return reconcile.Result {}, err
270+ }
271+
272+ // Remove the finalizer
273+ patch := client .MergeFrom (instance .DeepCopy ())
274+ controllerutil .RemoveFinalizer (instance , nvidiaDriverNodeLabelFinalizer )
275+ if err := r .Patch (ctx , instance , patch ); err != nil {
276+ return reconcile.Result {}, err
277+ }
278+ logger .Info ("Finalizer removed, NVIDIADriver will be deleted" )
279+ }
280+ return reconcile.Result {}, nil
281+ }
282+
283+ // reconcileNodeLabels ensures that the node labels for the NVIDIADriver resource are correctly set
284+ func (r * NVIDIADriverReconciler ) reconcileNodeLabels (ctx context.Context , nvd * nvidiav1alpha1.NVIDIADriver ) error {
285+ logger := log .FromContext (ctx )
286+
287+ var nodes corev1.NodeList
288+ if err := r .List (ctx , & nodes ); err != nil {
289+ logger .Error (err , "failed to list nodes" )
290+ return err
291+ }
292+
293+ selector := labels .SelectorFromSet (nvd .Spec .NodeSelector )
294+
295+ for i := range nodes .Items {
296+ node := & nodes .Items [i ]
297+
298+ nodeLabels := node .GetLabels ()
299+ if nodeLabels == nil {
300+ nodeLabels = make (map [string ]string )
301+ }
302+
303+ matches := selector .Matches (labels .Set (nodeLabels ))
304+ current , exists := nodeLabels [managedByLabel ]
305+
306+ var desired * string
307+ if matches {
308+ desired = & nvd .Name
309+ }
310+
311+ // Only update if:
312+ // 1. We want to add/change label and it doesn't exist or is different, OR
313+ // 2. We want to remove label and it exists with OUR value
314+ needsUpdate :=
315+ (desired != nil && (! exists || current != * desired )) ||
316+ (desired == nil && exists && current == nvd .Name )
317+
318+ if ! needsUpdate {
319+ continue
320+ }
321+
322+ nodeCopy := node .DeepCopy ()
323+ newLabels := maps .Clone (nodeLabels )
324+
325+ if desired != nil {
326+ logger .Info ("Setting driver management node label" ,
327+ "node" , node .Name ,
328+ "label" , managedByLabel ,
329+ "desired" , * desired ,
330+ )
331+ newLabels [managedByLabel ] = * desired
332+ } else {
333+ logger .Info ("Removing driver management node label" ,
334+ "node" , node .Name ,
335+ "label" , managedByLabel ,
336+ )
337+ delete (newLabels , managedByLabel )
338+ }
339+
340+ node .SetLabels (newLabels )
341+ if err := r .Patch (ctx , node , client .MergeFrom (nodeCopy )); err != nil {
342+ logger .Error (err , "failed to update node label" , "node" , node .Name )
343+ return err
344+ }
345+ }
346+ return nil
347+ }
348+
349+ // cleanupNodeLabels removes the managed-by label from all nodes managed by the given NVIDIADriver
350+ func (r * NVIDIADriverReconciler ) cleanupNodeLabels (ctx context.Context , nvd * nvidiav1alpha1.NVIDIADriver ) error {
351+ logger := log .FromContext (ctx )
352+
353+ nodeList := & corev1.NodeList {}
354+ if err := r .List (ctx , nodeList ); err != nil {
355+ logger .Error (err , "failed to list nodes during cleanup" )
356+ return err
357+ }
358+
359+ for i := range nodeList .Items {
360+ node := & nodeList .Items [i ]
361+ nodeLabels := node .GetLabels ()
362+ if nodeLabels == nil {
363+ continue
364+ }
365+
366+ currentValue , hasLabel := nodeLabels [managedByLabel ]
367+ if hasLabel && currentValue == nvd .Name {
368+ logger .Info ("Removing driver management label from node during cleanup" , "node" , node .Name )
369+ nodeCopy := node .DeepCopy ()
370+ // Clone the labels map to avoid modifying the original
371+ newLabels := maps .Clone (nodeLabels )
372+ delete (newLabels , managedByLabel )
373+ node .SetLabels (newLabels )
374+ patch := client .MergeFrom (nodeCopy )
375+ if err := r .Patch (ctx , node , patch ); err != nil {
376+ logger .Error (err , "failed to remove label from node" , "node" , node .Name )
377+ return err
378+ }
379+ }
380+ }
381+
382+ logger .Info (fmt .Sprintf ("Successfully cleaned up %s node labels for NVIDIADriver %s" , managedByLabel , nvd .Name ))
383+ return nil
384+ }
385+
223386func (r * NVIDIADriverReconciler ) updateCrStatus (
224387 ctx context.Context , cr * nvidiav1alpha1.NVIDIADriver , status state.Results ) error {
225388 reqLogger := log .FromContext (ctx )
@@ -282,7 +445,7 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
282445 mgr .GetCache (),
283446 & nvidiav1alpha1.NVIDIADriver {},
284447 & handler.TypedEnqueueRequestForObject [* nvidiav1alpha1.NVIDIADriver ]{},
285- predicate.TypedGenerationChangedPredicate [* nvidiav1alpha1.NVIDIADriver ]{},
448+ predicate.TypedResourceVersionChangedPredicate [* nvidiav1alpha1.NVIDIADriver ]{},
286449 ),
287450 )
288451 if err != nil {
0 commit comments