Skip to content

Commit fc2a5c8

Browse files
committed
use managed-by label with nvidiadriver cr
Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
1 parent 23e5272 commit fc2a5c8

File tree

4 files changed

+167
-4
lines changed

4 files changed

+167
-4
lines changed

controllers/nvidiadriver_controller.go

Lines changed: 164 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,14 @@ import (
2727
corev1 "k8s.io/api/core/v1"
2828
apierrors "k8s.io/apimachinery/pkg/api/errors"
2929
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30+
"k8s.io/apimachinery/pkg/labels"
3031
"k8s.io/apimachinery/pkg/runtime"
3132
"k8s.io/apimachinery/pkg/types"
3233
"k8s.io/client-go/util/workqueue"
3334
ctrl "sigs.k8s.io/controller-runtime"
3435
"sigs.k8s.io/controller-runtime/pkg/client"
3536
"sigs.k8s.io/controller-runtime/pkg/controller"
37+
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3638
"sigs.k8s.io/controller-runtime/pkg/event"
3739
"sigs.k8s.io/controller-runtime/pkg/handler"
3840
"sigs.k8s.io/controller-runtime/pkg/log"
@@ -49,6 +51,11 @@ import (
4951
"github.com/NVIDIA/gpu-operator/internal/validator"
5052
)
5153

54+
const (
55+
nvidiaDriverNodeLabelFinalizer = "nvidia.com/nvidiadriver-node-labels"
56+
managedByLabel = "nvidia.com/gpu.driver.managed-by"
57+
)
58+
5259
// NVIDIADriverReconciler reconciles a NVIDIADriver object
5360
type NVIDIADriverReconciler struct {
5461
client.Client
@@ -97,6 +104,16 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
97104
return reconcile.Result{}, wrappedErr
98105
}
99106

107+
// Handle deletion: cleanup labels and finalizer
108+
if !instance.ObjectMeta.DeletionTimestamp.IsZero() {
109+
return r.reconcileDelete(ctx, instance)
110+
}
111+
112+
// Add finalizer if not present
113+
if !controllerutil.ContainsFinalizer(instance, nvidiaDriverNodeLabelFinalizer) {
114+
return r.addFinalizer(ctx, instance)
115+
}
116+
100117
// Get the singleton NVIDIA ClusterPolicy object in the cluster.
101118
clusterPolicyList := &gpuv1.ClusterPolicyList{}
102119
if err := r.List(ctx, clusterPolicyList); err != nil {
@@ -151,6 +168,12 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
151168
return reconcile.Result{}, nil
152169
}
153170

171+
// Reconcile node labels
172+
if err := r.reconcileNodeLabels(ctx, instance); err != nil {
173+
logger.Error(err, "failed to reconcile node labels")
174+
return reconcile.Result{}, err
175+
}
176+
154177
if instance.Spec.UsePrecompiledDrivers() && (instance.Spec.IsGDSEnabled() || instance.Spec.IsGDRCopyEnabled()) {
155178
err := errors.New("GPUDirect Storage driver (nvidia-fs) and/or GDRCopy driver is not supported along with pre-compiled NVIDIA drivers")
156179
logger.Error(err, "unsupported driver combination detected")
@@ -220,6 +243,146 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
220243
return reconcile.Result{}, nil
221244
}
222245

246+
// addFinalizer adds a finalizer to the NVIDIADriver resource
247+
func (r *NVIDIADriverReconciler) addFinalizer(ctx context.Context, instance *nvidiav1alpha1.NVIDIADriver) (reconcile.Result, error) {
248+
logger := log.FromContext(ctx)
249+
logger.Info("Adding finalizer to NVIDIADriver")
250+
patch := client.MergeFrom(instance.DeepCopy())
251+
controllerutil.AddFinalizer(instance, nvidiaDriverNodeLabelFinalizer)
252+
if err := r.Patch(ctx, instance, patch); err != nil {
253+
return reconcile.Result{}, err
254+
}
255+
return reconcile.Result{}, nil
256+
}
257+
258+
// reconcileDelete handles the deletion of a NVIDIADriver resource
259+
// It ensures that any node labels managed by this NVIDIADriver are cleaned up
260+
func (r *NVIDIADriverReconciler) reconcileDelete(ctx context.Context, instance *nvidiav1alpha1.NVIDIADriver) (reconcile.Result, error) {
261+
logger := log.FromContext(ctx)
262+
263+
if controllerutil.ContainsFinalizer(instance, nvidiaDriverNodeLabelFinalizer) {
264+
logger.Info("NVIDIADriver is being deleted, cleaning up node labels")
265+
266+
// Remove node labels before deleting
267+
if err := r.cleanupNodeLabels(ctx, instance); err != nil {
268+
logger.Error(err, "failed to cleanup node labels")
269+
return reconcile.Result{}, err
270+
}
271+
272+
// Remove the finalizer
273+
patch := client.MergeFrom(instance.DeepCopy())
274+
controllerutil.RemoveFinalizer(instance, nvidiaDriverNodeLabelFinalizer)
275+
if err := r.Patch(ctx, instance, patch); err != nil {
276+
return reconcile.Result{}, err
277+
}
278+
logger.Info("Finalizer removed, NVIDIADriver will be deleted")
279+
}
280+
return reconcile.Result{}, nil
281+
}
282+
283+
// reconcileNodeLabels ensures that the node labels for the NVIDIADriver resource are correctly set
284+
func (r *NVIDIADriverReconciler) reconcileNodeLabels(ctx context.Context, nvd *nvidiav1alpha1.NVIDIADriver) error {
285+
logger := log.FromContext(ctx)
286+
287+
var nodes corev1.NodeList
288+
if err := r.List(ctx, &nodes); err != nil {
289+
logger.Error(err, "failed to list nodes")
290+
return err
291+
}
292+
293+
selector := labels.SelectorFromSet(nvd.Spec.NodeSelector)
294+
295+
for i := range nodes.Items {
296+
node := &nodes.Items[i]
297+
298+
nodeLabels := node.GetLabels()
299+
if nodeLabels == nil {
300+
nodeLabels = make(map[string]string)
301+
}
302+
303+
matches := selector.Matches(labels.Set(nodeLabels))
304+
current, exists := nodeLabels[managedByLabel]
305+
306+
var desired *string
307+
if matches {
308+
desired = &nvd.Name
309+
}
310+
311+
// Only update if:
312+
// 1. We want to add/change label and it doesn't exist or is different, OR
313+
// 2. We want to remove label and it exists with OUR value
314+
needsUpdate :=
315+
(desired != nil && (!exists || current != *desired)) ||
316+
(desired == nil && exists && current == nvd.Name)
317+
318+
if !needsUpdate {
319+
continue
320+
}
321+
322+
nodeCopy := node.DeepCopy()
323+
newLabels := maps.Clone(nodeLabels)
324+
325+
if desired != nil {
326+
logger.Info("Setting driver management node label",
327+
"node", node.Name,
328+
"label", managedByLabel,
329+
"desired", *desired,
330+
)
331+
newLabels[managedByLabel] = *desired
332+
} else {
333+
logger.Info("Removing driver management node label",
334+
"node", node.Name,
335+
"label", managedByLabel,
336+
)
337+
delete(newLabels, managedByLabel)
338+
}
339+
340+
node.SetLabels(newLabels)
341+
if err := r.Patch(ctx, node, client.MergeFrom(nodeCopy)); err != nil {
342+
logger.Error(err, "failed to update node label", "node", node.Name)
343+
return err
344+
}
345+
}
346+
return nil
347+
}
348+
349+
// cleanupNodeLabels removes the managed-by label from all nodes managed by the given NVIDIADriver
350+
func (r *NVIDIADriverReconciler) cleanupNodeLabels(ctx context.Context, nvd *nvidiav1alpha1.NVIDIADriver) error {
351+
logger := log.FromContext(ctx)
352+
353+
nodeList := &corev1.NodeList{}
354+
if err := r.List(ctx, nodeList); err != nil {
355+
logger.Error(err, "failed to list nodes during cleanup")
356+
return err
357+
}
358+
359+
for i := range nodeList.Items {
360+
node := &nodeList.Items[i]
361+
nodeLabels := node.GetLabels()
362+
if nodeLabels == nil {
363+
continue
364+
}
365+
366+
currentValue, hasLabel := nodeLabels[managedByLabel]
367+
if hasLabel && currentValue == nvd.Name {
368+
logger.Info("Removing driver management label from node during cleanup", "node", node.Name)
369+
nodeCopy := node.DeepCopy()
370+
// Clone the labels map to avoid modifying the original
371+
newLabels := maps.Clone(nodeLabels)
372+
delete(newLabels, managedByLabel)
373+
node.SetLabels(newLabels)
374+
patch := client.MergeFrom(nodeCopy)
375+
if err := r.Patch(ctx, node, patch); err != nil {
376+
logger.Error(err, "failed to remove label from node", "node", node.Name)
377+
return err
378+
}
379+
}
380+
}
381+
382+
logger.Info(fmt.Sprintf("Successfully cleaned up %s node labels for NVIDIADriver %s", managedByLabel, nvd.Name))
383+
return nil
384+
}
385+
223386
func (r *NVIDIADriverReconciler) updateCrStatus(
224387
ctx context.Context, cr *nvidiav1alpha1.NVIDIADriver, status state.Results) error {
225388
reqLogger := log.FromContext(ctx)
@@ -282,7 +445,7 @@ func (r *NVIDIADriverReconciler) SetupWithManager(ctx context.Context, mgr ctrl.
282445
mgr.GetCache(),
283446
&nvidiav1alpha1.NVIDIADriver{},
284447
&handler.TypedEnqueueRequestForObject[*nvidiav1alpha1.NVIDIADriver]{},
285-
predicate.TypedGenerationChangedPredicate[*nvidiav1alpha1.NVIDIADriver]{},
448+
predicate.TypedResourceVersionChangedPredicate[*nvidiav1alpha1.NVIDIADriver]{},
286449
),
287450
)
288451
if err != nil {

internal/state/driver.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,7 @@ func getDriverSpec(cr *nvidiav1alpha1.NVIDIADriver, nodePool nodePool) (*driverS
573573
Spec: spec,
574574
AppName: nvidiaDriverAppName,
575575
Name: nvidiaDriverName,
576+
CRName: cr.Name,
576577
ImagePath: imagePath,
577578
ManagerImagePath: managerImagePath,
578579
OSVersion: nodePool.getOS(),

internal/state/types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ type driverSpec struct {
3232
Spec *nvidiav1alpha1.NVIDIADriverSpec
3333
AppName string
3434
Name string
35+
CRName string
3536
ImagePath string
3637
ManagerImagePath string
3738
OSVersion string

manifests/state-driver/0500_daemonset.yaml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,12 @@ spec:
6363
terminationGracePeriodSeconds: 120
6464
{{- end }}
6565
nodeSelector:
66+
nvidia.com/gpu.driver.managed-by: {{ .Driver.CRName }}
6667
{{- if eq .Driver.Spec.DriverType "vgpu-host-manager" }}
6768
nvidia.com/gpu.deploy.vgpu-manager: "true"
6869
{{- else }}
6970
nvidia.com/gpu.deploy.driver: "true"
7071
{{- end }}
71-
{{- if .Driver.Spec.NodeSelector }}
72-
{{- .Driver.Spec.NodeSelector | yaml | nindent 8 }}
73-
{{- end }}
7472
{{- if and (.Openshift) (.Runtime.OpenshiftDriverToolkitEnabled) }}
7573
feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: {{ .Openshift.RHCOSVersion | quote }}
7674
{{- end }}

0 commit comments

Comments
 (0)