Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions cmd/nvidia-validator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ const (
defaultMetricsPort = 0
// hostDevCharPath indicates the path in the container where the host '/dev/char' directory is mounted to
hostDevCharPath = "/host-dev-char"
// nvidiaModuleRefcntPath is the path to check if the nvidia kernel module is loaded
nvidiaModuleRefcntPath = "/sys/module/nvidia/refcnt"
// defaultDriverInstallDir indicates the default path on the host where the driver container installation is made available
defaultDriverInstallDir = "/run/nvidia/driver"
// defaultDriverInstallDirCtrPath indicates the default path where the NVIDIA driver install dir is mounted in the container
Expand Down Expand Up @@ -916,6 +918,20 @@ func (d *Driver) createStatusFile(driverInfo driverInfo) error {
return createStatusFileWithContent(outputDirFlag+"/"+driverStatusFile, statusFileContent)
}

// isNvidiaModuleLoaded checks if NVIDIA kernel module is already loaded in kernel memory.
func isNvidiaModuleLoaded() bool {
// Check if the nvidia module is loaded by checking if nvidiaModuleRefcntPath exists
if _, err := os.Stat(nvidiaModuleRefcntPath); err == nil {
refcntData, err := os.ReadFile(nvidiaModuleRefcntPath)
if err == nil {
refcnt := strings.TrimSpace(string(refcntData))
log.Infof("NVIDIA kernel module already loaded in kernel memory (refcnt=%s)", refcnt)
return true
}
}
return false
}

// createDevCharSymlinks creates symlinks in /host-dev-char that point to all possible NVIDIA devices nodes.
func createDevCharSymlinks(driverInfo driverInfo, disableDevCharSymlinkCreation bool) error {
if disableDevCharSymlinkCreation {
Expand All @@ -926,8 +942,16 @@ func createDevCharSymlinks(driverInfo driverInfo, disableDevCharSymlinkCreation

log.Info("creating symlinks under /dev/char that correspond to NVIDIA character devices")

// Only attempt to load NVIDIA kernel modules when we can chroot into driverRoot
loadKernelModules := driverInfo.isHostDriver || (driverInfo.devRoot == driverInfo.driverRoot)
// Check if NVIDIA module is already loaded in kernel memory.
// If it is, we don't need to run modprobe (which would fail if modules aren't in /lib/modules/).
// This handles the case where the driver container performed a userspace-only install
// after detecting that module was already loaded from a previous boot.
moduleAlreadyLoaded := isNvidiaModuleLoaded()

// Only attempt to load NVIDIA kernel modules when:
// 1. Module is not already loaded in kernel memory, AND
// 2. We can chroot into driverRoot to run modprobe
loadKernelModules := !moduleAlreadyLoaded && (driverInfo.isHostDriver || (driverInfo.devRoot == driverInfo.driverRoot))

// driverRootCtrPath is the path of the driver install dir in the container. This will either be
// driverInstallDirCtrPathFlag or '/host'.
Expand Down
26 changes: 26 additions & 0 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -1042,6 +1042,32 @@ func TransformDriver(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n C
return fmt.Errorf("ERROR: failed to transform the pre-compiled Driver Daemonset: %s", err)
}
}

// Compute driver configuration digest after all transformations are complete.
// This digest enables fast-path driver installation by detecting when configuration
// hasn't changed, avoiding unnecessary driver reinstalls and pod evictions.
// Used by k8s-driver-manager to decide if driver cleanup is needed and by
// nvidia-driver container to skip full reinstall for matching configurations.
configDigest := utils.GetObjectHash(obj.Spec)

// Set the computed digest in driver-manager initContainer
driverManagerContainer := findContainerByName(obj.Spec.Template.Spec.InitContainers, "k8s-driver-manager")
if driverManagerContainer != nil {
setContainerEnv(driverManagerContainer, "DRIVER_CONFIG_DIGEST", configDigest)
}

// Set the computed digest in nvidia-driver container
driverContainer := findContainerByName(obj.Spec.Template.Spec.Containers, "nvidia-driver-ctr")
if driverContainer != nil {
setContainerEnv(driverContainer, "DRIVER_CONFIG_DIGEST", configDigest)
}

// Used by dtk-build-driver to determine if fast path should be used (skip rebuild)
driverToolkitContainer := findContainerByName(obj.Spec.Template.Spec.Containers, "openshift-driver-toolkit-ctr")
if driverToolkitContainer != nil {
setContainerEnv(driverToolkitContainer, "DRIVER_CONFIG_DIGEST", configDigest)
}

return nil
}

Expand Down
30 changes: 30 additions & 0 deletions controllers/transforms_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2067,6 +2067,23 @@ func newBoolPtr(b bool) *bool {
return boolPtr
}

// removeDigestFromDaemonSet removes DRIVER_CONFIG_DIGEST env var from a DaemonSet
func removeDigestFromDaemonSet(ds *appsv1.DaemonSet) {
removeDigestFromContainers := func(containers []corev1.Container) {
for i := range containers {
var filtered []corev1.EnvVar
for _, env := range containers[i].Env {
if env.Name != "DRIVER_CONFIG_DIGEST" {
filtered = append(filtered, env)
}
}
containers[i].Env = filtered
}
}
removeDigestFromContainers(ds.Spec.Template.Spec.Containers)
removeDigestFromContainers(ds.Spec.Template.Spec.InitContainers)
}

func TestTransformDriverManagerInitContainer(t *testing.T) {
testCases := []struct {
description string
Expand Down Expand Up @@ -2783,6 +2800,9 @@ func TestTransformDriver(t *testing.T) {
return
}
require.NoError(t, err)

// Remove dynamically generated digest before comparison
removeDigestFromDaemonSet(tc.ds.DaemonSet)
require.EqualValues(t, tc.expectedDs, tc.ds)
})
}
Expand Down Expand Up @@ -3149,6 +3169,9 @@ func TestTransformDriverWithLicensingConfig(t *testing.T) {
return
}
require.NoError(t, err)

// Remove dynamically generated digest before comparison
removeDigestFromDaemonSet(tc.ds.DaemonSet)
require.EqualValues(t, tc.expectedDs, tc.ds)
})
}
Expand Down Expand Up @@ -3268,6 +3291,9 @@ func TestTransformDriverWithResources(t *testing.T) {
return
}
require.NoError(t, err)

// Remove dynamically generated digest before comparison
removeDigestFromDaemonSet(tc.ds.DaemonSet)
require.EqualValues(t, tc.expectedDs, tc.ds)
})
}
Expand Down Expand Up @@ -3348,6 +3374,9 @@ func TestTransformDriverRDMA(t *testing.T) {
ClusterPolicyController{client: mockClient, runtime: gpuv1.Containerd,
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test")})
require.NoError(t, err)

// Remove dynamically generated digest before comparison
removeDigestFromDaemonSet(ds.DaemonSet)
require.EqualValues(t, expectedDs, ds)
}

Expand Down Expand Up @@ -3416,5 +3445,6 @@ func TestTransformDriverVGPUTopologyConfig(t *testing.T) {
ClusterPolicyController{client: mockClient, runtime: gpuv1.Containerd,
operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test")})
require.NoError(t, err)
removeDigestFromDaemonSet(ds.DaemonSet)
require.EqualValues(t, expectedDs, ds)
}
3 changes: 3 additions & 0 deletions internal/render/render.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ import (
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
yamlDecoder "k8s.io/apimachinery/pkg/util/yaml"
yamlConverter "sigs.k8s.io/yaml"

"github.com/NVIDIA/gpu-operator/internal/utils"
)

const (
Expand Down Expand Up @@ -109,6 +111,7 @@ func (r *textTemplateRenderer) renderFile(filePath string, data *TemplatingData)
}
return *b
},
"getObjectHash": utils.GetObjectHash,
})

if data.Funcs != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_CONFIG_DIGEST
value: "3041669332"
image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04
imagePullPolicy: IfNotPresent
lifecycle:
Expand Down Expand Up @@ -283,6 +285,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_CONFIG_DIGEST
value: "3041669332"
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel
imagePullPolicy: IfNotPresent
name: k8s-driver-manager
Expand Down
4 changes: 4 additions & 0 deletions internal/state/testdata/golden/driver-full-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_CONFIG_DIGEST
value: "185029761"
- name: KERNEL_MODULE_TYPE
value: open
- name: OPEN_KERNEL_MODULES_ENABLED
Expand Down Expand Up @@ -292,6 +294,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_CONFIG_DIGEST
value: "185029761"
- name: FOO
value: foo
- name: BAR
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_CONFIG_DIGEST
value: "2675269488"
- name: GDRCOPY_ENABLED
value: "true"
- name: OPENSHIFT_VERSION
Expand Down Expand Up @@ -392,6 +394,8 @@ spec:
value: 413.92.202304252344-0
- name: NVIDIA_VISIBLE_DEVICES
value: void
- name: DRIVER_CONFIG_DIGEST
value: "2675269488"
- name: GDRCOPY_ENABLED
value: "true"
image: quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:7fecaebc1d51b28bc3548171907e4d91823a031d7a6a694ab686999be2b4d867
Expand Down Expand Up @@ -454,6 +458,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_CONFIG_DIGEST
value: "2675269488"
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel
imagePullPolicy: IfNotPresent
name: k8s-driver-manager
Expand Down
4 changes: 4 additions & 0 deletions internal/state/testdata/golden/driver-gdrcopy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_CONFIG_DIGEST
value: "1592674923"
- name: GDRCOPY_ENABLED
value: "true"
image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04
Expand Down Expand Up @@ -337,6 +339,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_CONFIG_DIGEST
value: "1592674923"
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel
imagePullPolicy: IfNotPresent
name: k8s-driver-manager
Expand Down
4 changes: 4 additions & 0 deletions internal/state/testdata/golden/driver-gds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_CONFIG_DIGEST
value: "2626550354"
- name: GDS_ENABLED
value: "true"
image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04
Expand Down Expand Up @@ -337,6 +339,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_CONFIG_DIGEST
value: "2626550354"
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel
imagePullPolicy: IfNotPresent
name: k8s-driver-manager
Expand Down
4 changes: 4 additions & 0 deletions internal/state/testdata/golden/driver-minimal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_CONFIG_DIGEST
value: "4164928953"
image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04
imagePullPolicy: IfNotPresent
lifecycle:
Expand Down Expand Up @@ -274,6 +276,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_CONFIG_DIGEST
value: "4164928953"
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel
imagePullPolicy: IfNotPresent
name: k8s-driver-manager
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_CONFIG_DIGEST
value: "1738122983"
- name: OPENSHIFT_VERSION
value: "4.13"
- name: HTTP_PROXY
Expand Down Expand Up @@ -332,6 +334,8 @@ spec:
value: 413.92.202304252344-0
- name: NVIDIA_VISIBLE_DEVICES
value: void
- name: DRIVER_CONFIG_DIGEST
value: "1738122983"
image: quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:7fecaebc1d51b28bc3548171907e4d91823a031d7a6a694ab686999be2b4d867
imagePullPolicy: IfNotPresent
name: openshift-driver-toolkit-ctr
Expand Down Expand Up @@ -390,6 +394,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_CONFIG_DIGEST
value: "1738122983"
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel
imagePullPolicy: IfNotPresent
name: k8s-driver-manager
Expand Down
4 changes: 4 additions & 0 deletions internal/state/testdata/golden/driver-precompiled.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_CONFIG_DIGEST
value: "2677257823"
image: nvcr.io/nvidia/driver:535-5.4.0-150-generic-ubuntu22.04
imagePullPolicy: IfNotPresent
lifecycle:
Expand Down Expand Up @@ -276,6 +278,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_CONFIG_DIGEST
value: "2677257823"
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel
imagePullPolicy: IfNotPresent
name: k8s-driver-manager
Expand Down
4 changes: 4 additions & 0 deletions internal/state/testdata/golden/driver-rdma-hostmofed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_CONFIG_DIGEST
value: "193112138"
- name: GPU_DIRECT_RDMA_ENABLED
value: "true"
- name: USE_HOST_MOFED
Expand Down Expand Up @@ -353,6 +355,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_CONFIG_DIGEST
value: "193112138"
- name: GPU_DIRECT_RDMA_ENABLED
value: "true"
- name: USE_HOST_MOFED
Expand Down
4 changes: 4 additions & 0 deletions internal/state/testdata/golden/driver-rdma.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_CONFIG_DIGEST
value: "1255584491"
- name: GPU_DIRECT_RDMA_ENABLED
value: "true"
image: nvcr.io/nvidia/driver:525.85.03-ubuntu22.04
Expand Down Expand Up @@ -349,6 +351,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_CONFIG_DIGEST
value: "1255584491"
- name: GPU_DIRECT_RDMA_ENABLED
value: "true"
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel
Expand Down
4 changes: 4 additions & 0 deletions internal/state/testdata/golden/driver-secret-env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: DRIVER_CONFIG_DIGEST
value: "2171680191"
- name: GDS_ENABLED
value: "true"
- name: GDRCOPY_ENABLED
Expand Down Expand Up @@ -369,6 +371,8 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: DRIVER_CONFIG_DIGEST
value: "2171680191"
image: nvcr.io/nvidia/cloud-native/k8s-driver-manager:devel
imagePullPolicy: IfNotPresent
name: k8s-driver-manager
Expand Down
Loading