Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@ spec:
- name: Alert on node deployment failure
rules:
- alert: GPUOperatorNodeDeploymentFailed
# There is no GPU exposed on the node,
# There is no GPU exposed on the node.
# When the device plugin is intentionally disabled in the ClusterPolicy
# (devicePlugin.enabled: false), the metric is set to -1, so this
# alert will not fire in that case.
expr: |
gpu_operator_node_device_plugin_devices_total == 0
for: 30m
Expand Down
10 changes: 9 additions & 1 deletion cmd/nvidia-validator/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,15 @@ func (nm *NodeMetrics) Run() error {
go nm.watchStatusFile(&nm.cudaReady, cudaStatusFile)

go nm.watchDriverValidation()
go nm.watchDevicePluginValidation()
if os.Getenv("DEVICE_PLUGIN_ENABLED") != "false" {
go nm.watchDevicePluginValidation()
} else {
// Set to -1 so the alert (expr: == 0) does not fire.
// The gauge is auto-registered by promauto and defaults to 0,
// which would be a false positive.
nm.deviceCount.Set(-1)
log.Info("metrics: DevicePlugin is disabled in ClusterPolicy, skipping device plugin validation")
}
go nm.watchNVIDIAPCI()

log.Printf("Running the metrics server, listening on :%d/metrics", nm.port)
Expand Down
8 changes: 8 additions & 0 deletions controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ const (
NvidiaDisableRequireEnvName = "NVIDIA_DISABLE_REQUIRE"
// GDSEnabledEnvName is the env name to enable GDS support with device-plugin
GDSEnabledEnvName = "GDS_ENABLED"
// DevicePluginEnabledEnvName indicates whether the device plugin is enabled in the ClusterPolicy
DevicePluginEnabledEnvName = "DEVICE_PLUGIN_ENABLED"
// MOFEDEnabledEnvName is the env name to enable MOFED devices injection with device-plugin
MOFEDEnabledEnvName = "MOFED_ENABLED"
// GDRCopyEnabledEnvName is the envvar that enables injection of the GDRCopy device node with the device-plugin
Expand Down Expand Up @@ -2450,6 +2452,12 @@ func TransformNodeStatusExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol
obj.Spec.Template.Spec.Containers[0].Args = config.NodeStatusExporter.Args
}

devicePluginEnabled := "true"
if !config.DevicePlugin.IsEnabled() {
devicePluginEnabled = "false"
}
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DevicePluginEnabledEnvName, devicePluginEnabled)

// set/append environment variables for exporter container
if len(config.NodeStatusExporter.Env) > 0 {
for _, env := range config.NodeStatusExporter.Env {
Expand Down
29 changes: 29 additions & 0 deletions controllers/transforms_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2891,6 +2891,35 @@ func TestTransformNodeStatusExporter(t *testing.T) {
Name: "dummy",
Image: "nvcr.io/nvidia/cloud-native/node-status-exporter:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Env: []corev1.EnvVar{
{Name: DevicePluginEnabledEnvName, Value: "true"},
},
SecurityContext: &corev1.SecurityContext{
RunAsUser: rootUID,
},
}),
},
{
description: "node status exporter with device plugin disabled",
ds: NewDaemonset().
WithContainer(corev1.Container{Name: "dummy"}),
cpSpec: &gpuv1.ClusterPolicySpec{
NodeStatusExporter: gpuv1.NodeStatusExporterSpec{
Repository: "nvcr.io/nvidia/cloud-native",
Image: "node-status-exporter",
Version: "v1.0.0",
ImagePullPolicy: "IfNotPresent",
},
DevicePlugin: gpuv1.DevicePluginSpec{Enabled: newBoolPtr(false)},
},
expectedDs: NewDaemonset().
WithContainer(corev1.Container{
Name: "dummy",
Image: "nvcr.io/nvidia/cloud-native/node-status-exporter:v1.0.0",
ImagePullPolicy: corev1.PullIfNotPresent,
Env: []corev1.EnvVar{
{Name: DevicePluginEnabledEnvName, Value: "false"},
},
SecurityContext: &corev1.SecurityContext{
RunAsUser: rootUID,
},
Expand Down
12 changes: 12 additions & 0 deletions tests/e2e/helpers/clusterpolicy.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,18 @@ func (h *ClusterPolicyClient) DisableGFD(ctx context.Context, name string) error
})
}

func (h *ClusterPolicyClient) EnableDevicePlugin(ctx context.Context, name string) error {
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
clusterPolicy.Spec.DevicePlugin.Enabled = ptr.To(true)
})
}

func (h *ClusterPolicyClient) DisableDevicePlugin(ctx context.Context, name string) error {
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
clusterPolicy.Spec.DevicePlugin.Enabled = ptr.To(false)
})
}

func (h *ClusterPolicyClient) SetMIGStrategy(ctx context.Context, name, strategy string) error {
return h.modify(ctx, name, func(clusterPolicy *nvidiav1.ClusterPolicy) {
clusterPolicy.Spec.MIG.Strategy = nvidiav1.MIGStrategy(strategy)
Expand Down
37 changes: 37 additions & 0 deletions tests/e2e/suites/clusterpolicy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,43 @@ var _ = Describe("ClusterPolicy Management", Label("clusterPolicy"), func() {
})
})

// test_device_plugin_disabled_env - Verify DEVICE_PLUGIN_ENABLED env var propagation
When("Disabling device plugin", Label("device-plugin", "toggle"), func() {
It("should set DEVICE_PLUGIN_ENABLED=false on node-status-exporter when device plugin is disabled", func(ctx context.Context) {
clusterPolicy := getClusterPolicyOrSkip(ctx, clusterPolicyClient, policyName)
originalState := clusterPolicy.Spec.DevicePlugin.Enabled
DeferCleanup(func(ctx context.Context) {
if originalState == nil || *originalState {
_ = clusterPolicyClient.EnableDevicePlugin(ctx, policyName)
waitForDaemonSetReady(ctx, daemonSetClient, testNamespace, "nvidia-device-plugin-daemonset")
}
})

err := clusterPolicyClient.DisableDevicePlugin(ctx, policyName)
Expect(err).NotTo(HaveOccurred(), "Failed to disable device plugin in ClusterPolicy")

verifyEnvInDaemonSet(ctx, daemonSetClient, testNamespace,
"nvidia-node-status-exporter", "DEVICE_PLUGIN_ENABLED", "false")
})

It("should set DEVICE_PLUGIN_ENABLED=true on node-status-exporter when device plugin is re-enabled", func(ctx context.Context) {
clusterPolicy := getClusterPolicyOrSkip(ctx, clusterPolicyClient, policyName)
originalState := clusterPolicy.Spec.DevicePlugin.Enabled
DeferCleanup(func(ctx context.Context) {
if originalState != nil && !*originalState {
_ = clusterPolicyClient.DisableDevicePlugin(ctx, policyName)
}
})

err := clusterPolicyClient.EnableDevicePlugin(ctx, policyName)
Expect(err).NotTo(HaveOccurred(), "Failed to enable device plugin in ClusterPolicy")

verifyEnvInDaemonSet(ctx, daemonSetClient, testNamespace,
"nvidia-node-status-exporter", "DEVICE_PLUGIN_ENABLED", "true")
waitForDaemonSetReady(ctx, daemonSetClient, testNamespace, "nvidia-device-plugin-daemonset")
})
})

// test_custom_labels_override - Test custom labels on daemonsets
When("Updating daemonset custom labels", Label("labels", "config"), func() {
It("should apply custom labels to all operand pods", func(ctx context.Context) {
Expand Down