Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,25 @@ KMM_BUILDER_IMG ?= gcr.io/kaniko-project/executor:v1.23.2
KMM_WEBHOOK_IMG_NAME ?= $(DOCKER_REGISTRY)/kernel-module-management-webhook-server
KMM_OPERATOR_IMG_NAME ?= $(DOCKER_REGISTRY)/kernel-module-management-operator

# Operand related images
EXPORTER_IMAGE_TAG ?= latest
METRICS_EXPORTER_IMG = $(DOCKER_REGISTRY)/device-metrics-exporter:$(EXPORTER_IMAGE_TAG)
DEVICE_CONFIG_MANAGER_IMAGE_TAG ?= latest
DEVICE_CONFIG_MANAGER_IMG = $(DOCKER_REGISTRY)/device-config-manager:$(DEVICE_CONFIG_MANAGER_IMAGE_TAG)
TEST_RUNNER_IMAGE_TAG ?= latest
TEST_RUNNER_IMG = $(DOCKER_REGISTRY)/test-runner:$(TEST_RUNNER_IMAGE_TAG)
UTILS_IMAGE_TAG ?= latest
UTILS_IMAGE_NAME ?= $(IMAGE_NAME)-utils
UTILS_IMG ?= $(DOCKER_REGISTRY)/$(UTILS_IMAGE_NAME):$(UTILS_IMAGE_TAG)

#######################
# Helm Charts variables
YAML_FILES=bundle/manifests/amd-gpu-operator-node-metrics_rbac.authorization.k8s.io_v1_rolebinding.yaml bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml bundle/manifests/amd-gpu-operator-node-labeller_rbac.authorization.k8s.io_v1_clusterrolebinding.yaml bundle/manifests/amd-gpu-operator-node-metrics_monitoring.coreos.com_v1_servicemonitor.yaml config/samples/amd.com_deviceconfigs.yaml config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml example/deviceconfig_example.yaml config/default/kustomization.yaml
CRD_YAML_FILES = deviceconfig-crd.yaml remediationworkflowstatus-crd.yaml
K8S_KMM_CRD_YAML_FILES=module-crd.yaml nodemodulesconfig-crd.yaml
OPENSHIFT_KMM_CRD_YAML_FILES=module-crd.yaml nodemodulesconfig-crd.yaml
OPENSHIFT_CLUSTER_NFD_CRD_YAML_FILES=nodefeature-crd.yaml nodefeaturediscovery-crd.yaml nodefeaturerule-crd.yaml
DEFAULT_VALUES_FILES=helm-charts-k8s/values.yaml helm-charts-openshift/values.yaml hack/k8s-patch/metadata-patch/values.yaml hack/openshift-patch/metadata-patch/values.yaml

ifdef OPENSHIFT
$(info selected openshift)
Expand Down Expand Up @@ -199,6 +211,13 @@ update-registry:
hack/k8s-patch/metadata-patch/values.yaml helm-charts-k8s/values.yaml \
hack/openshift-patch/metadata-patch/values.yaml helm-charts-openshift/values.yaml \
example/deviceconfig_example.yaml
# update operands image tags
@for file in $(DEFAULT_VALUES_FILES); do \
yq eval -i '.deviceConfig.spec.metricsExporter.image = "$(METRICS_EXPORTER_IMG)"' $$file; \
yq eval -i '.deviceConfig.spec.configManager.image = "$(DEVICE_CONFIG_MANAGER_IMG)"' $$file; \
yq eval -i '.deviceConfig.spec.testRunner.image = "$(TEST_RUNNER_IMG)"' $$file; \
yq eval -i '.deviceConfig.spec.commonConfig.utilsContainer.image = "$(UTILS_IMG)"' $$file; \
done
sed -i -e 's|tag:.*$$|tag: ${KMM_IMAGE_TAG}|' \
-e 's|repository:.*operator.*$$|repository: ${KMM_OPERATOR_IMG_NAME}|' \
-e 's|repository:.*webhook.*$$|repository: ${KMM_WEBHOOK_IMG_NAME}|' \
Expand Down Expand Up @@ -298,6 +317,18 @@ docker-push: ## Push docker image with the manager.
docker-save: ## Save the container image with the manager.
docker save $(IMG) | gzip > $(DOCKER_CONTAINER_IMG).tar.gz

.PHONY: docker-build-utils
docker-build-utils: ## Build docker image for utils container.
DOCKER_BUILDKIT=1 docker build -t $(UTILS_IMG) --label HOURLY_TAG=$(HOURLY_TAG_LABEL) -f internal/utils_container/Dockerfile .

.PHONY: docker-push-utils
docker-push-utils: ## Push docker image for utils container.
docker push $(UTILS_IMG)

.PHONY: docker-save-utils
docker-save-utils: ## Save the utils container image as tar.gz.
docker save $(UTILS_IMG) | gzip > $(IMAGE_NAME)-utils-$(IMAGE_TAG).tar.gz

.PHONY: docker-build-env
docker-build-env: ## Build the docker shell container.
@echo "Building the Docker environment..."
Expand Down
8 changes: 4 additions & 4 deletions hack/k8s-patch/metadata-patch/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ deviceConfig:
initContainerImage: busybox:1.36
utilsContainer:
# -- gpu operator utility container image
image: docker.io/rocm/gpu-operator-utils:v1.4.0
image: docker.io/rocm/gpu-operator-utils:latest
# -- utility container image pull policy
imagePullPolicy: IfNotPresent
# -- utility container image pull secret, e.g. {"name": "mySecretName"}
Expand Down Expand Up @@ -129,7 +129,7 @@ deviceConfig:
# -- external port for pulling metrics from outside the cluster for NodePort service, in the range 30000-32767 (assigned automatically by default)
nodePort: 32500
# -- metrics exporter image
image: rocm/device-metrics-exporter:latest
image: docker.io/rocm/device-metrics-exporter:latest
# -- metrics exporter image pull policy
imagePullPolicy: "IfNotPresent"
# -- name of the metrics exporter config map, e.g. {"name": "metricConfigMapName"}
Expand Down Expand Up @@ -187,7 +187,7 @@ deviceConfig:
# -- enable / disable test runner
enable: false
# -- test runner image
image: rocm/test-runner:latest
image: docker.io/rocm/test-runner:latest
# -- test runner image pull policy
imagePullPolicy: "IfNotPresent"
# -- test runner config map, e.g. {"name": "myConfigMap"}
Expand All @@ -214,7 +214,7 @@ deviceConfig:
# -- enable/disable the config manager
enable: false
# -- config manager image
image: rocm/device-config-manager:latest
image: docker.io/rocm/device-config-manager:latest
# -- image pull policy for config manager image
imagePullPolicy: IfNotPresent
# -- image pull secret for config manager image, e.g. {"name": "myPullSecret"}
Expand Down
9 changes: 6 additions & 3 deletions hack/openshift-patch/metadata-patch/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,11 @@ metricsExporter:
deviceConfig:
spec:
metricsExporter:
image: rocm/device-metrics-exporter:latest
image: docker.io/rocm/device-metrics-exporter:latest
configManager:
image: rocm/device-config-manager:latest
image: docker.io/rocm/device-config-manager:latest
testRunner:
image: rocm/test-runner:latest
image: docker.io/rocm/test-runner:latest
commonConfig:
utilsContainer:
image: docker.io/rocm/gpu-operator-utils:latest
8 changes: 4 additions & 4 deletions helm-charts-k8s/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,13 @@ Kubernetes: `>= 1.29.0-0`
| crds.defaultCR.install | bool | `true` | Deploy default DeviceConfig during helm chart installation |
| crds.defaultCR.upgrade | bool | `false` | Deploy / Patch default DeviceConfig during helm chart upgrade. Be careful about this option: 1. Your customized change on default DeviceConfig may be overwritten 2. Your existing DeviceConfig may conflict with upgraded default DeviceConfig |
| deviceConfig.spec.commonConfig.initContainerImage | string | `"busybox:1.36"` | init container image |
| deviceConfig.spec.commonConfig.utilsContainer.image | string | `"docker.io/rocm/gpu-operator-utils:v1.4.0"` | gpu operator utility container image |
| deviceConfig.spec.commonConfig.utilsContainer.image | string | `"docker.io/rocm/gpu-operator-utils:latest"` | gpu operator utility container image |
| deviceConfig.spec.commonConfig.utilsContainer.imagePullPolicy | string | `"IfNotPresent"` | utility container image pull policy |
| deviceConfig.spec.commonConfig.utilsContainer.imageRegistrySecret | object | `{}` | utility container image pull secret, e.g. {"name": "mySecretName"} |
| deviceConfig.spec.configManager.config | object | `{}` | config map for config manager, e.g. {"name": "myConfigMap"} |
| deviceConfig.spec.configManager.configManagerTolerations | list | `[]` | config manager tolerations |
| deviceConfig.spec.configManager.enable | bool | `false` | enable/disable the config manager |
| deviceConfig.spec.configManager.image | string | `"rocm/device-config-manager:latest"` | config manager image |
| deviceConfig.spec.configManager.image | string | `"docker.io/rocm/device-config-manager:latest"` | config manager image |
| deviceConfig.spec.configManager.imagePullPolicy | string | `"IfNotPresent"` | image pull policy for config manager image |
| deviceConfig.spec.configManager.imageRegistrySecret | object | `{}` | image pull secret for config manager image, e.g. {"name": "myPullSecret"} |
| deviceConfig.spec.configManager.selector | object | `{}` | node selector for config manager, if not specified it will reuse spec.selector |
Expand Down Expand Up @@ -206,7 +206,7 @@ Kubernetes: `>= 1.29.0-0`
| deviceConfig.spec.driver.version | string | `"30.20.1"` | specify an out-of-tree driver version to install |
| deviceConfig.spec.metricsExporter.config | object | `{}` | name of the metrics exporter config map, e.g. {"name": "metricConfigMapName"} |
| deviceConfig.spec.metricsExporter.enable | bool | `true` | enable / disable device metrics exporter |
| deviceConfig.spec.metricsExporter.image | string | `"rocm/device-metrics-exporter:latest"` | metrics exporter image |
| deviceConfig.spec.metricsExporter.image | string | `"docker.io/rocm/device-metrics-exporter:latest"` | metrics exporter image |
| deviceConfig.spec.metricsExporter.imagePullPolicy | string | `"IfNotPresent"` | metrics exporter image pull policy |
| deviceConfig.spec.metricsExporter.imageRegistrySecret | object | `{}` | metrics exporter image pull secret, e.g. {"name": "pullSecretName"} |
| deviceConfig.spec.metricsExporter.nodePort | int | `32500` | external port for pulling metrics from outside the cluster for NodePort service, in the range 30000-32767 (assigned automatically by default) |
Expand Down Expand Up @@ -238,7 +238,7 @@ Kubernetes: `>= 1.29.0-0`
| deviceConfig.spec.selector | object | `{"feature.node.kubernetes.io/amd-gpu":"true"}` | Set node selector for the default DeviceConfig |
| deviceConfig.spec.testRunner.config | object | `{}` | test runner config map, e.g. {"name": "myConfigMap"} |
| deviceConfig.spec.testRunner.enable | bool | `false` | enable / disable test runner |
| deviceConfig.spec.testRunner.image | string | `"rocm/test-runner:latest"` | test runner image |
| deviceConfig.spec.testRunner.image | string | `"docker.io/rocm/test-runner:latest"` | test runner image |
| deviceConfig.spec.testRunner.imagePullPolicy | string | `"IfNotPresent"` | test runner image pull policy |
| deviceConfig.spec.testRunner.imageRegistrySecret | object | `{}` | test runner image pull secret |
| deviceConfig.spec.testRunner.logsLocation.hostPath | string | `"/var/log/amd-test-runner"` | host directory to save test run logs |
Expand Down
8 changes: 4 additions & 4 deletions helm-charts-k8s/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ deviceConfig:
initContainerImage: busybox:1.36
utilsContainer:
# -- gpu operator utility container image
image: docker.io/rocm/gpu-operator-utils:v1.4.0
image: docker.io/rocm/gpu-operator-utils:latest
# -- utility container image pull policy
imagePullPolicy: IfNotPresent
# -- utility container image pull secret, e.g. {"name": "mySecretName"}
Expand Down Expand Up @@ -129,7 +129,7 @@ deviceConfig:
# -- external port for pulling metrics from outside the cluster for NodePort service, in the range 30000-32767 (assigned automatically by default)
nodePort: 32500
# -- metrics exporter image
image: rocm/device-metrics-exporter:latest
image: docker.io/rocm/device-metrics-exporter:latest
# -- metrics exporter image pull policy
imagePullPolicy: "IfNotPresent"
# -- name of the metrics exporter config map, e.g. {"name": "metricConfigMapName"}
Expand Down Expand Up @@ -187,7 +187,7 @@ deviceConfig:
# -- enable / disable test runner
enable: false
# -- test runner image
image: rocm/test-runner:latest
image: docker.io/rocm/test-runner:latest
# -- test runner image pull policy
imagePullPolicy: "IfNotPresent"
# -- test runner config map, e.g. {"name": "myConfigMap"}
Expand All @@ -214,7 +214,7 @@ deviceConfig:
# -- enable/disable the config manager
enable: false
# -- config manager image
image: rocm/device-config-manager:latest
image: docker.io/rocm/device-config-manager:latest
# -- image pull policy for config manager image
imagePullPolicy: IfNotPresent
# -- image pull secret for config manager image, e.g. {"name": "myPullSecret"}
Expand Down
9 changes: 6 additions & 3 deletions helm-charts-openshift/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,11 @@ metricsExporter:
deviceConfig:
spec:
metricsExporter:
image: rocm/device-metrics-exporter:latest
image: docker.io/rocm/device-metrics-exporter:latest
configManager:
image: rocm/device-config-manager:latest
image: docker.io/rocm/device-config-manager:latest
testRunner:
image: rocm/test-runner:latest
image: docker.io/rocm/test-runner:latest
commonConfig:
utilsContainer:
image: docker.io/rocm/gpu-operator-utils:latest
Loading