OCP-on-NERC · DanNiESh · Aug 29, 2025 · Aug 29, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
 repos:
   - repo: https://github.com/Lucas-C/pre-commit-hooks
-    rev: v1.5.4
+    rev: v1.5.5
     hooks:
       - id: remove-tabs
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v6.0.0
     hooks:
       - id: trailing-whitespace
       - id: check-merge-conflict
@@ -18,7 +18,7 @@ repos:
       - id: detect-private-key
 
   - repo: https://github.com/adrienverge/yamllint.git
-    rev: v1.32.0
+    rev: v1.37.1
     hooks:
       - id: yamllint
         files: \.(yaml|yml)$

diff --git a/gpu-class/cleanup.sh b/gpu-class/cleanup.sh
@@ -0,0 +1,7 @@
+pattern="^bu-cs599-pmpp-cuda-"
+
+    for proj in $(oc get projects -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep "$pattern"); do
+        echo "deleting notebook + pvc"
+        oc -n "$proj" delete notebook --as system:admin --all --ignore-not-found --wait=true || true
+        oc -n "$proj" delete pvc --as system:admin --all --ignore-not-found --wait=true || true
+    done
diff --git a/gpu-class/cluster_queue_role.yaml b/gpu-class/cluster_queue_role.yaml
@@ -0,0 +1,8 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: kueue-clusterqueue-reader
+rules:
+  - apiGroups: ["kueue.x-k8s.io"]
+    resources: ["clusterqueues"]
+    verbs: ["get", "list", "watch"]
diff --git a/gpu-class/gpu-class-setup.sh b/gpu-class/gpu-class-setup.sh
@@ -0,0 +1,100 @@
+#!/bin/bash
+
+CLASS_NAME="bu-cs599-pmpp-cuda"
+
+create_resource_command=(oc create -f -)
+openshift_url=https://rhods-dashboard-redhat-ods-applications.apps.edu.nerc.mghpcc.org/projects
+# split openshift url to provide as parameters
+host="${openshift_url%/projects*}"        # get everything before projects
+hub_host=$host
+run_name="gpu_class_test"
+image_name="csw-dev-f25"
+
+create_wb() {
+    random_id=$(openssl rand -hex 3)
+
+    #set namespace
+    namespace=$1
+
+    username=$(oc -n "$ns" get rolebinding edit -o json \
+    | jq -r '
+        (.subjects // [])
+        | map(.name)
+        | map(select(. != "jappavoo-40bu-2edu"))
+        | map(select(. != "sdanni-40redhat-2com"))
+        | map(select(. != "istaplet"))
+        | .[]
+    ')
+
+    user=$(oc -n "$ns" get rolebinding edit -o json \
+    | jq -r '
+        (.subjects // [])
+        | map(.name
+            | if test("@.*\\..*$")
+                then sub("@"; "-40") | gsub("\\.";"-2")
+                else .
+                end)
+        | map(select(. != "jappavoo-40bu-2edu"))
+        | map(select(. != "sdanni-40redhat-2com"))
+        | map(select(. != "istaplet"))
+        | .[]
+    ')
+
+    # give notebook within namespace a name
+    notebook_name=cs599-${user}-wb
+
+    params=(
+        -p NOTEBOOK_NAME="$notebook_name"
+        -p RUN_NAME="$run_name"
+        -p USERNAME="$username"
+        -p NAMESPACE="$namespace"
+        -p USER="$user"
+        -p IMAGE_NAME="$image_name"
+        -p OPENSHIFT_URL="$openshift_url"
+        -p HUB_HOST="$hub_host"
+    )
+
+    oc process -f notebook_resource.yaml --local "${params[@]}" | "${create_resource_command[@]}"  --as system:admin 1>&2
+
+    echo "$notebook_name"
+}
+
+apply_localqueue() {
+    namespace=$1
+
+    local_params=(
+        -p NAMESPACE="$namespace"
+    )
+
+    oc process -f localqueue.yaml --local "${local_params[@]}" | "${create_resource_command[@]}" --as system:admin  1>&2
+}
+
+apply_rolebinding() {
+    #set namespace and nb name
+    namespace=$1
+    notebook_name=$2
+
+    rb_params=(
+        -p NAMESPACE="$namespace"
+        -p SERVICE_ACCOUNT_NB="$notebook_name"
+    )
+
+    oc process -f rb.yaml --local "${rb_params[@]}" | "${create_resource_command[@]}" --as system:admin
+}
+
+apply_clusterq() {
+
+    oc apply -f  cluster_queue_role.yaml --as system:admin
+}
+
+apply_clusterq
+
+oc get ns | grep "^${CLASS_NAME}-" | awk '{print $1}' | while read ns; do
+    oc project "$ns"
+
+    #create a workbench and save the name of the notebook to apply rolebindings
+    nb_name="$(create_wb "$ns")"
+    apply_rolebinding "$ns" "$nb_name"
+    apply_localqueue "$ns"
+
+done
diff --git a/gpu-class/localqueue.yaml b/gpu-class/localqueue.yaml
@@ -0,0 +1,29 @@
+apiVersion: template.openshift.io/v1
+kind: Template
+metadata:
+  name: localqueue
+parameters:
+  - name: NAMESPACE
+    required: true
+objects:
+  - apiVersion: kueue.x-k8s.io/v1beta1
+    kind: LocalQueue
+    metadata:
+      name: v100-localqueue
+      namespace: ${NAMESPACE}
+    spec:
+      clusterQueue: v100-clusterqueue
+  - apiVersion: kueue.x-k8s.io/v1beta1
+    kind: LocalQueue
+    metadata:
+      name: a100-localqueue
+      namespace: ${NAMESPACE}
+    spec:
+      clusterQueue: a100-clusterqueue
+  - apiVersion: kueue.x-k8s.io/v1beta1
+    kind: LocalQueue
+    metadata:
+      name: h100-localqueue
+      namespace: ${NAMESPACE}
+    spec:
+      clusterQueue: h100-clusterqueue
diff --git a/gpu-class/notebook_resource.yaml b/gpu-class/notebook_resource.yaml
@@ -0,0 +1,210 @@
+apiVersion: template.openshift.io/v1
+kind: Template
+parameters:
+- name: NOTEBOOK_NAME
+  required: true
+- name: RUN_NAME
+  required: true
+- name: USERNAME
+  required: true
+- name: IMAGE_NAME
+  required: true
+- name: NAMESPACE
+  required: true
+- name: OPENSHIFT_URL
+  required: true
+- name: USER
+  required: true
+- name: IMAGE_REPO
+  required: true
+  value: "image-registry.openshift-image-registry.svc:5000/redhat-ods-applications"
+- name: HUB_HOST
+  required: true
+- name: PVC_SIZE
+  required: true
+  value: "20Gi"
+- name: TOKEN
+  required: false
+objects:
+- apiVersion: kubeflow.org/v1beta1
+  kind: Notebook
+  metadata:
+    annotations:
+      notebooks.opendatahub.io/inject-oauth: 'true'
+      notebooks.opendatahub.io/last-image-selection: ${IMAGE_NAME}
+      notebooks.opendatahub.io/last-size-selection: Small
+      notebooks.opendatahub.io/oauth-logout-url: >-
+        ${OPENSHIFT_URL}/${NAMESPACE}?notebookLogout=${NOTEBOOK_NAME}
+      opendatahub.io/username: ${USER}
+      openshift.io/description: ''
+      openshift.io/display-name: ${NOTEBOOK_NAME}
+      opendatahub.io/image-display-name: ${IMAGE_NAME}
+    name: ${NOTEBOOK_NAME}
+    labels:
+      ope-run: ${RUN_NAME}
+      app: ${NOTEBOOK_NAME}
+      opendatahub.io/dashboard: 'true'
+      opendatahub.io/odh-managed: 'true'
+      opendatahub.io/user: ${USER}
+  spec:
+    affinity:
+    nodeAffinity:
+    preferredDuringSchedulingIgnoredDuringExecution:
+      - preference:
+        matchExpressions:
+          - key: nvidia.com/gpu.present
+            # set the value to 'true' to use nodes with GPUs
+            operator: In
+            values:
+            - 'false'
+            weight: 1
+    template:
+      spec:
+        containers:
+          - resources:
+              limits:
+                cpu: '2'
+                memory: 8Gi
+              requests:
+                cpu: '1'
+                memory: 8Gi
+            readinessProbe:
+              failureThreshold: 3
+              httpGet:
+                path: /notebook/${NAMESPACE}/${NOTEBOOK_NAME}/api
+                port: notebook-port
+                scheme: HTTP
+              initialDelaySeconds: 10
+              periodSeconds: 5
+              successThreshold: 1
+              timeoutSeconds: 1
+            name: ${NOTEBOOK_NAME}
+            livenessProbe:
+              failureThreshold: 3
+              httpGet:
+                path: /notebook/${NAMESPACE}/${NOTEBOOK_NAME}/api
+                port: notebook-port
+                scheme: HTTP
+              initialDelaySeconds: 10
+              periodSeconds: 5
+              successThreshold: 1
+              timeoutSeconds: 1
+            env:
+              - name: NOTEBOOK_ARGS
+                value: |-
+                  --ServerApp.port=8888
+                                    --ServerApp.token=${TOKEN}
+                                    --ServerApp.password=''
+                                    --ServerApp.base_url=/notebook/${NAMESPACE}/${NOTEBOOK_NAME}
+                                    --ServerApp.quit_button=False
+                                    --ServerApp.tornado_settings={"user":"${USER}","hub_host":"${HUB_HOST}","hub_prefix":"projects/${NAMESPACE}"}
+              - name: JUPYTER_IMAGE
+                value: >-
+                  ${IMAGE_REPO}/${IMAGE_NAME}
+            ports:
+              - containerPort: 8888
+                name: notebook-port
+                protocol: TCP
+            imagePullPolicy: Always
+            volumeMounts:
+              - mountPath: /opt/app-root/src
+                name: ${NOTEBOOK_NAME}
+              - mountPath: /dev/shm
+                name: shm
+            image: >-
+                ${IMAGE_REPO}/${IMAGE_NAME}
+            workingDir: /opt/app-root/src
+          - resources:
+              limits:
+                cpu: 100m
+                memory: 64Mi
+              requests:
+                cpu: 100m
+                memory: 64Mi
+            readinessProbe:
+              failureThreshold: 3
+              httpGet:
+                path: /oauth/healthz
+                port: oauth-proxy
+                scheme: HTTPS
+              initialDelaySeconds: 5
+              periodSeconds: 5
+              successThreshold: 1
+              timeoutSeconds: 1
+            name: oauth-proxy
+            livenessProbe:
+              failureThreshold: 3
+              httpGet:
+                path: /oauth/healthz
+                port: oauth-proxy
+                scheme: HTTPS
+              initialDelaySeconds: 30
+              periodSeconds: 5
+              successThreshold: 1
+              timeoutSeconds: 1
+            env:
+              - name: NAMESPACE
+                valueFrom:
+                  fieldRef:
+                    fieldPath: metadata.namespace
+            ports:
+              - containerPort: 8443
+                name: oauth-proxy
+                protocol: TCP
+            imagePullPolicy: Always
+            volumeMounts:
+              - mountPath: /etc/oauth/config
+                name: oauth-config
+              - mountPath: /etc/tls/private
+                name: tls-certificates
+            image: >-
+              registry.redhat.io/openshift4/ose-oauth-proxy@sha256:4bef31eb993feb6f1096b51b4876c65a6fb1f4401fee97fa4f4542b6b7c9bc46
+            args:
+              - '--provider=openshift'
+              - '--https-address=:8443'
+              - '--http-address='
+              - '--openshift-service-account=${NOTEBOOK_NAME}'
+              - '--cookie-secret-file=/etc/oauth/config/cookie_secret'
+              - '--cookie-expire=24h0m0s'
+              - '--tls-cert=/etc/tls/private/tls.crt'
+              - '--tls-key=/etc/tls/private/tls.key'
+              - '--upstream=http://localhost:8888'
+              - '--upstream-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'
+              - '--email-domain=*'
+              - '--skip-provider-button'
+              - >-
+                --openshift-sar={"verb":"get","resource":"notebooks","resourceAPIGroup":"kubeflow.org","resourceName":"${NOTEBOOK_NAME}","namespace":"$(NAMESPACE)"}
+              - >-
+                --logout-url=${OPENSHIFT_URL}${NAMESPACE}?notebookLogout=${NOTEBOOK_NAME}
+        enableServiceLinks: false
+        serviceAccountName: ${NOTEBOOK_NAME}
+        volumes:
+          - name: ${NOTEBOOK_NAME}
+            persistentVolumeClaim:
+              claimName: ${NOTEBOOK_NAME}
+          - emptyDir:
+              medium: Memory
+            name: shm
+          - name: oauth-config
+            secret:
+              defaultMode: 420
+              secretName: ${NOTEBOOK_NAME}-oauth-config
+          - name: tls-certificates
+            secret:
+              defaultMode: 420
+              secretName: ${NOTEBOOK_NAME}-tls
+- apiVersion: v1
+  kind: PersistentVolumeClaim
+  metadata:
+    name: ${NOTEBOOK_NAME}
+    labels:
+      app: ${NOTEBOOK_NAME}
+      notebook-name: ${NOTEBOOK_NAME}
+      ope-run: ${RUN_NAME}
+      opendatahub.io/dashboard: 'true'
+  spec:
+    accessModes:
+      - ReadWriteOnce
+    resources:
+      requests:
+        storage: "${PVC_SIZE}"