Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
repos:
- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.5.4
rev: v1.5.5
hooks:
- id: remove-tabs

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
rev: v6.0.0
hooks:
- id: trailing-whitespace
- id: check-merge-conflict
Expand All @@ -18,7 +18,7 @@ repos:
- id: detect-private-key

- repo: https://github.com/adrienverge/yamllint.git
rev: v1.32.0
rev: v1.37.1
hooks:
- id: yamllint
files: \.(yaml|yml)$
Expand Down
7 changes: 7 additions & 0 deletions gpu-class/cleanup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
pattern="^bu-cs599-pmpp-cuda-"

for proj in $(oc get projects -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' | grep "$pattern"); do
echo "deleting notebook + pvc"
oc -n "$proj" delete notebook --as system:admin --all --ignore-not-found --wait=true || true
oc -n "$proj" delete pvc --as system:admin --all --ignore-not-found --wait=true || true
done
8 changes: 8 additions & 0 deletions gpu-class/cluster_queue_role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kueue-clusterqueue-reader
rules:
- apiGroups: ["kueue.x-k8s.io"]
resources: ["clusterqueues"]
verbs: ["get", "list", "watch"]
100 changes: 100 additions & 0 deletions gpu-class/gpu-class-setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/bin/bash

CLASS_NAME="bu-cs599-pmpp-cuda"

create_resource_command=(oc create -f -)
openshift_url=https://rhods-dashboard-redhat-ods-applications.apps.edu.nerc.mghpcc.org/projects
# split openshift url to provide as parameters
host="${openshift_url%/projects*}" # get everything before projects
hub_host=$host
run_name="gpu_class_test"
image_name="csw-dev-f25"

create_wb() {
random_id=$(openssl rand -hex 3)

#set namespace
namespace=$1

username=$(oc -n "$ns" get rolebinding edit -o json \
| jq -r '
(.subjects // [])
| map(.name)
| map(select(. != "jappavoo-40bu-2edu"))
| map(select(. != "sdanni-40redhat-2com"))
| map(select(. != "istaplet"))
| .[]
')

user=$(oc -n "$ns" get rolebinding edit -o json \
| jq -r '
(.subjects // [])
| map(.name
| if test("@.*\\..*$")
then sub("@"; "-40") | gsub("\\.";"-2")
else .
end)
| map(select(. != "jappavoo-40bu-2edu"))
| map(select(. != "sdanni-40redhat-2com"))
| map(select(. != "istaplet"))
| .[]
')

# give notebook within namespace a name
notebook_name=cs599-${user}-wb

params=(
-p NOTEBOOK_NAME="$notebook_name"
-p RUN_NAME="$run_name"
-p USERNAME="$username"
-p NAMESPACE="$namespace"
-p USER="$user"
-p IMAGE_NAME="$image_name"
-p OPENSHIFT_URL="$openshift_url"
-p HUB_HOST="$hub_host"
)

oc process -f notebook_resource.yaml --local "${params[@]}" | "${create_resource_command[@]}" --as system:admin 1>&2

echo "$notebook_name"
}

apply_localqueue() {
namespace=$1

local_params=(
-p NAMESPACE="$namespace"
)

oc process -f localqueue.yaml --local "${local_params[@]}" | "${create_resource_command[@]}" --as system:admin 1>&2
}

apply_rolebinding() {
#set namespace and nb name
namespace=$1
notebook_name=$2

rb_params=(
-p NAMESPACE="$namespace"
-p SERVICE_ACCOUNT_NB="$notebook_name"
)

oc process -f rb.yaml --local "${rb_params[@]}" | "${create_resource_command[@]}" --as system:admin
}

apply_clusterq() {

oc apply -f cluster_queue_role.yaml --as system:admin
}

apply_clusterq

oc get ns | grep "^${CLASS_NAME}-" | awk '{print $1}' | while read ns; do
oc project "$ns"

#create a workbench and save the name of the notebook to apply rolebindings
nb_name="$(create_wb "$ns")"
apply_rolebinding "$ns" "$nb_name"
apply_localqueue "$ns"

done
29 changes: 29 additions & 0 deletions gpu-class/localqueue.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
apiVersion: template.openshift.io/v1
kind: Template
metadata:
name: localqueue
parameters:
- name: NAMESPACE
required: true
objects:
- apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
name: v100-localqueue
namespace: ${NAMESPACE}
spec:
clusterQueue: v100-clusterqueue
- apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
name: a100-localqueue
namespace: ${NAMESPACE}
spec:
clusterQueue: a100-clusterqueue
- apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
name: h100-localqueue
namespace: ${NAMESPACE}
spec:
clusterQueue: h100-clusterqueue
210 changes: 210 additions & 0 deletions gpu-class/notebook_resource.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
apiVersion: template.openshift.io/v1
kind: Template
parameters:
- name: NOTEBOOK_NAME
required: true
- name: RUN_NAME
required: true
- name: USERNAME
required: true
- name: IMAGE_NAME
required: true
- name: NAMESPACE
required: true
- name: OPENSHIFT_URL
required: true
- name: USER
required: true
- name: IMAGE_REPO
required: true
value: "image-registry.openshift-image-registry.svc:5000/redhat-ods-applications"
- name: HUB_HOST
required: true
- name: PVC_SIZE
required: true
value: "20Gi"
- name: TOKEN
required: false
objects:
- apiVersion: kubeflow.org/v1beta1
kind: Notebook
metadata:
annotations:
notebooks.opendatahub.io/inject-oauth: 'true'
notebooks.opendatahub.io/last-image-selection: ${IMAGE_NAME}
notebooks.opendatahub.io/last-size-selection: Small
notebooks.opendatahub.io/oauth-logout-url: >-
${OPENSHIFT_URL}/${NAMESPACE}?notebookLogout=${NOTEBOOK_NAME}
opendatahub.io/username: ${USER}
openshift.io/description: ''
openshift.io/display-name: ${NOTEBOOK_NAME}
opendatahub.io/image-display-name: ${IMAGE_NAME}
name: ${NOTEBOOK_NAME}
labels:
ope-run: ${RUN_NAME}
app: ${NOTEBOOK_NAME}
opendatahub.io/dashboard: 'true'
opendatahub.io/odh-managed: 'true'
opendatahub.io/user: ${USER}
spec:
affinity:
nodeAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- preference:
matchExpressions:
- key: nvidia.com/gpu.present
# set the value to 'true' to use nodes with GPUs
operator: In
values:
- 'false'
weight: 1
template:
spec:
containers:
- resources:
limits:
cpu: '2'
memory: 8Gi
requests:
cpu: '1'
memory: 8Gi
readinessProbe:
failureThreshold: 3
httpGet:
path: /notebook/${NAMESPACE}/${NOTEBOOK_NAME}/api
port: notebook-port
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
name: ${NOTEBOOK_NAME}
livenessProbe:
failureThreshold: 3
httpGet:
path: /notebook/${NAMESPACE}/${NOTEBOOK_NAME}/api
port: notebook-port
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
env:
- name: NOTEBOOK_ARGS
value: |-
--ServerApp.port=8888
--ServerApp.token=${TOKEN}
--ServerApp.password=''
--ServerApp.base_url=/notebook/${NAMESPACE}/${NOTEBOOK_NAME}
--ServerApp.quit_button=False
--ServerApp.tornado_settings={"user":"${USER}","hub_host":"${HUB_HOST}","hub_prefix":"projects/${NAMESPACE}"}
- name: JUPYTER_IMAGE
value: >-
${IMAGE_REPO}/${IMAGE_NAME}
ports:
- containerPort: 8888
name: notebook-port
protocol: TCP
imagePullPolicy: Always
volumeMounts:
- mountPath: /opt/app-root/src
name: ${NOTEBOOK_NAME}
- mountPath: /dev/shm
name: shm
image: >-
${IMAGE_REPO}/${IMAGE_NAME}
workingDir: /opt/app-root/src
- resources:
limits:
cpu: 100m
memory: 64Mi
requests:
cpu: 100m
memory: 64Mi
readinessProbe:
failureThreshold: 3
httpGet:
path: /oauth/healthz
port: oauth-proxy
scheme: HTTPS
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
name: oauth-proxy
livenessProbe:
failureThreshold: 3
httpGet:
path: /oauth/healthz
port: oauth-proxy
scheme: HTTPS
initialDelaySeconds: 30
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 1
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
ports:
- containerPort: 8443
name: oauth-proxy
protocol: TCP
imagePullPolicy: Always
volumeMounts:
- mountPath: /etc/oauth/config
name: oauth-config
- mountPath: /etc/tls/private
name: tls-certificates
image: >-
registry.redhat.io/openshift4/ose-oauth-proxy@sha256:4bef31eb993feb6f1096b51b4876c65a6fb1f4401fee97fa4f4542b6b7c9bc46
args:
- '--provider=openshift'
- '--https-address=:8443'
- '--http-address='
- '--openshift-service-account=${NOTEBOOK_NAME}'
- '--cookie-secret-file=/etc/oauth/config/cookie_secret'
- '--cookie-expire=24h0m0s'
- '--tls-cert=/etc/tls/private/tls.crt'
- '--tls-key=/etc/tls/private/tls.key'
- '--upstream=http://localhost:8888'
- '--upstream-ca=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'
- '--email-domain=*'
- '--skip-provider-button'
- >-
--openshift-sar={"verb":"get","resource":"notebooks","resourceAPIGroup":"kubeflow.org","resourceName":"${NOTEBOOK_NAME}","namespace":"$(NAMESPACE)"}
- >-
--logout-url=${OPENSHIFT_URL}${NAMESPACE}?notebookLogout=${NOTEBOOK_NAME}
enableServiceLinks: false
serviceAccountName: ${NOTEBOOK_NAME}
volumes:
- name: ${NOTEBOOK_NAME}
persistentVolumeClaim:
claimName: ${NOTEBOOK_NAME}
- emptyDir:
medium: Memory
name: shm
- name: oauth-config
secret:
defaultMode: 420
secretName: ${NOTEBOOK_NAME}-oauth-config
- name: tls-certificates
secret:
defaultMode: 420
secretName: ${NOTEBOOK_NAME}-tls
- apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ${NOTEBOOK_NAME}
labels:
app: ${NOTEBOOK_NAME}
notebook-name: ${NOTEBOOK_NAME}
ope-run: ${RUN_NAME}
opendatahub.io/dashboard: 'true'
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: "${PVC_SIZE}"
Loading