Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions .github/workflows/integration_gpu_cluster_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
group: nightly-test-cluster-group-gpu
cancel-in-progress: false
env:
GPU_CLUSTER_NAME: nightly-xpk-b200
GPU_CLUSTER_NAME: nightly-xpk-h100
WORKLOAD_NAME: xpktest-gpu-nightly-${{ github.run_attempt }}
steps:
- uses: actions/download-artifact@v4
Expand Down Expand Up @@ -55,21 +55,21 @@ jobs:

# 4. Set Env Var for the host (GitHub Runner)
echo "GOOGLE_APPLICATION_CREDENTIALS=$HOME/.config/gcloud/application_default_credentials.json" >> $GITHUB_ENV
- name: Create an XPK Cluster with 1 x b200 GPU
run: xpk cluster create --cluster $GPU_CLUSTER_NAME --device-type=b200-8 --zone=asia-northeast1-b --default-pool-cpu-machine-type=n1-standard-16 --spot
- name: Create an XPK Cluster with 1 x h100 GPU
run: xpk cluster create --cluster $GPU_CLUSTER_NAME --device-type=h100-mega-80gb-8 --zone=asia-southeast1-b --default-pool-cpu-machine-type=e2-standard-8 --spot
- name: Authenticate Docker
run: gcloud auth configure-docker --quiet
- name: Run a base-docker-image workload
run: xpk workload create --cluster $GPU_CLUSTER_NAME --workload $WORKLOAD_NAME --docker-image='nvidia/cuda:12.1.0-base-ubuntu22.04' --command "nvidia-smi" --zone=asia-northeast1-b --device-type=b200-8
run: xpk workload create --cluster $GPU_CLUSTER_NAME --workload $WORKLOAD_NAME --docker-image='nvidia/cuda:12.1.0-base-ubuntu22.04' --command "nvidia-smi" --zone=asia-southeast1-b --device-type=h100-mega-80gb-8
- name: List out the workloads on the cluster
run: xpk workload list --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b
run: xpk workload list --cluster $GPU_CLUSTER_NAME --zone=asia-southeast1-b
- name: Wait for workload completion and confirm it succeeded
run: xpk workload list --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b --wait-for-job-completion $WORKLOAD_NAME --timeout 600
run: xpk workload list --cluster $GPU_CLUSTER_NAME --zone=asia-southeast1-b --wait-for-job-completion $WORKLOAD_NAME --timeout 600
- name: Delete the workload on the cluster
run: xpk workload delete --workload $WORKLOAD_NAME --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b
run: xpk workload delete --workload $WORKLOAD_NAME --cluster $GPU_CLUSTER_NAME --zone=asia-southeast1-b
- name: Delete the cluster created
if: always()
run: xpk cluster delete --cluster $GPU_CLUSTER_NAME --zone=asia-northeast1-b --force
run: xpk cluster delete --cluster $GPU_CLUSTER_NAME --zone=asia-southeast1-b --force
- name: Upload cluster nodepool creation log
if: always()
uses: actions/upload-artifact@v4
Expand Down
Loading