aws · gmarciani · May 20, 2026 · May 19, 2026 · May 20, 2026 · hehe7318
@@ -15,7 +15,7 @@
 from assertpy import assert_that, soft_assertions
 from constants import UNSUPPORTED_OSES_FOR_DCV
 from remote_command_executor import RemoteCommandExecutor
-from utils import check_status, is_dcv_supported, test_cluster_health_metric
+from utils import check_status, get_flexible_gpu_instance_types, is_dcv_supported, test_cluster_health_metric
 
 from tests.basic.disable_hyperthreading_utils import _test_disable_hyperthreading_settings
 from tests.basic.log_rotation_utils import _test_compute_log_rotation, _test_headnode_log_rotation
@@ -65,6 +65,7 @@ def test_essential_features(
         dcv_enabled=dcv_enabled,
         max_queue_size=max_queue_size,
         scaledown_idletime=scaledown_idletime,
+        flexible_gpu_instance_types=get_flexible_gpu_instance_types(instance, region),
     )
     cluster = clusters_factory(cluster_config)
 
@@ -91,6 +92,8 @@ def test_essential_features(
         cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory
     )
 
+    _test_gpu_workload(cluster, scheduler_commands_factory, test_datadir)
+
 
 def _test_mpi_job(
     scheduler, region, instance, cluster, test_datadir, scheduler_commands_factory, scaledown_idletime, max_queue_size
@@ -331,6 +334,29 @@ def _test_custom_bootstrap_scripts_args_quotes(cluster):
     )
 
 
+def _test_gpu_workload(cluster, scheduler_commands_factory, test_datadir):
+    """Submit a Slurm job that builds and runs CUDA samples on a GPU compute node."""
+    remote_command_executor = RemoteCommandExecutor(cluster)
+    scheduler_commands = scheduler_commands_factory(remote_command_executor)
+
+    samples = ["1_Utilities/deviceQuery", "4_CUDA_Libraries/matrixMulCUBLAS"]
+    job_ids = []
+    for sample in samples:
+        logging.info("Submitting CUDA sample job for %s", sample)
+        result = scheduler_commands.submit_script(
+            str(test_datadir / "gpu_job.sh"),
+            script_args=[sample],
+            partition="gpu",
+            nodes=1,
+            slots=1,
+        )
+        job_ids.append(scheduler_commands.assert_job_submitted(result.stdout))
+
+    for job_id in job_ids:
+        scheduler_commands.wait_job_completed(job_id, timeout=20)
+        scheduler_commands.assert_job_succeeded(job_id)
+
+
 def _test_disable_hyperthreading(
     cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory
 ):

@@ -0,0 +1,51 @@
+#!/bin/bash
+#SBATCH --job-name=cuda-gpu-validate
+#SBATCH --output=cuda-gpu-validate-%j.out
+
+# Build and run a single CUDA sample (passed as a script argument) from the
+# pre-installed /usr/local/cuda-samples-13.0 tree. CUDA samples 13.x are
+# CMake-only and /usr/local/... isn't writable, so the script copies the
+# sample into a temp dir before building.
+
+set -euo pipefail
+
+if [[ $# -ne 1 ]]; then
+    echo "Usage: sbatch $0 <category>/<sample>" >&2
+    echo "  e.g. sbatch $0 1_Utilities/deviceQuery"  >&2
+    exit 2
+fi
+SAMPLE_REL=$1
+SAMPLE_NAME=${SAMPLE_REL##*/}
+
+export PATH=/usr/local/cuda/bin:${PATH}
+
+echo "Node: $(hostname)"
+echo "Sample: $SAMPLE_REL"
+echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-unset}"
+echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-unset}"
+nvidia-smi -L
+nvidia-smi
+nvcc --version
+
+SAMPLES_SRC=/usr/local/cuda-samples-13.0
+if [[ ! -d "$SAMPLES_SRC/Samples/$SAMPLE_REL" ]]; then
+    echo "ERROR: sample not found: $SAMPLES_SRC/Samples/$SAMPLE_REL" >&2
+    exit 2
+fi
+
+WORKDIR=$(mktemp -d)
+trap 'rm -rf "$WORKDIR"' EXIT
+
+# Shared scaffolding required by every sample (Common/, top-level cmake/)
+cp -r "$SAMPLES_SRC"/{Common,cmake,CMakeLists.txt} "$WORKDIR"/
+
+DST="$WORKDIR/Samples/$SAMPLE_REL"
+mkdir -p "$(dirname "$DST")"
+cp -r "$SAMPLES_SRC/Samples/$SAMPLE_REL" "$DST"
+
+echo "===== Building $SAMPLE_REL ====="
+cmake -S "$DST" -B "$DST/build"
+cmake --build "$DST/build" -j"${SLURM_CPUS_PER_TASK:-2}"
+
+echo "===== Running $SAMPLE_NAME ====="
+"$DST/build/$SAMPLE_NAME"
@@ -121,6 +121,18 @@ Scheduling:
             - InstanceType: {{ instance }}
           MinCount: 1
           MaxCount: {{ max_queue_size }}
+    - Name: gpu
+      Networking:
+        SubnetIds:
+          - {{ private_subnet_id }}
+      ComputeResources:
+        - Name: cr1
+          Instances:
+            {% for instance_type in flexible_gpu_instance_types %}
+            - InstanceType: {{ instance_type }}
+            {% endfor %}
+          MinCount: 0
+          MaxCount: 1
   SlurmSettings:
     ScaledownIdletime: {{ scaledown_idletime }} # Use shorter scaledown time to test logs in slurm_suspend
 Monitoring:

@@ -19,6 +19,7 @@
 import string
 import subprocess
 from datetime import datetime, timedelta
+from functools import cache
 from hashlib import sha1
 
 import boto3
@@ -1073,6 +1074,14 @@ def get_similar_instance_types(instance_type: str, region: str = None, max_items
     return similar_instances
 
 
+@cache
+def get_flexible_gpu_instance_types(instance, region):
+    """Return a list of NVIDIA GPU instance types compatible with ``instance``'s architecture."""
+    architecture = get_architecture_supported_by_instance_type(instance, region)
+    gpu_instance_type = "g4dn.2xlarge" if architecture == "x86_64" else "g5g.2xlarge"
+    return list({gpu_instance_type, *get_similar_instance_types(gpu_instance_type, region, 5)})
+
+
 def verify_cluster_node_config_version_in_ddb(region, cluster_name, instance_id, expected_version):
     """
     Verify that a cluster node has the correct config version in DynamoDB.