Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion tests/integration-tests/tests/basic/test_essential_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from assertpy import assert_that, soft_assertions
from constants import UNSUPPORTED_OSES_FOR_DCV
from remote_command_executor import RemoteCommandExecutor
from utils import check_status, is_dcv_supported, test_cluster_health_metric
from utils import check_status, get_flexible_gpu_instance_types, is_dcv_supported, test_cluster_health_metric

from tests.basic.disable_hyperthreading_utils import _test_disable_hyperthreading_settings
from tests.basic.log_rotation_utils import _test_compute_log_rotation, _test_headnode_log_rotation
Expand Down Expand Up @@ -65,6 +65,7 @@ def test_essential_features(
dcv_enabled=dcv_enabled,
max_queue_size=max_queue_size,
scaledown_idletime=scaledown_idletime,
flexible_gpu_instance_types=get_flexible_gpu_instance_types(instance, region),
)
cluster = clusters_factory(cluster_config)

Expand All @@ -91,6 +92,8 @@ def test_essential_features(
cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory
)

_test_gpu_workload(cluster, scheduler_commands_factory, test_datadir)


def _test_mpi_job(
scheduler, region, instance, cluster, test_datadir, scheduler_commands_factory, scaledown_idletime, max_queue_size
Expand Down Expand Up @@ -331,6 +334,29 @@ def _test_custom_bootstrap_scripts_args_quotes(cluster):
)


def _test_gpu_workload(cluster, scheduler_commands_factory, test_datadir):
"""Submit a Slurm job that builds and runs CUDA samples on a GPU compute node."""
remote_command_executor = RemoteCommandExecutor(cluster)
scheduler_commands = scheduler_commands_factory(remote_command_executor)

samples = ["1_Utilities/deviceQuery", "4_CUDA_Libraries/matrixMulCUBLAS"]
job_ids = []
for sample in samples:
logging.info("Submitting CUDA sample job for %s", sample)
result = scheduler_commands.submit_script(
str(test_datadir / "gpu_job.sh"),
script_args=[sample],
partition="gpu",
nodes=1,
slots=1,
)
job_ids.append(scheduler_commands.assert_job_submitted(result.stdout))

for job_id in job_ids:
scheduler_commands.wait_job_completed(job_id, timeout=20)
scheduler_commands.assert_job_succeeded(job_id)


def _test_disable_hyperthreading(
cluster, region, instance, scheduler, default_threads_per_core, request, scheduler_commands_factory
):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash
#SBATCH --job-name=cuda-gpu-validate
#SBATCH --output=cuda-gpu-validate-%j.out

# Build and run a single CUDA sample (passed as a script argument) from the
# pre-installed /usr/local/cuda-samples-13.0 tree. CUDA samples 13.x are
# CMake-only and /usr/local/... isn't writable, so the script copies the
# sample into a temp dir before building.

set -euo pipefail

if [[ $# -ne 1 ]]; then
echo "Usage: sbatch $0 <category>/<sample>" >&2
echo " e.g. sbatch $0 1_Utilities/deviceQuery" >&2
exit 2
fi
SAMPLE_REL=$1
SAMPLE_NAME=${SAMPLE_REL##*/}

export PATH=/usr/local/cuda/bin:${PATH}

echo "Node: $(hostname)"
echo "Sample: $SAMPLE_REL"
echo "SLURM_JOB_GPUS=${SLURM_JOB_GPUS:-unset}"
echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-unset}"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Minor] Do we need to explicitly set --gres=gpu to ensure GPU is visible?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not in this case because we are targeting the queue that only has cr with GPUs

nvidia-smi -L
nvidia-smi
nvcc --version

SAMPLES_SRC=/usr/local/cuda-samples-13.0
if [[ ! -d "$SAMPLES_SRC/Samples/$SAMPLE_REL" ]]; then
echo "ERROR: sample not found: $SAMPLES_SRC/Samples/$SAMPLE_REL" >&2
exit 2
fi

WORKDIR=$(mktemp -d)
trap 'rm -rf "$WORKDIR"' EXIT

# Shared scaffolding required by every sample (Common/, top-level cmake/)
cp -r "$SAMPLES_SRC"/{Common,cmake,CMakeLists.txt} "$WORKDIR"/

DST="$WORKDIR/Samples/$SAMPLE_REL"
mkdir -p "$(dirname "$DST")"
cp -r "$SAMPLES_SRC/Samples/$SAMPLE_REL" "$DST"

echo "===== Building $SAMPLE_REL ====="
cmake -S "$DST" -B "$DST/build"
cmake --build "$DST/build" -j"${SLURM_CPUS_PER_TASK:-2}"

echo "===== Running $SAMPLE_NAME ====="
"$DST/build/$SAMPLE_NAME"
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,18 @@ Scheduling:
- InstanceType: {{ instance }}
MinCount: 1
MaxCount: {{ max_queue_size }}
- Name: gpu
Networking:
SubnetIds:
- {{ private_subnet_id }}
ComputeResources:
- Name: cr1
Instances:
{% for instance_type in flexible_gpu_instance_types %}
- InstanceType: {{ instance_type }}
{% endfor %}
MinCount: 0
MaxCount: 1
SlurmSettings:
ScaledownIdletime: {{ scaledown_idletime }} # Use shorter scaledown time to test logs in slurm_suspend
Monitoring:
Expand Down
9 changes: 9 additions & 0 deletions tests/integration-tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import string
import subprocess
from datetime import datetime, timedelta
from functools import cache
from hashlib import sha1

import boto3
Expand Down Expand Up @@ -1073,6 +1074,14 @@ def get_similar_instance_types(instance_type: str, region: str = None, max_items
return similar_instances


@cache
def get_flexible_gpu_instance_types(instance, region):
"""Return a list of NVIDIA GPU instance types compatible with ``instance``'s architecture."""
architecture = get_architecture_supported_by_instance_type(instance, region)
gpu_instance_type = "g4dn.2xlarge" if architecture == "x86_64" else "g5g.2xlarge"
return list({gpu_instance_type, *get_similar_instance_types(gpu_instance_type, region, 5)})


def verify_cluster_node_config_version_in_ddb(region, cluster_name, instance_id, expected_version):
"""
Verify that a cluster node has the correct config version in DynamoDB.
Expand Down
Loading