Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/sagemaker/hyperpod/training/hyperpod_pytorch_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ def _process_replica_resources(cls, data):
elif limits.get(NEURON_RESOURCE_KEY):
accelerators_limit = int(limits.get(NEURON_RESOURCE_KEY))

if instance_type is None and (accelerators is not None or accelerators_limit is not None):
raise ValueError("--instance-type is required when specifying accelerator resources")

acc_req, acc_lim = _set_default_accelerators_val(instance_type, accelerators, accelerators_limit)
_validate_accelerators_inputs(instance_type, acc_req, acc_lim)

Expand Down
3 changes: 0 additions & 3 deletions src/sagemaker/hyperpod/training/quota_allocation_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,6 @@ def _get_limits(instance_type: str, vcpu_limit: Optional[float], memory_in_gib_l
if accelerators_limit is not None:
if type_of_accelerator is not None:
result[type_of_accelerator] = accelerators_limit
else:
# user specified accelerator limit but the instance type wasn't found, set limit to 0 as a precaution
result["nvidia.com/gpu"] = 0
if accelerator_partition_limit is not None:
result[f"nvidia.com/{accelerator_partition_type}"] = accelerator_partition_limit
if memory_in_gib_limit is not None:
Expand Down
12 changes: 6 additions & 6 deletions test/unit_tests/cli/test_quota_allocation_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,18 +193,18 @@ def test_get_limits_trainium_instance(self):

def test_get_limits_cpu_only_instance(self):
result = _get_limits("ml.c5.large", 2.0, 8.0, 1, None, None)
# CPU-only instance should set accelerator limit to 0 as precaution
assert result == {"cpu": "2.0", "memory": "8.0Gi", "nvidia.com/gpu": 0}
# CPU-only instance has no accelerator type, so accelerator limit is dropped
assert result == {"cpu": "2.0", "memory": "8.0Gi"}

def test_get_limits_invalid_instance_type(self):
result = _get_limits("invalid-instance", 4.0, 16.0, 2, None, None)
# Invalid instance type should set accelerator limit to 0 as precaution
assert result == {"cpu": "4.0", "memory": "16.0Gi", "nvidia.com/gpu": 0}
# Invalid instance type has no accelerator type, so accelerator limit is dropped
assert result == {"cpu": "4.0", "memory": "16.0Gi"}

def test_get_limits_cpu_instance_r7i(self):
result = _get_limits("ml.r7i.48xlarge", 16.0, 64.0, 2, None, None)
# CPU-only instance (ml.r7i.48xlarge) should set accelerator limit to 0 as precaution
assert result == {"cpu": "16.0", "memory": "64.0Gi", "nvidia.com/gpu": 0}
# CPU-only instance has no accelerator type, so accelerator limit is dropped
assert result == {"cpu": "16.0", "memory": "64.0Gi"}

def test_is_valid_no_instance_type_with_resources(self):
valid, message = _is_valid(4.0, 16.0, None, None, None, None)
Expand Down
Loading