Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -195,12 +195,12 @@ class PyTorchJobConfig(BaseModel):
default=None,
description="Limit for the amount of memory in GiB",
)
efa_interfaces: Optional[int] = Field(
efa: Optional[int] = Field(
default=None,
description="Number of EFA interfaces for the instance",
description="Number of EFA interfaces",
ge=0
)
efa_interfaces_limit: Optional[int] = Field(
efa_limit: Optional[int] = Field(
default=None,
description="Limit for the number of EFA interfaces",
ge=0
Expand Down Expand Up @@ -464,26 +464,26 @@ def build_dict(**kwargs):
**{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {},
vcpu=str(self.vcpu) if self.vcpu else None,
memory=str(self.memory) if self.memory else None,
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {},
**{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {},
)
limits_value = build_dict(
**{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {},
vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
memory=str(self.memory_limit) if self.memory_limit else None,
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {},
**{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {},
)
else:
requests_value = build_dict(
accelerators=str(self.accelerators) if self.accelerators else None,
vcpu=str(self.vcpu) if self.vcpu else None,
memory=str(self.memory) if self.memory else None,
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {},
**{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {},
)
limits_value = build_dict(
accelerators=str(self.accelerators_limit) if self.accelerators_limit else None,
vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
memory=str(self.memory_limit) if self.memory_limit else None,
**{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {},
**{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {},
)

# Build container
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,12 @@
"minimum": 0,
"description": "Limit for the amount of memory in GiB"
},
"efa_interfaces": {
"efa": {
"type": "integer",
"minimum": 0,
"description": "Number of EFA interfaces for the instance"
"description": "Number of EFA interfaces"
},
"efa_interfaces_limit": {
"efa_limit": {
"type": "integer",
"minimum": 0,
"description": "Limit for the number of EFA interfaces"
Expand Down
7 changes: 1 addition & 6 deletions src/sagemaker/hyperpod/training/quota_allocation_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,15 +282,10 @@ def _is_valid(vcpu: Optional[float], memory_in_gib: Optional[float], accelerator
if (instance_type is None and has_gpu_quota_allocation) or (instance_type is None and accelerator_partition_type):
return False, "Instance-type must be specified when accelerators, accelerator_partition_type, vcpu, or memory-in-gib specified"

node_specified = node_count is not None and node_count > 0

# Check if instance_type is valid only when it's provided
if instance_type is not None and (INSTANCE_RESOURCES.get(instance_type) is None):
return False, f"Invalid instance-type {instance_type}. Please re-check the instance type and contact AWS for support."
if instance_type is not None:
#both resources and node count specified
if (has_gpu_quota_allocation and node_specified):
return False, f"Either node-count OR a combination of accelerators, vcpu, memory-in-gib must be specified for instance-type {instance_type}"

return True, ""


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,37 +208,6 @@ def test_create_job_with_accelerators_memory_parameters(self, test_job_name):
assert result.returncode == 0
logger.info(f"Successfully deleted job: {test_job_name}")

def test_invalid_node_count_accelerators_parameter(self, test_job_name):
"""Test that invalid case where both node-count and accelerators are provided"""

# Test with both node-count and accelerators parameters
create_cmd = [
"hyp", "create", "hyp-pytorch-job",
"--version", "1.1",
"--job-name", test_job_name,
"--image", "pytorch:latest",
"--pull-policy", "IfNotPresent",
"--tasks-per-node", "1",
"--accelerators", "1",
"--instance-type", "ml.g5.8xlarge",
"--vcpu", "3",
"--memory", "1",
"--accelerators-limit", "1",
"--vcpu-limit", "4",
"--memory-limit", "2",
"--node-count", "1",
"--queue-name", QUEUE,
"--namespace", NAMESPACE
]
result = subprocess.run(
create_cmd,
capture_output=True,
text=True
)
assert result.returncode != 0
assert "Either node-count OR a combination of accelerators, vcpu, " in result.stdout
assert "memory-in-gib must be specified for instance-type ml.g5.8xlarge" in result.stdout

def test_invalid_no_node_count_or_quota_parameter(self, test_job_name):
"""Test that case where both node-count and any of the quota parameters are provided"""
# Test with no node-count, no accelerators/vcpu/memory parameters
Expand Down
4 changes: 2 additions & 2 deletions test/unit_tests/cli/test_quota_allocation_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,8 @@ def test_is_valid_invalid_instance_type(self):

def test_is_valid_both_node_count_and_resources(self):
valid, message = _is_valid(4.0, None, None, None, 2, "ml.g5.xlarge")
assert not valid
assert message == "Either node-count OR a combination of accelerators, vcpu, memory-in-gib must be specified for instance-type ml.g5.xlarge"
assert valid
assert message == ""

def test_is_valid_both_node_count_and_limits(self):
valid, message = _is_valid(None, None, None, None, 2, "ml.g5.xlarge")
Expand Down
81 changes: 80 additions & 1 deletion test/unit_tests/training/test_pytorch_job_template_model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import unittest
from hyperpod_pytorch_job_template.v1_1.model import PyTorchJobConfig
from hyperpod_pytorch_job_template.v1_0.model import PyTorchJobConfig as PyTorchJobConfigV1_0
from hyperpod_pytorch_job_template.v1_1.template import TEMPLATE_CONTENT
from jinja2 import Template


class TestPyTorchJobConfigEFA(unittest.TestCase):
Expand Down Expand Up @@ -98,7 +100,7 @@ def test_user_specified_efa_overrides_default(self):
job_name="test-custom-efa",
image="pytorch:latest",
accelerators=4,
efa_interfaces=2,
efa=2,
instance_type="ml.p4d.24xlarge"
)

Expand Down Expand Up @@ -199,3 +201,80 @@ def test_v1_0_model_to_domain_deep_health_check_label(self):

if __name__ == '__main__':
unittest.main()


class TestJinjaTemplateRendering(unittest.TestCase):
"""Test that jinja template variables match schema field names."""

def test_all_resource_fields_render_in_template(self):
"""Verify all schema resource fields are correctly rendered by the jinja template."""
template = Template(TEMPLATE_CONTENT)
rendered = template.render(
job_name="test-resources",
namespace="default",
image="pytorch:latest",
pull_policy="Always",
node_count=2,
accelerators=8,
vcpu=40,
memory=800,
efa=4,
accelerators_limit=8,
vcpu_limit=48,
memory_limit=900,
efa_limit=4,
instance_type="ml.p4d.24xlarge",
queue_name="test-queue",
priority="high",
preferred_topology="topology.kubernetes.io/zone",
required_topology="topology.kubernetes.io/zone",
tasks_per_node=8,
deep_health_check_passed_nodes_only=True,
service_account_name="training-sa",
scheduler_type="custom-scheduler",
max_retry=3,
)
# Requests
self.assertIn("nvidia.com/gpu: 8", rendered)
self.assertIn("cpu: 40", rendered)
self.assertIn("memory: 800Gi", rendered)
self.assertIn("vpc.amazonaws.com/efa: 4", rendered)
# Limits
self.assertIn("cpu: 48", rendered)
self.assertIn("memory: 900Gi", rendered)
self.assertEqual(rendered.count("nvidia.com/gpu: 8"), 2)
self.assertEqual(rendered.count("vpc.amazonaws.com/efa: 4"), 2)
# Replicas
self.assertIn("replicas: 2", rendered)
# Node selector
self.assertIn("node.kubernetes.io/instance-type: ml.p4d.24xlarge", rendered)
self.assertIn('sagemaker.amazonaws.com/deep-health-check-status: "Passed"', rendered)
# Kueue labels
self.assertIn("kueue.x-k8s.io/queue-name: test-queue", rendered)
self.assertIn("kueue.x-k8s.io/priority-class: high", rendered)
# Topology
self.assertIn("kueue.x-k8s.io/podset-preferred-topology: topology.kubernetes.io/zone", rendered)
self.assertIn("kueue.x-k8s.io/podset-required-topology: topology.kubernetes.io/zone", rendered)
# Container config
self.assertIn("imagePullPolicy: Always", rendered)
self.assertIn('nprocPerNode: "8"', rendered)
self.assertIn("serviceAccountName: training-sa", rendered)
self.assertIn("schedulerName: custom-scheduler", rendered)
# Run policy
self.assertIn("jobMaxRetryCount: 3", rendered)

def test_accelerator_partition_fields_render_in_template(self):
"""Verify accelerator partition fields render correctly (mutually exclusive with accelerators)."""
template = Template(TEMPLATE_CONTENT)
rendered = template.render(
job_name="test-mig",
namespace="default",
image="pytorch:latest",
accelerator_partition_type="mig-1g.5gb",
accelerator_partition_count=2,
accelerator_partition_limit=2,
instance_type="ml.p4d.24xlarge",
)
self.assertIn("nvidia.com/mig-1g.5gb: 2", rendered)
self.assertEqual(rendered.count("nvidia.com/mig-1g.5gb: 2"), 2)
self.assertIn('nvidia.com/mig.config.state: "success"', rendered)
Loading