From 21382e95fa8183f48800379a3dfe5f29407486e3 Mon Sep 17 00:00:00 2001 From: Farhan Tejani <8650465+FarhanTejani@users.noreply.github.com> Date: Tue, 17 Mar 2026 21:25:06 -0700 Subject: [PATCH] Fix node_count resource fields mutual exclusivity and EFA field naming (#383, #306) --- .../v1_1/model.py | 14 ++-- .../v1_1/schema.json | 6 +- .../training/quota_allocation_util.py | 7 +- .../training/cli/test_gpu_quota_allocation.py | 31 ------- .../cli/test_quota_allocation_util.py | 4 +- .../test_pytorch_job_template_model.py | 81 ++++++++++++++++++- 6 files changed, 93 insertions(+), 50 deletions(-) diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py index 3c6c083d..12a73e11 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py @@ -195,12 +195,12 @@ class PyTorchJobConfig(BaseModel): default=None, description="Limit for the amount of memory in GiB", ) - efa_interfaces: Optional[int] = Field( + efa: Optional[int] = Field( default=None, - description="Number of EFA interfaces for the instance", + description="Number of EFA interfaces", ge=0 ) - efa_interfaces_limit: Optional[int] = Field( + efa_limit: Optional[int] = Field( default=None, description="Limit for the number of EFA interfaces", ge=0 @@ -464,26 +464,26 @@ def build_dict(**kwargs): **{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {}, vcpu=str(self.vcpu) if self.vcpu else None, memory=str(self.memory) if self.memory else None, - **{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {}, + **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {}, ) limits_value = build_dict( **{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {}, vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, memory=str(self.memory_limit) if self.memory_limit else None, - **{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {}, + **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {}, ) else: requests_value = build_dict( accelerators=str(self.accelerators) if self.accelerators else None, vcpu=str(self.vcpu) if self.vcpu else None, memory=str(self.memory) if self.memory else None, - **{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {}, + **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {}, ) limits_value = build_dict( accelerators=str(self.accelerators_limit) if self.accelerators_limit else None, vcpu=str(self.vcpu_limit) if self.vcpu_limit else None, memory=str(self.memory_limit) if self.memory_limit else None, - **{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {}, + **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {}, ) # Build container diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json index d19ec4de..370a0a21 100644 --- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json +++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json @@ -305,12 +305,12 @@ "minimum": 0, "description": "Limit for the amount of memory in GiB" }, - "efa_interfaces": { + "efa": { "type": "integer", "minimum": 0, - "description": "Number of EFA interfaces for the instance" + "description": "Number of EFA interfaces" }, - "efa_interfaces_limit": { + "efa_limit": { "type": "integer", "minimum": 0, "description": "Limit for the number of EFA interfaces" diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py index 93c3258a..ddb15260 100644 --- a/src/sagemaker/hyperpod/training/quota_allocation_util.py +++ b/src/sagemaker/hyperpod/training/quota_allocation_util.py @@ -282,15 +282,10 @@ def _is_valid(vcpu: Optional[float], memory_in_gib: Optional[float], accelerator if (instance_type is None and has_gpu_quota_allocation) or (instance_type is None and accelerator_partition_type): return False, "Instance-type must be specified when accelerators, accelerator_partition_type, vcpu, or memory-in-gib specified" - node_specified = node_count is not None and node_count > 0 - # Check if instance_type is valid only when it's provided if instance_type is not None and (INSTANCE_RESOURCES.get(instance_type) is None): return False, f"Invalid instance-type {instance_type}. Please re-check the instance type and contact AWS for support." - if instance_type is not None: - #both resources and node count specified - if (has_gpu_quota_allocation and node_specified): - return False, f"Either node-count OR a combination of accelerators, vcpu, memory-in-gib must be specified for instance-type {instance_type}" + return True, "" diff --git a/test/integration_tests/training/cli/test_gpu_quota_allocation.py b/test/integration_tests/training/cli/test_gpu_quota_allocation.py index c4ea1480..72f382c3 100644 --- a/test/integration_tests/training/cli/test_gpu_quota_allocation.py +++ b/test/integration_tests/training/cli/test_gpu_quota_allocation.py @@ -208,37 +208,6 @@ def test_create_job_with_accelerators_memory_parameters(self, test_job_name): assert result.returncode == 0 logger.info(f"Successfully deleted job: {test_job_name}") - def test_invalid_node_count_accelerators_parameter(self, test_job_name): - """Test that invalid case where both node-count and accelerators are provided""" - - # Test with both node-count and accelerators parameters - create_cmd = [ - "hyp", "create", "hyp-pytorch-job", - "--version", "1.1", - "--job-name", test_job_name, - "--image", "pytorch:latest", - "--pull-policy", "IfNotPresent", - "--tasks-per-node", "1", - "--accelerators", "1", - "--instance-type", "ml.g5.8xlarge", - "--vcpu", "3", - "--memory", "1", - "--accelerators-limit", "1", - "--vcpu-limit", "4", - "--memory-limit", "2", - "--node-count", "1", - "--queue-name", QUEUE, - "--namespace", NAMESPACE - ] - result = subprocess.run( - create_cmd, - capture_output=True, - text=True - ) - assert result.returncode != 0 - assert "Either node-count OR a combination of accelerators, vcpu, " in result.stdout - assert "memory-in-gib must be specified for instance-type ml.g5.8xlarge" in result.stdout - def test_invalid_no_node_count_or_quota_parameter(self, test_job_name): """Test that case where both node-count and any of the quota parameters are provided""" # Test with no node-count, no accelerators/vcpu/memory parameters diff --git a/test/unit_tests/cli/test_quota_allocation_util.py b/test/unit_tests/cli/test_quota_allocation_util.py index 4a2cb79b..7ab8d2e1 100644 --- a/test/unit_tests/cli/test_quota_allocation_util.py +++ b/test/unit_tests/cli/test_quota_allocation_util.py @@ -218,8 +218,8 @@ def test_is_valid_invalid_instance_type(self): def test_is_valid_both_node_count_and_resources(self): valid, message = _is_valid(4.0, None, None, None, 2, "ml.g5.xlarge") - assert not valid - assert message == "Either node-count OR a combination of accelerators, vcpu, memory-in-gib must be specified for instance-type ml.g5.xlarge" + assert valid + assert message == "" def test_is_valid_both_node_count_and_limits(self): valid, message = _is_valid(None, None, None, None, 2, "ml.g5.xlarge") diff --git a/test/unit_tests/training/test_pytorch_job_template_model.py b/test/unit_tests/training/test_pytorch_job_template_model.py index 90af9efc..d95b8f44 100644 --- a/test/unit_tests/training/test_pytorch_job_template_model.py +++ b/test/unit_tests/training/test_pytorch_job_template_model.py @@ -1,6 +1,8 @@ import unittest from hyperpod_pytorch_job_template.v1_1.model import PyTorchJobConfig from hyperpod_pytorch_job_template.v1_0.model import PyTorchJobConfig as PyTorchJobConfigV1_0 +from hyperpod_pytorch_job_template.v1_1.template import TEMPLATE_CONTENT +from jinja2 import Template class TestPyTorchJobConfigEFA(unittest.TestCase): @@ -98,7 +100,7 @@ def test_user_specified_efa_overrides_default(self): job_name="test-custom-efa", image="pytorch:latest", accelerators=4, - efa_interfaces=2, + efa=2, instance_type="ml.p4d.24xlarge" ) @@ -199,3 +201,80 @@ def test_v1_0_model_to_domain_deep_health_check_label(self): if __name__ == '__main__': unittest.main() + + +class TestJinjaTemplateRendering(unittest.TestCase): + """Test that jinja template variables match schema field names.""" + + def test_all_resource_fields_render_in_template(self): + """Verify all schema resource fields are correctly rendered by the jinja template.""" + template = Template(TEMPLATE_CONTENT) + rendered = template.render( + job_name="test-resources", + namespace="default", + image="pytorch:latest", + pull_policy="Always", + node_count=2, + accelerators=8, + vcpu=40, + memory=800, + efa=4, + accelerators_limit=8, + vcpu_limit=48, + memory_limit=900, + efa_limit=4, + instance_type="ml.p4d.24xlarge", + queue_name="test-queue", + priority="high", + preferred_topology="topology.kubernetes.io/zone", + required_topology="topology.kubernetes.io/zone", + tasks_per_node=8, + deep_health_check_passed_nodes_only=True, + service_account_name="training-sa", + scheduler_type="custom-scheduler", + max_retry=3, + ) + # Requests + self.assertIn("nvidia.com/gpu: 8", rendered) + self.assertIn("cpu: 40", rendered) + self.assertIn("memory: 800Gi", rendered) + self.assertIn("vpc.amazonaws.com/efa: 4", rendered) + # Limits + self.assertIn("cpu: 48", rendered) + self.assertIn("memory: 900Gi", rendered) + self.assertEqual(rendered.count("nvidia.com/gpu: 8"), 2) + self.assertEqual(rendered.count("vpc.amazonaws.com/efa: 4"), 2) + # Replicas + self.assertIn("replicas: 2", rendered) + # Node selector + self.assertIn("node.kubernetes.io/instance-type: ml.p4d.24xlarge", rendered) + self.assertIn('sagemaker.amazonaws.com/deep-health-check-status: "Passed"', rendered) + # Kueue labels + self.assertIn("kueue.x-k8s.io/queue-name: test-queue", rendered) + self.assertIn("kueue.x-k8s.io/priority-class: high", rendered) + # Topology + self.assertIn("kueue.x-k8s.io/podset-preferred-topology: topology.kubernetes.io/zone", rendered) + self.assertIn("kueue.x-k8s.io/podset-required-topology: topology.kubernetes.io/zone", rendered) + # Container config + self.assertIn("imagePullPolicy: Always", rendered) + self.assertIn('nprocPerNode: "8"', rendered) + self.assertIn("serviceAccountName: training-sa", rendered) + self.assertIn("schedulerName: custom-scheduler", rendered) + # Run policy + self.assertIn("jobMaxRetryCount: 3", rendered) + + def test_accelerator_partition_fields_render_in_template(self): + """Verify accelerator partition fields render correctly (mutually exclusive with accelerators).""" + template = Template(TEMPLATE_CONTENT) + rendered = template.render( + job_name="test-mig", + namespace="default", + image="pytorch:latest", + accelerator_partition_type="mig-1g.5gb", + accelerator_partition_count=2, + accelerator_partition_limit=2, + instance_type="ml.p4d.24xlarge", + ) + self.assertIn("nvidia.com/mig-1g.5gb: 2", rendered) + self.assertEqual(rendered.count("nvidia.com/mig-1g.5gb: 2"), 2) + self.assertIn('nvidia.com/mig.config.state: "success"', rendered)