From 21382e95fa8183f48800379a3dfe5f29407486e3 Mon Sep 17 00:00:00 2001
From: Farhan Tejani <8650465+FarhanTejani@users.noreply.github.com>
Date: Tue, 17 Mar 2026 21:25:06 -0700
Subject: [PATCH] Fix node_count resource fields mutual exclusivity and EFA
 field naming (#383, #306)

---
 .../v1_1/model.py                             | 14 ++--
 .../v1_1/schema.json                          |  6 +-
 .../training/quota_allocation_util.py         |  7 +-
 .../training/cli/test_gpu_quota_allocation.py | 31 -------
 .../cli/test_quota_allocation_util.py         |  4 +-
 .../test_pytorch_job_template_model.py        | 81 ++++++++++++++++++-
 6 files changed, 93 insertions(+), 50 deletions(-)

diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py
index 3c6c083d..12a73e11 100644
--- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py
+++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/model.py
@@ -195,12 +195,12 @@ class PyTorchJobConfig(BaseModel):
         default=None,
         description="Limit for the amount of memory in GiB",
     )
-    efa_interfaces: Optional[int] = Field(
+    efa: Optional[int] = Field(
         default=None,
-        description="Number of EFA interfaces for the instance",
+        description="Number of EFA interfaces",
         ge=0
     )
-    efa_interfaces_limit: Optional[int] = Field(
+    efa_limit: Optional[int] = Field(
         default=None,
         description="Limit for the number of EFA interfaces",
         ge=0
@@ -464,26 +464,26 @@ def build_dict(**kwargs):
                     **{partition_resource_key: str(self.accelerator_partition_count)} if self.accelerator_partition_count else {},
                     vcpu=str(self.vcpu) if self.vcpu else None,
                     memory=str(self.memory) if self.memory else None,
-                    **{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {},
+                    **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {},
                 )
                 limits_value = build_dict(
                     **{partition_resource_key: str(self.accelerator_partition_limit)} if self.accelerator_partition_limit else {},
                     vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
                     memory=str(self.memory_limit) if self.memory_limit else None,
-                    **{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {},
+                    **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {},
                 )
             else:
                 requests_value = build_dict(
                     accelerators=str(self.accelerators) if self.accelerators else None,
                     vcpu=str(self.vcpu) if self.vcpu else None,
                     memory=str(self.memory) if self.memory else None,
-                    **{"vpc.amazonaws.com/efa": str(self.efa_interfaces)} if self.efa_interfaces else {},
+                    **{"vpc.amazonaws.com/efa": str(self.efa)} if self.efa else {},
                 )
                 limits_value = build_dict(
                     accelerators=str(self.accelerators_limit) if self.accelerators_limit else None,
                     vcpu=str(self.vcpu_limit) if self.vcpu_limit else None,
                     memory=str(self.memory_limit) if self.memory_limit else None,
-                    **{"vpc.amazonaws.com/efa": str(self.efa_interfaces_limit)} if self.efa_interfaces_limit else {},
+                    **{"vpc.amazonaws.com/efa": str(self.efa_limit)} if self.efa_limit else {},
                 )
 
         # Build container
diff --git a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json
index d19ec4de..370a0a21 100644
--- a/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json
+++ b/hyperpod-pytorch-job-template/hyperpod_pytorch_job_template/v1_1/schema.json
@@ -305,12 +305,12 @@
       "minimum": 0,
       "description": "Limit for the amount of memory in GiB"
     },
-    "efa_interfaces": {
+    "efa": {
       "type": "integer",
       "minimum": 0,
-      "description": "Number of EFA interfaces for the instance"
+      "description": "Number of EFA interfaces"
     },
-    "efa_interfaces_limit": {
+    "efa_limit": {
       "type": "integer",
       "minimum": 0,
       "description": "Limit for the number of EFA interfaces"
diff --git a/src/sagemaker/hyperpod/training/quota_allocation_util.py b/src/sagemaker/hyperpod/training/quota_allocation_util.py
index 93c3258a..ddb15260 100644
--- a/src/sagemaker/hyperpod/training/quota_allocation_util.py
+++ b/src/sagemaker/hyperpod/training/quota_allocation_util.py
@@ -282,15 +282,10 @@ def _is_valid(vcpu: Optional[float], memory_in_gib: Optional[float], accelerator
     if (instance_type is None and has_gpu_quota_allocation) or (instance_type is None and accelerator_partition_type):
         return False, "Instance-type must be specified when accelerators, accelerator_partition_type, vcpu, or memory-in-gib specified"
 
-    node_specified = node_count is not None and node_count > 0
-
     # Check if instance_type is valid only when it's provided
     if instance_type is not None and (INSTANCE_RESOURCES.get(instance_type) is None):
         return False, f"Invalid instance-type {instance_type}. Please re-check the instance type and contact AWS for support."
-    if instance_type is not None:
-        #both resources and node count specified
-        if (has_gpu_quota_allocation and node_specified):
-            return False, f"Either node-count OR a combination of accelerators, vcpu, memory-in-gib must be specified for instance-type {instance_type}"
+
     return True, ""
 
 
diff --git a/test/integration_tests/training/cli/test_gpu_quota_allocation.py b/test/integration_tests/training/cli/test_gpu_quota_allocation.py
index c4ea1480..72f382c3 100644
--- a/test/integration_tests/training/cli/test_gpu_quota_allocation.py
+++ b/test/integration_tests/training/cli/test_gpu_quota_allocation.py
@@ -208,37 +208,6 @@ def test_create_job_with_accelerators_memory_parameters(self, test_job_name):
         assert result.returncode == 0
         logger.info(f"Successfully deleted job: {test_job_name}")
 
-    def test_invalid_node_count_accelerators_parameter(self, test_job_name):
-        """Test that invalid case where both node-count and accelerators are provided"""
-
-        # Test with both node-count and accelerators parameters
-        create_cmd = [
-            "hyp", "create", "hyp-pytorch-job",
-            "--version", "1.1",
-            "--job-name", test_job_name,
-            "--image", "pytorch:latest",
-            "--pull-policy", "IfNotPresent",
-            "--tasks-per-node", "1",
-            "--accelerators", "1",
-            "--instance-type", "ml.g5.8xlarge",
-            "--vcpu", "3",
-            "--memory", "1",
-            "--accelerators-limit", "1",
-            "--vcpu-limit", "4",
-            "--memory-limit", "2",
-            "--node-count", "1",
-            "--queue-name", QUEUE,
-            "--namespace", NAMESPACE
-        ]
-        result = subprocess.run(
-                    create_cmd,
-                    capture_output=True,
-                    text=True
-                )
-        assert result.returncode != 0
-        assert "Either node-count OR a combination of accelerators, vcpu, " in result.stdout
-        assert "memory-in-gib must be specified for instance-type ml.g5.8xlarge" in result.stdout
-
     def test_invalid_no_node_count_or_quota_parameter(self, test_job_name):
         """Test that case where both node-count and any of the quota parameters are provided"""
         # Test with no node-count, no accelerators/vcpu/memory parameters
diff --git a/test/unit_tests/cli/test_quota_allocation_util.py b/test/unit_tests/cli/test_quota_allocation_util.py
index 4a2cb79b..7ab8d2e1 100644
--- a/test/unit_tests/cli/test_quota_allocation_util.py
+++ b/test/unit_tests/cli/test_quota_allocation_util.py
@@ -218,8 +218,8 @@ def test_is_valid_invalid_instance_type(self):
 
     def test_is_valid_both_node_count_and_resources(self):
         valid, message = _is_valid(4.0, None, None, None, 2, "ml.g5.xlarge")
-        assert not valid
-        assert message == "Either node-count OR a combination of accelerators, vcpu, memory-in-gib must be specified for instance-type ml.g5.xlarge"
+        assert valid
+        assert message == ""
 
     def test_is_valid_both_node_count_and_limits(self):
         valid, message = _is_valid(None, None, None, None, 2, "ml.g5.xlarge")
diff --git a/test/unit_tests/training/test_pytorch_job_template_model.py b/test/unit_tests/training/test_pytorch_job_template_model.py
index 90af9efc..d95b8f44 100644
--- a/test/unit_tests/training/test_pytorch_job_template_model.py
+++ b/test/unit_tests/training/test_pytorch_job_template_model.py
@@ -1,6 +1,8 @@
 import unittest
 from hyperpod_pytorch_job_template.v1_1.model import PyTorchJobConfig
 from hyperpod_pytorch_job_template.v1_0.model import PyTorchJobConfig as PyTorchJobConfigV1_0
+from hyperpod_pytorch_job_template.v1_1.template import TEMPLATE_CONTENT
+from jinja2 import Template
 
 
 class TestPyTorchJobConfigEFA(unittest.TestCase):
@@ -98,7 +100,7 @@ def test_user_specified_efa_overrides_default(self):
             job_name="test-custom-efa",
             image="pytorch:latest",
             accelerators=4,
-            efa_interfaces=2,
+            efa=2,
             instance_type="ml.p4d.24xlarge"
         )
 
@@ -199,3 +201,80 @@ def test_v1_0_model_to_domain_deep_health_check_label(self):
 
 if __name__ == '__main__':
     unittest.main()
+
+
+class TestJinjaTemplateRendering(unittest.TestCase):
+    """Test that jinja template variables match schema field names."""
+
+    def test_all_resource_fields_render_in_template(self):
+        """Verify all schema resource fields are correctly rendered by the jinja template."""
+        template = Template(TEMPLATE_CONTENT)
+        rendered = template.render(
+            job_name="test-resources",
+            namespace="default",
+            image="pytorch:latest",
+            pull_policy="Always",
+            node_count=2,
+            accelerators=8,
+            vcpu=40,
+            memory=800,
+            efa=4,
+            accelerators_limit=8,
+            vcpu_limit=48,
+            memory_limit=900,
+            efa_limit=4,
+            instance_type="ml.p4d.24xlarge",
+            queue_name="test-queue",
+            priority="high",
+            preferred_topology="topology.kubernetes.io/zone",
+            required_topology="topology.kubernetes.io/zone",
+            tasks_per_node=8,
+            deep_health_check_passed_nodes_only=True,
+            service_account_name="training-sa",
+            scheduler_type="custom-scheduler",
+            max_retry=3,
+        )
+        # Requests
+        self.assertIn("nvidia.com/gpu: 8", rendered)
+        self.assertIn("cpu: 40", rendered)
+        self.assertIn("memory: 800Gi", rendered)
+        self.assertIn("vpc.amazonaws.com/efa: 4", rendered)
+        # Limits
+        self.assertIn("cpu: 48", rendered)
+        self.assertIn("memory: 900Gi", rendered)
+        self.assertEqual(rendered.count("nvidia.com/gpu: 8"), 2)
+        self.assertEqual(rendered.count("vpc.amazonaws.com/efa: 4"), 2)
+        # Replicas
+        self.assertIn("replicas: 2", rendered)
+        # Node selector
+        self.assertIn("node.kubernetes.io/instance-type: ml.p4d.24xlarge", rendered)
+        self.assertIn('sagemaker.amazonaws.com/deep-health-check-status: "Passed"', rendered)
+        # Kueue labels
+        self.assertIn("kueue.x-k8s.io/queue-name: test-queue", rendered)
+        self.assertIn("kueue.x-k8s.io/priority-class: high", rendered)
+        # Topology
+        self.assertIn("kueue.x-k8s.io/podset-preferred-topology: topology.kubernetes.io/zone", rendered)
+        self.assertIn("kueue.x-k8s.io/podset-required-topology: topology.kubernetes.io/zone", rendered)
+        # Container config
+        self.assertIn("imagePullPolicy: Always", rendered)
+        self.assertIn('nprocPerNode: "8"', rendered)
+        self.assertIn("serviceAccountName: training-sa", rendered)
+        self.assertIn("schedulerName: custom-scheduler", rendered)
+        # Run policy
+        self.assertIn("jobMaxRetryCount: 3", rendered)
+
+    def test_accelerator_partition_fields_render_in_template(self):
+        """Verify accelerator partition fields render correctly (mutually exclusive with accelerators)."""
+        template = Template(TEMPLATE_CONTENT)
+        rendered = template.render(
+            job_name="test-mig",
+            namespace="default",
+            image="pytorch:latest",
+            accelerator_partition_type="mig-1g.5gb",
+            accelerator_partition_count=2,
+            accelerator_partition_limit=2,
+            instance_type="ml.p4d.24xlarge",
+        )
+        self.assertIn("nvidia.com/mig-1g.5gb: 2", rendered)
+        self.assertEqual(rendered.count("nvidia.com/mig-1g.5gb: 2"), 2)
+        self.assertIn('nvidia.com/mig.config.state: "success"', rendered)