ROCm · coketaste · May 29, 2026
@@ -2,6 +2,72 @@
 
 This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
 
+## Behavioral Guidelines
+
+Behavioral guidelines to reduce common LLM coding mistakes. Merge with project-specific instructions as needed.
+
+**Tradeoff:** These guidelines bias toward caution over speed. For trivial tasks, use judgment.
+
+## 1. Think Before Coding
+
+**Don't assume. Don't hide confusion. Surface tradeoffs.**
+
+Before implementing:
+- State your assumptions explicitly. If uncertain, ask.
+- If multiple interpretations exist, present them - don't pick silently.
+- If a simpler approach exists, say so. Push back when warranted.
+- If something is unclear, stop. Name what's confusing. Ask.
+
+## 2. Simplicity First
+
+**Minimum code that solves the problem. Nothing speculative.**
+
+- No features beyond what was asked.
+- No abstractions for single-use code.
+- No "flexibility" or "configurability" that wasn't requested.
+- No error handling for impossible scenarios.
+- If you write 200 lines and it could be 50, rewrite it.
+
+Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify.
+
+## 3. Surgical Changes
+
+**Touch only what you must. Clean up only your own mess.**
+
+When editing existing code:
+- Don't "improve" adjacent code, comments, or formatting.
+- Don't refactor things that aren't broken.
+- Match existing style, even if you'd do it differently.
+- If you notice unrelated dead code, mention it - don't delete it.
+
+When your changes create orphans:
+- Remove imports/variables/functions that YOUR changes made unused.
+- Don't remove pre-existing dead code unless asked.
+
+The test: Every changed line should trace directly to the user's request.
+
+## 4. Goal-Driven Execution
+
+**Define success criteria. Loop until verified.**
+
+Transform tasks into verifiable goals:
+- "Add validation" → "Write tests for invalid inputs, then make them pass"
+- "Fix the bug" → "Write a test that reproduces it, then make it pass"
+- "Refactor X" → "Ensure tests pass before and after"
+
+For multi-step tasks, state a brief plan:
+```
+1. [Step] → verify: [check]
+2. [Step] → verify: [check]
+3. [Step] → verify: [check]
+```
+
+Strong success criteria let you loop independently. Weak criteria ("make it work") require constant clarification.
+
+---
+
+**These guidelines are working if:** fewer unnecessary changes in diffs, fewer rewrites due to overcomplication, and clarifying questions come before implementation rather than after mistakes.
+
 ## Development Setup
 
 ```bash

@@ -24,6 +24,23 @@
     "sglang-disagg"
 ]
 
+# Alternate spellings for distributed launcher values → canonical form.
+# Add new aliases here only; do not branch on alternate spellings at dispatch sites.
+_LAUNCHER_ALIASES: Dict[str, str] = {
+    "sglang_disagg": "sglang-disagg",
+}
+
+
+def canonicalize_distributed_launcher(launcher: Optional[str]) -> Optional[str]:
+    """Normalize alternate launcher spellings to their canonical form.
+
+    Resolves aliases (e.g. ``sglang_disagg`` → ``sglang-disagg``). Unknown or
+    empty values are returned unchanged; callers do their own validation.
+    """
+    if not launcher:
+        return launcher
+    return _LAUNCHER_ALIASES.get(launcher, launcher)
+
 # Tool names that use rocprof / rocprofv3 wrapping and need MPI-aware rocprofv3 on multi-node.
 _ROCPROF_FAMILY_TOOL_NAMES = frozenset(
     {

@@ -13,7 +13,7 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
-from .common import configure_multi_node_profiling
+from .common import canonicalize_distributed_launcher, configure_multi_node_profiling
 from .k8s_names import sanitize_k8s_container_name, sanitize_k8s_label_value
 from .k8s_secrets import (
     CONFIGMAP_MAX_BYTES,
@@ -339,7 +339,7 @@ def _prepare_template_context(
                 model_args=model_info.get("args", ""),
             )
 
-        elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg":
+        elif canonicalize_distributed_launcher(launcher_type) == "sglang-disagg":
             if nnodes < 3:
                 raise ValueError(
                     f"SGLang Disaggregated requires minimum 3 nodes "

@@ -19,7 +19,11 @@
 
 from .base import BaseDeployment, DeploymentConfig, DeploymentResult, DeploymentStatus, create_jinja_env
 from .primus_backend import infer_primus_backend_from_model_name, merged_primus_config
-from .common import configure_multi_node_profiling, normalize_launcher
+from .common import (
+    canonicalize_distributed_launcher,
+    configure_multi_node_profiling,
+    normalize_launcher,
+)
 from .config_loader import ConfigLoader, apply_deployment_config
 from .slurm_node_selector import SlurmNodeSelector
 from madengine.utils.gpu_config import resolve_runtime_gpus
@@ -347,7 +351,7 @@ def _generate_launcher_command(
             return self._generate_vllm_command(nnodes, nproc_per_node, master_port)
         elif launcher_type == "sglang":
             return self._generate_sglang_command(nnodes, nproc_per_node, master_port)
-        elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg":
+        elif canonicalize_distributed_launcher(launcher_type) == "sglang-disagg":
             return self._generate_sglang_disagg_command(nnodes, nproc_per_node, master_port)
         elif launcher_type == "deepspeed":
             return self._generate_deepspeed_command(nnodes, nproc_per_node, master_port)

@@ -31,6 +31,7 @@
 )
 from madengine.reporting.update_perf_super import update_perf_super_json, update_perf_super_csv
 from madengine.utils.gpu_config import resolve_runtime_gpus
+from madengine.deployment.common import canonicalize_distributed_launcher
 from madengine.utils.config_parser import ConfigParser
 from madengine.utils.path_utils import scripts_base_dir_from
 from madengine.utils.run_details import get_build_number, get_pipeline
@@ -661,7 +662,7 @@ def _generate_local_launcher_command(self, launcher_type: str, nproc_per_node: i
         """Generate distributed process launcher command for Docker local deployment.
 
         Docker local is always single-node. This parallels
-        SlurmDeployer._generate_launcher_command() and
+        SlurmDeployment._generate_launcher_command() and
         KubernetesLauncherMixin._generate_torchrun_command() for the
         Docker local path.
 
@@ -682,6 +683,46 @@ def _generate_local_launcher_command(self, launcher_type: str, nproc_per_node: i
         else:
             return f"torchrun --standalone --nproc_per_node={nproc_per_node}"
 
+    def _resolve_local_multi_node_runner_env(
+        self, model_info: typing.Dict, resolved_gpu_count: int
+    ) -> None:
+        """Set ``docker_env_vars["MAD_MULTI_NODE_RUNNER"]`` for local Docker runs.
+
+        No-op if the env var is already set. Resolves launcher from
+        ``additional_context.distributed.launcher``, then ``model_info.distributed.launcher``,
+        then ``MAD_LAUNCHER``; falls back to ``torchrun`` for unknown values.
+        Self-managing launchers (vllm/sglang/sglang-disagg/primus) set the var
+        to an empty string so downstream scripts under ``set -u`` don't fail.
+        """
+        if "MAD_MULTI_NODE_RUNNER" in self.context.ctx["docker_env_vars"]:
+            return
+        launcher = ""
+        if self.additional_context:
+            launcher = self.additional_context.get("distributed", {}).get("launcher", "")
+        if not launcher and model_info.get("distributed"):
+            launcher = model_info["distributed"].get("launcher", "")
+        if not launcher:
+            launcher = os.environ.get("MAD_LAUNCHER", "")
+        canonical_launcher = canonicalize_distributed_launcher(launcher)
+        valid_local_launchers = (
+            "torchrun", "megatron", "megatron-lm", "torchtitan",
+            "deepspeed", "vllm", "sglang", "sglang-disagg", "primus",
+        )
+        if canonical_launcher in valid_local_launchers:
+            dist_launcher = canonical_launcher
+        else:
+            if launcher:
+                print(f"⚠️  Unrecognized launcher '{launcher}'; "
+                      f"defaulting to torchrun for local deployment")
+            dist_launcher = "torchrun"
+        runtime_ngpus = int(self.context.ctx["docker_env_vars"].get(
+            "MAD_RUNTIME_NGPUS", str(resolved_gpu_count)))
+        launcher_cmd = self._generate_local_launcher_command(dist_launcher, runtime_ngpus)
+        self.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] = launcher_cmd
+        if launcher_cmd:
+            print(f"ℹ️  Set MAD_MULTI_NODE_RUNNER for local deployment "
+                  f"(launcher={dist_launcher}): {launcher_cmd}")
+
     _ENV_KEY_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
 
     def get_env_arg(self, run_env: typing.Dict) -> str:
@@ -1119,29 +1160,7 @@ def run_container(
         # SLURM and K8s generate this in their deployment layers (slurm.py,
         # kubernetes_launcher_mixin.py). Docker local has no such layer, so
         # we generate it here after GPU resolution provides MAD_RUNTIME_NGPUS.
-        # Defaults to torchrun when launcher is a deployment-level value
-        # ("docker", "native") rather than a distributed launcher.
-        # For models that hardcode their own launcher (e.g. HuggingFace scripts
-        # calling torchrun directly), this env var is simply unused.
-        if "MAD_MULTI_NODE_RUNNER" not in self.context.ctx["docker_env_vars"]:
-            launcher = ""
-            if self.additional_context:
-                launcher = self.additional_context.get("distributed", {}).get("launcher", "")
-            if not launcher and model_info.get("distributed"):
-                launcher = model_info["distributed"].get("launcher", "")
-            if not launcher:
-                launcher = os.environ.get("MAD_LAUNCHER", "")
-            dist_launcher = launcher if launcher in (
-                "torchrun", "megatron", "megatron-lm", "torchtitan",
-                "deepspeed", "vllm", "sglang", "sglang-disagg", "primus",
-            ) else "torchrun"
-            runtime_ngpus = int(self.context.ctx["docker_env_vars"].get(
-                "MAD_RUNTIME_NGPUS", str(resolved_gpu_count)))
-            launcher_cmd = self._generate_local_launcher_command(dist_launcher, runtime_ngpus)
-            if launcher_cmd:
-                self.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] = launcher_cmd
-                print(f"ℹ️  Set MAD_MULTI_NODE_RUNNER for local deployment "
-                      f"(launcher={dist_launcher}): {launcher_cmd}")
+        self._resolve_local_multi_node_runner_env(model_info, resolved_gpu_count)
 
         # Filter out MIOPEN_USER_DB_PATH from run_env if it exists
         # It should be passed via docker_env_vars in context instead

@@ -291,3 +291,113 @@ def test_invalid_mode_falls_back_to_lite(self):
         runner.gather_system_env_details(pep, "my_model")
         args = pep["pre_scripts"][0]["args"]
         assert args == "my_model_env lite UBUNTU"
+
+
+class TestGenerateLocalLauncherCommand:
+    """_generate_local_launcher_command: launcher_type → MAD_MULTI_NODE_RUNNER command."""
+
+    def _runner(self):
+        return ContainerRunner.__new__(ContainerRunner)
+
+    @pytest.mark.parametrize("launcher", ["torchrun", "megatron", "megatron-lm", "torchtitan"])
+    def test_torchrun_family_emits_torchrun_standalone(self, launcher):
+        cmd = self._runner()._generate_local_launcher_command(launcher, nproc_per_node=8)
+        assert cmd == "torchrun --standalone --nproc_per_node=8"
+
+    def test_deepspeed_emits_deepspeed_command(self):
+        cmd = self._runner()._generate_local_launcher_command("deepspeed", nproc_per_node=4)
+        assert cmd == "deepspeed --num_gpus=4"
+
+    @pytest.mark.parametrize("launcher", ["vllm", "sglang", "sglang-disagg", "primus"])
+    def test_self_managed_launchers_emit_empty(self, launcher):
+        assert self._runner()._generate_local_launcher_command(launcher, nproc_per_node=8) == ""
+
+    def test_unknown_launcher_falls_back_to_torchrun(self):
+        cmd = self._runner()._generate_local_launcher_command("bogus", nproc_per_node=2)
+        assert cmd == "torchrun --standalone --nproc_per_node=2"
+
+
+class TestResolveLocalMultiNodeRunnerEnv:
+    """_resolve_local_multi_node_runner_env: wires launcher → docker_env_vars."""
+
+    def _runner(self, docker_env_vars=None, additional_context=None):
+        runner = ContainerRunner.__new__(ContainerRunner)
+        runner.context = MagicMock()
+        runner.context.ctx = {"docker_env_vars": dict(docker_env_vars or {})}
+        runner.additional_context = additional_context
+        return runner
+
+    @pytest.mark.parametrize(
+        "launcher,expected",
+        [
+            ("torchrun", "torchrun --standalone --nproc_per_node=8"),
+            ("megatron", "torchrun --standalone --nproc_per_node=8"),
+            ("megatron-lm", "torchrun --standalone --nproc_per_node=8"),
+            ("torchtitan", "torchrun --standalone --nproc_per_node=8"),
+            ("deepspeed", "deepspeed --num_gpus=8"),
+        ],
+    )
+    def test_sets_expected_command_from_additional_context(self, launcher, expected):
+        runner = self._runner(
+            additional_context={"distributed": {"launcher": launcher}},
+        )
+        runner._resolve_local_multi_node_runner_env({}, resolved_gpu_count=8)
+        assert runner.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] == expected
+
+    def test_does_not_override_user_provided_value(self):
+        runner = self._runner(
+            docker_env_vars={"MAD_MULTI_NODE_RUNNER": "custom-runner --foo"},
+            additional_context={"distributed": {"launcher": "torchrun"}},
+        )
+        runner._resolve_local_multi_node_runner_env({}, resolved_gpu_count=8)
+        assert (
+            runner.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"]
+            == "custom-runner --foo"
+        )
+
+    @pytest.mark.parametrize(
+        "launcher",
+        ["vllm", "sglang", "sglang-disagg", "sglang_disagg", "primus"],
+    )
+    def test_self_managed_launchers_set_empty_string(self, launcher):
+        """Self-managing launchers set the var to "" (defined but empty),
+        so downstream scripts under set -u don't fail referencing it.
+        Covers the ``sglang_disagg`` underscore alias to lock in the
+        canonicalize_distributed_launcher() routing."""
+        runner = self._runner(
+            additional_context={"distributed": {"launcher": launcher}},
+        )
+        runner._resolve_local_multi_node_runner_env({}, resolved_gpu_count=4)
+        assert "MAD_MULTI_NODE_RUNNER" in runner.context.ctx["docker_env_vars"]
+        assert runner.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"] == ""
+
+    def test_falls_back_to_model_info_launcher(self):
+        runner = self._runner(additional_context=None)
+        runner._resolve_local_multi_node_runner_env(
+            {"distributed": {"launcher": "deepspeed"}}, resolved_gpu_count=2
+        )
+        assert (
+            runner.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"]
+            == "deepspeed --num_gpus=2"
+        )
+
+    def test_unknown_launcher_defaults_to_torchrun(self):
+        runner = self._runner(
+            additional_context={"distributed": {"launcher": "docker"}},
+        )
+        runner._resolve_local_multi_node_runner_env({}, resolved_gpu_count=1)
+        assert (
+            runner.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"]
+            == "torchrun --standalone --nproc_per_node=1"
+        )
+
+    def test_uses_mad_runtime_ngpus_when_set(self):
+        runner = self._runner(
+            docker_env_vars={"MAD_RUNTIME_NGPUS": "4"},
+            additional_context={"distributed": {"launcher": "torchrun"}},
+        )
+        runner._resolve_local_multi_node_runner_env({}, resolved_gpu_count=8)
+        assert (
+            runner.context.ctx["docker_env_vars"]["MAD_MULTI_NODE_RUNNER"]
+            == "torchrun --standalone --nproc_per_node=4"
+        )
@@ -9,6 +9,7 @@
 from madengine.deployment.base import BaseDeployment, DeploymentConfig, create_jinja_env
 from madengine.deployment.common import (
     VALID_LAUNCHERS,
+    canonicalize_distributed_launcher,
     configure_multi_node_profiling,
     is_rocprofv3_available,
     normalize_launcher,
@@ -70,6 +71,24 @@ def test_invalid_or_missing_launcher_non_k8s_returns_docker(self, deployment):
         assert normalize_launcher(None, deployment) == "docker"
 
 
+class TestCanonicalizeDistributedLauncher:
+    """canonicalize_distributed_launcher resolves alternate spellings."""
+
+    def test_underscore_form_maps_to_canonical_hyphen_form(self):
+        assert canonicalize_distributed_launcher("sglang_disagg") == "sglang-disagg"
+
+    def test_canonical_form_passthrough(self):
+        assert canonicalize_distributed_launcher("sglang-disagg") == "sglang-disagg"
+        assert canonicalize_distributed_launcher("torchrun") == "torchrun"
+
+    @pytest.mark.parametrize("value", [None, ""])
+    def test_empty_returned_unchanged(self, value):
+        assert canonicalize_distributed_launcher(value) == value
+
+    def test_unknown_value_returned_unchanged(self):
+        assert canonicalize_distributed_launcher("bogus_launcher") == "bogus_launcher"
+
+
 class TestToolsIncludeRocprofFamily:
     """tools_include_rocprof_family."""