ROCm · ipanfilo · May 26, 2026 · May 26, 2026 · May 27, 2026 · May 29, 2026
@@ -44,6 +44,8 @@
 LAUNCH_CMD = ["torchrun", f"--nproc_per_node={NUM_PROCS}"]
 if tex.ubuf_built_with_mpi():
     LAUNCH_CMD = ["mpirun", "-np", str(NUM_PROCS), "--oversubscribe", "--quiet", "python3"]
+if IS_HIP_EXTENSION:
+    LAUNCH_CMD = ["timeout", "-k60", "-v", "180"] + LAUNCH_CMD
 
 # Fall back on CUDA IPC if the platform does not support CUDA multicast
 if not tex.device_supports_multicast():
@@ -94,6 +96,8 @@ def _run_gemm_with_overlap(comm_type, bulk, p2p, atomic, aggregate, quantization
         or "NUMERICAL CHECK FAILED" in result.stderr.decode()
         or "NUMERICAL CHECK PASSED" not in result.stdout.decode()
     ):
+        if result.returncode == 124:
+            pytest.fail("Test timed out", pytrace=False)
         raise AssertionError(result.stderr.decode())
 
 
@@ -155,6 +159,8 @@ def _run_layer_with_overlap(
         or "NUMERICAL CHECK FAILED" in result.stderr.decode()
         or "NUMERICAL CHECK PASSED" not in result.stdout.decode()
     ):
+        if result.returncode == 124:
+            pytest.fail("Test timed out", pytrace=False)
         raise AssertionError(result.stderr.decode())
 
 

@@ -1,3 +1,5 @@
+# This file was modified for portability to AMDGPU
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
 # Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # See LICENSE for license information.
@@ -9,6 +11,7 @@
 import transformer_engine.pytorch as te
 
 import torch
+from torch.utils.cpp_extension import IS_HIP_EXTENSION
 
 
 fp8_available, reason_for_no_fp8 = te.is_fp8_available(return_reason=True)
@@ -19,6 +22,8 @@
 def _run_test(fp_init, sharding_dims, recipe, layer_type):
     test_path = Path(__file__).parent.resolve() / "run_fsdp2_model.py"
     test_cmd = ["torchrun", f"--nproc_per_node={NUM_PROCS}", str(test_path)]
+    if IS_HIP_EXTENSION:
+        test_cmd = ["timeout", "-k60", "-v", "180"] + test_cmd
 
     if fp_init:
         test_cmd += ["--fp8-init"]

@@ -43,14 +43,17 @@ def _run_test(fp_init, recipe):
     test_dir = Path(__file__).parent.resolve()
     fsdp_script = test_dir / "run_fsdp2_fp8_model.py"
 
-    test_cmd = ["torchrun", f"--nproc_per_node={NUM_PROCS}", "--master-port=29501", str(fsdp_script)]
+    test_cmd = ["timeout", "-k60", "-v", "180", "torchrun", f"--nproc_per_node={NUM_PROCS}",
+                "--master-port=29501", str(fsdp_script)]
 
     if fp_init:
         test_cmd += ["--fp8-init"]
     test_cmd += ["--recipe", recipe]
 
-    subprocess.run(test_cmd + ['--use-fsdp2','--gradients-save-file', 'all_iters_fsdp2.pt'], env=os.environ, check=True)
-    subprocess.run(test_cmd + ['--gradients-save-file', 'all_iters_dp.pt'], env=os.environ, check=True)
+    subprocess.run(test_cmd + ['--use-fsdp2','--gradients-save-file', 'all_iters_fsdp2.pt'],
+                   env=os.environ, check=True)
+    subprocess.run(test_cmd + ['--gradients-save-file', 'all_iters_dp.pt'], env=os.environ,
+                   check=True)
 
     # Load outputs
     output_fsdp = torch.load("all_iters_fsdp2.pt", map_location="cpu")