pytorch · vmoens · May 22, 2026 · May 24, 2026 · May 24, 2026 · May 24, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -60,6 +60,48 @@ Strongly encouraged (not mandatory):
   `pytest.main(...)`, so the file can be executed directly.
 - New algorithms: also tested in the sota-implementations CI.
 
+### 7a. The `gpu` marker (load-bearing!)
+
+The unified Linux CI (`.github/workflows/test-linux.yml`) collects tests with
+**two mutually-exclusive marker filters**:
+
+- `tests-cpu*` jobs run with `-m 'not gpu'`.
+- `tests-gpu*` and `tests-stable-gpu*` jobs run with `-m gpu`.
+
+Any test gated by `@pytest.mark.skipif(not torch.cuda.is_available(), ...)`,
+`@pytest.mark.skipif(not torch.cuda.device_count(), ...)`, the project's
+`_has_triton` / `_has_cuda` flags, or any other CUDA-only requirement
+**must** also carry `@pytest.mark.gpu`. Otherwise:
+
+- On CPU runners the `skipif` skips it.
+- On GPU runners the `-m gpu` filter deselects it before collection.
+- Net result: the test never runs in CI and silently rots — the exact
+  failure mode that let the triton RNN recurrent-matmul bug ship.
+
+The convention is:
+
+```python
+@pytest.mark.gpu
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="needs CUDA")
+def test_something_cuda_specific():
+    ...
+```
+
+Apply the marker at whatever scope is appropriate (function, class, or
+module via `pytestmark = pytest.mark.gpu`). Order doesn't matter; both
+decorators are required.
+
+**Exceptions**: dedicated GPU-runner workflows that pin a specific test
+file or `-k` filter (e.g. `unittests-torch_geometric`,
+`unittests-isaaclab`) do not use the `-m gpu` filter and therefore do not
+strictly need the marker. Adding it anyway is harmless and recommended
+for consistency.
+
+**Do not add `@pytest.mark.gpu`** to tests that meaningfully exercise both
+CPU and GPU paths via parametrization (e.g.
+`device = "cpu" if torch.cuda.device_count() == 0 else "cuda"`); those
+must continue to run on the CPU side.
+
 ## 8. Documentation
 
 - Every new public class / function referenced in `docs/source/reference/*.rst`.

diff --git a/examples/collectors/isaaclab_rnn_ppo_memory.py b/examples/collectors/isaaclab_rnn_ppo_memory.py
@@ -277,6 +277,26 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--compile-update", action=argparse.BooleanOptionalAction, default=False
     )
+    parser.add_argument(
+        "--compile-gae",
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help="Wrap the GAE module with torch.compile.",
+    )
+    parser.add_argument(
+        "--env-compile-warmup",
+        type=int,
+        default=0,
+        help=(
+            "Compile the worker env's step_and_maybe_reset via the "
+            "compile={'warmup': N} env constructor kwarg. The first N calls "
+            "run eagerly to populate Transform.parent / done_keys / obs_keys "
+            "caches before tracing, which avoids dynamo wrapping a "
+            "half-initialised TransformedEnv built by Compose._rebuild_up_to "
+            "during the compiled trace. 0 disables env compile entirely. Use "
+            ">=1 to enable; 2 is a safe default."
+        ),
+    )
     parser.add_argument(
         "--compile-mode",
         choices=["default", "reduce-overhead", "max-autotune"],
@@ -310,7 +330,12 @@ def main() -> None:
     logging.basicConfig(level=getattr(logging, args.log_level))
     if args.num_envs % args.num_collectors:
         raise ValueError("--num-envs must be divisible by --num-collectors.")
-    if args.compile_update or args.cudagraph_update:
+    if (
+        args.compile_update
+        or args.compile_gae
+        or args.env_compile_warmup > 0
+        or args.cudagraph_update
+    ):
         torch._dynamo.config.capture_scalar_outputs = True
     gae_shifted = args.gae_shifted
 
@@ -354,6 +379,8 @@ def main() -> None:
         deactivate_vmap=args.deactivate_vmap,
         device=train_device,
     )
+    if args.compile_gae:
+        adv_module = torch.compile(adv_module, mode=args.compile_mode)
     optim = group_optimizers(
         torch.optim.Adam(
             actor.parameters(), lr=args.lr, eps=1e-5, capturable=args.cudagraph_update
@@ -387,12 +414,18 @@ def update(batch):
         )
 
     # ---- Collector (Isaac lives in workers; main process never imports it) ----
+    compile_env: bool | dict
+    if args.env_compile_warmup > 0:
+        compile_env = {"warmup": args.env_compile_warmup}
+    else:
+        compile_env = False
     make_env_fn = partial(
         make_env,
         task=args.task,
         num_envs=per_collector_envs,
         max_episode_steps=args.max_episode_steps,
         device=str(collector_device),
+        compile_env=compile_env,
     )
     # The worker's actor uses the same backend; during collection the LSTM
     # runs with set_recurrent_mode=False and auto-dispatches to cuDNN.

diff --git a/examples/collectors/isaaclab_rnn_ppo_memory_utils.py b/examples/collectors/isaaclab_rnn_ppo_memory_utils.py
@@ -46,8 +46,21 @@ def _init_isaac_app(device: str | None = None) -> None:
     AppLauncher(args_cli)
 
 
-def make_env(task: str, num_envs: int, max_episode_steps: int, device: str):
-    """Build an Isaac Lab env. Imports ``isaaclab`` lazily (worker-only)."""
+def make_env(
+    task: str,
+    num_envs: int,
+    max_episode_steps: int,
+    device: str,
+    *,
+    compile_env: bool | dict | None = False,
+):
+    """Build an Isaac Lab env. Imports ``isaaclab`` lazily (worker-only).
+
+    The ``compile_env`` argument forwards to the ``compile=...`` constructor
+    kwarg on :class:`~torchrl.envs.TransformedEnv`, which compiles the env's
+    ``step_and_maybe_reset`` path with :func:`torch.compile`. Pass ``True``
+    for default options or a ``dict`` of :func:`torch.compile` kwargs.
+    """
     import gymnasium as gym
     import isaaclab_tasks  # noqa: F401
     from isaaclab_tasks.manager_based.classic.ant.ant_env_cfg import AntEnvCfg
@@ -70,9 +83,13 @@ def make_env(task: str, num_envs: int, max_episode_steps: int, device: str):
         env = gym.make(task, cfg=cfg)
     else:
         env = gym.make(task)
+    transformed_env_kwargs = {}
+    if compile_env:
+        transformed_env_kwargs["compile"] = compile_env
     return TransformedEnv(
         IsaacLabWrapper(env, device=torch.device(device)),
         RewardSum(),
+        **transformed_env_kwargs,
     )
 
 

diff --git a/test/collectors/test_evaluator.py b/test/collectors/test_evaluator.py
@@ -320,6 +320,7 @@ def test_shutdown_with_inflight_eval(self):
         assert elapsed < 10.0, f"Shutdown took {elapsed:.1f}s, expected < 10s"
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(
     not torch.cuda.is_available() or torch.cuda.device_count() < 2,
     reason="Requires 2+ CUDA devices",

diff --git a/test/envs/test_auto_reset.py b/test/envs/test_auto_reset.py
@@ -263,6 +263,22 @@ def test_env_constructor_compile_kwarg(self):
         with pytest.raises(TypeError, match="compile must be"):
             CountingEnv(3, compile="please")
 
+    def test_transformed_env_getattr_no_recursion_on_module_internals(self):
+        """TransformedEnv.__getattr__ must short-circuit private/dunder names.
+
+        ``nn.Module.__dir__`` reads ``_parameters``, ``_buffers``, ``_modules``
+        via attribute access. Routing those through the
+        ``"base_env" in self.__dir__()`` branch re-enters ``__getattr__`` and
+        infinite-recurses, which previously crashed
+        ``torch.compile(step_and_maybe_reset)`` while tracing
+        ``TransformedEnv``.
+        """
+        env = TransformedEnv(CountingEnv(3), StepCounter())
+        for name in ("_parameters", "_buffers", "_modules", "__weakref__"):
+            with pytest.raises(AttributeError):
+                env.__getattr__(name)
+        assert "base_env" in dir(env)
+
     def test_native_auto_reset_wrapped_vecnorm_step_and_maybe_reset(self):
         class CountingVecNormV2(VecNormV2):
             def __init__(self, *args, **kwargs):

diff --git a/test/envs/test_parallel.py b/test/envs/test_parallel.py
@@ -93,6 +93,7 @@ def make_env():
                 4, make_env, create_env_kwargs=[{"seed": 0}, {"seed": 1}]
             )
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(
         not torch.cuda.device_count(), reason="No cuda device detected."
     )