Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,48 @@ Strongly encouraged (not mandatory):
`pytest.main(...)`, so the file can be executed directly.
- New algorithms: also tested in the sota-implementations CI.

### 7a. The `gpu` marker (load-bearing!)

The unified Linux CI (`.github/workflows/test-linux.yml`) collects tests with
**two mutually-exclusive marker filters**:

- `tests-cpu*` jobs run with `-m 'not gpu'`.
- `tests-gpu*` and `tests-stable-gpu*` jobs run with `-m gpu`.

Any test gated by `@pytest.mark.skipif(not torch.cuda.is_available(), ...)`,
`@pytest.mark.skipif(not torch.cuda.device_count(), ...)`, the project's
`_has_triton` / `_has_cuda` flags, or any other CUDA-only requirement
**must** also carry `@pytest.mark.gpu`. Otherwise:

- On CPU runners the `skipif` skips it.
- On GPU runners the `-m gpu` filter deselects it before collection.
- Net result: the test never runs in CI and silently rots — the exact
failure mode that let the triton RNN recurrent-matmul bug ship.

The convention is:

```python
@pytest.mark.gpu
@pytest.mark.skipif(not torch.cuda.is_available(), reason="needs CUDA")
def test_something_cuda_specific():
...
```

Apply the marker at whatever scope is appropriate (function, class, or
module via `pytestmark = pytest.mark.gpu`). Order doesn't matter; both
decorators are required.

**Exceptions**: dedicated GPU-runner workflows that pin a specific test
file or `-k` filter (e.g. `unittests-torch_geometric`,
`unittests-isaaclab`) do not use the `-m gpu` filter and therefore do not
strictly need the marker. Adding it anyway is harmless and recommended
for consistency.

**Do not add `@pytest.mark.gpu`** to tests that meaningfully exercise both
CPU and GPU paths via parametrization (e.g.
`device = "cpu" if torch.cuda.device_count() == 0 else "cuda"`); those
must continue to run on the CPU side.

## 8. Documentation

- Every new public class / function referenced in `docs/source/reference/*.rst`.
Expand Down
35 changes: 34 additions & 1 deletion examples/collectors/isaaclab_rnn_ppo_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,26 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--compile-update", action=argparse.BooleanOptionalAction, default=False
)
parser.add_argument(
"--compile-gae",
action=argparse.BooleanOptionalAction,
default=False,
help="Wrap the GAE module with torch.compile.",
)
parser.add_argument(
"--env-compile-warmup",
type=int,
default=0,
help=(
"Compile the worker env's step_and_maybe_reset via the "
"compile={'warmup': N} env constructor kwarg. The first N calls "
"run eagerly to populate Transform.parent / done_keys / obs_keys "
"caches before tracing, which avoids dynamo wrapping a "
"half-initialised TransformedEnv built by Compose._rebuild_up_to "
"during the compiled trace. 0 disables env compile entirely. Use "
">=1 to enable; 2 is a safe default."
),
)
parser.add_argument(
"--compile-mode",
choices=["default", "reduce-overhead", "max-autotune"],
Expand Down Expand Up @@ -310,7 +330,12 @@ def main() -> None:
logging.basicConfig(level=getattr(logging, args.log_level))
if args.num_envs % args.num_collectors:
raise ValueError("--num-envs must be divisible by --num-collectors.")
if args.compile_update or args.cudagraph_update:
if (
args.compile_update
or args.compile_gae
or args.env_compile_warmup > 0
or args.cudagraph_update
):
torch._dynamo.config.capture_scalar_outputs = True
gae_shifted = args.gae_shifted

Expand Down Expand Up @@ -354,6 +379,8 @@ def main() -> None:
deactivate_vmap=args.deactivate_vmap,
device=train_device,
)
if args.compile_gae:
adv_module = torch.compile(adv_module, mode=args.compile_mode)
optim = group_optimizers(
torch.optim.Adam(
actor.parameters(), lr=args.lr, eps=1e-5, capturable=args.cudagraph_update
Expand Down Expand Up @@ -387,12 +414,18 @@ def update(batch):
)

# ---- Collector (Isaac lives in workers; main process never imports it) ----
compile_env: bool | dict
if args.env_compile_warmup > 0:
compile_env = {"warmup": args.env_compile_warmup}
else:
compile_env = False
make_env_fn = partial(
make_env,
task=args.task,
num_envs=per_collector_envs,
max_episode_steps=args.max_episode_steps,
device=str(collector_device),
compile_env=compile_env,
)
# The worker's actor uses the same backend; during collection the LSTM
# runs with set_recurrent_mode=False and auto-dispatches to cuDNN.
Expand Down
21 changes: 19 additions & 2 deletions examples/collectors/isaaclab_rnn_ppo_memory_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,21 @@ def _init_isaac_app(device: str | None = None) -> None:
AppLauncher(args_cli)


def make_env(task: str, num_envs: int, max_episode_steps: int, device: str):
"""Build an Isaac Lab env. Imports ``isaaclab`` lazily (worker-only)."""
def make_env(
task: str,
num_envs: int,
max_episode_steps: int,
device: str,
*,
compile_env: bool | dict | None = False,
):
"""Build an Isaac Lab env. Imports ``isaaclab`` lazily (worker-only).

The ``compile_env`` argument forwards to the ``compile=...`` constructor
kwarg on :class:`~torchrl.envs.TransformedEnv`, which compiles the env's
``step_and_maybe_reset`` path with :func:`torch.compile`. Pass ``True``
for default options or a ``dict`` of :func:`torch.compile` kwargs.
"""
import gymnasium as gym
import isaaclab_tasks # noqa: F401
from isaaclab_tasks.manager_based.classic.ant.ant_env_cfg import AntEnvCfg
Expand All @@ -70,9 +83,13 @@ def make_env(task: str, num_envs: int, max_episode_steps: int, device: str):
env = gym.make(task, cfg=cfg)
else:
env = gym.make(task)
transformed_env_kwargs = {}
if compile_env:
transformed_env_kwargs["compile"] = compile_env
return TransformedEnv(
IsaacLabWrapper(env, device=torch.device(device)),
RewardSum(),
**transformed_env_kwargs,
)


Expand Down
1 change: 1 addition & 0 deletions test/collectors/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ def test_shutdown_with_inflight_eval(self):
assert elapsed < 10.0, f"Shutdown took {elapsed:.1f}s, expected < 10s"


@pytest.mark.gpu
@pytest.mark.skipif(
not torch.cuda.is_available() or torch.cuda.device_count() < 2,
reason="Requires 2+ CUDA devices",
Expand Down
16 changes: 16 additions & 0 deletions test/envs/test_auto_reset.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,22 @@ def test_env_constructor_compile_kwarg(self):
with pytest.raises(TypeError, match="compile must be"):
CountingEnv(3, compile="please")

def test_transformed_env_getattr_no_recursion_on_module_internals(self):
"""TransformedEnv.__getattr__ must short-circuit private/dunder names.

``nn.Module.__dir__`` reads ``_parameters``, ``_buffers``, ``_modules``
via attribute access. Routing those through the
``"base_env" in self.__dir__()`` branch re-enters ``__getattr__`` and
infinite-recurses, which previously crashed
``torch.compile(step_and_maybe_reset)`` while tracing
``TransformedEnv``.
"""
env = TransformedEnv(CountingEnv(3), StepCounter())
for name in ("_parameters", "_buffers", "_modules", "__weakref__"):
with pytest.raises(AttributeError):
env.__getattr__(name)
assert "base_env" in dir(env)

def test_native_auto_reset_wrapped_vecnorm_step_and_maybe_reset(self):
class CountingVecNormV2(VecNormV2):
def __init__(self, *args, **kwargs):
Expand Down
1 change: 1 addition & 0 deletions test/envs/test_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def make_env():
4, make_env, create_env_kwargs=[{"seed": 0}, {"seed": 1}]
)

@pytest.mark.gpu
@pytest.mark.skipif(
not torch.cuda.device_count(), reason="No cuda device detected."
)
Expand Down
Loading
Loading