Emerge-Lab · eugenevinitsky · May 21, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
diff --git a/.gitignore b/.gitignore
@@ -211,8 +211,8 @@ pufferlib/resources/drive/output*.gif
 # External local clones
 external/
 
-# Generated docs
-docs/
+# Generated docs (sphinx build output only; docs/*.md is tracked)
+docs/_build/
 
 # Claude config
 .claude/

diff --git a/README.md b/README.md
@@ -90,28 +90,28 @@ torchrun --standalone --nnodes=1 --nproc-per-node=6 -m pufferlib.pufferl train p
 
 ## Eval
 
+All evaluation runs through the unified `Evaluator`/`EvalManager` pipeline.
+`[eval.<name>]` sections in `drive.ini` define each evaluator; the same ones
+run inline during training and standalone here.
+
 ```bash
-# Multi-scenario eval (replay mode)
-puffer eval_multi_scenarios puffer_drive \
-  --load-model-path experiments/puffer_drive_177193887946/models/model_puffer_drive_000001.pt \
-  --num_scenarios 250 --eval_simulation replay
-
-# Multi-scenario eval (gigaflow mode)
-puffer eval_multi_scenarios puffer_drive \
-  --load-model-path experiments/puffer_drive_177193887946/models/model_puffer_drive_000001.pt \
-  --num_scenarios 10 --eval_simulation gigaflow
-
-# Multi-scenario eval with rendering
-puffer eval_multi_scenarios_render puffer_drive \
-  --load-model-path experiments/puffer_drive_177193887946/models/model_puffer_drive_000001.pt \
-  --num_scenarios 10 --eval_simulation gigaflow --render 1 --render_obs 0
-
-# Save eval as GIF
-puffer eval_multi_scenarios_render puffer_drive \
-  --load-model-path experiments/puffer_drive_177193887946/models/model_puffer_drive_000001.pt \
-  --num_scenarios 5 --eval_simulation gigaflow --save-frames 1 --gif-path eval.gif --fps 15
+# Run a named evaluator on a checkpoint (config from [eval.<name>])
+puffer eval puffer_drive --evaluator validation_gigaflow \
+  --load-model-path experiments/puffer_drive_xxxx/models/model_puffer_drive_000500.pt
+
+# Ad-hoc: pick by simulation + override scale from the CLI
+puffer eval puffer_drive --eval_simulation replay \
+  --load-model-path experiments/puffer_drive_xxxx/models/model_puffer_drive_000500.pt \
+  --num_scenarios 250 --render 1
+
+# Render the agent's observations (interactive HTML)
+puffer eval puffer_drive --eval_simulation gigaflow \
+  --load-model-path experiments/puffer_drive_xxxx/models/model_puffer_drive_000500.pt \
+  --num_scenarios 10 --render 1 --render-backend obs_html
 ```
 
+**For the full guide see [`docs/evaluation.md`](docs/evaluation.md).**
+
 ## Failure mining
 
 Roll a trained policy out against a scenario suite, capture per-episode compact replays for episodes whose `episode_return` falls below a threshold, render each one as an interactive HTML page, and produce a sortable cross-episode index. Useful for triaging what a policy fails at after a long training run.

diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -0,0 +1,178 @@
+# Evaluation — operational guide
+
+How evaluation works in PufferDrive and how to run it. All evaluation goes
+through one system: the `Evaluator` classes in
+`pufferlib/ocean/benchmark/evaluators/` orchestrated by the `EvalManager` in
+`pufferlib/ocean/benchmark/manager.py`. The same evaluators run inline during
+training and standalone from the CLI — there is no second eval path.
+
+## Concepts
+
+- **Evaluator** — one evaluation, defined by an `[eval.<name>]` section in
+  `pufferlib/config/ocean/drive.ini`. It owns an env config, a rollout, a set
+  of metrics, and optional rendering.
+- **EvalManager** — discovers every `[eval.<name>]` section, instantiates the
+  evaluators, and runs the ones whose `interval` is due (inline) or that you
+  name (standalone). One evaluator failing doesn't stop the rest.
+- **Evaluator types** (the `type` field): `multi_scenario` (sweep a scenario
+  set in one batched rollout), `behavior_class` (a labelled nuPlan scene
+  bucket), `human_replay`, `wosac`.
+
+## Config schema
+
+```ini
+[eval.<name>]
+type          = "multi_scenario"   ; registered evaluator class (omit for a template)
+enabled       = true               ; skip when false
+interval      = 250                ; run every N epochs inline (0 disables inline)
+mode          = "inline"           ; "inline" (block training) | "subprocess"
+inherits      = "<other_section>"  ; optional: pull defaults from another section
+clean         = true               ; zero perturbations/dropout + enforce red lights
+render        = true               ; capture renders during the rollout
+render_views  = ["sim_state","bev"]; camera views for the egl backend
+render_backend = "egl"             ; egl | triage_html | obs_html (see Render backends)
+env.<key>     = <value>            ; any [env] override (dotted)
+eval.<key>    = <value>            ; evaluator-specific knob (see below)
+vec.<key>     = <value>            ; any [vec] override
+```
+
+A section **without** a `type` is a *template*: it is never instantiated, only
+pulled in via `inherits`. `validation_defaults` and `behaviors_defaults` are
+templates.
+
+`eval.*` knobs read by `multi_scenario`:
+
+| Key | Meaning |
+|---|---|
+| `eval.num_scenarios` | how many episodes to evaluate (loop target) |
+| `eval.export_episode_csv` | write one CSV row per finished episode |
+| `eval.verify_coverage` | report expected-vs-evaluated counts + duplicate maps |
+| `eval.render_num_scenarios` | how many scenarios to render (caps render cost) |
+| `eval.render_max_steps` | steps per rendered clip |
+
+The `clean` macro zeros `lane_segment_dropout`, `boundary_segment_dropout`,
+`partner_blindness_prob`, `phantom_braking_prob`,
+`phantom_braking_trigger_prob` and sets `traffic_light_behavior=1`. A value set
+explicitly in the section wins over the macro (e.g. `env.traffic_light_behavior
+= 0` keeps red lights ignored even with `clean = true`).
+
+## Running evaluation
+
+### Inline during training
+
+Any `enabled` evaluator with `interval > 0` runs automatically every `interval`
+epochs (and once at shutdown). Nothing extra to do — the metrics land in
+wandb/TensorBoard under `<name>/<metric>` and renders under `<name>/render`.
+
+### Standalone, by name
+
+```bash
+puffer eval puffer_drive --evaluator validation_gigaflow \
+    --load-model-path experiments/puffer_drive_xxxx/models/model_000500.pt
+```
+
+Runs that one evaluator with its `[eval.validation_gigaflow]` config. The
+checkpoint's network architecture is read from the sibling `config.yaml` (next
+to `models/`), so a checkpoint loads even if its policy/rnn dims differ from
+`drive.ini`. With no `--load-model-path`, a fresh (random) policy is used —
+useful for smoke-testing the eval path itself.
+
+### Standalone, ad-hoc
+
+Same as by-name, except instead of naming an evaluator you select one of the two
+built-in `validation_*` evaluators by simulation and override its config from the
+CLI — no `drive.ini` edit needed:
+
+- `--eval_simulation gigaflow` → runs the `validation_gigaflow` section
+- `--eval_simulation replay` → runs the `validation_replay` section
+
+The flags below override that evaluator's config for this run, and each applies
+**only when passed** — omit one and the evaluator's own `[eval.*]` value stands:
+
+```bash
+puffer eval puffer_drive --eval_simulation gigaflow \
+    --load-model-path <ckpt> \
+    --num_scenarios 50 --render 1 --render-backend obs_html --num_maps 4
+```
+
+| Flag | Effect |
+|---|---|
+| `--eval_simulation gigaflow\|replay` | selects `validation_<sim>` when `--evaluator` is absent |
+| `--num_scenarios N` | override the evaluator's `eval.num_scenarios` |
+| `--render 0\|1` | toggle rendering on/off |
+| `--render-backend egl\|triage_html\|obs_html` | choose the renderer (see Render backends) |
+| `--num_maps N` | override `env.num_maps` (CARLA maps for gigaflow, bin count for replay) |
+
+Any other section value can be overridden with the generic dotted form, e.g.
+`--eval.validation_replay.env.scenario-length 91`.
+
+### Subprocess mode
+
+`mode = "subprocess"` runs the evaluator in a fresh `python -m pufferlib.pufferl
+eval … --out <json>` process that loads the latest checkpoint from disk; the
+parent reads metrics back from the JSON. Use it to isolate a heavy/leaky eval
+from the training process.
+
+## Outputs
+
+- **Aggregate metrics** — a weighted per-agent mean of the env's `vec_log`
+  emissions, logged to wandb/TensorBoard. Always produced.
+- **Per-episode CSV** (`eval.export_episode_csv = true`) — one row per finished
+  episode in `episode_metrics/<name>_epoch{E}_step{N}.csv`, including
+  `map_name`/`scenario_id` and the per-episode metrics. Drains the env's
+  `completed_episode` summaries, which the manager enables automatically for
+  evaluators that opt in.
+- **Coverage** (`eval.verify_coverage = true`) — folds `coverage_expected`,
+  `coverage_found`, `coverage_unique_maps`, `coverage_complete` into the
+  metrics and logs any maps evaluated more than once. For a unique-scenario
+  sweep (replay) duplicates flag a problem; for cycling maps (gigaflow) they
+  are expected.
+- **Renders** — selected by `render_backend` (see below). `render_num_scenarios`
+  caps how many scenarios are rendered, so render cost stays bounded regardless
+  of `num_scenarios`.
+
+### Render backends
+
+`render_backend` picks one renderer (it is not a stack — exactly one runs):
+
+| `render_backend` | Output | Shows | Built from | Use it to |
+|---|---|---|---|---|
+| `egl` (default) | mp4 per (scenario, view) | top-down sim camera | GPU EGL → ffmpeg | get a shareable video clip |
+| `triage_html` | one HTML per episode → `gif/<name>/` | scene playback **+ per-episode metrics** | the captured compact-replay bundle (no re-sim) | triage *which* episodes failed |
+| `obs_html` | one HTML per scenario (+ gallery `index.html`) → `obs/<name>/` | interactive scene **+ each agent's NN observation** | a CPU re-roll capturing state + obs | inspect *what the policy sees* |
+
+Both HTML backends are CPU-only (no EGL/ffmpeg). `triage_html` is lighter (it
+reuses data already captured during the rollout); `obs_html` re-simulates to
+record the observation, so it's heavier but shows the policy's actual inputs.
+
+## The built-in evaluators
+
+| Section | Type | What it runs |
+|---|---|---|
+| `validation_replay` | multi_scenario | replay sweep over the nuPlan bins, `control_sdc_only` |
+| `validation_gigaflow` | multi_scenario | gigaflow sweep over the CARLA maps |
+| `behaviors_*` | behavior_class | one labelled nuPlan scene bucket each (hard_stop, merge, …) |
+| `wosac` | wosac | Waymo open sim agents challenge metrics |
+
+`validation_replay` and `validation_gigaflow` inherit shared eval reward
+weights and clean-eval knobs from the `validation_defaults` template; the
+`behaviors_*` sections inherit from `behaviors_defaults`.
+
+## Adding an evaluator
+
+1. Subclass `Evaluator` (or an existing type) in
+   `pufferlib/ocean/benchmark/evaluators/`, set `type_name`, and register it in
+   `evaluators/__init__.py`. Most subclasses only override `env_overrides`,
+   `_should_stop`, and optionally `_render_env_overrides`; the base `rollout`
+   handles the step loop, metric aggregation, CSV, and coverage.
+2. Add an `[eval.<name>]` section with `type = "<your_type_name>"`.
+
+## Scripts
+
+`scripts/eval/` drives the unified pipeline over many checkpoints:
+
+- `run_all_eval.sh` — eval the latest checkpoint in every `experiments/*/`.
+- `run_all_latest_eval.py` — eval the latest checkpoint in every `runs/*/`, with rendering.
+- `run_failure_scenarios.py` — re-eval from a failure CSV.
+
+All call `puffer eval puffer_drive` with the flags above.
diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
@@ -224,62 +224,77 @@ num_episodes = 100
 score_threshold = -inf
 render = true
 
-[eval]
-; Set to True to enable periodic multi-scenario evaluation during training
-multi_scenario_eval = False
-; Frequency of evaluation during training (in epochs)
-eval_interval = 25
-num_agents = 512
-; Batch size for eval_multi_scenarios (number of scenarios per batch)
-; Path to dataset used for evaluation
-map_dir = "pufferlib/resources/drive/binaries/eval"
-; Simulation mode for evaluation: "gigaflow" or "replay"
-multi_scenario_simulation_mode = "replay"
-; Total number of scenarios to evaluate
-multi_scenario_num_scenarios = 250
-backend = PufferEnv
-
+; ---------------------------------------------------------------------------
+; Multi-scenario validation eval. validation_defaults holds the shared
+; clean-eval env + fixed eval reward weights; the replay / gigaflow sections
+; add their sim-specific map set and agent layout. Each sweeps 250 scenarios,
+; ignores traffic lights, and disables reward randomization so metrics are
+; comparable across checkpoints. Both write a per-episode CSV and a coverage
+; report (episode_metrics/).
+; ---------------------------------------------------------------------------
 
-[eval.validation_replay]
-type = "multi_scenario"
+[eval.validation_defaults]
 enabled = true
 interval = 250
 mode = "inline"
 clean = true
+env.eval_mode = 1
+env.collision_behavior = 1
+env.offroad_behavior = 1
+; Explicit 0 wins over the clean macro's red-light enforcement.
+env.traffic_light_behavior = 0
+env.reward_randomization = False
+env.termination_mode = 0
+env.num_agents = 512
+env.target_type = "static"
+env.goal_speed = 3.0
+env.reward_vehicle_collision = 3.0
+env.reward_offroad_collision = 3.0
+env.reward_stop_line = 1.0
+env.reward_goal = 1.0
+env.reward_overspeed = 0.05
+env.reward_comfort = 0.05
+env.reward_velocity = 0.0025
+env.reward_lane_align = 0.025
+env.reward_lane_center = 0.0038
+env.reward_timestep = 0.000025
+env.reward_reverse = 0.005
+env.reward_ade = 0.0
+env.lane_segment_dropout = 0.0
+env.boundary_segment_dropout = 0.0
+env.max_lane_segment_observations = 80
+env.max_boundary_segment_observations = 80
+eval.num_scenarios = 250
+eval.export_episode_csv = true
+eval.verify_coverage = true
+
+[eval.validation_replay]
+inherits = "validation_defaults"
+type = "multi_scenario"
 render = true
-render_backend = "html"
+render_backend = "triage_html"
 env.simulation_mode = "replay"
 env.control_mode = "control_sdc_only"
-env.map_dir = "/home/ricky/nuBenchmark/.data/bins/deduped_set/training"
-env.num_maps = 10
-env.num_agents = 10
-env.min_agents_per_env = 1
-env.max_agents_per_env = 1
-env.scenario_length = 91
-env.resample_frequency = 91
-eval.num_scenarios = 10
+env.map_dir = "/scratch/ev2237/data/nuplan/nuplan_mini_train_bins"
+env.num_maps = 250
+env.max_agents_per_env = 64
+env.scenario_length = 200
+env.resample_frequency = 200
 eval.render_num_scenarios = 5
-eval.render_max_steps = 91
+eval.render_max_steps = 200
 
 [eval.validation_gigaflow]
+inherits = "validation_defaults"
 type = "multi_scenario"
-enabled = true
-interval = 250
-mode = "inline"
-clean = true
 render = true
 render_views = ["sim_state", "bev"]
 env.simulation_mode = "gigaflow"
-env.map_dir = "pufferlib/resources/drive/binaries/carla_py123d"
+env.map_dir = "pufferlib/resources/drive/binaries/carla"
 env.num_maps = 8
-env.num_agents = 400
-env.min_agents_per_env = 50
-env.max_agents_per_env = 50
-env.scenario_length = 3000
-env.resample_frequency = 3000
-; One rollout per carla map. Render every map so the wandb panel covers
-; all 8 (rather than a 5-sample subset).
-eval.num_scenarios = 8
+env.min_agents_per_env = 40
+env.max_agents_per_env = 40
+env.scenario_length = 500
+env.resample_frequency = 500
 eval.render_num_scenarios = 8
 eval.render_max_steps = 300