jramos · jramos · May 18, 2026 · May 17, 2026 · May 17, 2026 · May 18, 2026
diff --git a/docs/workflows.md b/docs/workflows.md
@@ -488,7 +488,19 @@ Per-task scores are deterministic over candidate text (cache is keyed by `sha256
 
 Default `--closed-loop-mode` is `feedback` (not `trainset`) on the skill side. Skill bodies mutate heavily, so the `gate_mode="always"` that trainset needs would fire the validator on every novel candidate — N tasks × 2 phases per fire. Opt into `trainset` / `both` explicitly when the cost is acceptable.
 
-Reference suite: `evolution/validation/suites/systematic_debugging.jsonl` (5 planted-bug tasks). Manual smoke harness: `tests/manual/skill_closed_loop_smoke.py`.
+Reference suites:
+- `evolution/validation/suites/systematic_debugging.jsonl` — 5 textbook bugs; good for verifying the wiring works.
+- `evolution/validation/suites/systematic_debugging_advanced.jsonl` — 5 harder bugs (generator exhaustion, shared mutable return, float-precision equality, leftmost-insert boundary, class-vs-instance attribute) designed to discriminate skill-text variants on capable agent models that saturate the basic suite at 5/5.
+
+When your daily-driver Hermes model is capable enough to solve every textbook bug regardless of skill text, the planted-bug verdict adds no signal. Three knobs to recover discrimination:
+
+- `--closed-loop-during-evolution .../systematic_debugging_advanced.jsonl` — use the harder bugs (different cognitive failure modes).
+- `--closed-loop-agent-model MODEL` — run the validator's agent against a different model than your `~/.hermes/config.yaml` default. Hermes sends `include: ['reasoning.encrypted_content']` so the model must be a reasoning model (o1-family, o3-family, o4-mini, gpt-5.x-family); non-reasoning models reject the request.
+- `--closed-loop-task-timeout-seconds N` — bump the per-task wall-clock budget. The default is 120s; most reasoning models other than the smallest take 200–300s per debugging task and would otherwise abstain (timeout) without recording a verdict.
+
+**Empirical caveat from validation.** Both suites saturate at 5/5 against capable reasoning models (`gpt-5.4-mini` saturated both; `o3-mini` was slow enough to abstain most tasks at the default timeout). For a setup where the user's default model handles textbook Python debugging easily, the closed-loop signal on this domain may be uninformative regardless of skill text — the agent's raw capability dominates. Real headroom likely needs evaluation surfaces where methodology matters more than recognition: multi-file refactoring, ambiguous specs with edge cases the agent must enumerate, tasks requiring iterative hypothesis-testing across multiple test runs.
+
+Manual smoke harness: `tests/manual/skill_closed_loop_smoke.py` (supports `--suite {basic,advanced}`, `--agent-model MODEL`, `--task-timeout-seconds N`).
 
 ## Failure-mode summary
 

diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
@@ -471,6 +471,8 @@ def _maybe_build_closed_loop_cache_skill(
     min_iters: int,
     window_size: int,
     gate_mode: str = "sampled",
+    agent_model: Optional[str] = None,
+    agent_timeout_seconds: Optional[int] = None,
 ):
     """Build a ClosedLoopFeedbackCache for the skill path; return None when disabled.
 
@@ -506,7 +508,10 @@ def _maybe_build_closed_loop_cache_skill(
         skill_name=skill_name,
         workdir=workdir,
     )
-    runner = HermesAgentRunner()
+    runner_kwargs: dict = {"model": agent_model}
+    if agent_timeout_seconds is not None:
+        runner_kwargs["timeout_seconds"] = agent_timeout_seconds
+    runner = HermesAgentRunner(**runner_kwargs)
     validator = ClosedLoopValidator(installer=installer, runner=runner)
     suite = TaskSuite.from_jsonl(suite_path)
     return ClosedLoopFeedbackCache(
@@ -599,6 +604,8 @@ def evolve(
     closed_loop_window_size: int = 8,
     closed_loop_mode: str = "feedback",
     closed_loop_in_valset: bool = False,
+    closed_loop_agent_model: Optional[str] = None,
+    closed_loop_task_timeout_seconds: Optional[int] = None,
 ):
     """Main evolution function — orchestrates the full optimization loop."""
 
@@ -835,6 +842,8 @@ def evolve(
                 min_iters=closed_loop_min_iters,
                 window_size=closed_loop_window_size,
                 gate_mode=_cache_gate_mode,
+                agent_model=closed_loop_agent_model,
+                agent_timeout_seconds=closed_loop_task_timeout_seconds,
             )
 
             # Build the metric once: DSPy's LM cache lines up across GEPA's
@@ -1547,6 +1556,31 @@ def evolve(
          "scoring). Costs more — each accepted candidate triggers another full "
          "eval pass over the behavioral examples. Default off.",
 )
+@click.option(
+    "--closed-loop-agent-model",
+    "closed_loop_agent_model",
+    default=None,
+    type=str,
+    help="Override the agent model the closed-loop validator runs `hermes -z` "
+         "with (passed as `hermes -m MODEL -z ...`). When unset, the validator "
+         "uses whatever's in your ~/.hermes/config.yaml. Useful when your "
+         "daily-driver Hermes model is so capable it saturates the planted-bug "
+         "suite at 100%, hiding the behavioral signal closed-loop is supposed "
+         "to surface — run validation against a weaker model without touching "
+         "your config.",
+)
+@click.option(
+    "--closed-loop-task-timeout-seconds",
+    "closed_loop_task_timeout_seconds",
+    default=None,
+    type=click.IntRange(min=1),
+    help="Per-task wall-clock budget for the closed-loop validator's `hermes -z` "
+         "subprocess (default 120s). Bump when --closed-loop-agent-model selects "
+         "a slow reasoning model that doesn't finish within the default — most "
+         "OpenAI reasoning models (o1-family, o3-family) take 60-180s per "
+         "debugging task. Hitting the timeout abstains the task verdict rather "
+         "than failing it, so over-tight values silently produce no-signal runs.",
+)
 def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflection_model,
          eval_model, skill_source_dir, dry_run, seed, budget, no_fallback,
          quality_gate, growth_free_threshold,
@@ -1563,7 +1597,9 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
          closed_loop_min_iters,
          closed_loop_window_size,
          closed_loop_mode,
-         closed_loop_in_valset):
+         closed_loop_in_valset,
+         closed_loop_agent_model,
+         closed_loop_task_timeout_seconds):
     """Evolve an agent skill using DSPy + GEPA optimization."""
     try:
         evolve(
@@ -1607,6 +1643,8 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
             closed_loop_window_size=closed_loop_window_size,
             closed_loop_mode=closed_loop_mode,
             closed_loop_in_valset=closed_loop_in_valset,
+            closed_loop_agent_model=closed_loop_agent_model,
+            closed_loop_task_timeout_seconds=closed_loop_task_timeout_seconds,
         )
     except HermesProviderError as exc:
         # Render a clean error panel instead of dumping a Python traceback

diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
@@ -260,6 +260,8 @@ def _maybe_build_closed_loop_cache(
     min_iters: int,
     window_size: int,
     gate_mode: str = "sampled",
+    agent_model: Optional[str] = None,
+    agent_timeout_seconds: Optional[int] = None,
 ):
     """Build a ClosedLoopFeedbackCache when the user opted in, else None.
 
@@ -286,7 +288,10 @@ def _maybe_build_closed_loop_cache(
     installer = HermesToolDescriptionInstaller(
         hermes_repo=hermes_repo, tool_name=tool_name
     )
-    runner = HermesAgentRunner()
+    runner_kwargs: dict = {"model": agent_model}
+    if agent_timeout_seconds is not None:
+        runner_kwargs["timeout_seconds"] = agent_timeout_seconds
+    runner = HermesAgentRunner(**runner_kwargs)
     validator = ClosedLoopValidator(installer=installer, runner=runner)
     suite = TaskSuite.from_jsonl(suite_path)
     return ClosedLoopFeedbackCache(
@@ -359,6 +364,8 @@ def evolve(
     closed_loop_window_size: int = 8,
     closed_loop_mode: str = "feedback",
     closed_loop_in_valset: bool = False,
+    closed_loop_agent_model: Optional[str] = None,
+    closed_loop_task_timeout_seconds: Optional[int] = None,
     skip_preflight: bool = False,
     skip_cost_suggest: bool = False,
 ) -> dict[str, Any]:
@@ -592,6 +599,8 @@ def evolve(
                 min_iters=closed_loop_min_iters,
                 window_size=closed_loop_window_size,
                 gate_mode=cache_gate_mode,
+                agent_model=closed_loop_agent_model,
+                agent_timeout_seconds=closed_loop_task_timeout_seconds,
             )
             metric = make_tool_fitness_metric(
                 judge=judge,
@@ -1187,6 +1196,30 @@ def evolve(
          "scoring). Costs more — each accepted candidate triggers another full "
          "eval pass over the behavioral examples. Default off.",
 )
+@click.option(
+    "--closed-loop-agent-model",
+    "closed_loop_agent_model",
+    default=None,
+    type=str,
+    help="Override the agent model the closed-loop validator runs `hermes -z` "
+         "with (passed as `hermes -m MODEL -z ...`). When unset, the validator "
+         "uses whatever's in your ~/.hermes/config.yaml. Useful when your "
+         "daily-driver Hermes model saturates the planted-bug suite at 100%, "
+         "hiding the behavioral signal — run validation against a weaker model "
+         "without touching your config.",
+)
+@click.option(
+    "--closed-loop-task-timeout-seconds",
+    "closed_loop_task_timeout_seconds",
+    default=None,
+    type=click.IntRange(min=1),
+    help="Per-task wall-clock budget for the closed-loop validator's `hermes -z` "
+         "subprocess (default 120s). Bump when --closed-loop-agent-model selects "
+         "a slow reasoning model that doesn't finish within the default — most "
+         "OpenAI reasoning models (o1-family, o3-family) take 60-180s per "
+         "debugging task. Hitting the timeout abstains the task verdict rather "
+         "than failing it, so over-tight values silently produce no-signal runs.",
+)
 def main(
     tool_name: str,
     manifest_path: Path,
@@ -1212,6 +1245,8 @@ def main(
     closed_loop_window_size: int,
     closed_loop_mode: str,
     closed_loop_in_valset: bool,
+    closed_loop_agent_model: Optional[str],
+    closed_loop_task_timeout_seconds: Optional[int],
 ) -> None:
     """Evolve one tool description in an MCP manifest using DSPy + GEPA."""
     if apply_flag and patch_flag:
@@ -1249,6 +1284,8 @@ def main(
             closed_loop_window_size=closed_loop_window_size,
             closed_loop_mode=closed_loop_mode,
             closed_loop_in_valset=closed_loop_in_valset,
+            closed_loop_agent_model=closed_loop_agent_model,
+            closed_loop_task_timeout_seconds=closed_loop_task_timeout_seconds,
             skip_preflight=skip_preflight,
             skip_cost_suggest=skip_cost_suggest,
         )

diff --git a/evolution/validation/hermes_runner.py b/evolution/validation/hermes_runner.py
@@ -46,6 +46,7 @@ def __init__(
         hermes_command: str = "hermes",
         timeout_seconds: int = DEFAULT_TASK_TIMEOUT_SECONDS,
         user_config_path: Optional[Path] = None,
+        model: Optional[str] = None,
     ) -> None:
         self.hermes_command = hermes_command
         self.timeout_seconds = timeout_seconds
@@ -56,6 +57,13 @@ def __init__(
             if user_config_path is not None
             else Path.home() / ".hermes" / "config.yaml"
         )
+        # Optional per-invocation model override (passed as ``hermes -z -m
+        # <model>``). When unset, Hermes uses whatever is configured in
+        # the sandboxed ``config.yaml``. Useful for closed-loop validation
+        # against a deliberately weaker agent model than the user's
+        # daily-driver default — saturation on capable models hides
+        # behavioral signal that a weaker model would expose.
+        self.model = model
 
     def run(self, ctx: TaskRunContext) -> AgentRunResult:
         message = ctx.user_message
@@ -68,10 +76,15 @@ def run(self, ctx: TaskRunContext) -> AgentRunResult:
                 "HOME": str(sandbox),
                 **ctx.extra_env,
             }
+            argv = [self.hermes_command, "-z", message]
+            if self.model is not None:
+                # Insert before the -z so hermes parses it as a global flag,
+                # not as part of the -z prompt value.
+                argv = [self.hermes_command, "-m", self.model, "-z", message]
             start = time.time()
             try:
                 subprocess.run(
-                    [self.hermes_command, "-z", message],
+                    argv,
                     env=env,
                     cwd=str(ctx.fixture_dir),
                     capture_output=True,