Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion docs/workflows.md
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,19 @@ Per-task scores are deterministic over candidate text (cache is keyed by `sha256

Default `--closed-loop-mode` is `feedback` (not `trainset`) on the skill side. Skill bodies mutate heavily, so the `gate_mode="always"` that trainset needs would fire the validator on every novel candidate — N tasks × 2 phases per fire. Opt into `trainset` / `both` explicitly when the cost is acceptable.

Reference suite: `evolution/validation/suites/systematic_debugging.jsonl` (5 planted-bug tasks). Manual smoke harness: `tests/manual/skill_closed_loop_smoke.py`.
Reference suites:
- `evolution/validation/suites/systematic_debugging.jsonl` — 5 textbook bugs; good for verifying the wiring works.
- `evolution/validation/suites/systematic_debugging_advanced.jsonl` — 5 harder bugs (generator exhaustion, shared mutable return, float-precision equality, leftmost-insert boundary, class-vs-instance attribute) designed to discriminate skill-text variants on capable agent models that saturate the basic suite at 5/5.

When your daily-driver Hermes model is capable enough to solve every textbook bug regardless of skill text, the planted-bug verdict adds no signal. Three knobs to recover discrimination:

- `--closed-loop-during-evolution .../systematic_debugging_advanced.jsonl` — use the harder bugs (different cognitive failure modes).
- `--closed-loop-agent-model MODEL` — run the validator's agent against a different model than your `~/.hermes/config.yaml` default. Hermes sends `include: ['reasoning.encrypted_content']` so the model must be a reasoning model (o1-family, o3-family, o4-mini, gpt-5.x-family); non-reasoning models reject the request.
- `--closed-loop-task-timeout-seconds N` — bump the per-task wall-clock budget. The default is 120s; most reasoning models other than the smallest take 200–300s per debugging task and would otherwise abstain (timeout) without recording a verdict.

**Empirical caveat from validation.** Both suites saturate at 5/5 against capable reasoning models (`gpt-5.4-mini` saturated both; `o3-mini` was slow enough to abstain most tasks at the default timeout). For a setup where the user's default model handles textbook Python debugging easily, the closed-loop signal on this domain may be uninformative regardless of skill text — the agent's raw capability dominates. Real headroom likely needs evaluation surfaces where methodology matters more than recognition: multi-file refactoring, ambiguous specs with edge cases the agent must enumerate, tasks requiring iterative hypothesis-testing across multiple test runs.

Manual smoke harness: `tests/manual/skill_closed_loop_smoke.py` (supports `--suite {basic,advanced}`, `--agent-model MODEL`, `--task-timeout-seconds N`).

## Failure-mode summary

Expand Down
42 changes: 40 additions & 2 deletions evolution/skills/evolve_skill.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,8 @@ def _maybe_build_closed_loop_cache_skill(
min_iters: int,
window_size: int,
gate_mode: str = "sampled",
agent_model: Optional[str] = None,
agent_timeout_seconds: Optional[int] = None,
):
"""Build a ClosedLoopFeedbackCache for the skill path; return None when disabled.

Expand Down Expand Up @@ -506,7 +508,10 @@ def _maybe_build_closed_loop_cache_skill(
skill_name=skill_name,
workdir=workdir,
)
runner = HermesAgentRunner()
runner_kwargs: dict = {"model": agent_model}
if agent_timeout_seconds is not None:
runner_kwargs["timeout_seconds"] = agent_timeout_seconds
runner = HermesAgentRunner(**runner_kwargs)
validator = ClosedLoopValidator(installer=installer, runner=runner)
suite = TaskSuite.from_jsonl(suite_path)
return ClosedLoopFeedbackCache(
Expand Down Expand Up @@ -599,6 +604,8 @@ def evolve(
closed_loop_window_size: int = 8,
closed_loop_mode: str = "feedback",
closed_loop_in_valset: bool = False,
closed_loop_agent_model: Optional[str] = None,
closed_loop_task_timeout_seconds: Optional[int] = None,
):
"""Main evolution function — orchestrates the full optimization loop."""

Expand Down Expand Up @@ -835,6 +842,8 @@ def evolve(
min_iters=closed_loop_min_iters,
window_size=closed_loop_window_size,
gate_mode=_cache_gate_mode,
agent_model=closed_loop_agent_model,
agent_timeout_seconds=closed_loop_task_timeout_seconds,
)

# Build the metric once: DSPy's LM cache lines up across GEPA's
Expand Down Expand Up @@ -1547,6 +1556,31 @@ def evolve(
"scoring). Costs more — each accepted candidate triggers another full "
"eval pass over the behavioral examples. Default off.",
)
@click.option(
"--closed-loop-agent-model",
"closed_loop_agent_model",
default=None,
type=str,
help="Override the agent model the closed-loop validator runs `hermes -z` "
"with (passed as `hermes -m MODEL -z ...`). When unset, the validator "
"uses whatever's in your ~/.hermes/config.yaml. Useful when your "
"daily-driver Hermes model is so capable it saturates the planted-bug "
"suite at 100%, hiding the behavioral signal closed-loop is supposed "
"to surface — run validation against a weaker model without touching "
"your config.",
)
@click.option(
"--closed-loop-task-timeout-seconds",
"closed_loop_task_timeout_seconds",
default=None,
type=click.IntRange(min=1),
help="Per-task wall-clock budget for the closed-loop validator's `hermes -z` "
"subprocess (default 120s). Bump when --closed-loop-agent-model selects "
"a slow reasoning model that doesn't finish within the default — most "
"OpenAI reasoning models (o1-family, o3-family) take 60-180s per "
"debugging task. Hitting the timeout abstains the task verdict rather "
"than failing it, so over-tight values silently produce no-signal runs.",
)
def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflection_model,
eval_model, skill_source_dir, dry_run, seed, budget, no_fallback,
quality_gate, growth_free_threshold,
Expand All @@ -1563,7 +1597,9 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
closed_loop_min_iters,
closed_loop_window_size,
closed_loop_mode,
closed_loop_in_valset):
closed_loop_in_valset,
closed_loop_agent_model,
closed_loop_task_timeout_seconds):
"""Evolve an agent skill using DSPy + GEPA optimization."""
try:
evolve(
Expand Down Expand Up @@ -1607,6 +1643,8 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
closed_loop_window_size=closed_loop_window_size,
closed_loop_mode=closed_loop_mode,
closed_loop_in_valset=closed_loop_in_valset,
closed_loop_agent_model=closed_loop_agent_model,
closed_loop_task_timeout_seconds=closed_loop_task_timeout_seconds,
)
except HermesProviderError as exc:
# Render a clean error panel instead of dumping a Python traceback
Expand Down
39 changes: 38 additions & 1 deletion evolution/tools/evolve_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,8 @@ def _maybe_build_closed_loop_cache(
min_iters: int,
window_size: int,
gate_mode: str = "sampled",
agent_model: Optional[str] = None,
agent_timeout_seconds: Optional[int] = None,
):
"""Build a ClosedLoopFeedbackCache when the user opted in, else None.

Expand All @@ -286,7 +288,10 @@ def _maybe_build_closed_loop_cache(
installer = HermesToolDescriptionInstaller(
hermes_repo=hermes_repo, tool_name=tool_name
)
runner = HermesAgentRunner()
runner_kwargs: dict = {"model": agent_model}
if agent_timeout_seconds is not None:
runner_kwargs["timeout_seconds"] = agent_timeout_seconds
runner = HermesAgentRunner(**runner_kwargs)
validator = ClosedLoopValidator(installer=installer, runner=runner)
suite = TaskSuite.from_jsonl(suite_path)
return ClosedLoopFeedbackCache(
Expand Down Expand Up @@ -359,6 +364,8 @@ def evolve(
closed_loop_window_size: int = 8,
closed_loop_mode: str = "feedback",
closed_loop_in_valset: bool = False,
closed_loop_agent_model: Optional[str] = None,
closed_loop_task_timeout_seconds: Optional[int] = None,
skip_preflight: bool = False,
skip_cost_suggest: bool = False,
) -> dict[str, Any]:
Expand Down Expand Up @@ -592,6 +599,8 @@ def evolve(
min_iters=closed_loop_min_iters,
window_size=closed_loop_window_size,
gate_mode=cache_gate_mode,
agent_model=closed_loop_agent_model,
agent_timeout_seconds=closed_loop_task_timeout_seconds,
)
metric = make_tool_fitness_metric(
judge=judge,
Expand Down Expand Up @@ -1187,6 +1196,30 @@ def evolve(
"scoring). Costs more — each accepted candidate triggers another full "
"eval pass over the behavioral examples. Default off.",
)
@click.option(
"--closed-loop-agent-model",
"closed_loop_agent_model",
default=None,
type=str,
help="Override the agent model the closed-loop validator runs `hermes -z` "
"with (passed as `hermes -m MODEL -z ...`). When unset, the validator "
"uses whatever's in your ~/.hermes/config.yaml. Useful when your "
"daily-driver Hermes model saturates the planted-bug suite at 100%, "
"hiding the behavioral signal — run validation against a weaker model "
"without touching your config.",
)
@click.option(
"--closed-loop-task-timeout-seconds",
"closed_loop_task_timeout_seconds",
default=None,
type=click.IntRange(min=1),
help="Per-task wall-clock budget for the closed-loop validator's `hermes -z` "
"subprocess (default 120s). Bump when --closed-loop-agent-model selects "
"a slow reasoning model that doesn't finish within the default — most "
"OpenAI reasoning models (o1-family, o3-family) take 60-180s per "
"debugging task. Hitting the timeout abstains the task verdict rather "
"than failing it, so over-tight values silently produce no-signal runs.",
)
def main(
tool_name: str,
manifest_path: Path,
Expand All @@ -1212,6 +1245,8 @@ def main(
closed_loop_window_size: int,
closed_loop_mode: str,
closed_loop_in_valset: bool,
closed_loop_agent_model: Optional[str],
closed_loop_task_timeout_seconds: Optional[int],
) -> None:
"""Evolve one tool description in an MCP manifest using DSPy + GEPA."""
if apply_flag and patch_flag:
Expand Down Expand Up @@ -1249,6 +1284,8 @@ def main(
closed_loop_window_size=closed_loop_window_size,
closed_loop_mode=closed_loop_mode,
closed_loop_in_valset=closed_loop_in_valset,
closed_loop_agent_model=closed_loop_agent_model,
closed_loop_task_timeout_seconds=closed_loop_task_timeout_seconds,
skip_preflight=skip_preflight,
skip_cost_suggest=skip_cost_suggest,
)
Expand Down
15 changes: 14 additions & 1 deletion evolution/validation/hermes_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def __init__(
hermes_command: str = "hermes",
timeout_seconds: int = DEFAULT_TASK_TIMEOUT_SECONDS,
user_config_path: Optional[Path] = None,
model: Optional[str] = None,
) -> None:
self.hermes_command = hermes_command
self.timeout_seconds = timeout_seconds
Expand All @@ -56,6 +57,13 @@ def __init__(
if user_config_path is not None
else Path.home() / ".hermes" / "config.yaml"
)
# Optional per-invocation model override (passed as ``hermes -z -m
# <model>``). When unset, Hermes uses whatever is configured in
# the sandboxed ``config.yaml``. Useful for closed-loop validation
# against a deliberately weaker agent model than the user's
# daily-driver default — saturation on capable models hides
# behavioral signal that a weaker model would expose.
self.model = model

def run(self, ctx: TaskRunContext) -> AgentRunResult:
message = ctx.user_message
Expand All @@ -68,10 +76,15 @@ def run(self, ctx: TaskRunContext) -> AgentRunResult:
"HOME": str(sandbox),
**ctx.extra_env,
}
argv = [self.hermes_command, "-z", message]
if self.model is not None:
# Insert before the -z so hermes parses it as a global flag,
# not as part of the -z prompt value.
argv = [self.hermes_command, "-m", self.model, "-z", message]
start = time.time()
try:
subprocess.run(
[self.hermes_command, "-z", message],
argv,
env=env,
cwd=str(ctx.fixture_dir),
capture_output=True,
Expand Down
Loading
Loading