Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions evolution/core/saturation_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def _classify_band(

if closed_loop_score is not None and closed_loop_score <= uniform_cl:
return "uniform_failure", [
"Validator agent appears too weak to use the tool/skill — all behavioral tasks fail uniformly.",
"Try a stronger --closed-loop-agent-model.",
"Or harden the suite tasks so failure modes are interesting, not 'model can't execute'.",
"Baseline scored 0 on every behavioral task — GEPA has nothing to optimize for.",
"First check the validator actually ran: look in run.log for a 'Stripped LiteLLM provider prefix' line confirming --closed-loop-agent-model routed correctly, and for a non-zero number of subprocess LM calls.",
"If the validator did run: try a stronger --closed-loop-agent-model, or harden the suite tasks so failure modes are interesting rather than 'model can't execute the task.'",
]

synthetic_saturated = holdout_score >= no_head_syn
Expand Down Expand Up @@ -200,7 +200,7 @@ def saturation_preflight(
"healthy": "Saturation check passed",
"no_headroom": "No measurable headroom",
"weak_signal": "Weak signal — expect a hard run",
"uniform_failure": "Uniform failure — validator too weak",
"uniform_failure": "Uniform failure — closed-loop scored zero on every task",
}

_BAND_STYLES: dict[SaturationBand, str] = {
Expand Down
9 changes: 9 additions & 0 deletions tests/core/test_saturation_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,15 @@ def test_uniform_failure_when_closed_loop_below_threshold(self):
)
assert band == "uniform_failure"
assert any("validator" in s.lower() or "stronger" in s.lower() for s in suggestions)
# The "first check the validator actually ran" hint guards against
# the historical silent-failure: hermes -m treated litellm-formatted
# model strings as openrouter routing, broke auth, returned 0-turn
# sessions, and the framework reported it as "validator too weak."
# The hint points users at the run.log line that confirms routing.
assert any(
"stripped litellm" in s.lower() or "run.log" in s.lower() or "routed correctly" in s.lower()
for s in suggestions
)

def test_boundary_exactly_at_no_headroom_synthetic_triggers(self):
"""0.99 exactly should trigger no_headroom (>= comparison)."""
Expand Down
Loading