jramos · jramos · May 23, 2026 · May 23, 2026
diff --git a/evolution/core/saturation_check.py b/evolution/core/saturation_check.py
@@ -60,9 +60,9 @@ def _classify_band(
 
     if closed_loop_score is not None and closed_loop_score <= uniform_cl:
         return "uniform_failure", [
-            "Validator agent appears too weak to use the tool/skill — all behavioral tasks fail uniformly.",
-            "Try a stronger --closed-loop-agent-model.",
-            "Or harden the suite tasks so failure modes are interesting, not 'model can't execute'.",
+            "Baseline scored 0 on every behavioral task — GEPA has nothing to optimize for.",
+            "First check the validator actually ran: look in run.log for a 'Stripped LiteLLM provider prefix' line confirming --closed-loop-agent-model routed correctly, and for a non-zero number of subprocess LM calls.",
+            "If the validator did run: try a stronger --closed-loop-agent-model, or harden the suite tasks so failure modes are interesting rather than 'model can't execute the task.'",
         ]
 
     synthetic_saturated = holdout_score >= no_head_syn
@@ -200,7 +200,7 @@ def saturation_preflight(
     "healthy": "Saturation check passed",
     "no_headroom": "No measurable headroom",
     "weak_signal": "Weak signal — expect a hard run",
-    "uniform_failure": "Uniform failure — validator too weak",
+    "uniform_failure": "Uniform failure — closed-loop scored zero on every task",
 }
 
 _BAND_STYLES: dict[SaturationBand, str] = {

diff --git a/tests/core/test_saturation_check.py b/tests/core/test_saturation_check.py
@@ -50,6 +50,15 @@ def test_uniform_failure_when_closed_loop_below_threshold(self):
         )
         assert band == "uniform_failure"
         assert any("validator" in s.lower() or "stronger" in s.lower() for s in suggestions)
+        # The "first check the validator actually ran" hint guards against
+        # the historical silent-failure: hermes -m treated litellm-formatted
+        # model strings as openrouter routing, broke auth, returned 0-turn
+        # sessions, and the framework reported it as "validator too weak."
+        # The hint points users at the run.log line that confirms routing.
+        assert any(
+            "stripped litellm" in s.lower() or "run.log" in s.lower() or "routed correctly" in s.lower()
+            for s in suggestions
+        )
 
     def test_boundary_exactly_at_no_headroom_synthetic_triggers(self):
         """0.99 exactly should trigger no_headroom (>= comparison)."""