Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions evolution/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ class EvolutionConfig:
# for the actionable hint surfaced to users.
reflection_minibatch_size: int = 3

# GEPA acceptance criterion. "improvement_or_equal" (default) accepts
# plateau-equal candidates so noisy LM-judge ties don't reject "true
# zero-difference" mutations ~50% of the time; "strict_improvement"
# preserves the gepa<0.1.2 implicit behavior. Forwarded as the literal
# kwarg expected by gepa.optimize via dspy.GEPA's gepa_kwargs passthrough
# (valid gepa values: "strict_improvement", "improvement_or_equal").
gepa_acceptance: str = "improvement_or_equal"

# Per-role model overrides. When set, treated as explicit LiteLLM model
# strings that bypass Hermes resolution. When None, get_lm() falls back
# to resolve_default_lm() against ~/.hermes/config.yaml + auth.json +
Expand Down
2 changes: 2 additions & 0 deletions evolution/core/run_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def build_run_inputs(
optimizer_model: str,
quality_gate_preset: str,
eval_source: str,
gepa_acceptance: str,
fitness_profile: Optional[str] = None,
enable_confusable_bucket: Optional[bool] = None,
) -> dict[str, Any]:
Expand All @@ -37,6 +38,7 @@ def build_run_inputs(
"holdout_ratio": config.holdout_ratio,
"quality_gate_preset": quality_gate_preset,
"eval_source": eval_source,
"gepa_acceptance": gepa_acceptance,
}
if fitness_profile is not None:
run_inputs["fitness_profile"] = fitness_profile
Expand Down
24 changes: 24 additions & 0 deletions evolution/skills/evolve_skill.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ def _default_gepa_runner(
instruction_proposer=None,
reflection_model: Optional[str] = None,
reflection_minibatch_size: int = 3,
gepa_acceptance: str = "improvement_or_equal",
):
# max_tokens=32000 satisfies DSPy's reasoning-model floor of 16000
# (DSPy raises ValueError below that).
Expand Down Expand Up @@ -327,6 +328,7 @@ def _default_gepa_runner(
track_stats=True,
instruction_proposer=instruction_proposer,
reflection_minibatch_size=reflection_minibatch_size,
gepa_kwargs={"acceptance_criterion": gepa_acceptance},
)
return optimizer.compile(baseline_module, trainset=trainset, valset=valset)

Expand Down Expand Up @@ -385,6 +387,7 @@ def _build_optimizer_and_compile(
instruction_proposer=None,
reflection_model: Optional[str] = None,
reflection_minibatch_size: int = 3,
gepa_acceptance: str = "improvement_or_equal",
_gepa_runner=_default_gepa_runner,
_mipro_runner=_default_mipro_runner,
):
Expand All @@ -407,6 +410,7 @@ def _build_optimizer_and_compile(
instruction_proposer=instruction_proposer,
reflection_model=reflection_model,
reflection_minibatch_size=reflection_minibatch_size,
gepa_acceptance=gepa_acceptance,
)
return optimized, "GEPA"
except CostCeilingExceeded:
Expand Down Expand Up @@ -638,6 +642,7 @@ def evolve(
skip_saturation_check: bool = False,
force_saturation_check: bool = False,
gepa_minibatch_size: int = 3,
gepa_acceptance: str = "improvement-or-equal",
closed_loop_suite_path: Optional[Path] = None,
closed_loop_saturation_threshold: float = 0.95,
closed_loop_min_iters: int = 3,
Expand Down Expand Up @@ -689,6 +694,7 @@ def evolve(
if holdout_ratio is not None:
config_kwargs["holdout_ratio"] = holdout_ratio
config_kwargs["reflection_minibatch_size"] = gepa_minibatch_size
config_kwargs["gepa_acceptance"] = gepa_acceptance.replace("-", "_")
config = EvolutionConfig(**config_kwargs)
explicit_dirs = [Path(d) for d in (skill_source_dirs or [])]
if explicit_dirs:
Expand Down Expand Up @@ -1010,6 +1016,7 @@ def evolve(
instruction_proposer=proposer,
reflection_model=config.reflection_model,
reflection_minibatch_size=config.reflection_minibatch_size,
gepa_acceptance=config.gepa_acceptance,
)

elapsed = time.time() - start_time
Expand Down Expand Up @@ -1101,6 +1108,7 @@ def evolve(
optimizer_model=optimizer_model,
quality_gate_preset=quality_gate,
eval_source=eval_source,
gepa_acceptance=config.gepa_acceptance,
),
})
console.print(f" Saved failed variant to {failed_path}")
Expand Down Expand Up @@ -1141,6 +1149,7 @@ def evolve(
optimizer_model=optimizer_model,
quality_gate_preset=quality_gate,
eval_source=eval_source,
gepa_acceptance=config.gepa_acceptance,
)

use_cl_primary = (
Expand Down Expand Up @@ -1531,6 +1540,7 @@ def evolve(
optimizer_model=optimizer_model,
quality_gate_preset=quality_gate,
eval_source=eval_source,
gepa_acceptance=config.gepa_acceptance,
),
schema_version="5",
)
Expand Down Expand Up @@ -1826,6 +1836,18 @@ def evolve(
"to preserve the proposal count. Aborts at startup if the "
"value exceeds the trainset size.",
)
@click.option(
"--gepa-acceptance",
"gepa_acceptance",
default="improvement-or-equal",
type=click.Choice(["strict-improvement", "improvement-or-equal"]),
help="GEPA acceptance criterion. 'strict-improvement': only accept "
"candidates with strictly better minibatch score (legacy gepa<0.1.2 "
"default). 'improvement-or-equal' (default): allow plateau-equal "
"candidates for more lateral exploration — the literature-recommended "
"fix for noisy LM-judge fitness where strict acceptance rejects "
"~50% of true-equal mutations.",
)
@click.option(
"--closed-loop-during-evolution",
"closed_loop_suite_path",
Expand Down Expand Up @@ -1920,6 +1942,7 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
skip_saturation_check,
force_saturation_check,
gepa_minibatch_size,
gepa_acceptance,
closed_loop_suite_path,
closed_loop_saturation_threshold,
closed_loop_min_iters,
Expand Down Expand Up @@ -1968,6 +1991,7 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
skip_saturation_check=skip_saturation_check,
force_saturation_check=force_saturation_check,
gepa_minibatch_size=gepa_minibatch_size,
gepa_acceptance=gepa_acceptance,
closed_loop_suite_path=closed_loop_suite_path,
closed_loop_saturation_threshold=closed_loop_saturation_threshold,
closed_loop_min_iters=closed_loop_min_iters,
Expand Down
19 changes: 19 additions & 0 deletions evolution/tools/evolve_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,7 @@ def evolve(
skip_saturation_check: bool = False,
force_saturation_check: bool = False,
gepa_minibatch_size: int = 3,
gepa_acceptance: str = "improvement-or-equal",
) -> dict[str, Any]:
"""Evolve one tool description inside a manifest.

Expand Down Expand Up @@ -429,6 +430,7 @@ def evolve(
holdout_ratio=holdout_ratio,
enable_confusable_bucket=enable_confusable_bucket,
reflection_minibatch_size=gepa_minibatch_size,
gepa_acceptance=gepa_acceptance.replace("-", "_"),
)

console.print(
Expand Down Expand Up @@ -745,6 +747,7 @@ def evolve(
track_stats=True,
instruction_proposer=proposer,
reflection_minibatch_size=config.reflection_minibatch_size,
gepa_kwargs={"acceptance_criterion": config.gepa_acceptance},
)
optimized_module = optimizer.compile(
baseline_module, trainset=trainset, valset=valset,
Expand Down Expand Up @@ -800,6 +803,7 @@ def evolve(
optimizer_model=optimizer_model,
quality_gate_preset=quality_gate,
eval_source=eval_source,
gepa_acceptance=config.gepa_acceptance,
fitness_profile=fitness_profile,
enable_confusable_bucket=config.enable_confusable_bucket,
)
Expand Down Expand Up @@ -1239,6 +1243,7 @@ def evolve(
optimizer_model=optimizer_model,
quality_gate_preset=quality_gate,
eval_source=eval_source,
gepa_acceptance=config.gepa_acceptance,
fitness_profile=fitness_profile,
enable_confusable_bucket=config.enable_confusable_bucket,
)
Expand Down Expand Up @@ -1476,6 +1481,18 @@ def evolve(
"~10 to preserve the proposal count. Aborts at startup if the "
"value exceeds the trainset size.",
)
@click.option(
"--gepa-acceptance",
"gepa_acceptance",
default="improvement-or-equal",
type=click.Choice(["strict-improvement", "improvement-or-equal"]),
help="GEPA acceptance criterion. 'strict-improvement': only accept "
"candidates with strictly better minibatch score (legacy gepa<0.1.2 "
"default). 'improvement-or-equal' (default): allow plateau-equal "
"candidates for more lateral exploration — the literature-recommended "
"fix for noisy LM-judge fitness where strict acceptance rejects "
"~50% of true-equal mutations.",
)
@click.option(
"--closed-loop-in-valset/--no-closed-loop-in-valset",
"closed_loop_in_valset",
Expand Down Expand Up @@ -1530,6 +1547,7 @@ def main(
skip_saturation_check: bool,
force_saturation_check: bool,
gepa_minibatch_size: int,
gepa_acceptance: str,
closed_loop_suite_path: Optional[Path],
closed_loop_hermes_repo: Optional[Path],
closed_loop_saturation_threshold: float,
Expand Down Expand Up @@ -1583,6 +1601,7 @@ def main(
skip_saturation_check=skip_saturation_check,
force_saturation_check=force_saturation_check,
gepa_minibatch_size=gepa_minibatch_size,
gepa_acceptance=gepa_acceptance,
)
except HermesProviderError as exc:
# Render a clean error panel instead of dumping a Python traceback —
Expand Down
10 changes: 10 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ keywords = ["llm", "optimization", "evolution", "dspy", "gepa", "agent", "skill"

dependencies = [
"dspy>=3.2.0,<3.3",
# Pinned to PR-304 merge SHA for the acceptance_criterion API; swap to
# a PyPI version when 0.1.2 ships (latest released 0.1.1 predates the merge).
"gepa @ git+https://github.com/gepa-ai/gepa.git@5e24ee5c8e1857a62a1ba19731de9da45ffb6f1b",
# Pinned because lm_timing_callback.py uses litellm.failure_callback
# (a module-level list mutation API) and dspy.LM forwards
# request_timeout/num_retries to litellm. Both are stable at 1.82
Expand Down Expand Up @@ -47,6 +50,13 @@ dev = [
Homepage = "https://github.com/jramos/agent-self-evolution"
Repository = "https://github.com/jramos/agent-self-evolution"

[tool.uv]
# DSPy 3.2.0 hard-pins gepa[dspy]==0.0.27; override to take PR-304's
# acceptance_criterion API before 0.1.2 ships on PyPI.
override-dependencies = [
"gepa @ git+https://github.com/gepa-ai/gepa.git@5e24ee5c8e1857a62a1ba19731de9da45ffb6f1b",
]

[tool.setuptools.packages.find]
include = ["evolution*"]

Expand Down
22 changes: 22 additions & 0 deletions reports/calibration_findings.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,25 @@ The `--max-absolute-chars 12000` override applied to Stage 7 validation runs (so
## Audit trail

Full campaign artifacts — runbook, analysis scripts, `study_*_results.json`, and per-run `output/<skill>/<ts>/{gate_decision,band_holdout,metrics}.json` — live on the `archive/2026-deploy-gate-calibration` branch. No-op merge to main; the report and the one-line preset change are the only durable changes from this campaign.

## Path D — GEPA acceptance criterion (improvement-or-equal default)

The framework now defaults `acceptance_criterion` to `improvement_or_equal`
for the underlying `gepa.optimize` call, replacing the implicit strict
behavior baked into gepa <0.1.2. Flag: `--gepa-acceptance
{strict-improvement,improvement-or-equal}` on `evolve_skill` and `evolve_tool`.

Why: strict-elitist acceptance under noisy LM-judge fitness rejects
"true zero-difference" candidates roughly half the time, narrowing
search and reducing Pareto-frontier diversity for no benefit. GEPA's
Algorithm 1 only says "if σ′ improved" — the strict tiebreak was an
implementation artifact, not a paper claim. Literature on this is
unambiguous (Beyer 2000 on noisy elitism; Aizawa & Wah 1994 and
Rakshit et al. 2017 on threshold/relaxed acceptance for noisy fitness).

Upstream: gepa-ai/gepa PR #304 (merged 2026-04-06) introduced the
configurable `acceptance_criterion` API with `"strict"` and
`"improvement_or_equal"` shortcuts. We pin gepa to the merge SHA via a
git dependency until 0.1.2 ships on PyPI (0.1.1 was uploaded
2026-03-16, three weeks before the merge); migration path is a one-line
swap to `"gepa>=0.1.2"` once it's published.
8 changes: 8 additions & 0 deletions tests/core/test_run_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def test_skill_side_shape(self):
optimizer_model="openai/gpt-4.1",
quality_gate_preset="default",
eval_source="synthetic",
gepa_acceptance="improvement_or_equal",
)
assert set(result.keys()) == {
"seed",
Expand All @@ -39,7 +40,9 @@ def test_skill_side_shape(self):
"holdout_ratio",
"quality_gate_preset",
"eval_source",
"gepa_acceptance",
}
assert result["gepa_acceptance"] == "improvement_or_equal"

def test_tool_side_adds_fitness_profile_and_confusable_bucket(self):
config = _fake_config()
Expand All @@ -50,6 +53,7 @@ def test_tool_side_adds_fitness_profile_and_confusable_bucket(self):
optimizer_model="openai/gpt-4.1",
quality_gate_preset="default",
eval_source="synthetic",
gepa_acceptance="strict_improvement",
fitness_profile="balanced",
enable_confusable_bucket=True,
)
Expand All @@ -64,9 +68,11 @@ def test_tool_side_adds_fitness_profile_and_confusable_bucket(self):
"holdout_ratio",
"quality_gate_preset",
"eval_source",
"gepa_acceptance",
"fitness_profile",
"enable_confusable_bucket",
}
assert result["gepa_acceptance"] == "strict_improvement"
assert result["fitness_profile"] == "balanced"
assert result["enable_confusable_bucket"] is True

Expand All @@ -78,6 +84,7 @@ def test_resolved_lms_matches_helper_output(self):
optimizer_model="openai/gpt-4.1",
quality_gate_preset="default",
eval_source="synthetic",
gepa_acceptance="improvement_or_equal",
)
expected = resolved_lms_dump(
optimizer="openai/gpt-4.1",
Expand All @@ -98,6 +105,7 @@ def test_enable_confusable_bucket_round_trips_when_passed(self):
optimizer_model="openai/gpt-4.1",
quality_gate_preset="default",
eval_source="synthetic",
gepa_acceptance="improvement_or_equal",
fitness_profile="balanced",
enable_confusable_bucket=config.enable_confusable_bucket,
)
Expand Down
Loading
Loading