Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions evolution/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,17 @@ class EvolutionConfig:
iterations: int = 10
population_size: int = 5

# GEPA's reflective minibatch size — the number of training examples
# sampled per reflective step for the sum() acceptance gate at
# gepa/core/engine.py:491-493. Default 3 matches GEPA's own default
# (no behavior change). Users hitting the weak_signal saturation
# band can bump this to ~8 to widen the sampling window so
# discriminating examples appear more often per minibatch — see
# reports/pareto_frontier_feasibility.md spike #2 for the
# motivating case and saturation_check.py's weak_signal suggestions
# for the actionable hint surfaced to users.
reflection_minibatch_size: int = 3

# Per-role model overrides. When set, treated as explicit LiteLLM model
# strings that bypass Hermes resolution. When None, get_lm() falls back
# to resolve_default_lm() against ~/.hermes/config.yaml + auth.json +
Expand Down
4 changes: 2 additions & 2 deletions evolution/core/saturation_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ def _classify_band(
):
return "weak_signal", [
"Judge saturating but closed-loop has signal; GEPA's small-minibatch acceptance will struggle.",
"Expect many proposals rejected — bump --iterations above 5.",
"Larger minibatch (Path E follow-up) would help once landed.",
"Try --gepa-minibatch-size 8 (default 3) — widens the sampling window so discriminating examples appear in ~68% of minibatches vs ~34% at default.",
"Larger minibatch means fewer proposals per budget: on evolve_tool bump --iterations to ~10, on evolve_skill use --budget heavy.",
]

return "healthy", []
Expand Down
38 changes: 38 additions & 0 deletions evolution/skills/evolve_skill.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ def _default_gepa_runner(
seed: int,
instruction_proposer=None,
reflection_model: Optional[str] = None,
reflection_minibatch_size: int = 3,
):
# max_tokens=32000 satisfies DSPy's reasoning-model floor of 16000
# (DSPy raises ValueError below that).
Expand Down Expand Up @@ -298,6 +299,7 @@ def _default_gepa_runner(
# (.candidates, .val_aggregate_scores) on the returned module.
track_stats=True,
instruction_proposer=instruction_proposer,
reflection_minibatch_size=reflection_minibatch_size,
)
return optimizer.compile(baseline_module, trainset=trainset, valset=valset)

Expand Down Expand Up @@ -355,6 +357,7 @@ def _build_optimizer_and_compile(
failure_log_path: Optional[Path] = None,
instruction_proposer=None,
reflection_model: Optional[str] = None,
reflection_minibatch_size: int = 3,
_gepa_runner=_default_gepa_runner,
_mipro_runner=_default_mipro_runner,
):
Expand All @@ -376,6 +379,7 @@ def _build_optimizer_and_compile(
seed=seed,
instruction_proposer=instruction_proposer,
reflection_model=reflection_model,
reflection_minibatch_size=reflection_minibatch_size,
)
return optimized, "GEPA"
except CostCeilingExceeded:
Expand Down Expand Up @@ -606,6 +610,7 @@ def evolve(
skip_cost_suggest: bool = False,
skip_saturation_check: bool = False,
force_saturation_check: bool = False,
gepa_minibatch_size: int = 3,
closed_loop_suite_path: Optional[Path] = None,
closed_loop_saturation_threshold: float = 0.95,
closed_loop_min_iters: int = 3,
Expand Down Expand Up @@ -656,6 +661,7 @@ def evolve(
config_kwargs["eval_dataset_size"] = eval_dataset_size
if holdout_ratio is not None:
config_kwargs["holdout_ratio"] = holdout_ratio
config_kwargs["reflection_minibatch_size"] = gepa_minibatch_size
config = EvolutionConfig(**config_kwargs)
explicit_dirs = [Path(d) for d in (skill_source_dirs or [])]
if explicit_dirs:
Expand Down Expand Up @@ -793,6 +799,18 @@ def evolve(
)
sys.exit(1)

# Guard: GEPA's reflective batch sampler asserts
# len(trainset) >= reflection_minibatch_size mid-optimization
# (gepa/strategies/batch_sampler.py). Catch the misconfiguration
# at startup with an actionable message instead.
if config.reflection_minibatch_size > len(dataset.train):
console.print(
f"[red]✗ --gepa-minibatch-size={config.reflection_minibatch_size} "
f"exceeds trainset size {len(dataset.train)}. Pick a value ≤ "
f"{len(dataset.train)} or increase --eval-dataset-size.[/red]"
)
sys.exit(1)

# Static checks only — the growth-with-quality gate runs later on
# the evolved artifact once there's a holdout improvement signal.
console.print(f"\n[bold]Validating baseline constraints[/bold]")
Expand Down Expand Up @@ -953,6 +971,7 @@ def evolve(
failure_log_path=failure_log_path,
instruction_proposer=proposer,
reflection_model=config.reflection_model,
reflection_minibatch_size=config.reflection_minibatch_size,
)

elapsed = time.time() - start_time
Expand Down Expand Up @@ -1566,6 +1585,23 @@ def evolve(
"regardless of band. Required to override a non-healthy verdict "
"in non-interactive contexts (no TTY).",
)
@click.option(
"--gepa-minibatch-size",
"gepa_minibatch_size",
default=3,
type=click.IntRange(min=1),
help="GEPA's reflective minibatch size — number of training examples "
"sampled per reflective step for the sum() acceptance gate. "
"Default 3 matches GEPA's own default. Bump to ~8 when the "
"saturation pre-flight flags the weak_signal band: the wider "
"sampling window makes discriminating examples appear in "
"~68% of minibatches vs ~34% at default. Trade-off: larger "
"minibatch means each accepted proposal consumes more of the "
"metric-call budget. The skill pipeline uses --budget (not "
"--iterations) for its budget knob, so consider --budget heavy "
"to preserve the proposal count. Aborts at startup if the "
"value exceeds the trainset size.",
)
@click.option(
"--closed-loop-during-evolution",
"closed_loop_suite_path",
Expand Down Expand Up @@ -1659,6 +1695,7 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
skip_cost_suggest,
skip_saturation_check,
force_saturation_check,
gepa_minibatch_size,
closed_loop_suite_path,
closed_loop_saturation_threshold,
closed_loop_min_iters,
Expand Down Expand Up @@ -1706,6 +1743,7 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
skip_cost_suggest=skip_cost_suggest,
skip_saturation_check=skip_saturation_check,
force_saturation_check=force_saturation_check,
gepa_minibatch_size=gepa_minibatch_size,
closed_loop_suite_path=closed_loop_suite_path,
closed_loop_saturation_threshold=closed_loop_saturation_threshold,
closed_loop_min_iters=closed_loop_min_iters,
Expand Down
33 changes: 33 additions & 0 deletions evolution/tools/evolve_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ def evolve(
skip_cost_suggest: bool = False,
skip_saturation_check: bool = False,
force_saturation_check: bool = False,
gepa_minibatch_size: int = 3,
) -> dict[str, Any]:
"""Evolve one tool description inside a manifest.

Expand Down Expand Up @@ -421,6 +422,7 @@ def evolve(
eval_dataset_size=eval_dataset_size,
holdout_ratio=holdout_ratio,
enable_confusable_bucket=enable_confusable_bucket,
reflection_minibatch_size=gepa_minibatch_size,
)

console.print(
Expand Down Expand Up @@ -573,6 +575,18 @@ def evolve(
)
sys.exit(1)

# Guard: GEPA's reflective batch sampler asserts
# len(trainset) >= reflection_minibatch_size mid-optimization
# (gepa/strategies/batch_sampler.py). Catch the misconfiguration
# at startup with an actionable message instead.
if config.reflection_minibatch_size > len(dataset.train):
console.print(
f"[red]✗ --gepa-minibatch-size={config.reflection_minibatch_size} "
f"exceeds trainset size {len(dataset.train)}. Pick a value ≤ "
f"{len(dataset.train)} or increase --eval-dataset-size.[/red]"
)
sys.exit(1)

console.print(f"\n[bold]Validating baseline description[/bold]")
validator = ConstraintValidator(config)
baseline_constraints = validator.validate_static(baseline_description, "tool_description")
Expand Down Expand Up @@ -712,6 +726,7 @@ def evolve(
seed=config.seed,
track_stats=True,
instruction_proposer=proposer,
reflection_minibatch_size=config.reflection_minibatch_size,
)
optimized_module = optimizer.compile(
baseline_module, trainset=trainset, valset=valset,
Expand Down Expand Up @@ -1255,6 +1270,22 @@ def evolve(
"in non-interactive contexts (no TTY). Without this in such a "
"context, the framework exits cleanly without spending GEPA budget.",
)
@click.option(
"--gepa-minibatch-size",
"gepa_minibatch_size",
default=3,
type=click.IntRange(min=1),
help="GEPA's reflective minibatch size — number of training examples "
"sampled per reflective step for the sum() acceptance gate. "
"Default 3 matches GEPA's own default. Bump to ~8 when the "
"saturation pre-flight flags the weak_signal band: the wider "
"sampling window makes discriminating examples appear in "
"~68% of minibatches vs ~34% at default. Trade-off: larger "
"minibatch means each accepted proposal consumes more of the "
"metric-call budget, so consider also bumping --iterations to "
"~10 to preserve the proposal count. Aborts at startup if the "
"value exceeds the trainset size.",
)
@click.option(
"--closed-loop-in-valset/--no-closed-loop-in-valset",
"closed_loop_in_valset",
Expand Down Expand Up @@ -1308,6 +1339,7 @@ def main(
skip_cost_suggest: bool,
skip_saturation_check: bool,
force_saturation_check: bool,
gepa_minibatch_size: int,
closed_loop_suite_path: Optional[Path],
closed_loop_hermes_repo: Optional[Path],
closed_loop_saturation_threshold: float,
Expand Down Expand Up @@ -1360,6 +1392,7 @@ def main(
skip_cost_suggest=skip_cost_suggest,
skip_saturation_check=skip_saturation_check,
force_saturation_check=force_saturation_check,
gepa_minibatch_size=gepa_minibatch_size,
)
except HermesProviderError as exc:
# Render a clean error panel instead of dumping a Python traceback —
Expand Down
94 changes: 94 additions & 0 deletions tests/skills/test_evolve_skill_saturation_preflight.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,3 +280,97 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, skill_dir):
f"Expected baseline holdout to be reused from preflight cache "
f"(1 call for evolved only), got {mock_holdout_eval.call_count}"
)


class TestGepaMinibatchSizeFlag:
"""--gepa-minibatch-size threads through to dspy.GEPA's
reflection_minibatch_size kwarg, and the post-dataset-build guard
rejects values that exceed the trainset size with an actionable
message instead of an opaque assertion deep inside GEPA."""

def test_flag_passes_through_to_dspy_gepa(self, skill_dir):
"""Patch dspy.GEPA's __init__ to record the value, then invoke the
CLI with --gepa-minibatch-size 7. Assert the constructed instance
carries the value on the documented attribute."""
from evolution.core.saturation_check import SaturationReport
from evolution.skills.knee_point import CandidatePick
captured: dict = {}
original_init = __import__("dspy").GEPA.__init__

def recording_init(self, *args, **kwargs):
original_init(self, *args, **kwargs)
captured["reflection_minibatch_size"] = self.reflection_minibatch_size

healthy = SaturationReport(
band="healthy", holdout_score=0.6, holdout_n=10,
holdout_per_example=[0.6] * 10, suggestions=[], thresholds={},
)
fake_module = MagicMock()
fake_module.skill_text = "evolved skill text"
knee_pick = CandidatePick(
module=fake_module, skill_text="evolved skill text", body_chars=18,
val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
fallback="knee", picked_idx=0, gepa_default_idx=0,
gepa_default_body_chars=18, band_roster=[],
)
fake_builder = MagicMock()
fake_builder.generate.return_value = _fake_skill_dataset()
with patch(
"evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder
), patch(
"evolution.skills.evolve_skill.saturation_preflight", return_value=healthy
), patch(
"evolution.skills.evolve_skill._preflight_lm_credentials"
), patch("evolution.skills.evolve_skill.dspy.GEPA.__init__", recording_init), patch(
"evolution.skills.evolve_skill.dspy.GEPA.compile", return_value=fake_module
), patch(
"evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick
), patch(
"evolution.skills.evolve_skill._holdout_evaluate_with_metric",
return_value=(0.6, [0.6] * 10),
):
runner = CliRunner()
result = runner.invoke(
evolve_skill_main,
["--skill", "demo-skill", "--skill-source-dir", str(skill_dir),
"--iterations", "1", "--no-preflight",
"--gepa-minibatch-size", "7"],
)
assert captured.get("reflection_minibatch_size") == 7, (
f"Expected dspy.GEPA.reflection_minibatch_size=7; got "
f"{captured!r}. CLI output: {result.output}"
)

def test_minibatch_exceeding_trainset_aborts_at_startup(self, skill_dir):
"""--gepa-minibatch-size larger than the trainset triggers the
post-dataset guard (sys.exit(1) with an actionable message),
not a mid-optimization assertion inside EpochShuffledBatchSampler."""
from evolution.core.saturation_check import SaturationReport
healthy = SaturationReport(
band="healthy", holdout_score=0.6, holdout_n=10,
holdout_per_example=[0.6] * 10, suggestions=[], thresholds={},
)
# _fake_skill_dataset() returns train=30 — so 1000 exceeds it.
fake_builder = MagicMock()
fake_builder.generate.return_value = _fake_skill_dataset()
gepa_mock = MagicMock()
with patch(
"evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder
), patch(
"evolution.skills.evolve_skill.saturation_preflight", return_value=healthy
), patch(
"evolution.skills.evolve_skill._preflight_lm_credentials"
), patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock):
runner = CliRunner()
result = runner.invoke(
evolve_skill_main,
["--skill", "demo-skill", "--skill-source-dir", str(skill_dir),
"--iterations", "1", "--no-preflight",
"--gepa-minibatch-size", "1000"],
)
assert result.exit_code == 1, (
f"Expected exit 1 from trainset-ceiling guard, got "
f"{result.exit_code}. Output: {result.output}"
)
assert "exceeds trainset size" in result.output
gepa_mock.assert_not_called()
Loading
Loading