jramos · jramos · May 22, 2026 · May 22, 2026
diff --git a/evolution/core/config.py b/evolution/core/config.py
@@ -26,6 +26,17 @@ class EvolutionConfig:
     iterations: int = 10
     population_size: int = 5
 
+    # GEPA's reflective minibatch size — the number of training examples
+    # sampled per reflective step for the sum() acceptance gate at
+    # gepa/core/engine.py:491-493. Default 3 matches GEPA's own default
+    # (no behavior change). Users hitting the weak_signal saturation
+    # band can bump this to ~8 to widen the sampling window so
+    # discriminating examples appear more often per minibatch — see
+    # reports/pareto_frontier_feasibility.md spike #2 for the
+    # motivating case and saturation_check.py's weak_signal suggestions
+    # for the actionable hint surfaced to users.
+    reflection_minibatch_size: int = 3
+
     # Per-role model overrides. When set, treated as explicit LiteLLM model
     # strings that bypass Hermes resolution. When None, get_lm() falls back
     # to resolve_default_lm() against ~/.hermes/config.yaml + auth.json +

diff --git a/evolution/core/saturation_check.py b/evolution/core/saturation_check.py
@@ -97,8 +97,8 @@ def _classify_band(
     ):
         return "weak_signal", [
             "Judge saturating but closed-loop has signal; GEPA's small-minibatch acceptance will struggle.",
-            "Expect many proposals rejected — bump --iterations above 5.",
-            "Larger minibatch (Path E follow-up) would help once landed.",
+            "Try --gepa-minibatch-size 8 (default 3) — widens the sampling window so discriminating examples appear in ~68% of minibatches vs ~34% at default.",
+            "Larger minibatch means fewer proposals per budget: on evolve_tool bump --iterations to ~10, on evolve_skill use --budget heavy.",
         ]
 
     return "healthy", []

diff --git a/evolution/skills/evolve_skill.py b/evolution/skills/evolve_skill.py
@@ -270,6 +270,7 @@ def _default_gepa_runner(
     seed: int,
     instruction_proposer=None,
     reflection_model: Optional[str] = None,
+    reflection_minibatch_size: int = 3,
 ):
     # max_tokens=32000 satisfies DSPy's reasoning-model floor of 16000
     # (DSPy raises ValueError below that).
@@ -298,6 +299,7 @@ def _default_gepa_runner(
         # (.candidates, .val_aggregate_scores) on the returned module.
         track_stats=True,
         instruction_proposer=instruction_proposer,
+        reflection_minibatch_size=reflection_minibatch_size,
     )
     return optimizer.compile(baseline_module, trainset=trainset, valset=valset)
 
@@ -355,6 +357,7 @@ def _build_optimizer_and_compile(
     failure_log_path: Optional[Path] = None,
     instruction_proposer=None,
     reflection_model: Optional[str] = None,
+    reflection_minibatch_size: int = 3,
     _gepa_runner=_default_gepa_runner,
     _mipro_runner=_default_mipro_runner,
 ):
@@ -376,6 +379,7 @@ def _build_optimizer_and_compile(
             seed=seed,
             instruction_proposer=instruction_proposer,
             reflection_model=reflection_model,
+            reflection_minibatch_size=reflection_minibatch_size,
         )
         return optimized, "GEPA"
     except CostCeilingExceeded:
@@ -606,6 +610,7 @@ def evolve(
     skip_cost_suggest: bool = False,
     skip_saturation_check: bool = False,
     force_saturation_check: bool = False,
+    gepa_minibatch_size: int = 3,
     closed_loop_suite_path: Optional[Path] = None,
     closed_loop_saturation_threshold: float = 0.95,
     closed_loop_min_iters: int = 3,
@@ -656,6 +661,7 @@ def evolve(
         config_kwargs["eval_dataset_size"] = eval_dataset_size
     if holdout_ratio is not None:
         config_kwargs["holdout_ratio"] = holdout_ratio
+    config_kwargs["reflection_minibatch_size"] = gepa_minibatch_size
     config = EvolutionConfig(**config_kwargs)
     explicit_dirs = [Path(d) for d in (skill_source_dirs or [])]
     if explicit_dirs:
@@ -793,6 +799,18 @@ def evolve(
                 )
                 sys.exit(1)
 
+            # Guard: GEPA's reflective batch sampler asserts
+            # len(trainset) >= reflection_minibatch_size mid-optimization
+            # (gepa/strategies/batch_sampler.py). Catch the misconfiguration
+            # at startup with an actionable message instead.
+            if config.reflection_minibatch_size > len(dataset.train):
+                console.print(
+                    f"[red]✗ --gepa-minibatch-size={config.reflection_minibatch_size} "
+                    f"exceeds trainset size {len(dataset.train)}. Pick a value ≤ "
+                    f"{len(dataset.train)} or increase --eval-dataset-size.[/red]"
+                )
+                sys.exit(1)
+
             # Static checks only — the growth-with-quality gate runs later on
             # the evolved artifact once there's a holdout improvement signal.
             console.print(f"\n[bold]Validating baseline constraints[/bold]")
@@ -953,6 +971,7 @@ def evolve(
                 failure_log_path=failure_log_path,
                 instruction_proposer=proposer,
                 reflection_model=config.reflection_model,
+                reflection_minibatch_size=config.reflection_minibatch_size,
             )
 
             elapsed = time.time() - start_time
@@ -1566,6 +1585,23 @@ def evolve(
          "regardless of band. Required to override a non-healthy verdict "
          "in non-interactive contexts (no TTY).",
 )
+@click.option(
+    "--gepa-minibatch-size",
+    "gepa_minibatch_size",
+    default=3,
+    type=click.IntRange(min=1),
+    help="GEPA's reflective minibatch size — number of training examples "
+         "sampled per reflective step for the sum() acceptance gate. "
+         "Default 3 matches GEPA's own default. Bump to ~8 when the "
+         "saturation pre-flight flags the weak_signal band: the wider "
+         "sampling window makes discriminating examples appear in "
+         "~68% of minibatches vs ~34% at default. Trade-off: larger "
+         "minibatch means each accepted proposal consumes more of the "
+         "metric-call budget. The skill pipeline uses --budget (not "
+         "--iterations) for its budget knob, so consider --budget heavy "
+         "to preserve the proposal count. Aborts at startup if the "
+         "value exceeds the trainset size.",
+)
 @click.option(
     "--closed-loop-during-evolution",
     "closed_loop_suite_path",
@@ -1659,6 +1695,7 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
          skip_cost_suggest,
          skip_saturation_check,
          force_saturation_check,
+         gepa_minibatch_size,
          closed_loop_suite_path,
          closed_loop_saturation_threshold,
          closed_loop_min_iters,
@@ -1706,6 +1743,7 @@ def main(skill, iterations, eval_source, dataset_path, optimizer_model, reflecti
             skip_cost_suggest=skip_cost_suggest,
             skip_saturation_check=skip_saturation_check,
             force_saturation_check=force_saturation_check,
+            gepa_minibatch_size=gepa_minibatch_size,
             closed_loop_suite_path=closed_loop_suite_path,
             closed_loop_saturation_threshold=closed_loop_saturation_threshold,
             closed_loop_min_iters=closed_loop_min_iters,

diff --git a/evolution/tools/evolve_tool.py b/evolution/tools/evolve_tool.py
@@ -376,6 +376,7 @@ def evolve(
     skip_cost_suggest: bool = False,
     skip_saturation_check: bool = False,
     force_saturation_check: bool = False,
+    gepa_minibatch_size: int = 3,
 ) -> dict[str, Any]:
     """Evolve one tool description inside a manifest.
 
@@ -421,6 +422,7 @@ def evolve(
         eval_dataset_size=eval_dataset_size,
         holdout_ratio=holdout_ratio,
         enable_confusable_bucket=enable_confusable_bucket,
+        reflection_minibatch_size=gepa_minibatch_size,
     )
 
     console.print(
@@ -573,6 +575,18 @@ def evolve(
                 )
                 sys.exit(1)
 
+            # Guard: GEPA's reflective batch sampler asserts
+            # len(trainset) >= reflection_minibatch_size mid-optimization
+            # (gepa/strategies/batch_sampler.py). Catch the misconfiguration
+            # at startup with an actionable message instead.
+            if config.reflection_minibatch_size > len(dataset.train):
+                console.print(
+                    f"[red]✗ --gepa-minibatch-size={config.reflection_minibatch_size} "
+                    f"exceeds trainset size {len(dataset.train)}. Pick a value ≤ "
+                    f"{len(dataset.train)} or increase --eval-dataset-size.[/red]"
+                )
+                sys.exit(1)
+
             console.print(f"\n[bold]Validating baseline description[/bold]")
             validator = ConstraintValidator(config)
             baseline_constraints = validator.validate_static(baseline_description, "tool_description")
@@ -712,6 +726,7 @@ def evolve(
                 seed=config.seed,
                 track_stats=True,
                 instruction_proposer=proposer,
+                reflection_minibatch_size=config.reflection_minibatch_size,
             )
             optimized_module = optimizer.compile(
                 baseline_module, trainset=trainset, valset=valset,
@@ -1255,6 +1270,22 @@ def evolve(
          "in non-interactive contexts (no TTY). Without this in such a "
          "context, the framework exits cleanly without spending GEPA budget.",
 )
+@click.option(
+    "--gepa-minibatch-size",
+    "gepa_minibatch_size",
+    default=3,
+    type=click.IntRange(min=1),
+    help="GEPA's reflective minibatch size — number of training examples "
+         "sampled per reflective step for the sum() acceptance gate. "
+         "Default 3 matches GEPA's own default. Bump to ~8 when the "
+         "saturation pre-flight flags the weak_signal band: the wider "
+         "sampling window makes discriminating examples appear in "
+         "~68% of minibatches vs ~34% at default. Trade-off: larger "
+         "minibatch means each accepted proposal consumes more of the "
+         "metric-call budget, so consider also bumping --iterations to "
+         "~10 to preserve the proposal count. Aborts at startup if the "
+         "value exceeds the trainset size.",
+)
 @click.option(
     "--closed-loop-in-valset/--no-closed-loop-in-valset",
     "closed_loop_in_valset",
@@ -1308,6 +1339,7 @@ def main(
     skip_cost_suggest: bool,
     skip_saturation_check: bool,
     force_saturation_check: bool,
+    gepa_minibatch_size: int,
     closed_loop_suite_path: Optional[Path],
     closed_loop_hermes_repo: Optional[Path],
     closed_loop_saturation_threshold: float,
@@ -1360,6 +1392,7 @@ def main(
             skip_cost_suggest=skip_cost_suggest,
             skip_saturation_check=skip_saturation_check,
             force_saturation_check=force_saturation_check,
+            gepa_minibatch_size=gepa_minibatch_size,
         )
     except HermesProviderError as exc:
         # Render a clean error panel instead of dumping a Python traceback —

diff --git a/tests/skills/test_evolve_skill_saturation_preflight.py b/tests/skills/test_evolve_skill_saturation_preflight.py
@@ -280,3 +280,97 @@ def test_cache_reuse_skips_baseline_re_eval_after_gepa(self, skill_dir):
                 f"Expected baseline holdout to be reused from preflight cache "
                 f"(1 call for evolved only), got {mock_holdout_eval.call_count}"
             )
+
+
+class TestGepaMinibatchSizeFlag:
+    """--gepa-minibatch-size threads through to dspy.GEPA's
+    reflection_minibatch_size kwarg, and the post-dataset-build guard
+    rejects values that exceed the trainset size with an actionable
+    message instead of an opaque assertion deep inside GEPA."""
+
+    def test_flag_passes_through_to_dspy_gepa(self, skill_dir):
+        """Patch dspy.GEPA's __init__ to record the value, then invoke the
+        CLI with --gepa-minibatch-size 7. Assert the constructed instance
+        carries the value on the documented attribute."""
+        from evolution.core.saturation_check import SaturationReport
+        from evolution.skills.knee_point import CandidatePick
+        captured: dict = {}
+        original_init = __import__("dspy").GEPA.__init__
+
+        def recording_init(self, *args, **kwargs):
+            original_init(self, *args, **kwargs)
+            captured["reflection_minibatch_size"] = self.reflection_minibatch_size
+
+        healthy = SaturationReport(
+            band="healthy", holdout_score=0.6, holdout_n=10,
+            holdout_per_example=[0.6] * 10, suggestions=[], thresholds={},
+        )
+        fake_module = MagicMock()
+        fake_module.skill_text = "evolved skill text"
+        knee_pick = CandidatePick(
+            module=fake_module, skill_text="evolved skill text", body_chars=18,
+            val_score=0.8, val_rank_in_band=1, band_size=1, epsilon=0.1,
+            fallback="knee", picked_idx=0, gepa_default_idx=0,
+            gepa_default_body_chars=18, band_roster=[],
+        )
+        fake_builder = MagicMock()
+        fake_builder.generate.return_value = _fake_skill_dataset()
+        with patch(
+            "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder
+        ), patch(
+            "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy
+        ), patch(
+            "evolution.skills.evolve_skill._preflight_lm_credentials"
+        ), patch("evolution.skills.evolve_skill.dspy.GEPA.__init__", recording_init), patch(
+            "evolution.skills.evolve_skill.dspy.GEPA.compile", return_value=fake_module
+        ), patch(
+            "evolution.skills.evolve_skill.select_knee_point", return_value=knee_pick
+        ), patch(
+            "evolution.skills.evolve_skill._holdout_evaluate_with_metric",
+            return_value=(0.6, [0.6] * 10),
+        ):
+            runner = CliRunner()
+            result = runner.invoke(
+                evolve_skill_main,
+                ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir),
+                 "--iterations", "1", "--no-preflight",
+                 "--gepa-minibatch-size", "7"],
+            )
+            assert captured.get("reflection_minibatch_size") == 7, (
+                f"Expected dspy.GEPA.reflection_minibatch_size=7; got "
+                f"{captured!r}. CLI output: {result.output}"
+            )
+
+    def test_minibatch_exceeding_trainset_aborts_at_startup(self, skill_dir):
+        """--gepa-minibatch-size larger than the trainset triggers the
+        post-dataset guard (sys.exit(1) with an actionable message),
+        not a mid-optimization assertion inside EpochShuffledBatchSampler."""
+        from evolution.core.saturation_check import SaturationReport
+        healthy = SaturationReport(
+            band="healthy", holdout_score=0.6, holdout_n=10,
+            holdout_per_example=[0.6] * 10, suggestions=[], thresholds={},
+        )
+        # _fake_skill_dataset() returns train=30 — so 1000 exceeds it.
+        fake_builder = MagicMock()
+        fake_builder.generate.return_value = _fake_skill_dataset()
+        gepa_mock = MagicMock()
+        with patch(
+            "evolution.skills.evolve_skill.SyntheticDatasetBuilder", return_value=fake_builder
+        ), patch(
+            "evolution.skills.evolve_skill.saturation_preflight", return_value=healthy
+        ), patch(
+            "evolution.skills.evolve_skill._preflight_lm_credentials"
+        ), patch("evolution.skills.evolve_skill.dspy.GEPA", gepa_mock):
+            runner = CliRunner()
+            result = runner.invoke(
+                evolve_skill_main,
+                ["--skill", "demo-skill", "--skill-source-dir", str(skill_dir),
+                 "--iterations", "1", "--no-preflight",
+                 "--gepa-minibatch-size", "1000"],
+            )
+            assert result.exit_code == 1, (
+                f"Expected exit 1 from trainset-ceiling guard, got "
+                f"{result.exit_code}. Output: {result.output}"
+            )
+            assert "exceeds trainset size" in result.output
+            gepa_mock.assert_not_called()