Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion eval-harness/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,24 @@ Results separate fix-only metrics from skill generation (indexing) metrics:
- Success rate (test passes)
- Wall clock time
- Input/output tokens
- Estimated Claude-reported USD cost for the fix pass
- Tool calls
- Lines changed
- Files touched

**Skill generation metrics** (reported separately):
- Wall clock time (indexing)
- Input/output tokens (indexing)
- Estimated Claude-reported USD cost for context generation
- Cache hit/miss status
- Files created

**Delta calculations**: Compare `with_skill` fix metrics vs `without_skill` metrics. Skill generation costs are visible in reports but excluded from performance deltas. This ensures the comparison shows whether the Intent Layer helps with fixing bugs, independent of the one-time indexing overhead.
**Estimated cost attribution**: Reports now carry Claude's reported `total_cost_usd` end-to-end and attribute it by harness component:

- `fix_only`: the Claude edit/fix pass for the task
- `skill_generation`: the context-generation pass for `flat_llm` or `intent_layer`
- `total`: the sum of those two estimates

This is an estimate derived from Claude's own run-level USD cost reporting. It is not a per-tool or per-model pricing engine, and it currently splits cost only across harness execution components (`fix` vs `skill_generation`).

**Delta calculations**: Compare `with_skill` fix metrics vs `without_skill` metrics. Skill generation costs are visible in reports, but deltas still use fix-only metrics so the comparison stays focused on whether the added context improves bug-fixing rather than on the one-time indexing overhead.
21 changes: 14 additions & 7 deletions eval-harness/lib/agentbench_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,13 +295,13 @@ def build_prompt(

# -- Context generation helpers (used by run_single Step 3) --

# Type alias: (workspace, instance, workspaces_dir, model) -> (input_tokens, output_tokens)
_GenerateFn = Callable[[Path, AgentbenchInstance, Path, str], tuple[int, int]]
# Type alias: (workspace, instance, workspaces_dir, model) -> (input_tokens, output_tokens, cost_usd)
_GenerateFn = Callable[[Path, AgentbenchInstance, Path, str], tuple[int, int, float]]


def _generate_flat_context(
workspace: Path, instance: AgentbenchInstance, workspaces_dir: Path, model: str,
) -> tuple[int, int]:
) -> tuple[int, int, float]:
"""Generate flat CLAUDE.md via Claude, dual-write to AGENTS.md."""
from lib.prompt_builder import build_flat_generation_prompt
log_dir = workspaces_dir.parent / "logs"
Expand All @@ -316,19 +316,19 @@ def _generate_flat_context(
agents_md = workspace / "AGENTS.md"
if claude_md.exists() and not agents_md.exists():
shutil.copy2(claude_md, agents_md)
return result.input_tokens, result.output_tokens
return result.input_tokens, result.output_tokens, result.cost_usd


def _generate_il_context(
workspace: Path, plugin_root: str, model: str,
) -> tuple[int, int]:
) -> tuple[int, int, float]:
"""Generate intent-layer context via the skill prompt."""
from lib.prompt_builder import build_skill_generation_prompt
result = run_claude(
str(workspace), build_skill_generation_prompt(plugin_root),
timeout=600, model=model,
)
return result.input_tokens, result.output_tokens
return result.input_tokens, result.output_tokens, result.cost_usd


def _inject_cached_context(
Expand All @@ -344,6 +344,7 @@ def _inject_cached_context(
gen_start = time.time()
gen_input_tokens = 0
gen_output_tokens = 0
gen_cost_usd = 0.0
cache_hit = False

if index_cache:
Expand All @@ -354,14 +355,15 @@ def _inject_cached_context(
cache_hit = True

if not cache_hit:
gen_input_tokens, gen_output_tokens = generate_fn(
gen_input_tokens, gen_output_tokens, gen_cost_usd = generate_fn(
workspace, instance, workspaces_dir, model,
)

return SkillGenerationMetrics(
wall_clock_seconds=time.time() - gen_start,
input_tokens=gen_input_tokens,
output_tokens=gen_output_tokens,
cost_usd=gen_cost_usd,
cache_hit=cache_hit,
)

Expand Down Expand Up @@ -464,6 +466,7 @@ def _fail(error: str, **kwargs) -> TaskResult:
test_output="", wall_clock_seconds=time.time() - start,
input_tokens=kwargs.get("input_tokens", 0),
output_tokens=kwargs.get("output_tokens", 0),
cost_usd=kwargs.get("cost_usd", 0.0),
tool_calls=kwargs.get("tool_calls", 0),
lines_changed=0, files_touched=[], rep=rep,
error=error,
Expand Down Expand Up @@ -511,6 +514,7 @@ def _fail(error: str, **kwargs) -> TaskResult:
wall_clock_seconds=time.time() - human_start,
input_tokens=0,
output_tokens=0,
cost_usd=0.0,
cache_hit=False,
files_created=files,
)
Expand Down Expand Up @@ -629,6 +633,7 @@ def _fail(error: str, **kwargs) -> TaskResult:
skill_generation=skill_metrics,
input_tokens=claude_result.input_tokens,
output_tokens=claude_result.output_tokens,
cost_usd=claude_result.cost_usd,
tool_calls=claude_result.tool_calls,
exit_code=claude_result.exit_code,
is_timeout=True,
Expand All @@ -640,6 +645,7 @@ def _fail(error: str, **kwargs) -> TaskResult:
skill_generation=skill_metrics,
input_tokens=claude_result.input_tokens,
output_tokens=claude_result.output_tokens,
cost_usd=claude_result.cost_usd,
exit_code=claude_result.exit_code,
)

Expand All @@ -665,6 +671,7 @@ def _fail(error: str, **kwargs) -> TaskResult:
wall_clock_seconds=elapsed,
input_tokens=claude_result.input_tokens,
output_tokens=claude_result.output_tokens,
cost_usd=claude_result.cost_usd,
tool_calls=claude_result.tool_calls,
lines_changed=diff.lines_changed,
files_touched=diff.files,
Expand Down
52 changes: 51 additions & 1 deletion eval-harness/lib/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# lib/cli.py
from __future__ import annotations
import json
import statistics
import shutil
import sys
import tempfile
Expand Down Expand Up @@ -200,7 +201,13 @@ def _recompute_summary(merged_results: list[dict]) -> dict:
# Discover conditions dynamically from the merged data
conditions_present = Reporter._discover_conditions(merged_results)
cond_stats: dict[str, dict] = {
c: {"successes": 0, "total": 0, "assigned": 0}
c: {
"successes": 0,
"total": 0,
"assigned": 0,
"fix_costs": [],
"skill_costs": [],
}
for c in conditions_present
}
infra_errors = 0
Expand All @@ -221,6 +228,12 @@ def _recompute_summary(merged_results: list[dict]) -> dict:
cond_stats[cond_key]["successes"] += successes
cond_stats[cond_key]["total"] += valid
cond_stats[cond_key]["assigned"] += total_runs
cost_breakdown = cond_data.get("cost_breakdown", {})
if valid:
median_fix = cost_breakdown.get("fix_only_usd", 0.0)
median_skill = cost_breakdown.get("skill_generation_usd", 0.0)
cond_stats[cond_key]["fix_costs"].append(median_fix)
cond_stats[cond_key]["skill_costs"].append(median_skill)
else:
cond_stats[cond_key]["assigned"] += 1
if _is_infra_error_dict(cond_data):
Expand All @@ -229,6 +242,13 @@ def _recompute_summary(merged_results: list[dict]) -> dict:
cond_stats[cond_key]["total"] += 1
if cond_data.get("success") is True:
cond_stats[cond_key]["successes"] += 1
cost_breakdown = cond_data.get("cost_breakdown", {})
cond_stats[cond_key]["fix_costs"].append(
cost_breakdown.get("fix_only_usd", cond_data.get("cost_usd", 0.0))
)
cond_stats[cond_key]["skill_costs"].append(
cost_breakdown.get("skill_generation_usd", 0.0)
)

def rate(stats):
if stats["total"] == 0:
Expand All @@ -240,14 +260,44 @@ def itt_rate(stats):
return 0
return round(stats["successes"] / stats["assigned"], 2)

def median_cost(values: list[float]) -> float:
if not values:
return 0.0
return round(statistics.median(values), 6)

summary: dict = {
"total_tasks": len(merged_results),
"infrastructure_errors": infra_errors,
"resumed_from": None, # Filled in by caller
}
summary["cost_attribution"] = {"by_condition": {}, "overall": {}}
overall_fix_cost = 0.0
overall_skill_cost = 0.0
for label in conditions_present:
summary[f"{label}_success_rate"] = rate(cond_stats[label])
summary[f"{label}_itt_rate"] = itt_rate(cond_stats[label])
summary[f"{label}_median_cost_usd"] = median_cost(cond_stats[label]["fix_costs"])
total_fix = round(sum(cond_stats[label]["fix_costs"]), 6)
total_skill = round(sum(cond_stats[label]["skill_costs"]), 6)
summary["cost_attribution"]["by_condition"][label] = {
"median_fix_only_usd": median_cost(cond_stats[label]["fix_costs"]),
"median_skill_generation_usd": median_cost(cond_stats[label]["skill_costs"]),
"median_total_usd": round(
median_cost(cond_stats[label]["fix_costs"])
+ median_cost(cond_stats[label]["skill_costs"]),
6,
),
"total_fix_only_usd": total_fix,
"total_skill_generation_usd": total_skill,
"total_usd": round(total_fix + total_skill, 6),
}
overall_fix_cost += total_fix
overall_skill_cost += total_skill
summary["cost_attribution"]["overall"] = {
"total_fix_only_usd": round(overall_fix_cost, 6),
"total_skill_generation_usd": round(overall_skill_cost, 6),
"total_usd": round(overall_fix_cost + overall_skill_cost, 6),
}

# Add Wilson Score CIs when multi-run data is present
if has_multi_run:
Expand Down
Loading