orban · orban · Apr 4, 2026
diff --git a/eval-harness/README.md b/eval-harness/README.md
@@ -99,14 +99,24 @@ Results separate fix-only metrics from skill generation (indexing) metrics:
 - Success rate (test passes)
 - Wall clock time
 - Input/output tokens
+- Estimated Claude-reported USD cost for the fix pass
 - Tool calls
 - Lines changed
 - Files touched
 
 **Skill generation metrics** (reported separately):
 - Wall clock time (indexing)
 - Input/output tokens (indexing)
+- Estimated Claude-reported USD cost for context generation
 - Cache hit/miss status
 - Files created
 
-**Delta calculations**: Compare `with_skill` fix metrics vs `without_skill` metrics. Skill generation costs are visible in reports but excluded from performance deltas. This ensures the comparison shows whether the Intent Layer helps with fixing bugs, independent of the one-time indexing overhead.
+**Estimated cost attribution**: Reports now carry Claude's reported `total_cost_usd` end-to-end and attribute it by harness component:
+
+- `fix_only`: the Claude edit/fix pass for the task
+- `skill_generation`: the context-generation pass for `flat_llm` or `intent_layer`
+- `total`: the sum of those two estimates
+
+This is an estimate derived from Claude's own run-level USD cost reporting. It is not a per-tool or per-model pricing engine, and it currently splits cost only across harness execution components (`fix` vs `skill_generation`).
+
+**Delta calculations**: Compare `with_skill` fix metrics vs `without_skill` metrics. Skill generation costs are visible in reports, but deltas still use fix-only metrics so the comparison stays focused on whether the added context improves bug-fixing rather than on the one-time indexing overhead.
diff --git a/eval-harness/lib/agentbench_runner.py b/eval-harness/lib/agentbench_runner.py
@@ -295,13 +295,13 @@ def build_prompt(
 
 # -- Context generation helpers (used by run_single Step 3) --
 
-# Type alias: (workspace, instance, workspaces_dir, model) -> (input_tokens, output_tokens)
-_GenerateFn = Callable[[Path, AgentbenchInstance, Path, str], tuple[int, int]]
+# Type alias: (workspace, instance, workspaces_dir, model) -> (input_tokens, output_tokens, cost_usd)
+_GenerateFn = Callable[[Path, AgentbenchInstance, Path, str], tuple[int, int, float]]
 
 
 def _generate_flat_context(
     workspace: Path, instance: AgentbenchInstance, workspaces_dir: Path, model: str,
-) -> tuple[int, int]:
+) -> tuple[int, int, float]:
     """Generate flat CLAUDE.md via Claude, dual-write to AGENTS.md."""
     from lib.prompt_builder import build_flat_generation_prompt
     log_dir = workspaces_dir.parent / "logs"
@@ -316,19 +316,19 @@ def _generate_flat_context(
     agents_md = workspace / "AGENTS.md"
     if claude_md.exists() and not agents_md.exists():
         shutil.copy2(claude_md, agents_md)
-    return result.input_tokens, result.output_tokens
+    return result.input_tokens, result.output_tokens, result.cost_usd
 
 
 def _generate_il_context(
     workspace: Path, plugin_root: str, model: str,
-) -> tuple[int, int]:
+) -> tuple[int, int, float]:
     """Generate intent-layer context via the skill prompt."""
     from lib.prompt_builder import build_skill_generation_prompt
     result = run_claude(
         str(workspace), build_skill_generation_prompt(plugin_root),
         timeout=600, model=model,
     )
-    return result.input_tokens, result.output_tokens
+    return result.input_tokens, result.output_tokens, result.cost_usd
 
 
 def _inject_cached_context(
@@ -344,6 +344,7 @@ def _inject_cached_context(
     gen_start = time.time()
     gen_input_tokens = 0
     gen_output_tokens = 0
+    gen_cost_usd = 0.0
     cache_hit = False
 
     if index_cache:
@@ -354,14 +355,15 @@ def _inject_cached_context(
             cache_hit = True
 
     if not cache_hit:
-        gen_input_tokens, gen_output_tokens = generate_fn(
+        gen_input_tokens, gen_output_tokens, gen_cost_usd = generate_fn(
             workspace, instance, workspaces_dir, model,
         )
 
     return SkillGenerationMetrics(
         wall_clock_seconds=time.time() - gen_start,
         input_tokens=gen_input_tokens,
         output_tokens=gen_output_tokens,
+        cost_usd=gen_cost_usd,
         cache_hit=cache_hit,
     )
 
@@ -464,6 +466,7 @@ def _fail(error: str, **kwargs) -> TaskResult:
             test_output="", wall_clock_seconds=time.time() - start,
             input_tokens=kwargs.get("input_tokens", 0),
             output_tokens=kwargs.get("output_tokens", 0),
+            cost_usd=kwargs.get("cost_usd", 0.0),
             tool_calls=kwargs.get("tool_calls", 0),
             lines_changed=0, files_touched=[], rep=rep,
             error=error,
@@ -511,6 +514,7 @@ def _fail(error: str, **kwargs) -> TaskResult:
             wall_clock_seconds=time.time() - human_start,
             input_tokens=0,
             output_tokens=0,
+            cost_usd=0.0,
             cache_hit=False,
             files_created=files,
         )
@@ -629,6 +633,7 @@ def _fail(error: str, **kwargs) -> TaskResult:
                 skill_generation=skill_metrics,
                 input_tokens=claude_result.input_tokens,
                 output_tokens=claude_result.output_tokens,
+                cost_usd=claude_result.cost_usd,
                 tool_calls=claude_result.tool_calls,
                 exit_code=claude_result.exit_code,
                 is_timeout=True,
@@ -640,6 +645,7 @@ def _fail(error: str, **kwargs) -> TaskResult:
                 skill_generation=skill_metrics,
                 input_tokens=claude_result.input_tokens,
                 output_tokens=claude_result.output_tokens,
+                cost_usd=claude_result.cost_usd,
                 exit_code=claude_result.exit_code,
             )
 
@@ -665,6 +671,7 @@ def _fail(error: str, **kwargs) -> TaskResult:
         wall_clock_seconds=elapsed,
         input_tokens=claude_result.input_tokens,
         output_tokens=claude_result.output_tokens,
+        cost_usd=claude_result.cost_usd,
         tool_calls=claude_result.tool_calls,
         lines_changed=diff.lines_changed,
         files_touched=diff.files,

diff --git a/eval-harness/lib/cli.py b/eval-harness/lib/cli.py
@@ -1,6 +1,7 @@
 # lib/cli.py
 from __future__ import annotations
 import json
+import statistics
 import shutil
 import sys
 import tempfile
@@ -200,7 +201,13 @@ def _recompute_summary(merged_results: list[dict]) -> dict:
     # Discover conditions dynamically from the merged data
     conditions_present = Reporter._discover_conditions(merged_results)
     cond_stats: dict[str, dict] = {
-        c: {"successes": 0, "total": 0, "assigned": 0}
+        c: {
+            "successes": 0,
+            "total": 0,
+            "assigned": 0,
+            "fix_costs": [],
+            "skill_costs": [],
+        }
         for c in conditions_present
     }
     infra_errors = 0
@@ -221,6 +228,12 @@ def _recompute_summary(merged_results: list[dict]) -> dict:
                 cond_stats[cond_key]["successes"] += successes
                 cond_stats[cond_key]["total"] += valid
                 cond_stats[cond_key]["assigned"] += total_runs
+                cost_breakdown = cond_data.get("cost_breakdown", {})
+                if valid:
+                    median_fix = cost_breakdown.get("fix_only_usd", 0.0)
+                    median_skill = cost_breakdown.get("skill_generation_usd", 0.0)
+                    cond_stats[cond_key]["fix_costs"].append(median_fix)
+                    cond_stats[cond_key]["skill_costs"].append(median_skill)
             else:
                 cond_stats[cond_key]["assigned"] += 1
                 if _is_infra_error_dict(cond_data):
@@ -229,6 +242,13 @@ def _recompute_summary(merged_results: list[dict]) -> dict:
                     cond_stats[cond_key]["total"] += 1
                     if cond_data.get("success") is True:
                         cond_stats[cond_key]["successes"] += 1
+                    cost_breakdown = cond_data.get("cost_breakdown", {})
+                    cond_stats[cond_key]["fix_costs"].append(
+                        cost_breakdown.get("fix_only_usd", cond_data.get("cost_usd", 0.0))
+                    )
+                    cond_stats[cond_key]["skill_costs"].append(
+                        cost_breakdown.get("skill_generation_usd", 0.0)
+                    )
 
     def rate(stats):
         if stats["total"] == 0:
@@ -240,14 +260,44 @@ def itt_rate(stats):
             return 0
         return round(stats["successes"] / stats["assigned"], 2)
 
+    def median_cost(values: list[float]) -> float:
+        if not values:
+            return 0.0
+        return round(statistics.median(values), 6)
+
     summary: dict = {
         "total_tasks": len(merged_results),
         "infrastructure_errors": infra_errors,
         "resumed_from": None,  # Filled in by caller
     }
+    summary["cost_attribution"] = {"by_condition": {}, "overall": {}}
+    overall_fix_cost = 0.0
+    overall_skill_cost = 0.0
     for label in conditions_present:
         summary[f"{label}_success_rate"] = rate(cond_stats[label])
         summary[f"{label}_itt_rate"] = itt_rate(cond_stats[label])
+        summary[f"{label}_median_cost_usd"] = median_cost(cond_stats[label]["fix_costs"])
+        total_fix = round(sum(cond_stats[label]["fix_costs"]), 6)
+        total_skill = round(sum(cond_stats[label]["skill_costs"]), 6)
+        summary["cost_attribution"]["by_condition"][label] = {
+            "median_fix_only_usd": median_cost(cond_stats[label]["fix_costs"]),
+            "median_skill_generation_usd": median_cost(cond_stats[label]["skill_costs"]),
+            "median_total_usd": round(
+                median_cost(cond_stats[label]["fix_costs"])
+                + median_cost(cond_stats[label]["skill_costs"]),
+                6,
+            ),
+            "total_fix_only_usd": total_fix,
+            "total_skill_generation_usd": total_skill,
+            "total_usd": round(total_fix + total_skill, 6),
+        }
+        overall_fix_cost += total_fix
+        overall_skill_cost += total_skill
+    summary["cost_attribution"]["overall"] = {
+        "total_fix_only_usd": round(overall_fix_cost, 6),
+        "total_skill_generation_usd": round(overall_skill_cost, 6),
+        "total_usd": round(overall_fix_cost + overall_skill_cost, 6),
+    }
 
     # Add Wilson Score CIs when multi-run data is present
     if has_multi_run: