Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions prompt_testing/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import click
from dotenv import load_dotenv

from app.model_costs import get_model_cost
from prompt_testing.ce_api import CompilerExplorerClient
from prompt_testing.enricher import TestCaseEnricher
from prompt_testing.file_utils import load_all_test_cases
Expand All @@ -43,7 +44,7 @@ def cli(ctx, project_root):
@click.option("--output", help="Output filename")
@click.option("--max-concurrent", type=int, default=5)
@click.option("--review", is_flag=True, help="Also run Opus correctness review on results")
@click.option("--review-model", default="claude-opus-4-6", help="Model for correctness review")
@click.option("--review-model", default="claude-opus-4-7", help="Model for correctness review")
@click.pass_context
def run(ctx, prompt, cases, categories, output, max_concurrent, review, review_model):
"""Run test cases and save results for review."""
Expand Down Expand Up @@ -208,6 +209,7 @@ async def _run_reviews(project_root: Path, results: dict, model: str) -> dict:

review_cost = 0.0
errors_found = 0
cost_per_input_token, cost_per_output_token = get_model_cost(model)

for i, result in enumerate(successful, 1):
case = cases_by_id.get(result["case_id"])
Expand All @@ -221,8 +223,10 @@ async def _run_reviews(project_root: Path, results: dict, model: str) -> dict:
n_issues = len(review.get("issues", []))
if not review.get("correct"):
errors_found += 1
# Opus pricing: $15/M in, $75/M out
cost = review.get("reviewer_input_tokens", 0) * 15 / 1e6 + review.get("reviewer_output_tokens", 0) * 75 / 1e6
cost = (
review.get("reviewer_input_tokens", 0) * cost_per_input_token
+ review.get("reviewer_output_tokens", 0) * cost_per_output_token
)
review_cost += cost
click.echo(f" [{i}/{len(successful)}] {status} {result['case_id']} ({n_issues} issues, ${cost:.4f})")

Expand Down Expand Up @@ -256,7 +260,7 @@ def _print_review_summary(results: dict) -> None:

@cli.command()
@click.argument("results_file")
@click.option("--model", default="claude-opus-4-6", help="Reviewer model")
@click.option("--model", default="claude-opus-4-7", help="Reviewer model")
@click.pass_context
def review(ctx, results_file, model):
"""Run Opus correctness review on existing results."""
Expand Down
4 changes: 2 additions & 2 deletions prompt_testing/reviewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
class CorrectnessReviewer:
"""Reviews explanations for factual correctness using a powerful model."""

def __init__(self, model: str = "claude-opus-4-6"):
def __init__(self, model: str = "claude-opus-4-7"):
self.model = model
self.client = AsyncAnthropic()

Expand Down Expand Up @@ -95,10 +95,10 @@ async def review(
explanation=explanation,
)

# Opus 4.7+ rejects `temperature`; rely on the model's own default.
msg = await self.client.messages.create(
model=self.model,
max_tokens=2048,
temperature=0.0,
system=REVIEW_SYSTEM_PROMPT,
messages=[{"role": "user", "content": user_prompt}],
)
Expand Down
Loading