Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,13 @@ call → response with metrics. See `claude_explain.md` for detailed architectur
- **Reviewer thinking is on by default.** `prompt-test run --review` and `prompt-test review` default to
`--reviewer-thinking adaptive` / `--thinking adaptive`. It catches factual errors the no-think reviewer misses
but adds ~70% to review cost. Pass `off` to compare runs or save money on large batches.
- **Production explainer thinking is intentionally off.** Adaptive thinking on Sonnet 4.6 measurably improves
factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~11s end-to-end
latency, which is too much for the interactive endpoint. Don't enable it in `app/prompt.yaml` without an
explicit latency/quality decision.
- **Production explainer thinking is opt-in per request.** Adaptive thinking on Sonnet 4.6 measurably improves
factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~10s+ latency on
small/medium queries and can push large queries past the **30s Lambda + API Gateway v2 timeout** entirely (no
raising that — HTTP API has a 30s ceiling). Callers opt in by sending `useThinking: true` on the request; the
default (no field, or `false`) preserves current latency. Cache keys split on the flag, so on/off requests
cache independently. If we ever want default-on, we need either a smaller fixed thinking budget *or* an async
response architecture (Lambda Function URL with response streaming, SQS poll, etc.).
- **Multi-block responses.** When thinking is enabled the API returns thinking blocks before the text block.
`app/explain.py` and `prompt_testing/runner.py` both pick the last text block via `getattr(c, "type", None) ==
"text"`. Preserve that pattern for any new code that consumes responses. The API may also return
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ The API uses **camelCase** for all field names to maintain consistency with Java
"labels": []
}
],
"bypassCache": false // Optional: set to true to skip cache reads
"bypassCache": false, // Optional: set to true to skip cache reads
"useThinking": false // Optional: enable extended thinking for higher accuracy at cost of latency
}
```

Expand Down
15 changes: 4 additions & 11 deletions app/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,18 +134,11 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str:
Returns:
A SHA-256 hash string to use as cache key
"""
# Generate the full message data that would be sent to Anthropic
prompt_data = prompt.generate_messages(request)

# Create a deterministic representation of all cache-affecting data
# The payload IS the cache-affecting data — the same dict that gets
# spread into messages.create. Add prompt_version on top so changes
# to the YAML invalidate cached responses.
cache_data = {
"model": prompt_data["model"],
"max_tokens": prompt_data["max_tokens"],
"temperature": prompt_data["temperature"],
"thinking": prompt_data.get("thinking"),
"system": prompt_data["system"],
"messages": prompt_data["messages"],
# Include a hash of the prompt config to invalidate cache when prompts change
**prompt.build_api_payload(request),
"prompt_version": _get_prompt_config_hash(prompt.config),
}

Expand Down
29 changes: 10 additions & 19 deletions app/explain.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
from typing import Any

from anthropic import AsyncAnthropic

Expand Down Expand Up @@ -80,8 +79,7 @@ async def _call_anthropic_api(

This is the original process_request logic, extracted for clarity.
"""
# Generate messages using the Prompt instance
prompt_data = prompt.generate_messages(body)
prompt_data = prompt.build_api_payload(body)

# Debug logging for prompts
LOGGER.debug(f"=== PROMPT DEBUG FOR {body.explanation.value.upper()} (audience: {body.audience.value}) ===")
Expand All @@ -92,22 +90,15 @@ async def _call_anthropic_api(
LOGGER.debug(message)
LOGGER.debug("=== END PROMPT DEBUG ===")

# Call Claude API
LOGGER.info("Using Anthropic client with model: %s", prompt_data["model"])

api_kwargs: dict[str, Any] = {
"model": prompt_data["model"],
"max_tokens": prompt_data["max_tokens"],
"system": prompt_data["system"],
"messages": prompt_data["messages"],
}
if prompt_data.get("thinking"):
# Extended thinking: API requires temperature to be unset.
api_kwargs["thinking"] = prompt_data["thinking"]
else:
api_kwargs["temperature"] = prompt_data["temperature"]

message = await client.messages.create(**api_kwargs)
# Call Claude API. `prompt_data` is already exactly the kwargs
# `messages.create` expects — `build_api_payload` resolved the
# thinking-vs-temperature exclusivity for us.
LOGGER.info(
"Using Anthropic client with model: %s (thinking=%s)",
prompt_data["model"],
bool(prompt_data.get("thinking")),
)
message = await client.messages.create(**prompt_data)

# Extract usage information
input_tokens = message.usage.input_tokens
Expand Down
9 changes: 9 additions & 0 deletions app/explain_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,15 @@ class ExplainRequest(BaseModel):
default=ExplanationType.ASSEMBLY, description="Type of explanation to generate"
)
bypassCache: bool = Field(default=False, description="If true, skip reading from cache but still write to cache")
useThinking: bool = Field(
default=False,
description=(
"If true, enable adaptive extended thinking on the explainer. "
"Improves correctness on complex inputs at the cost of significant "
"extra latency (often +10s, sometimes much more on large inputs). "
"May exceed the Lambda 30s timeout on the largest queries."
),
)

@property
def instruction_set_with_default(self) -> str:
Expand Down
30 changes: 30 additions & 0 deletions app/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,36 @@ def prepare_structured_data(self, request: ExplainRequest) -> dict[str, Any]:

return structured_data

def build_api_payload(self, request: ExplainRequest) -> dict[str, Any]:
"""Return the exact kwargs to pass to ``client.messages.create``.

Single source of truth: the API call splats this dict directly,
and the cache-key generator hashes it. Per-request runtime
overrides (currently just ``useThinking``) are resolved here, and
mutually-exclusive fields like ``thinking`` vs ``temperature``
are decided here too — callers don't need to re-derive anything.

Always returns a fresh dict so callers can mutate freely.
"""
base = self.generate_messages(request)
payload: dict[str, Any] = {
"model": base["model"],
"max_tokens": base["max_tokens"],
"system": base["system"],
"messages": base["messages"],
}
# Resolve thinking config: per-request override wins over the YAML.
thinking = {"type": "adaptive"} if request.useThinking else base.get("thinking")
if thinking is not None:
payload["thinking"] = thinking
# Thinking and temperature are mutually exclusive (the API
# rejects temperature when thinking is set). Floor max_tokens
# so adaptive thinking can't starve the visible text.
payload["max_tokens"] = max(payload["max_tokens"], MIN_MAX_TOKENS_WITH_THINKING)
else:
payload["temperature"] = base["temperature"]
return payload

def generate_messages(self, request: ExplainRequest) -> dict[str, Any]:
"""Generate the complete message structure for Claude API.

Expand Down
30 changes: 30 additions & 0 deletions app/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,36 @@ def test_generate_cache_key_bypass_cache_ignored(self, test_prompt):
key2 = generate_cache_key(request2, test_prompt)
assert key1 == key2 # bypass_cache shouldn't affect cache key

def test_generate_cache_key_use_thinking_splits(self, test_prompt):
"""useThinking changes the API call payload, so it must change the
cache key. Confirms cache hits don't bleed across thinking-on/off."""
base_kwargs = {
"language": "c++",
"compiler": "g++",
"code": "int test() { return 0; }",
"asm": [AssemblyItem(text="test:", source=None)],
}
request_off = ExplainRequest(**base_kwargs, useThinking=False)
request_on = ExplainRequest(**base_kwargs, useThinking=True)

assert generate_cache_key(request_off, test_prompt) != generate_cache_key(request_on, test_prompt)

def test_generate_cache_key_use_thinking_default_matches_omitted(self, test_prompt):
"""A request with useThinking absent (Pydantic default) must produce
the same key as one with useThinking=False explicit. This is the
invariant that lets us add the field without invalidating the
existing S3 cache."""
base_kwargs = {
"language": "c++",
"compiler": "g++",
"code": "int test() { return 0; }",
"asm": [AssemblyItem(text="test:", source=None)],
}
request_default = ExplainRequest(**base_kwargs) # useThinking absent
request_explicit = ExplainRequest(**base_kwargs, useThinking=False)

assert generate_cache_key(request_default, test_prompt) == generate_cache_key(request_explicit, test_prompt)


class TestCacheHighLevelFunctions:
"""Test the high-level cache functions."""
Expand Down
19 changes: 18 additions & 1 deletion app/test_explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
SourceMapping,
)
from app.metrics import NoopMetricsProvider
from app.prompt import MAX_ASSEMBLY_LINES, Prompt
from app.prompt import MAX_ASSEMBLY_LINES, MIN_MAX_TOKENS_WITH_THINKING, Prompt


@pytest.fixture
Expand Down Expand Up @@ -177,6 +177,23 @@ async def test_picks_last_text_block_with_thinking(self, sample_request, noop_me
assert response.status == "success"
assert response.explanation == "Final explanation here."

@pytest.mark.asyncio
async def test_use_thinking_overrides_prompt(self, sample_request, mock_anthropic_client, noop_metrics):
"""Setting `useThinking=True` on the request must enable adaptive
thinking and bump max_tokens to satisfy the floor, even when the
loaded prompt has no thinking config."""
sample_request.useThinking = True

test_prompt = Prompt(Path("app/prompt.yaml"))
await process_request(sample_request, mock_anthropic_client, test_prompt, noop_metrics)

_args, kwargs = mock_anthropic_client.messages.create.call_args
assert kwargs.get("thinking") == {"type": "adaptive"}
# When thinking is set, temperature must NOT be passed (API rejects it).
assert "temperature" not in kwargs
# max_tokens bumped to at least the documented floor.
assert kwargs["max_tokens"] >= MIN_MAX_TOKENS_WITH_THINKING

@pytest.mark.asyncio
async def test_returns_error_when_no_text_block(self, sample_request, noop_metrics):
"""A response with no text block (e.g. thinking exhausted max_tokens)
Expand Down
Loading