compiler-explorer · mattgodbolt · May 6, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,6 @@ node_modules
 # Package build artifacts
 *.egg-info/
 *.backup
+
+# Claude Code local session state
+.claude/scheduled_tasks.lock
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -66,6 +66,33 @@ uv run ruff format
 The service processes compiler output through a pipeline: input validation → smart assembly filtering → Claude API
 call → response with metrics. See `claude_explain.md` for detailed architecture documentation.
 
+## Anthropic API gotchas
+
+- **`max_tokens` includes thinking tokens.** When a prompt YAML sets `model.thinking: {type: adaptive}` (or
+  `{type: enabled, budget_tokens: N}`), thinking counts against `max_tokens`. The production value `1536` silently
+  starves the visible text output on complex cases when thinking is on. `Prompt.__init__` now refuses to load a
+  thinking-enabled config with `max_tokens < 4096`; ≥4096 (8192 worked in past experiments) is the floor.
+- **The reviewer model rejects `temperature`.** Opus 4.7 deprecated the parameter, so `prompt_testing/reviewer.py`
+  omits it. The Sonnet explainer still accepts `temperature`. If you swap the reviewer to a model that requires
+  it, restore the param.
+- **Reviewer thinking is on by default.** `prompt-test run --review` and `prompt-test review` default to
+  `--reviewer-thinking adaptive` / `--thinking adaptive`. It catches factual errors the no-think reviewer misses
+  but adds ~70% to review cost. Pass `off` to compare runs or save money on large batches.
+- **Production explainer thinking is intentionally off.** Adaptive thinking on Sonnet 4.6 measurably improves
+  factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~11s end-to-end
+  latency, which is too much for the interactive endpoint. Don't enable it in `app/prompt.yaml` without an
+  explicit latency/quality decision.
+- **Multi-block responses.** When thinking is enabled the API returns thinking blocks before the text block.
+  `app/explain.py` and `prompt_testing/runner.py` both pick the last text block via `getattr(c, "type", None) ==
+  "text"`. Preserve that pattern for any new code that consumes responses. The API may also return
+  `redacted_thinking` blocks (encrypted reasoning when safety filters trip); the same filter excludes them
+  correctly, but be aware "no text block" can mean either max_tokens starvation *or* a redacted-thinking-only
+  response — the error message is the same.
+- **Empty responses are not 500s.** When the model returns no text block, `app/explain.py` returns
+  `ExplainResponse(status="error")` with `usage` populated and emits `ClaudeExplainEmptyResponse`. The cache
+  layer skips storing error responses so retries hit the API. Don't change this to raise — the structured error
+  is what the CE frontend can render.
+
 ## Code Style Guidelines
 
 - Prefer using modern Python 3.13+ type syntax. Good: `a: list[str] | None`. Bad: `a: Optional[List[str]]`

diff --git a/app/cache.py b/app/cache.py
@@ -142,6 +142,7 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str:
         "model": prompt_data["model"],
         "max_tokens": prompt_data["max_tokens"],
         "temperature": prompt_data["temperature"],
+        "thinking": prompt_data.get("thinking"),
         "system": prompt_data["system"],
         "messages": prompt_data["messages"],
         # Include a hash of the prompt config to invalidate cache when prompts change

diff --git a/app/explain.py b/app/explain.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Any
 
 from anthropic import AsyncAnthropic
 
@@ -59,8 +60,10 @@ async def process_request(
     # Cache miss or no cache - proceed with Anthropic API call
     response = await _call_anthropic_api(body, client, prompt, metrics_provider)
 
-    # Cache the response (if cache provider is available)
-    if cache_provider is not None:
+    # Cache the response (if cache provider is available). Don't cache
+    # error responses — they consume real tokens but produce no useful
+    # content, and we want a retry to hit the API rather than the cache.
+    if cache_provider is not None and response.status == "success":
         await cache_response(body, prompt, response, cache_provider)
         metrics_provider.put_metric("ClaudeExplainCacheMiss", 1)
 
@@ -90,24 +93,61 @@ async def _call_anthropic_api(
     LOGGER.debug("=== END PROMPT DEBUG ===")
 
     # Call Claude API
-    LOGGER.info("Using Anthropic client with model: %s", {prompt_data["model"]})
-
-    message = await client.messages.create(
-        model=prompt_data["model"],
-        max_tokens=prompt_data["max_tokens"],
-        temperature=prompt_data["temperature"],
-        system=prompt_data["system"],
-        messages=prompt_data["messages"],
-    )
-
-    # Get explanation and strip leading/trailing whitespace
-    explanation = message.content[0].text.strip()
+    LOGGER.info("Using Anthropic client with model: %s", prompt_data["model"])
+
+    api_kwargs: dict[str, Any] = {
+        "model": prompt_data["model"],
+        "max_tokens": prompt_data["max_tokens"],
+        "system": prompt_data["system"],
+        "messages": prompt_data["messages"],
+    }
+    if prompt_data.get("thinking"):
+        # Extended thinking: API requires temperature to be unset.
+        api_kwargs["thinking"] = prompt_data["thinking"]
+    else:
+        api_kwargs["temperature"] = prompt_data["temperature"]
+
+    message = await client.messages.create(**api_kwargs)
 
     # Extract usage information
     input_tokens = message.usage.input_tokens
     output_tokens = message.usage.output_tokens
     total_tokens = input_tokens + output_tokens
 
+    # Pick the last text block — when thinking is enabled the response
+    # contains thinking blocks before the final text block.
+    text_blocks = [c for c in message.content if getattr(c, "type", None) == "text"]
+    explanation = text_blocks[-1].text.strip() if text_blocks else ""
+    if not explanation:
+        # Can happen if extended thinking exhausts max_tokens before any
+        # text block is emitted. Surface the failure to the caller with
+        # token usage populated, and emit a metric so this is visible on
+        # dashboards rather than buried in a generic 500.
+        message_text = (
+            f"Claude returned no text content "
+            f"(stop_reason={message.stop_reason}, in={input_tokens}, out={output_tokens}). "
+            f"If thinking is enabled, max_tokens may be too low."
+        )
+        LOGGER.warning(message_text)
+        metrics_provider.set_property("language", body.language)
+        metrics_provider.set_property("compiler", body.compiler)
+        metrics_provider.set_property("instructionSet", body.instructionSet or "unknown")
+        metrics_provider.set_property("cached", "false")
+        metrics_provider.put_metric("ClaudeExplainRequest", 1)
+        metrics_provider.put_metric("ClaudeExplainEmptyResponse", 1)
+        metrics_provider.put_metric("ClaudeExplainInputTokens", input_tokens)
+        metrics_provider.put_metric("ClaudeExplainOutputTokens", output_tokens)
+        return ExplainResponse(
+            status="error",
+            message=message_text,
+            model=prompt_data["model"],
+            usage=TokenUsage(
+                inputTokens=input_tokens,
+                outputTokens=output_tokens,
+                totalTokens=total_tokens,
+            ),
+        )
+
     # Calculate costs based on model
     cost_per_input_token, cost_per_output_token = get_model_cost(prompt_data["model"])
     input_cost = input_tokens * cost_per_input_token

diff --git a/app/prompt.py b/app/prompt.py
@@ -15,6 +15,11 @@
 # Constants from explain.py that are needed for data preparation
 MAX_ASSEMBLY_LINES = 300  # Maximum number of assembly lines to process
 
+# Minimum max_tokens that's safe to pair with extended thinking. Below this,
+# adaptive thinking can consume the whole budget on complex inputs and leave
+# nothing for the visible response.
+MIN_MAX_TOKENS_WITH_THINKING = 4096
+
 
 class Prompt:
     """Manages prompt templates and generates messages for Claude API."""
@@ -36,6 +41,18 @@ def __init__(self, config: dict[str, Any] | Path):
         self.model = self.config["model"]["name"]
         self.max_tokens = self.config["model"]["max_tokens"]
         self.temperature = self.config["model"].get("temperature", 0.0)
+        # Optional extended-thinking config, e.g. {"type": "adaptive"} or
+        # {"type": "enabled", "budget_tokens": 2000}. When set, callers
+        # should drop `temperature` (the API requires it to be unset/1).
+        self.thinking = self.config["model"].get("thinking")
+        if self.thinking and self.max_tokens < MIN_MAX_TOKENS_WITH_THINKING:
+            # Adaptive thinking happily consumes the entire token budget on
+            # complex inputs and leaves nothing for the visible text block.
+            # Fail loudly here rather than silently returning empty responses.
+            raise ValueError(
+                f"max_tokens={self.max_tokens} is too low for thinking; "
+                f"use at least {MIN_MAX_TOKENS_WITH_THINKING} when model.thinking is set."
+            )
 
         # Extract prompt templates
         self.system_prompt_template = self.config["system_prompt"]
@@ -275,6 +292,7 @@ def generate_messages(self, request: ExplainRequest) -> dict[str, Any]:
             "model": self.model,
             "max_tokens": self.max_tokens,
             "temperature": self.temperature,
+            "thinking": self.thinking,
             "system": system_prompt,
             "messages": messages,
             "structured_data": structured_data,  # Include for reference

diff --git a/app/test_explain.py b/app/test_explain.py
@@ -57,6 +57,7 @@ def mock_anthropic_client():
     mock_client = MagicMock()
     mock_message = MagicMock()
     mock_content = MagicMock()
+    mock_content.type = "text"
     mock_content.text = "This assembly code implements a simple square function..."
     mock_message.content = [mock_content]
 
@@ -149,6 +150,93 @@ async def test_process_request_success(self, sample_request, mock_anthropic_clie
         assert structured_data["compiler"] == "g++"
         assert structured_data["sourceCode"] == "int square(int x) {\n  return x * x;\n}"
 
+    @pytest.mark.asyncio
+    async def test_picks_last_text_block_with_thinking(self, sample_request, noop_metrics):
+        """When thinking is enabled, the response has thinking blocks before the
+        final text block. The handler should select the text block, not the
+        thinking block."""
+        thinking_block = MagicMock()
+        thinking_block.type = "thinking"
+        thinking_block.thinking = "Let me trace through the imul..."
+
+        text_block = MagicMock()
+        text_block.type = "text"
+        text_block.text = "Final explanation here."
+
+        mock_message = MagicMock()
+        mock_message.content = [thinking_block, text_block]
+        mock_message.usage = MagicMock(input_tokens=100, output_tokens=50)
+        mock_message.stop_reason = "end_turn"
+
+        mock_client = MagicMock()
+        mock_client.messages.create = AsyncMock(return_value=mock_message)
+
+        test_prompt = Prompt(Path("app/prompt.yaml"))
+        response = await process_request(sample_request, mock_client, test_prompt, noop_metrics)
+
+        assert response.status == "success"
+        assert response.explanation == "Final explanation here."
+
+    @pytest.mark.asyncio
+    async def test_returns_error_when_no_text_block(self, sample_request, noop_metrics):
+        """A response with no text block (e.g. thinking exhausted max_tokens)
+        should return status='error' with token usage populated, not raise.
+        Production must surface a structured error rather than a generic 500."""
+        thinking_block = MagicMock()
+        thinking_block.type = "thinking"
+        thinking_block.thinking = "..."
+
+        mock_message = MagicMock()
+        mock_message.content = [thinking_block]
+        mock_message.usage = MagicMock(input_tokens=100, output_tokens=50)
+        mock_message.stop_reason = "max_tokens"
+
+        mock_client = MagicMock()
+        mock_client.messages.create = AsyncMock(return_value=mock_message)
+
+        test_prompt = Prompt(Path("app/prompt.yaml"))
+        response = await process_request(sample_request, mock_client, test_prompt, noop_metrics)
+
+        assert response.status == "error"
+        assert response.explanation is None
+        assert "no text content" in response.message
+        # Real tokens were spent — surface them so callers can see the cost.
+        assert response.usage is not None
+        assert response.usage.inputTokens == 100
+        assert response.usage.outputTokens == 50
+
+
+class TestPromptValidation:
+    """Validation rules enforced at Prompt construction."""
+
+    def test_thinking_requires_min_max_tokens(self):
+        """A prompt YAML that enables thinking must also bump max_tokens."""
+        with pytest.raises(ValueError, match="too low for thinking"):
+            Prompt(
+                {
+                    "model": {"name": "test", "max_tokens": 1536, "thinking": {"type": "adaptive"}},
+                    "system_prompt": "",
+                    "user_prompt": "",
+                    "assistant_prefill": "",
+                    "audience_levels": {},
+                    "explanation_types": {},
+                }
+            )
+
+    def test_thinking_with_sufficient_max_tokens_is_ok(self):
+        """At/above the floor (4096), thinking-enabled prompts load fine."""
+        prompt = Prompt(
+            {
+                "model": {"name": "test", "max_tokens": 4096, "thinking": {"type": "adaptive"}},
+                "system_prompt": "",
+                "user_prompt": "",
+                "assistant_prefill": "",
+                "audience_levels": {},
+                "explanation_types": {},
+            }
+        )
+        assert prompt.thinking == {"type": "adaptive"}
+
 
 class TestSelectImportantAssembly:
     """Test the select_important_assembly function."""