compiler-explorer · mattgodbolt · May 6, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -78,10 +78,13 @@ call → response with metrics. See `claude_explain.md` for detailed architectur
 - **Reviewer thinking is on by default.** `prompt-test run --review` and `prompt-test review` default to
   `--reviewer-thinking adaptive` / `--thinking adaptive`. It catches factual errors the no-think reviewer misses
   but adds ~70% to review cost. Pass `off` to compare runs or save money on large batches.
-- **Production explainer thinking is intentionally off.** Adaptive thinking on Sonnet 4.6 measurably improves
-  factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~11s end-to-end
-  latency, which is too much for the interactive endpoint. Don't enable it in `app/prompt.yaml` without an
-  explicit latency/quality decision.
+- **Production explainer thinking is opt-in per request.** Adaptive thinking on Sonnet 4.6 measurably improves
+  factual accuracy (e.g. eliminates the recurring `imul eax, edi, edi` invention) but adds ~10s+ latency on
+  small/medium queries and can push large queries past the **30s Lambda + API Gateway v2 timeout** entirely (no
+  raising that — HTTP API has a 30s ceiling). Callers opt in by sending `useThinking: true` on the request; the
+  default (no field, or `false`) preserves current latency. Cache keys split on the flag, so on/off requests
+  cache independently. If we ever want default-on, we need either a smaller fixed thinking budget *or* an async
+  response architecture (Lambda Function URL with response streaming, SQS poll, etc.).
 - **Multi-block responses.** When thinking is enabled the API returns thinking blocks before the text block.
   `app/explain.py` and `prompt_testing/runner.py` both pick the last text block via `getattr(c, "type", None) ==
   "text"`. Preserve that pattern for any new code that consumes responses. The API may also return

diff --git a/README.md b/README.md
@@ -124,7 +124,8 @@ The API uses **camelCase** for all field names to maintain consistency with Java
       "labels": []
     }
   ],
-  "bypassCache": false  // Optional: set to true to skip cache reads
+  "bypassCache": false,  // Optional: set to true to skip cache reads
+  "useThinking": false   // Optional: enable extended thinking for higher accuracy at cost of latency
 }
 ```
 

diff --git a/app/cache.py b/app/cache.py
@@ -134,18 +134,11 @@ def generate_cache_key(request: ExplainRequest, prompt: Prompt) -> str:
     Returns:
         A SHA-256 hash string to use as cache key
     """
-    # Generate the full message data that would be sent to Anthropic
-    prompt_data = prompt.generate_messages(request)
-
-    # Create a deterministic representation of all cache-affecting data
+    # The payload IS the cache-affecting data — the same dict that gets
+    # spread into messages.create. Add prompt_version on top so changes
+    # to the YAML invalidate cached responses.
     cache_data = {
-        "model": prompt_data["model"],
-        "max_tokens": prompt_data["max_tokens"],
-        "temperature": prompt_data["temperature"],
-        "thinking": prompt_data.get("thinking"),
-        "system": prompt_data["system"],
-        "messages": prompt_data["messages"],
-        # Include a hash of the prompt config to invalidate cache when prompts change
+        **prompt.build_api_payload(request),
         "prompt_version": _get_prompt_config_hash(prompt.config),
     }
 

diff --git a/app/explain.py b/app/explain.py
@@ -1,5 +1,4 @@
 import logging
-from typing import Any
 
 from anthropic import AsyncAnthropic
 
@@ -80,8 +79,7 @@ async def _call_anthropic_api(
 
     This is the original process_request logic, extracted for clarity.
     """
-    # Generate messages using the Prompt instance
-    prompt_data = prompt.generate_messages(body)
+    prompt_data = prompt.build_api_payload(body)
 
     # Debug logging for prompts
     LOGGER.debug(f"=== PROMPT DEBUG FOR {body.explanation.value.upper()} (audience: {body.audience.value}) ===")
@@ -92,22 +90,15 @@ async def _call_anthropic_api(
         LOGGER.debug(message)
     LOGGER.debug("=== END PROMPT DEBUG ===")
 
-    # Call Claude API
-    LOGGER.info("Using Anthropic client with model: %s", prompt_data["model"])
-
-    api_kwargs: dict[str, Any] = {
-        "model": prompt_data["model"],
-        "max_tokens": prompt_data["max_tokens"],
-        "system": prompt_data["system"],
-        "messages": prompt_data["messages"],
-    }
-    if prompt_data.get("thinking"):
-        # Extended thinking: API requires temperature to be unset.
-        api_kwargs["thinking"] = prompt_data["thinking"]
-    else:
-        api_kwargs["temperature"] = prompt_data["temperature"]
-
-    message = await client.messages.create(**api_kwargs)
+    # Call Claude API. `prompt_data` is already exactly the kwargs
+    # `messages.create` expects — `build_api_payload` resolved the
+    # thinking-vs-temperature exclusivity for us.
+    LOGGER.info(
+        "Using Anthropic client with model: %s (thinking=%s)",
+        prompt_data["model"],
+        bool(prompt_data.get("thinking")),
+    )
+    message = await client.messages.create(**prompt_data)
 
     # Extract usage information
     input_tokens = message.usage.input_tokens

diff --git a/app/explain_api.py b/app/explain_api.py
@@ -57,6 +57,15 @@ class ExplainRequest(BaseModel):
         default=ExplanationType.ASSEMBLY, description="Type of explanation to generate"
     )
     bypassCache: bool = Field(default=False, description="If true, skip reading from cache but still write to cache")
+    useThinking: bool = Field(
+        default=False,
+        description=(
+            "If true, enable adaptive extended thinking on the explainer. "
+            "Improves correctness on complex inputs at the cost of significant "
+            "extra latency (often +10s, sometimes much more on large inputs). "
+            "May exceed the Lambda 30s timeout on the largest queries."
+        ),
+    )
 
     @property
     def instruction_set_with_default(self) -> str:

diff --git a/app/prompt.py b/app/prompt.py
@@ -235,6 +235,36 @@ def prepare_structured_data(self, request: ExplainRequest) -> dict[str, Any]:
 
         return structured_data
 
+    def build_api_payload(self, request: ExplainRequest) -> dict[str, Any]:
+        """Return the exact kwargs to pass to ``client.messages.create``.
+
+        Single source of truth: the API call splats this dict directly,
+        and the cache-key generator hashes it. Per-request runtime
+        overrides (currently just ``useThinking``) are resolved here, and
+        mutually-exclusive fields like ``thinking`` vs ``temperature``
+        are decided here too — callers don't need to re-derive anything.
+
+        Always returns a fresh dict so callers can mutate freely.
+        """
+        base = self.generate_messages(request)
+        payload: dict[str, Any] = {
+            "model": base["model"],
+            "max_tokens": base["max_tokens"],
+            "system": base["system"],
+            "messages": base["messages"],
+        }
+        # Resolve thinking config: per-request override wins over the YAML.
+        thinking = {"type": "adaptive"} if request.useThinking else base.get("thinking")
+        if thinking is not None:
+            payload["thinking"] = thinking
+            # Thinking and temperature are mutually exclusive (the API
+            # rejects temperature when thinking is set). Floor max_tokens
+            # so adaptive thinking can't starve the visible text.
+            payload["max_tokens"] = max(payload["max_tokens"], MIN_MAX_TOKENS_WITH_THINKING)
+        else:
+            payload["temperature"] = base["temperature"]
+        return payload
+
     def generate_messages(self, request: ExplainRequest) -> dict[str, Any]:
         """Generate the complete message structure for Claude API.
 

diff --git a/app/test_cache.py b/app/test_cache.py
@@ -265,6 +265,36 @@ def test_generate_cache_key_bypass_cache_ignored(self, test_prompt):
         key2 = generate_cache_key(request2, test_prompt)
         assert key1 == key2  # bypass_cache shouldn't affect cache key
 
+    def test_generate_cache_key_use_thinking_splits(self, test_prompt):
+        """useThinking changes the API call payload, so it must change the
+        cache key. Confirms cache hits don't bleed across thinking-on/off."""
+        base_kwargs = {
+            "language": "c++",
+            "compiler": "g++",
+            "code": "int test() { return 0; }",
+            "asm": [AssemblyItem(text="test:", source=None)],
+        }
+        request_off = ExplainRequest(**base_kwargs, useThinking=False)
+        request_on = ExplainRequest(**base_kwargs, useThinking=True)
+
+        assert generate_cache_key(request_off, test_prompt) != generate_cache_key(request_on, test_prompt)
+
+    def test_generate_cache_key_use_thinking_default_matches_omitted(self, test_prompt):
+        """A request with useThinking absent (Pydantic default) must produce
+        the same key as one with useThinking=False explicit. This is the
+        invariant that lets us add the field without invalidating the
+        existing S3 cache."""
+        base_kwargs = {
+            "language": "c++",
+            "compiler": "g++",
+            "code": "int test() { return 0; }",
+            "asm": [AssemblyItem(text="test:", source=None)],
+        }
+        request_default = ExplainRequest(**base_kwargs)  # useThinking absent
+        request_explicit = ExplainRequest(**base_kwargs, useThinking=False)
+
+        assert generate_cache_key(request_default, test_prompt) == generate_cache_key(request_explicit, test_prompt)
+
 
 class TestCacheHighLevelFunctions:
     """Test the high-level cache functions."""

diff --git a/app/test_explain.py b/app/test_explain.py
@@ -11,7 +11,7 @@
     SourceMapping,
 )
 from app.metrics import NoopMetricsProvider
-from app.prompt import MAX_ASSEMBLY_LINES, Prompt
+from app.prompt import MAX_ASSEMBLY_LINES, MIN_MAX_TOKENS_WITH_THINKING, Prompt
 
 
 @pytest.fixture
@@ -177,6 +177,23 @@ async def test_picks_last_text_block_with_thinking(self, sample_request, noop_me
         assert response.status == "success"
         assert response.explanation == "Final explanation here."
 
+    @pytest.mark.asyncio
+    async def test_use_thinking_overrides_prompt(self, sample_request, mock_anthropic_client, noop_metrics):
+        """Setting `useThinking=True` on the request must enable adaptive
+        thinking and bump max_tokens to satisfy the floor, even when the
+        loaded prompt has no thinking config."""
+        sample_request.useThinking = True
+
+        test_prompt = Prompt(Path("app/prompt.yaml"))
+        await process_request(sample_request, mock_anthropic_client, test_prompt, noop_metrics)
+
+        _args, kwargs = mock_anthropic_client.messages.create.call_args
+        assert kwargs.get("thinking") == {"type": "adaptive"}
+        # When thinking is set, temperature must NOT be passed (API rejects it).
+        assert "temperature" not in kwargs
+        # max_tokens bumped to at least the documented floor.
+        assert kwargs["max_tokens"] >= MIN_MAX_TOKENS_WITH_THINKING
+
     @pytest.mark.asyncio
     async def test_returns_error_when_no_text_block(self, sample_request, noop_metrics):
         """A response with no text block (e.g. thinking exhausted max_tokens)