Anmolnoor · Anmolnoor · May 27, 2026 · May 27, 2026
diff --git a/src/foundation/services/orchestrator.py b/src/foundation/services/orchestrator.py
@@ -61,7 +61,7 @@
 from foundation.services.history import HistoryStore
 from foundation.services.observer import EventSink, ObserverService
 from foundation.services.planner import PlannerService, PlanningError
-from foundation.services.provider import ProviderAdapter
+from foundation.services.provider import ProviderAdapter, ProviderError
 from foundation.services.shell import OutputCallback, ShellRuntime
 from foundation.services.tools import LocalToolService
 from foundation.settings import ApprovalMode
@@ -576,21 +576,31 @@ def orchestrate(self, request: UserRequest) -> OrchestrationResult:
                 )
             return result
         except Exception as exc:
+            # Preserve the raw (capped) provider response on parse/truncation
+            # failures so the persisted event log is self-diagnosing.
+            failure_extra: dict[str, str] = {}
+            if isinstance(exc, ProviderError) and exc.response_text:
+                failure_extra["response_text"] = exc.response_text[:4096]
             self._observer.emit_exception(
                 EVENT_EXCEPTION,
                 exc,
                 payload={
                     "request_id": request_id,
                     "session_id": session_id,
                     "request_text": request.message,
+                    **failure_extra,
                 },
                 session_id=session_id,
                 logger_name="foundation.services.orchestrator",
             )
             self._observer.emit_exception(
                 EVENT_PLAN_FAILED,
                 exc,
-                payload={"request_id": request_id, "session_id": session_id},
+                payload={
+                    "request_id": request_id,
+                    "session_id": session_id,
+                    **failure_extra,
+                },
                 session_id=session_id,
                 logger_name="foundation.services.orchestrator",
             )

diff --git a/src/foundation/services/planner.py b/src/foundation/services/planner.py
@@ -148,12 +148,22 @@ def request_plan(
                 response = self._provider.complete(prompt)
             except ProviderError as exc:
                 last_error = exc
-                if (
-                    exc.code is ProviderErrorCode.INVALID_RESPONSE
-                    and attempt < self._max_plan_attempts
-                ):
+                repairable = exc.code in (
+                    ProviderErrorCode.INVALID_RESPONSE,
+                    ProviderErrorCode.TRUNCATED,
+                )
+                if repairable and attempt < self._max_plan_attempts:
+                    if exc.code is ProviderErrorCode.TRUNCATED:
+                        feedback = (
+                            "Your previous response was truncated before the JSON closed. "
+                            "Produce a SHORTER plan: do not inline large file contents — for "
+                            "any sizable file body, omit `content` and provide a brief "
+                            "`content_brief` describing what to write instead."
+                        )
+                    else:
+                        feedback = "The previous response was not valid JSON."
                     supplemental_messages = self._repair_messages(
-                        "The previous response was not valid JSON.",
+                        feedback,
                         invalid_output=exc.response_text,
                     )
                     continue

diff --git a/src/foundation/services/provider.py b/src/foundation/services/provider.py
@@ -41,6 +41,7 @@ class ProviderErrorCode(StrEnum):
     RATE_LIMIT = "rate_limit"
     SERVER_ERROR = "server_error"
     INVALID_RESPONSE = "invalid_response"
+    TRUNCATED = "truncated"
     REFUSAL = "refusal"
     UNSUPPORTED_PROVIDER = "unsupported_provider"
 
@@ -197,6 +198,7 @@ def __init__(
         timeout_seconds: int = 60,
         max_attempts: int = 3,
         retry_backoff_seconds: float = 0.25,
+        max_output_tokens: int | None = None,
         transport: JsonTransport | None = None,
     ) -> None:
         self._model = model
@@ -205,6 +207,7 @@ def __init__(
         self._timeout_seconds = timeout_seconds
         self._max_attempts = max_attempts
         self._retry_backoff_seconds = retry_backoff_seconds
+        self._max_output_tokens = max_output_tokens
         self._transport = transport or UrllibJsonTransport()
 
     def complete(self, prompt: ProviderPrompt) -> ProviderResponse:
@@ -333,6 +336,8 @@ def _build_payload(self, prompt: ProviderPrompt) -> dict[str, Any]:
                 for message in prompt.messages
             ],
         }
+        if self._max_output_tokens is not None:
+            payload["max_output_tokens"] = self._max_output_tokens
         if prompt.response_format is ProviderResponseFormat.JSON_OBJECT:
             assert prompt.schema_name is not None
             assert prompt.output_schema is not None
@@ -347,6 +352,21 @@ def _build_payload(self, prompt: ProviderPrompt) -> dict[str, Any]:
         return payload
 
     def _extract_content(self, payload: Mapping[str, Any]) -> str:
+        # An incomplete response means the model hit max_output_tokens; the
+        # output is partial.  Flag it explicitly rather than parsing truncated JSON.
+        if payload.get("status") == "incomplete":
+            details = payload.get("incomplete_details")
+            reason = details.get("reason") if isinstance(details, Mapping) else None
+            if reason in (None, "max_output_tokens"):
+                raise ProviderError(
+                    "Provider response was truncated before completion "
+                    f"(status=incomplete, reason={reason}). Raise provider.max_output_tokens, "
+                    "or have the planner emit a smaller plan (use content_brief for large "
+                    "file bodies).",
+                    code=ProviderErrorCode.TRUNCATED,
+                    response_text=_coerce_optional_string(payload.get("output_text")),
+                )
+
         top_level_text = payload.get("output_text")
         if isinstance(top_level_text, str) and top_level_text.strip():
             return top_level_text.strip()
@@ -423,6 +443,8 @@ def __init__(
         timeout_seconds: int = 60,
         max_attempts: int = 3,
         retry_backoff_seconds: float = 0.25,
+        max_output_tokens: int | None = None,
+        num_ctx: int | None = None,
         transport: JsonTransport | None = None,
     ) -> None:
         self._model = model
@@ -431,6 +453,8 @@ def __init__(
         self._timeout_seconds = timeout_seconds
         self._max_attempts = max_attempts
         self._retry_backoff_seconds = retry_backoff_seconds
+        self._max_output_tokens = max_output_tokens
+        self._num_ctx = num_ctx
         self._transport = transport or UrllibJsonTransport()
 
     def complete(self, prompt: ProviderPrompt) -> ProviderResponse:
@@ -586,14 +610,19 @@ def _build_payload(self, prompt: ProviderPrompt) -> dict[str, Any]:
             ],
             "stream": False,
         }
+        options: dict[str, Any] = {}
+        if self._max_output_tokens is not None:
+            options["num_predict"] = self._max_output_tokens
+        if self._num_ctx is not None:
+            options["num_ctx"] = self._num_ctx
         if prompt.response_format is ProviderResponseFormat.JSON_OBJECT:
             assert prompt.output_schema is not None
             payload["format"] = prompt.output_schema
-            payload["options"] = {
-                "temperature": 0,
-            }
+            options["temperature"] = 0
             if self._needs_think_for_structured_output(self._model):
                 payload["think"] = True
+        if options:
+            payload["options"] = options
         return payload
 
     def _extract_content(
@@ -604,6 +633,23 @@ def _extract_content(
     ) -> str:
         json_requested = response_format is ProviderResponseFormat.JSON_OBJECT
 
+        # A truncated generation (model hit its output-token budget) leaves the
+        # JSON object unterminated.  Surface that explicitly here instead of
+        # letting it fall through to a confusing json.loads failure downstream.
+        if payload.get("done_reason") == "length":
+            message = payload.get("message")
+            partial = ""
+            if isinstance(message, Mapping) and isinstance(message.get("content"), str):
+                partial = message["content"]
+            raise ProviderError(
+                "Provider response was truncated before completion (done_reason=length). "
+                "The model hit its output-token limit, so the response is incomplete. "
+                "Raise provider.max_output_tokens, or have the planner emit a smaller plan "
+                "(use content_brief for large file bodies).",
+                code=ProviderErrorCode.TRUNCATED,
+                response_text=partial,
+            )
+
         # Standard Ollama local format: {"message": {"content": "...", "thinking": "..."}}
         message = payload.get("message")
         if isinstance(message, Mapping):
@@ -800,6 +846,8 @@ def build_provider_adapter(
             api_key=api_key,
             base_url=settings.provider.effective_base_url(),
             timeout_seconds=settings.provider.request_timeout_seconds,
+            max_output_tokens=settings.provider.max_output_tokens,
+            num_ctx=settings.provider.num_ctx,
             transport=transport,
         )
 
@@ -808,5 +856,6 @@ def build_provider_adapter(
         api_key=api_key or "",
         base_url=settings.provider.effective_base_url(),
         timeout_seconds=settings.provider.request_timeout_seconds,
+        max_output_tokens=settings.provider.max_output_tokens,
         transport=transport,
     )
diff --git a/src/foundation/settings.py b/src/foundation/settings.py
@@ -200,6 +200,8 @@ class ProviderSection(BaseModel):
     model: str = "gpt-5-mini"
     base_url: AnyUrl | None = None
     request_timeout_seconds: PositiveInt = 60
+    max_output_tokens: PositiveInt | None = None
+    num_ctx: PositiveInt | None = None
     api_key_env_var: str | None = OPENAI_DEFAULT_API_KEY_ENV_VAR
     api_key_keychain: KeychainSecretRef | None = Field(default_factory=KeychainSecretRef)
 

diff --git a/tests/test_orchestrator.py b/tests/test_orchestrator.py
@@ -279,6 +279,42 @@ def test_orchestrator_retries_invalid_plans_without_duplicate_shell_execution(
     assert result.execution_results[0].artifact["stdout"] == f"{workspace_root}\n"
 
 
+def test_orchestrator_recovers_from_truncated_plan_with_content_brief_hint(
+    tmp_path: Path,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from foundation.services.provider import ProviderError, ProviderErrorCode
+
+    class _TruncateThenSucceed:
+        def __init__(self) -> None:
+            self.calls: list[ProviderPrompt] = []
+            self._raised = False
+
+        def complete(self, prompt: ProviderPrompt) -> ProviderResponse:
+            self.calls.append(prompt)
+            if not self._raised:
+                self._raised = True
+                raise ProviderError(
+                    "Provider response was truncated before completion (done_reason=length).",
+                    code=ProviderErrorCode.TRUNCATED,
+                    response_text='{"assistant_message":"writing","actions":[{"id":"w"',
+                )
+            return _provider_response({"assistant_message": "Done.", "actions": []})
+
+    provider = _TruncateThenSucceed()
+    orchestrator, runtime, _ = _orchestrator(tmp_path, monkeypatch, provider)
+
+    result = orchestrator.orchestrate(UserRequest(message="write a big file"))
+
+    # Truncated attempt + repaired retry, both within iteration 1's planning.
+    assert len(provider.calls) == 2
+    assert runtime.calls == 0
+    repair_text = "\n".join(m.content for m in provider.calls[1].messages)
+    assert "truncated" in repair_text.lower()
+    assert "content_brief" in repair_text
+    assert result.summary is not None
+
+
 def test_orchestrator_retries_shell_cat_plan_without_executing_it(
     tmp_path: Path,
     monkeypatch: pytest.MonkeyPatch,

diff --git a/tests/test_provider.py b/tests/test_provider.py
@@ -456,3 +456,82 @@ def test_ollama_adapter_invalid_json_error_includes_raw() -> None:
 
     assert exc_info.value.code is ProviderErrorCode.INVALID_RESPONSE
     assert exc_info.value.response_text == garbage
+
+
+def test_ollama_adapter_sets_num_predict_and_num_ctx() -> None:
+    transport = FakeTransport(
+        [{"message": {"role": "assistant", "content": '{"assistant_message":"ok","actions":[]}'}}]
+    )
+    adapter = OllamaChatAdapter(
+        model="glm-5.1:cloud",
+        base_url="http://localhost:11434/api",
+        max_output_tokens=2048,
+        num_ctx=8192,
+        transport=transport,
+    )
+
+    adapter.complete(_structured_prompt())
+
+    options = transport.calls[0]["payload"]["options"]
+    assert options["num_predict"] == 2048
+    assert options["num_ctx"] == 8192
+    assert options["temperature"] == 0
+
+
+def test_ollama_adapter_raises_truncated_on_done_reason_length() -> None:
+    """A length-truncated response surfaces a distinct TRUNCATED error, not a JSON parse error."""
+    partial = '{"assistant_message":"writing","actions":[{"id":"w","kind":"tool_call"'
+    transport = FakeTransport(
+        [{"message": {"role": "assistant", "content": partial}, "done_reason": "length"}]
+    )
+    adapter = OllamaChatAdapter(
+        model="kimi-k2.6:cloud",
+        base_url="http://localhost:11434/api",
+        max_output_tokens=128,
+        transport=transport,
+    )
+
+    with pytest.raises(ProviderError, match="truncated") as exc_info:
+        adapter.complete(_structured_prompt())
+
+    assert exc_info.value.code is ProviderErrorCode.TRUNCATED
+    assert exc_info.value.response_text == partial
+
+
+def test_openai_adapter_sets_max_output_tokens() -> None:
+    transport = FakeTransport(
+        [{"id": "r", "output_text": '{"assistant_message":"ok","actions":[]}'}]
+    )
+    adapter = OpenAIResponsesAdapter(
+        model="gpt-5-mini",
+        api_key="sk-test",
+        max_output_tokens=4096,
+        transport=transport,
+    )
+
+    adapter.complete(_structured_prompt())
+
+    assert transport.calls[0]["payload"]["max_output_tokens"] == 4096
+
+
+def test_openai_adapter_raises_truncated_on_incomplete_response() -> None:
+    transport = FakeTransport(
+        [
+            {
+                "id": "resp_1",
+                "status": "incomplete",
+                "incomplete_details": {"reason": "max_output_tokens"},
+                "output_text": '{"assistant_message":"partial"',
+            }
+        ]
+    )
+    adapter = OpenAIResponsesAdapter(
+        model="gpt-5-mini",
+        api_key="sk-test",
+        transport=transport,
+    )
+
+    with pytest.raises(ProviderError, match="truncated") as exc_info:
+        adapter.complete(_structured_prompt())
+
+    assert exc_info.value.code is ProviderErrorCode.TRUNCATED