Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions mellea/backends/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -1122,6 +1122,40 @@ async def post_processing(
mot.generation.model = self._model_id
mot.generation.provider = "openai"

# content=None with stop+tokens means thinking-only mode; surface it rather than returning "".
finish_reason = choice_response.get("finish_reason")
completion_tokens = usage.get("completion_tokens", 0) if usage else 0
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When usage is unavailable (e.g. streaming without stream_options={"include_usage": True}, or an OpenAI-compatible provider that omits it), completion_tokens falls back to 0 and the guard silently passes — same empty-string symptom the fix is meant to surface. Not a regression, but worth a comment here or a line in the error so the next person hitting it knows why the guard didn't fire for them.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. Fixed in the rebase — the guard now uses (completion_tokens > 0 or bool(mot._thinking)) so that when usage is unavailable but reasoning content was captured during processing(), the guard still fires. Added a code comment explaining the fallback and a dedicated test (test_post_processing_raises_when_thinking_present_but_no_usage).

# Use _thinking as alternative evidence when usage is unavailable (some providers omit it).
if (
not mot._underlying_value
and finish_reason == "stop"
and (completion_tokens > 0 or bool(mot._thinking))
and not mot.tool_calls
):
thinking_note = (
f" Reasoning content ({len(mot._thinking)} chars) is in mot._thinking."
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: the error message points users at mot._thinking, which is a private attribute today. #909 item 4 is considering whether to promote it to public — happy to leave this as-is and let the message track whatever shakes out of that issue, but worth flagging.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed — leaving as-is for now. The message will naturally track whatever #909 item 4 decides (public attribute vs accessor). No action needed in this PR.

if mot._thinking
else ""
)
err = RuntimeError(
"OpenAI backend received an empty response (content=None) with "
f"finish_reason=stop and completion_tokens={completion_tokens}. "
"This typically indicates a thinking-mode model (e.g. Qwen3 via vLLM "
"with --reasoning-parser) that emitted only reasoning tokens."
+ thinking_note
+ " For vLLM/Qwen3, disable thinking via model_options, e.g.: "
'model_options={"extra_body": {"chat_template_kwargs": '
'{"enable_thinking": False}}}.'
" For other providers, consult your runtime's documentation."
)
span = mot._meta.pop("_telemetry_span", None)
if span is not None:
from ..telemetry import end_backend_span, set_span_error

set_span_error(span, err)
end_backend_span(span)
raise err

# Record telemetry now that response is available
span = mot._meta.get("_telemetry_span")
if span is not None:
Expand Down
133 changes: 132 additions & 1 deletion test/backends/test_openai_unit.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""Unit tests for OpenAI backend pure-logic helpers — no API calls required.

Covers filter_openai_client_kwargs, filter_chat_completions_kwargs,
_simplify_and_merge, and _make_backend_specific_and_remove.
_simplify_and_merge, _make_backend_specific_and_remove, and post_processing
error detection for empty thinking-mode responses.
"""

import pytest
Expand All @@ -11,6 +12,7 @@
from mellea.backends import ModelOption
from mellea.backends.openai import OpenAIBackend
from mellea.core.base import ModelOutputThunk
from mellea.stdlib.components import Message


def _make_backend(model_options: dict | None = None) -> OpenAIBackend:
Expand Down Expand Up @@ -311,5 +313,134 @@ async def test_processing_reasoning_content_takes_precedence_over_reasoning(back
assert mot._underlying_value == "answer"


# --- post_processing: empty thinking-mode response detection ---


def _build_mot_for_empty_content_check(
finish_reason: str = "stop",
content: str | None = None,
completion_tokens: int = 9,
tool_calls: list[dict] | None = None,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optional: the helper hand-assembles mot._meta["oai_chat_response"] and oai_chat_response_choice. Driving a real ChatCompletion fixture through processing() instead would catch future drift in those meta keys. Fine as a unit test; mentioning in case you want the broader coverage.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good suggestion. Keeping the hand-assembled approach here since these tests specifically validate post_processing's interpretation of the meta keys in isolation (unit-test scope). The processing() reasoning tests above already cover the full pipeline path, so between the two sections we get both the integration-style coverage and the focused unit verification.

thinking: str | None = None,
) -> ModelOutputThunk:
"""Construct a ModelOutputThunk in the state post_processing expects after processing()."""
mot = ModelOutputThunk(value=None)
mot._action = Message("user", "What is 2 + 2?")
mot._model_options = {}
mot._underlying_value = content if content is not None else ""
if thinking is not None:
mot._thinking = thinking
choice = {
"finish_reason": finish_reason,
"index": 0,
"message": {"content": content, "role": "assistant", "tool_calls": tool_calls},
}
full_response = {
"id": "chatcmpl-test",
"object": "chat.completion",
"choices": [choice],
"usage": {
"prompt_tokens": 10,
"completion_tokens": completion_tokens,
"total_tokens": 10 + completion_tokens,
},
}
mot._meta["oai_chat_response"] = full_response
mot._meta["oai_chat_response_choice"] = choice
return mot


async def test_post_processing_raises_on_empty_content_with_tokens(backend):
"""Thinking model with content=None, finish_reason=stop, non-zero tokens -> RuntimeError."""
mot = _build_mot_for_empty_content_check()
with pytest.raises(RuntimeError, match="enable_thinking"):
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)


async def test_post_processing_raises_on_empty_string_content(backend):
"""content='' is treated the same as None when finish_reason=stop and tokens>0."""
mot = _build_mot_for_empty_content_check(content="")
with pytest.raises(RuntimeError, match="empty response"):
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)


async def test_post_processing_accepts_empty_content_with_zero_tokens(backend):
"""Empty content with zero completion_tokens is not a thinking-mode failure."""
mot = _build_mot_for_empty_content_check(completion_tokens=0)
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)


async def test_post_processing_accepts_empty_content_with_length_finish(backend):
"""finish_reason=length (truncated) is a different failure mode, not raised here."""
mot = _build_mot_for_empty_content_check(finish_reason="length")
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)


async def test_post_processing_accepts_non_empty_content(backend):
"""Normal response with content is unaffected."""
mot = _build_mot_for_empty_content_check(content="The answer is 4.")
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)
assert mot._underlying_value == "The answer is 4."


async def test_post_processing_streaming_raises_on_empty_content(backend):
"""Streaming path: oai_chat_response is a choice-shaped dict (chat_completion_delta_merge output); guard still fires."""
mot = ModelOutputThunk(value=None)
mot._action = Message("user", "What is 2 + 2?")
mot._model_options = {}
mot._underlying_value = ""
# Streaming: oai_chat_response is the merged choice dict — finish_reason at the top level.
mot._meta["oai_chat_response"] = {
"finish_reason": "stop",
"index": 0,
"logprobs": None,
"stop_reason": None,
"message": {
"content": None,
"reasoning_content": "2+2=4",
"role": "assistant",
"tool_calls": [],
},
}
mot._meta["oai_streaming_usage"] = {
"prompt_tokens": 10,
"completion_tokens": 9,
"total_tokens": 19,
}
# oai_chat_response_choice intentionally absent — this is the streaming code path.
with pytest.raises(RuntimeError, match="enable_thinking"):
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)


async def test_post_processing_skips_when_tool_calls_present(backend):
"""Empty content with active tool calls must not raise — tool calls legitimately have no text."""
mot = _build_mot_for_empty_content_check()
mot.tool_calls = {"get_weather": {"name": "get_weather", "arguments": "{}"}} # type: ignore[assignment]
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)


async def test_post_processing_raises_when_thinking_present_but_no_usage(backend):
"""When usage is unavailable (tokens=0) but _thinking is populated, still raise."""
mot = _build_mot_for_empty_content_check(completion_tokens=0, thinking="deep thought")
with pytest.raises(RuntimeError, match="empty response"):
await backend.post_processing(
mot=mot, tools={}, conversation=[], thinking=None, seed=None, _format=None
)


if __name__ == "__main__":
pytest.main([__file__, "-v"])
Loading