Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions dana/common/llm/providers/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,22 @@
class AzureProvider(OpenAICompatibleProvider):
"""Azure OpenAI provider."""

# Azure exposes the Responses API only on api-version >= this date.
# Older versions return HTTP 400 BadRequest for /openai/responses.
_RESPONSES_API_MIN_DATE = "2025-03-01"

# Env var operators use to dial reasoning effort for Azure deployments.
# Valid values: "minimal" | "low" | "medium" | "high".
_REASONING_EFFORT_ENV_VAR = "AZURE_THINKING_EFFORT"

def _responses_api_supported(self) -> bool:
# api-version format is "YYYY-MM-DD" or "YYYY-MM-DD-preview"; first 10 chars
# are the ISO date which sorts correctly lexicographically.
version = getattr(self, "api_version", None)
if not version or len(version) < 10:
return False
return version[:10] >= self._RESPONSES_API_MIN_DATE

def __init__(
self, api_key: str | None = None, model: str = "gpt-35-turbo", base_url: str | None = None, api_version: str | None = None
):
Expand Down
4 changes: 4 additions & 0 deletions dana/common/llm/providers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
class OpenAIProvider(OpenAICompatibleProvider):
"""OpenAI API provider."""

# Env var operators use to dial reasoning effort for OpenAI direct API.
# Valid values: "minimal" | "low" | "medium" | "high".
_REASONING_EFFORT_ENV_VAR = "OPENAI_THINKING_EFFORT"

def __init__(self, api_key: str | None = None, model: str = "gpt-3.5-turbo", base_url: str | None = None):
self.model = model

Expand Down
301 changes: 243 additions & 58 deletions dana/common/llm/providers/openai_compatible_base.py

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions dana/core/agent/star_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,18 @@ def _record_think_results(
output_state = "exit"

if not tool_calls or len(tool_calls) == 0:
# Persist reasoning even on direct-answer turns. Without this, the
# model's internal reasoning (LLMResponse.reasoning_content for
# gpt-5/o3/o4, or <thinking> tags / JSON reasoning fields for other
# codecs) is silently dropped whenever the agent answers without
# invoking a tool. Same emit pattern as the tool-calls branch below.
if reasoning and len(reasoning) > 0:
timeline.add_entry(
TimelineEntry(
entry_type=TimelineEntryType.AGENT_THOUGHTS,
content=reasoning,
)
)
response = response if (response and len(response) > 0) else "No response generated"
timeline.add_entry(
TimelineEntry(
Expand Down
131 changes: 131 additions & 0 deletions scripts/verify-azure-thinking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/env python3
"""Verify that dana.common.llm surfaces thinking/reasoning for Azure gpt-5.2.

Checks all three surfaces:
- streaming: at least one LLMStreamChunk(type="thinking") yielded
- non-streaming reasoning_tokens count returned in usage details
- non-streaming reasoning_content text populated (Responses API path)

Both stream() and chat() now route to the Responses API for gpt-5/o3/o4
when api-version supports it.

Run:
uv run python scripts/verify-azure-thinking.py
"""

from __future__ import annotations

import asyncio
import os
from pathlib import Path
import sys

from dotenv import load_dotenv


ROOT = Path(__file__).resolve().parent.parent
load_dotenv(ROOT / ".env", override=False)

from dana.common.llm.providers.azure import AzureProvider # noqa: E402
from dana.common.llm.types import LLMMessage # noqa: E402


MODEL = os.getenv("AZURE_MODEL", "gpt-5.2")

PROMPT = (
"You have 3 boxes labeled A, B, C. One holds gold, two are empty. "
"B's label says 'gold is in A'. C's label says 'gold is not here'. "
"Exactly one label is true. Where is the gold? Reason step by step, then answer."
)


def _hr(title: str) -> None:
print(f"\n{'=' * 8} {title} {'=' * 8}")


async def check_streaming(provider: AzureProvider) -> dict:
_hr(f"STREAM: {MODEL} (expect Responses API + thinking chunks)")
use_responses = provider._should_use_responses_api()
print(f"_should_use_responses_api()={use_responses}")

thinking_chunks: list[str] = []
text_chunks: list[str] = []
chunk_types: dict[str, int] = {}

stream_kwargs = {"reasoning": {"effort": "medium"}} # wrapper now adds summary="auto"
print(f"stream kwargs: {stream_kwargs} (wrapper merges summary='auto')")
async for chunk in provider.stream(messages=[LLMMessage(role="user", content=PROMPT)], **stream_kwargs):
chunk_types[chunk.type] = chunk_types.get(chunk.type, 0) + 1
if chunk.type == "thinking":
thinking_chunks.append(chunk.content or "")
elif chunk.type == "text_delta":
text_chunks.append(chunk.content or "")

print(f"chunk type counts: {chunk_types}")
if thinking_chunks:
preview = "".join(thinking_chunks)[:300].replace("\n", " ")
print(f"thinking preview ({len(''.join(thinking_chunks))} chars): {preview!r}")
print(f"final text ({len(''.join(text_chunks))} chars): {''.join(text_chunks)[:200]!r}...")

return {
"uses_responses_api": use_responses,
"thinking_chunk_count": len(thinking_chunks),
"thinking_chars": sum(len(c) for c in thinking_chunks),
"text_chars": sum(len(c) for c in text_chunks),
}


async def check_nonstreaming(provider: AzureProvider) -> dict:
_hr(f"CHAT: {MODEL} (now Responses API — expect reasoning_content populated)")
# gpt-5* + supported api-version → wrapper routes chat() to Responses API.
# Pass reasoning so the model actually reasons; summary auto-defaults inside the wrapper.
chat_kwargs = {"reasoning": {"effort": "medium"}}
print(f"chat kwargs: {chat_kwargs}")
resp = await provider.chat(messages=[LLMMessage(role="user", content=PROMPT)], **chat_kwargs)
print(f"finish_reason={resp.finish_reason}")
print(f"usage={resp.usage}")
print(f"reasoning_tokens={resp.reasoning_tokens}")
if resp.reasoning_content:
preview = resp.reasoning_content[:300].replace("\n", " ")
print(f"reasoning_content ({len(resp.reasoning_content)} chars): {preview!r}...")
else:
print("reasoning_content=None")
print(f"content[:200]={(resp.content or '')[:200]!r}")
return {
"reasoning_tokens": resp.reasoning_tokens,
"reasoning_content_present": bool(resp.reasoning_content),
"reasoning_content_chars": len(resp.reasoning_content or ""),
"content_chars": len(resp.content or ""),
}


async def main() -> int:
if not os.getenv("AZURE_OPENAI_API_KEY"):
print("ERROR: AZURE_OPENAI_API_KEY not set", file=sys.stderr)
return 2

api_version_override = os.getenv("AZURE_RESPONSES_API_VERSION", "2025-04-01-preview")
provider = AzureProvider(model=MODEL, api_version=api_version_override)
print(f"deployment={provider.deployment_name} api_version={provider.api_version}")

stream_result = await check_streaming(provider)
chat_result = await check_nonstreaming(provider)

_hr("VERDICT")
stream_ok = stream_result["uses_responses_api"] and stream_result["thinking_chunk_count"] > 0
chat_tokens_ok = (chat_result["reasoning_tokens"] or 0) > 0
chat_text_ok = chat_result["reasoning_content_present"]

print(
f"streaming thinking blocks ............. {'PASS' if stream_ok else 'FAIL'} "
f"({stream_result['thinking_chunk_count']} chunks, "
f"{stream_result['thinking_chars']} chars)"
)
print(f"non-streaming reasoning_tokens > 0 .... {'PASS' if chat_tokens_ok else 'FAIL'} (tokens={chat_result['reasoning_tokens']})")
print(f"non-streaming reasoning_content text .. {'PASS' if chat_text_ok else 'FAIL'} ({chat_result['reasoning_content_chars']} chars)")

return 0 if (stream_ok and chat_tokens_ok and chat_text_ok) else 1


if __name__ == "__main__":
sys.exit(asyncio.run(main()))
160 changes: 160 additions & 0 deletions scripts/verify-thinking-persisted-via-coding-agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#!/usr/bin/env python3
"""End-to-end check: does DanaCodingAgent persist gpt-5 reasoning to timeline.json?

Spins up DanaCodingAgent against Azure gpt-5.2, asks a reasoning-heavy question,
then loads the persisted timeline.json and inspects AGENT_THOUGHTS entries to
verify the model's internal reasoning (LLMResponse.reasoning_content) made it
into durable storage.

Path layout (per LocalTimelineRepository):
<CWD>/.dana/dana_agent/<agent.object_id>/sessions/<session_id>/timeline.json

Run:
uv run python scripts/verify-thinking-persisted-via-coding-agent.py
"""

from __future__ import annotations

import asyncio
import json
import os
from pathlib import Path
import sys
import tempfile

from dotenv import load_dotenv


ROOT = Path(__file__).resolve().parent.parent
load_dotenv(ROOT / ".env", override=False)

# Force an api-version that supports the Responses API (>= 2025-03-01-preview).
# Without this the wrapper falls back to Chat Completions and reasoning text
# is never returned, even on gpt-5.
os.environ["AZURE_OPENAI_API_VERSION"] = os.environ.get("AZURE_RESPONSES_API_VERSION", "2025-04-01-preview")

# Imports must come AFTER env override.
from dana.common.llm.providers.openai_compatible_base import OpenAICompatibleProvider # noqa: E402
from dana.core.agent.builtin_agents.dana_coding_agent import DanaCodingAgent # noqa: E402


# Instrumentation: log reasoning_content size on every call so it's easy to see
# what the wrapper produced vs what the agent persisted.
_orig_chat_via_responses = OpenAICompatibleProvider._chat_via_responses


async def _logged_chat_via_responses(self, messages, tools=None, **kwargs):
resp = await _orig_chat_via_responses(self, messages, tools, **kwargs)
print(
f"[INSTR] _chat_via_responses → "
f"reasoning_content={len(resp.reasoning_content or '')} chars, "
f"reasoning_tokens={resp.reasoning_tokens or 0}, "
f"content={len(resp.content or '')} chars"
)
return resp


OpenAICompatibleProvider._chat_via_responses = _logged_chat_via_responses


PROMPT_DIRECT = (
"You have 3 boxes labeled A, B, C. One holds gold, two are empty. "
"B's label says 'gold is in A'. C's label says 'gold is not here'. "
"Exactly one label is true. Where is the gold? Reason step by step, then answer. "
"Do NOT use any tools — answer directly from reasoning."
)
PROMPT_TOOL = (
"Reason carefully about which file in the current directory is the most recent, "
"then use the bash tool exactly once to list files (`ls -lt`) to confirm. "
"Then state the answer."
)
SCENARIO = os.getenv("SCENARIO", "direct") # "direct" or "tool"
PROMPT = PROMPT_TOOL if SCENARIO == "tool" else PROMPT_DIRECT

AGENT_ID = "dana-coding-agent-thinking-test"


def _hr(title: str) -> None:
print(f"\n{'=' * 8} {title} {'=' * 8}")


def _find_timeline(session_id: str) -> Path | None:
workspace = Path.cwd() / ".dana" / "dana_agent"
candidates = list(workspace.glob(f"*/sessions/{session_id}/timeline.json"))
return candidates[0] if candidates else None


def _print_thought_summary(entries: list[dict]) -> tuple[int, int]:
"""Return (count, total_chars) of substantive AGENT_THOUGHTS entries."""
thoughts = [e for e in entries if e.get("type") == "agent_thoughts"]
print(f"\nAGENT_THOUGHTS entries: {len(thoughts)}")
total_chars = 0
for i, t in enumerate(thoughts):
content = t.get("content", "")
if not isinstance(content, str):
print(f" [{i}] non-string content: {type(content).__name__}")
continue
total_chars += len(content)
preview = content[:200].replace("\n", " ")
print(f" [{i}] {len(content)} chars: {preview!r}")
return len(thoughts), total_chars


async def main() -> int:
if not os.getenv("AZURE_OPENAI_API_KEY"):
print("ERROR: AZURE_OPENAI_API_KEY not set", file=sys.stderr)
return 2

cwd = tempfile.mkdtemp(prefix="dana_thinking_test_")
print(f"agent cwd: {cwd}")
print(f"api-version: {os.environ['AZURE_OPENAI_API_VERSION']}")
print(f"scenario: {SCENARIO}")

agent = DanaCodingAgent(
agent_id=AGENT_ID,
agent_type="dana_coding_agent",
llm_provider="azure",
model=os.getenv("AZURE_MODEL", "gpt-5.2"),
cwd=cwd,
)
print(f"session_id: {agent._session_id}")

_hr("RUNNING aquery")
answer = await agent.aquery(message=PROMPT)
print(f"answer[:200]: {str(answer or '')[:200]!r}")

_hr("LOADING TIMELINE")
timeline_path = _find_timeline(agent._session_id)
if timeline_path is None:
print(f"ERROR: no timeline.json found for session {agent._session_id}")
return 1
print(f"timeline: {timeline_path}")

data = json.loads(timeline_path.read_text())
entries = data.get("entries", [])
print(f"total entries: {len(entries)}")
entry_types = {}
for e in entries:
t = e.get("type", "?")
entry_types[t] = entry_types.get(t, 0) + 1
print(f"entry type counts: {entry_types}")

thought_count, thought_chars = _print_thought_summary(entries)

_hr("VERDICT")
if thought_count > 0 and thought_chars > 50:
print(f"PASS — reasoning persisted as AGENT_THOUGHTS ({thought_count} entries, {thought_chars} chars)")
print(f"\nInspect: cat {timeline_path}")
return 0

print("FAIL — no substantive AGENT_THOUGHTS entry in timeline")
print("\nLikely causes:")
print(" 1. gpt-5.2 chose not to reason on this turn (nondeterministic without explicit effort)")
print(" 2. Wrapper routed to Chat Completions (api-version too old?)")
print(" 3. Codec doesn't read response.reasoning_content (only codec_with_native_tool_use does)")
print(f"\nInspect: cat {timeline_path}")
return 1


if __name__ == "__main__":
sys.exit(asyncio.run(main()))
Loading
Loading