Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ dev = [
"pytest-anyio>=0.0.0",
"opentelemetry-api>=1.27.0",
"opentelemetry-sdk>=1.27.0",
"ruff>=0.9.0",
]
# Optional heavy ML deps for the TSFM server.
# tsfm_public must be installed separately: pip install git+https://github.com/ibm-granite/granite-tsfm
Expand Down
2 changes: 1 addition & 1 deletion src/agent/_litellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ def resolve_model(model_id: str) -> str:
"anthropic/claude-sonnet-4-6" -> "anthropic/claude-sonnet-4-6"
"""
if model_id.startswith(LITELLM_PREFIX):
return model_id[len(LITELLM_PREFIX):]
return model_id[len(LITELLM_PREFIX) :]
return model_id
4 changes: 3 additions & 1 deletion src/agent/claude_agent/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ async def _run(args: argparse.Namespace) -> None:

runner = ClaudeAgentRunner(model=args.model_id, max_turns=args.max_turns)
result = await runner.run(args.question)
print_result(result, show_trajectory=args.show_trajectory, output_json=args.output_json)
print_result(
result, show_trajectory=args.show_trajectory, output_json=args.output_json
)


def main() -> None:
Expand Down
34 changes: 27 additions & 7 deletions src/agent/claude_agent/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@
import time
from pathlib import Path

from claude_agent_sdk import AssistantMessage, ClaudeAgentOptions, HookMatcher, ResultMessage, query
from claude_agent_sdk import (
AssistantMessage,
ClaudeAgentOptions,
HookMatcher,
ResultMessage,
query,
)
from claude_agent_sdk import TextBlock, ToolUseBlock

from observability import agent_run_span, persist_trajectory
Expand Down Expand Up @@ -132,8 +138,14 @@ async def run(self, question: str) -> AgentResult:
last_turn_start = run_started
tool_outputs: dict[str, object] = {}

async def _capture_tool_output(input_data, tool_use_id: str, context) -> dict:
resp = input_data.get("tool_response") if isinstance(input_data, dict) else input_data
async def _capture_tool_output(
input_data, tool_use_id: str, context
) -> dict:
resp = (
input_data.get("tool_response")
if isinstance(input_data, dict)
else input_data
)
if isinstance(resp, dict):
tool_outputs[tool_use_id] = resp.get("content", resp)
else:
Expand All @@ -145,7 +157,9 @@ async def _capture_tool_output(input_data, tool_use_id: str, context) -> dict:
# per-tool duration for claude-agent is therefore not captured
# (matches openai-agent / deep-agent).
options.hooks = {
"PostToolUse": [HookMatcher(matcher=".*", hooks=[_capture_tool_output])],
"PostToolUse": [
HookMatcher(matcher=".*", hooks=[_capture_tool_output])
],
}

def _flush_tool_outputs() -> None:
Expand All @@ -169,7 +183,9 @@ def _flush_tool_outputs() -> None:
text += block.text
elif isinstance(block, ToolUseBlock):
tool_calls.append(
ToolCall(name=block.name, input=block.input, id=block.id)
ToolCall(
name=block.name, input=block.input, id=block.id
)
)
usage = message.usage or {}
trajectory.turns.append(
Expand Down Expand Up @@ -197,8 +213,12 @@ def _flush_tool_outputs() -> None:

duration_ms = (time.perf_counter() - run_started) * 1000
span.set_attribute("agent.answer.length", len(answer))
span.set_attribute("gen_ai.usage.input_tokens", trajectory.total_input_tokens)
span.set_attribute("gen_ai.usage.output_tokens", trajectory.total_output_tokens)
span.set_attribute(
"gen_ai.usage.input_tokens", trajectory.total_input_tokens
)
span.set_attribute(
"gen_ai.usage.output_tokens", trajectory.total_output_tokens
)
span.set_attribute("agent.turns", len(trajectory.turns))
span.set_attribute("agent.tool_calls", len(trajectory.all_tool_calls))
span.set_attribute("agent.duration_ms", duration_ms)
Expand Down
23 changes: 19 additions & 4 deletions src/agent/claude_agent/tests/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from __future__ import annotations

from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
from unittest.mock import MagicMock, patch

import pytest

Expand Down Expand Up @@ -113,7 +113,12 @@ async def fake_query(prompt, options):

@pytest.mark.anyio
async def test_run_collects_trajectory():
from claude_agent_sdk import AssistantMessage, ResultMessage, TextBlock, ToolUseBlock
from claude_agent_sdk import (
AssistantMessage,
ResultMessage,
TextBlock,
ToolUseBlock,
)

mock_tool = MagicMock(spec=ToolUseBlock)
mock_tool.name = "sensors"
Expand Down Expand Up @@ -157,7 +162,12 @@ async def fake_query(prompt, options):
@pytest.mark.anyio
async def test_run_tool_output_captured():
"""PostToolUse hook output is attached to the matching ToolCall."""
from claude_agent_sdk import AssistantMessage, ResultMessage, TextBlock, ToolUseBlock
from claude_agent_sdk import (
AssistantMessage,
ResultMessage,
TextBlock,
ToolUseBlock,
)

mock_tool = MagicMock(spec=ToolUseBlock)
mock_tool.name = "sensors"
Expand Down Expand Up @@ -206,7 +216,12 @@ async def fake_query(prompt, options):
@pytest.mark.anyio
async def test_run_tool_output_string_response():
"""PostToolUse hook handles string tool_response (no .get)."""
from claude_agent_sdk import AssistantMessage, ResultMessage, TextBlock, ToolUseBlock
from claude_agent_sdk import (
AssistantMessage,
ResultMessage,
TextBlock,
ToolUseBlock,
)

mock_tool = MagicMock(spec=ToolUseBlock)
mock_tool.name = "sites"
Expand Down
4 changes: 3 additions & 1 deletion src/agent/deep_agent/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ async def _run(args: argparse.Namespace) -> None:
recursion_limit=args.recursion_limit,
)
result = await runner.run(args.question)
print_result(result, show_trajectory=args.show_trajectory, output_json=args.output_json)
print_result(
result, show_trajectory=args.show_trajectory, output_json=args.output_json
)


def main() -> None:
Expand Down
8 changes: 6 additions & 2 deletions src/agent/deep_agent/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,12 @@ async def run(self, question: str) -> AgentResult:
)

span.set_attribute("agent.answer.length", len(answer))
span.set_attribute("gen_ai.usage.input_tokens", trajectory.total_input_tokens)
span.set_attribute("gen_ai.usage.output_tokens", trajectory.total_output_tokens)
span.set_attribute(
"gen_ai.usage.input_tokens", trajectory.total_input_tokens
)
span.set_attribute(
"gen_ai.usage.output_tokens", trajectory.total_output_tokens
)
span.set_attribute("agent.turns", len(trajectory.turns))
span.set_attribute("agent.tool_calls", len(trajectory.all_tool_calls))
span.set_attribute(
Expand Down
47 changes: 39 additions & 8 deletions src/agent/deep_agent/tests/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,20 @@ def test_build_trajectory_tool_calls_and_outputs():
AIMessage(
content="",
tool_calls=[{"name": "sensors", "args": {"asset_id": "CH-6"}, "id": "c1"}],
usage_metadata={"input_tokens": 100, "output_tokens": 20, "total_tokens": 120},
usage_metadata={
"input_tokens": 100,
"output_tokens": 20,
"total_tokens": 120,
},
),
ToolMessage(content="5 sensors found", tool_call_id="c1"),
AIMessage(
content="Chiller 6 has 5 sensors.",
usage_metadata={"input_tokens": 150, "output_tokens": 30, "total_tokens": 180},
usage_metadata={
"input_tokens": 150,
"output_tokens": 30,
"total_tokens": 180,
},
),
]
traj = _build_trajectory(messages)
Expand All @@ -149,7 +157,12 @@ def test_build_trajectory_tool_calls_and_outputs():

def test_build_trajectory_list_content():
messages = [
AIMessage(content=[{"type": "text", "text": "part one "}, {"type": "text", "text": "part two"}])
AIMessage(
content=[
{"type": "text", "text": "part one "},
{"type": "text", "text": "part two"},
]
)
]
traj = _build_trajectory(messages)
assert traj.turns[0].text == "part one part two"
Expand All @@ -172,13 +185,21 @@ def test_build_trajectory_multiple_tool_calls_one_turn():
{"name": "sites", "args": {}, "id": "c1"},
{"name": "assets", "args": {"site_id": "MAIN"}, "id": "c2"},
],
usage_metadata={"input_tokens": 50, "output_tokens": 10, "total_tokens": 60},
usage_metadata={
"input_tokens": 50,
"output_tokens": 10,
"total_tokens": 60,
},
),
ToolMessage(content=["MAIN"], tool_call_id="c1"),
ToolMessage(content=["Chiller 6"], tool_call_id="c2"),
AIMessage(
content="Found Chiller 6 at site MAIN.",
usage_metadata={"input_tokens": 80, "output_tokens": 15, "total_tokens": 95},
usage_metadata={
"input_tokens": 80,
"output_tokens": 15,
"total_tokens": 95,
},
),
]
traj = _build_trajectory(messages)
Expand Down Expand Up @@ -242,13 +263,23 @@ async def test_run_collects_trajectory():
HumanMessage(content="What sensors are on Chiller 6?"),
AIMessage(
content="",
tool_calls=[{"name": "sensors", "args": {"asset_id": "CH-6"}, "id": "c1"}],
usage_metadata={"input_tokens": 100, "output_tokens": 20, "total_tokens": 120},
tool_calls=[
{"name": "sensors", "args": {"asset_id": "CH-6"}, "id": "c1"}
],
usage_metadata={
"input_tokens": 100,
"output_tokens": 20,
"total_tokens": 120,
},
),
ToolMessage(content="sensor data", tool_call_id="c1"),
AIMessage(
content="Chiller 6 has 5 sensors.",
usage_metadata={"input_tokens": 150, "output_tokens": 30, "total_tokens": 180},
usage_metadata={
"input_tokens": 150,
"output_tokens": 30,
"total_tokens": 180,
},
),
]
}
Expand Down
4 changes: 3 additions & 1 deletion src/agent/openai_agent/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ async def _run(args: argparse.Namespace) -> None:

runner = OpenAIAgentRunner(model=args.model_id, max_turns=args.max_turns)
result = await runner.run(args.question)
print_result(result, show_trajectory=args.show_trajectory, output_json=args.output_json)
print_result(
result, show_trajectory=args.show_trajectory, output_json=args.output_json
)


def main() -> None:
Expand Down
23 changes: 17 additions & 6 deletions src/agent/openai_agent/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,14 @@

from openai import AsyncOpenAI

from agents import Agent, ModelProvider, OpenAIChatCompletionsModel, RunConfig, Runner, set_tracing_disabled
from agents import (
Agent,
ModelProvider,
OpenAIChatCompletionsModel,
RunConfig,
Runner,
set_tracing_disabled,
)
from agents.mcp import MCPServerStdio

from observability import agent_run_span, persist_trajectory
Expand Down Expand Up @@ -144,7 +151,9 @@ def _flush() -> None:
tc_id = getattr(raw, "call_id", "") or getattr(raw, "id", "") or ""
tc_args = getattr(raw, "arguments", "{}") or "{}"
try:
tc_input = json.loads(tc_args) if isinstance(tc_args, str) else tc_args
tc_input = (
json.loads(tc_args) if isinstance(tc_args, str) else tc_args
)
except (json.JSONDecodeError, TypeError):
tc_input = {"raw": tc_args}
tool_calls.append(ToolCall(name=tc_name, input=tc_input, id=tc_id))
Expand Down Expand Up @@ -258,8 +267,12 @@ async def run(self, question: str) -> AgentResult:
)

span.set_attribute("agent.answer.length", len(answer))
span.set_attribute("gen_ai.usage.input_tokens", trajectory.total_input_tokens)
span.set_attribute("gen_ai.usage.output_tokens", trajectory.total_output_tokens)
span.set_attribute(
"gen_ai.usage.input_tokens", trajectory.total_input_tokens
)
span.set_attribute(
"gen_ai.usage.output_tokens", trajectory.total_output_tokens
)
span.set_attribute("agent.turns", len(trajectory.turns))
span.set_attribute("agent.tool_calls", len(trajectory.all_tool_calls))
span.set_attribute(
Expand All @@ -277,5 +290,3 @@ async def run(self, question: str) -> AgentResult:
answer=answer,
trajectory=trajectory,
)


2 changes: 1 addition & 1 deletion src/agent/openai_agent/tests/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from pathlib import Path
from types import SimpleNamespace
from unittest.mock import AsyncMock, MagicMock, patch
from unittest.mock import AsyncMock, patch

import pytest

Expand Down
10 changes: 6 additions & 4 deletions src/agent/plan_execute/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,9 @@ async def execute_plan(self, plan: Plan, question: str) -> list[StepResult]:
)
schema = tool_schemas.get(step.server, {}).get(step.tool, "")
step_started = time.perf_counter()
result = await self.execute_step(step, context, question, tool_schema=schema)
result = await self.execute_step(
step, context, question, tool_schema=schema
)
result.duration_ms = (time.perf_counter() - step_started) * 1000
if result.success:
_log.info("Step %d OK.", step.step_number)
Expand Down Expand Up @@ -202,8 +204,7 @@ async def _resolve_args_with_llm(
f"Step {n}: {r.response}" for n, r in sorted(context.items())
)
prompt = (
_ARG_RESOLUTION_PROMPT
.replace("{question}", question)
_ARG_RESOLUTION_PROMPT.replace("{question}", question)
.replace("{task}", task)
.replace("{tool}", tool)
.replace("{tool_schema}", tool_schema or "(unknown)")
Expand All @@ -214,7 +215,8 @@ async def _resolve_args_with_llm(
if resolved is None:
_log.warning(
"Tool '%s': arg resolution returned no parseable JSON (response: %r…)",
tool, raw[:120],
tool,
raw[:120],
)
return {}
return resolved
Expand Down
5 changes: 2 additions & 3 deletions src/agent/plan_execute/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,7 @@ def generate(self, prompt: str, temperature: float = 0.0) -> str:
self.output_tokens += result.output_tokens
return result.text

def generate_with_usage(
self, prompt: str, temperature: float = 0.0
) -> LLMResult:
def generate_with_usage(self, prompt: str, temperature: float = 0.0) -> LLMResult:
result = self._inner.generate_with_usage(prompt, temperature)
self.input_tokens += result.input_tokens
self.output_tokens += result.output_tokens
Expand All @@ -62,6 +60,7 @@ def generate_with_usage(
def model_id(self) -> str:
return self._inner.model_id


_log = logging.getLogger(__name__)

_SUMMARIZE_PROMPT = """\
Expand Down
Loading