Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
so we need to patch the RealtimeSession class directly to add OpenTelemetry tracing.
"""

import json
import logging
import time
from typing import Dict, Any, Optional, List, Tuple
Expand Down Expand Up @@ -258,7 +259,7 @@ def start_audio_span(self, item_id: str, content_index: int):
kind=SpanKind.CLIENT,
context=parent_context,
attributes={
SpanAttributes.LLM_REQUEST_TYPE: "realtime",
GenAIAttributes.GEN_AI_OPERATION_NAME: "realtime",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GenAIAttributes.GEN_AI_SYSTEM: "openai",
},
)
Expand Down Expand Up @@ -351,8 +352,7 @@ def create_llm_span(self, completion_content: str):
context=parent_context,
start_time=start_time,
attributes={
SpanAttributes.LLM_REQUEST_TYPE: "realtime",
SpanAttributes.LLM_SYSTEM: "openai",
GenAIAttributes.GEN_AI_OPERATION_NAME: "realtime",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

GenAIAttributes.GEN_AI_SYSTEM: "openai",
GenAIAttributes.GEN_AI_REQUEST_MODEL: model_name_str,
},
Expand All @@ -373,21 +373,14 @@ def create_llm_span(self, completion_content: str):

if should_send_prompts():
if prompt_content:
input_messages = [{"role": prompt_role or "user", "content": prompt_content}]
span.set_attribute(
f"{GenAIAttributes.GEN_AI_PROMPT}.0.role", prompt_role or "user"
)
span.set_attribute(
f"{GenAIAttributes.GEN_AI_PROMPT}.0.content", prompt_content
GenAIAttributes.GEN_AI_INPUT_MESSAGES, json.dumps(input_messages)
)

output_messages = [{"role": "assistant", "content": completion_content, "finish_reason": "stop"}]
span.set_attribute(
f"{GenAIAttributes.GEN_AI_COMPLETION}.0.role", "assistant"
)
span.set_attribute(
f"{GenAIAttributes.GEN_AI_COMPLETION}.0.content", completion_content
)
span.set_attribute(
f"{GenAIAttributes.GEN_AI_COMPLETION}.0.finish_reason", "stop"
GenAIAttributes.GEN_AI_OUTPUT_MESSAGES, json.dumps(output_messages)
)

span.set_status(Status(StatusCode.OK))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ requires-python = ">=3.10,<4"
dependencies = [
"opentelemetry-api>=1.38.0,<2",
"opentelemetry-instrumentation>=0.59b0",
"opentelemetry-semantic-conventions-ai>=0.4.13,<0.5.0",
"opentelemetry-semantic-conventions-ai>=0.5.0,<0.6.0",
"opentelemetry-semantic-conventions>=0.59b0",
]

Expand Down Expand Up @@ -74,3 +74,6 @@ select = ["E", "F", "W"]

[tool.uv]
constraint-dependencies = ["urllib3>=2.6.3", "pip>=25.3"]

[tool.uv.sources]
opentelemetry-semantic-conventions-ai = { path = "../opentelemetry-semantic-conventions-ai", editable = true }
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import pytest
from unittest.mock import MagicMock
from opentelemetry.instrumentation.openai_agents import (
Expand Down Expand Up @@ -49,11 +50,11 @@ def test_dict_content_serialization(exporter):

spans = exporter.get_finished_spans()

# Look for any spans with prompt/content attributes
# Look for any spans with message content attributes
for span in spans:
for attr_name, attr_value in span.attributes.items():
prompt_content_check = ("prompt" in attr_name and "content" in attr_name) or (
"gen_ai.prompt" in attr_name and "content" in attr_name
prompt_content_check = (
attr_name in ("gen_ai.input.messages", "gen_ai.output.messages")
)
if prompt_content_check:
# All content attributes should be strings, not dicts
Expand Down Expand Up @@ -94,39 +95,38 @@ def test_agent_spans(exporter, test_agent):
assert agent_span.status.status_code == StatusCode.OK

# Agent span should NOT contain LLM parameters
assert SpanAttributes.LLM_REQUEST_TEMPERATURE not in agent_span.attributes
assert SpanAttributes.LLM_REQUEST_MAX_TOKENS not in agent_span.attributes
assert SpanAttributes.LLM_REQUEST_TOP_P not in agent_span.attributes
assert "openai.agent.model.frequency_penalty" not in agent_span.attributes
assert GenAIAttributes.GEN_AI_REQUEST_TEMPERATURE not in agent_span.attributes
assert GenAIAttributes.GEN_AI_REQUEST_MAX_TOKENS not in agent_span.attributes
assert GenAIAttributes.GEN_AI_REQUEST_TOP_P not in agent_span.attributes
assert GenAIAttributes.GEN_AI_REQUEST_FREQUENCY_PENALTY not in agent_span.attributes

# Find the response span (openai.response) - this should contain prompts/completions/usage
response_spans = [s for s in spans if s.name == "openai.response"]
assert len(response_spans) >= 1, f"Expected at least 1 openai.response span, got {len(response_spans)}"
response_span = response_spans[0]

# Test response span attributes (should contain prompts/completions/usage)

# Test proper semantic conventions
assert response_span.attributes[SpanAttributes.LLM_REQUEST_TYPE] == "response"
assert response_span.attributes["gen_ai.operation.name"] == "response"
assert response_span.attributes["gen_ai.system"] == "openai"
assert response_span.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME] == "response"
assert response_span.attributes[GenAIAttributes.GEN_AI_SYSTEM] == "openai"

# Test prompts using OpenAI semantic conventions
assert response_span.attributes[f"{GenAIAttributes.GEN_AI_PROMPT}.0.role"] == "user"
assert response_span.attributes[f"{GenAIAttributes.GEN_AI_PROMPT}.0.content"] == "What is AI?"
# Test input messages (JSON array)
input_messages = json.loads(response_span.attributes[GenAIAttributes.GEN_AI_INPUT_MESSAGES])
assert input_messages[0]["role"] == "user"
assert input_messages[0]["content"] == "What is AI?"

# Test usage tokens
assert response_span.attributes[GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS] is not None
assert response_span.attributes[GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS] is not None
assert response_span.attributes[SpanAttributes.LLM_USAGE_TOTAL_TOKENS] is not None
assert response_span.attributes[SpanAttributes.GEN_AI_USAGE_TOTAL_TOKENS] is not None
assert response_span.attributes[GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS] > 0
assert response_span.attributes[GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS] > 0
assert response_span.attributes[SpanAttributes.LLM_USAGE_TOTAL_TOKENS] > 0
assert response_span.attributes[SpanAttributes.GEN_AI_USAGE_TOTAL_TOKENS] > 0

# Test completions using OpenAI semantic conventions
assert response_span.attributes[f"{GenAIAttributes.GEN_AI_COMPLETION}.0.content"] is not None
assert len(response_span.attributes[f"{GenAIAttributes.GEN_AI_COMPLETION}.0.content"]) > 0
assert response_span.attributes[f"{GenAIAttributes.GEN_AI_COMPLETION}.0.role"] is not None
# Test output messages (JSON array)
output_messages = json.loads(response_span.attributes[GenAIAttributes.GEN_AI_OUTPUT_MESSAGES])
assert output_messages[0]["content"] is not None
assert len(output_messages[0]["content"]) > 0
assert output_messages[0]["role"] is not None

# Test model settings are in the response span
assert response_span.attributes["gen_ai.request.temperature"] == 0.3
Expand Down Expand Up @@ -444,59 +444,39 @@ async def get_city_info(city_name: str) -> str:
second_response_span = response_spans[1]

# The tool call and result appear in the SECOND response span as part of conversation history
# Find the assistant message with tool call
# Parse the input messages JSON array
input_messages = json.loads(
second_response_span.attributes[GenAIAttributes.GEN_AI_INPUT_MESSAGES]
)

tool_call_found = False
tool_result_found = False

for i in range(20): # Check conversation history
role_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.role"
if role_key not in second_response_span.attributes:
continue

role = second_response_span.attributes[role_key]
for msg in input_messages:
role = msg.get("role")

if role == "assistant" and not tool_call_found:
# Check if this assistant message has tool_calls
tool_call_name_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.tool_calls.0.name"
if tool_call_name_key in second_response_span.attributes:
tool_calls = msg.get("tool_calls", [])
if tool_calls:
tool_call_found = True
# Verify tool call attributes
assert second_response_span.attributes[tool_call_name_key] == "get_city_info", (
f"Expected tool name 'get_city_info', got '{second_response_span.attributes[tool_call_name_key]}'"
)
# Verify tool call ID exists
tool_call_id_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.tool_calls.0.id"
assert tool_call_id_key in second_response_span.attributes, (
f"Tool call ID not found at {tool_call_id_key}"
assert tool_calls[0]["name"] == "get_city_info", (
f"Expected tool name 'get_city_info', got '{tool_calls[0]['name']}'"
)
tool_call_id = second_response_span.attributes[tool_call_id_key]
tool_call_id = tool_calls[0].get("id", "")
assert len(tool_call_id) > 0, "Tool call ID should not be empty"

# Verify arguments exist and contain city name
tool_call_args_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.tool_calls.0.arguments"
assert tool_call_args_key in second_response_span.attributes, (
f"Tool call arguments not found at {tool_call_args_key}"
)
arguments = second_response_span.attributes[tool_call_args_key]
arguments = tool_calls[0].get("arguments", "")
assert "London" in arguments or "london" in arguments.lower(), (
f"Expected 'London' in arguments, got: {arguments}"
)

elif role == "tool" and not tool_result_found:
tool_result_found = True
# Verify tool result attributes
content_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.content"
tool_call_id_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.tool_call_id"

assert content_key in second_response_span.attributes, f"Tool result content not found at {content_key}"
content = second_response_span.attributes[content_key]
content = msg.get("content", "")
assert len(content) > 0, "Tool result content should not be empty"
assert "London" in content or "9000000" in content or "United Kingdom" in content, (
f"Expected tool result to contain city info, got: {content}"
)

assert tool_call_id_key in second_response_span.attributes, f"Tool call ID not found at {tool_call_id_key}"
tool_call_id = second_response_span.attributes[tool_call_id_key]
tool_call_id = msg.get("tool_call_id", "")
assert len(tool_call_id) > 0, "Tool call ID should not be empty"

assert tool_call_found, "No assistant message with tool_calls found in second response span"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from opentelemetry.trace import StatusCode
from opentelemetry.semconv_ai import SpanAttributes
from opentelemetry.semconv._incubating.attributes import (
gen_ai_attributes as GenAIAttributes,
)
Expand Down Expand Up @@ -117,7 +116,6 @@ def test_speech_span_start_creates_otel_span(self, tracer_provider_and_exporter)
assert "openai.realtime.speech" in span_names

speech_span = next(s for s in spans if s.name == "openai.realtime.speech")
assert speech_span.attributes[SpanAttributes.LLM_REQUEST_TYPE] == "realtime"
assert speech_span.attributes["gen_ai.system"] == "openai"
assert speech_span.attributes["gen_ai.operation.name"] == "speech"
assert speech_span.status.status_code == StatusCode.OK
Expand Down Expand Up @@ -213,7 +211,6 @@ def test_transcription_span_start_creates_otel_span(self, tracer_provider_and_ex
assert "openai.realtime.transcription" in span_names

transcription_span = next(s for s in spans if s.name == "openai.realtime.transcription")
assert transcription_span.attributes[SpanAttributes.LLM_REQUEST_TYPE] == "realtime"
assert transcription_span.attributes["gen_ai.system"] == "openai"
assert transcription_span.attributes["gen_ai.operation.name"] == "transcription"

Expand Down Expand Up @@ -306,7 +303,6 @@ def test_speech_group_span_creates_otel_span(self, tracer_provider_and_exporter)
assert "openai.realtime.speech_group" in span_names

speech_group_span = next(s for s in spans if s.name == "openai.realtime.speech_group")
assert speech_group_span.attributes[SpanAttributes.LLM_REQUEST_TYPE] == "realtime"
assert speech_group_span.attributes["gen_ai.system"] == "openai"
assert speech_group_span.attributes["gen_ai.operation.name"] == "speech_group"
assert speech_group_span.status.status_code == StatusCode.OK
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Tests for realtime session instrumentation via wrapper patching."""

import json
import pytest
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
Expand Down Expand Up @@ -199,10 +200,12 @@ def test_record_completion_creates_llm_span(self, tracer, tracer_provider):
assert len(llm_spans) == 1

llm_span = llm_spans[0]
assert llm_span.attributes.get("gen_ai.prompt.0.role") == "user"
assert llm_span.attributes.get("gen_ai.prompt.0.content") == "What is the weather?"
assert llm_span.attributes.get("gen_ai.completion.0.role") == "assistant"
assert llm_span.attributes.get("gen_ai.completion.0.content") == "The weather is sunny."
input_msgs = json.loads(llm_span.attributes.get("gen_ai.input.messages"))
assert input_msgs[0]["role"] == "user"
assert input_msgs[0]["content"] == "What is the weather?"
output_msgs = json.loads(llm_span.attributes.get("gen_ai.output.messages"))
assert output_msgs[0]["role"] == "assistant"
assert output_msgs[0]["content"] == "The weather is sunny."

def test_multiple_llm_spans(self, tracer, tracer_provider):
"""Test that multiple prompt/completion pairs create multiple LLM spans."""
Expand All @@ -229,12 +232,12 @@ def test_multiple_llm_spans(self, tracer, tracer_provider):
assert len(llm_spans) == 2

# First span should have "Hello" and "Hi there!"
assert llm_spans[0].attributes.get("gen_ai.prompt.0.content") == "Hello"
assert llm_spans[0].attributes.get("gen_ai.completion.0.content") == "Hi there!"
assert json.loads(llm_spans[0].attributes.get("gen_ai.input.messages"))[0]["content"] == "Hello"
assert json.loads(llm_spans[0].attributes.get("gen_ai.output.messages"))[0]["content"] == "Hi there!"

# Second span should have "What is the weather?" and "It's sunny."
assert llm_spans[1].attributes.get("gen_ai.prompt.0.content") == "What is the weather?"
assert llm_spans[1].attributes.get("gen_ai.completion.0.content") == "It's sunny."
assert json.loads(llm_spans[1].attributes.get("gen_ai.input.messages"))[0]["content"] == "What is the weather?"
assert json.loads(llm_spans[1].attributes.get("gen_ai.output.messages"))[0]["content"] == "It's sunny."

def test_cleanup_ends_all_spans(self, tracer, tracer_provider):
"""Test that cleanup ends all remaining spans."""
Expand Down Expand Up @@ -584,7 +587,7 @@ def __init__(self, role, content):
spans = exporter.get_finished_spans()
llm_spans = [s for s in spans if s.name == "openai.realtime"]
assert len(llm_spans) == 1
assert llm_spans[0].attributes.get("gen_ai.completion.0.content") == "Hi there!"
assert json.loads(llm_spans[0].attributes.get("gen_ai.output.messages"))[0]["content"] == "Hi there!"

def test_response_done_dict_captures_usage_and_completion(self, tracer, tracer_provider):
"""Test that response.done with dict data captures usage and completions."""
Expand Down Expand Up @@ -646,7 +649,7 @@ def test_response_done_dict_captures_usage_and_completion(self, tracer, tracer_p
llm_span = llm_spans[0]
assert llm_span.attributes.get("gen_ai.usage.input_tokens") == 42
assert llm_span.attributes.get("gen_ai.usage.output_tokens") == 18
assert llm_span.attributes.get("gen_ai.completion.0.content") == "It is sunny today."
assert json.loads(llm_span.attributes.get("gen_ai.output.messages"))[0]["content"] == "It is sunny today."

def test_response_done_without_usage_still_captures_completion(self, tracer, tracer_provider):
"""Test that completions are captured even when usage is absent from response.done."""
Expand Down Expand Up @@ -694,5 +697,6 @@ def test_response_done_without_usage_still_captures_completion(self, tracer, tra
spans = exporter.get_finished_spans()
llm_spans = [s for s in spans if s.name == "openai.realtime"]
assert len(llm_spans) == 1
assert llm_spans[0].attributes.get("gen_ai.completion.0.content") == "Why did the chicken cross the road?"
output = json.loads(llm_spans[0].attributes.get("gen_ai.output.messages"))
assert output[0]["content"] == "Why did the chicken cross the road?"
assert llm_spans[0].attributes.get("gen_ai.usage.input_tokens") is None
Original file line number Diff line number Diff line change
Expand Up @@ -272,21 +272,21 @@ async def test_recipe_agents_hierarchy(exporter, recipe_agents):

# Verify each response span has prompts, completions, and usage
for i, response_span in enumerate(response_spans):
# Check for prompts
has_prompt = any(key.startswith("gen_ai.prompt.") for key in response_span.attributes.keys())
# Check for input messages (new JSON array format)
has_prompt = "gen_ai.input.messages" in response_span.attributes
assert has_prompt, (
f"Response span {i} should have prompt attributes, attributes: {dict(response_span.attributes)}"
)

# Check for completions
has_completion = any(key.startswith("gen_ai.completion.") for key in response_span.attributes.keys())
# Check for output messages (new JSON array format)
has_completion = "gen_ai.output.messages" in response_span.attributes
assert has_completion, (
f"Response span {i} should have completion attributes, attributes: {dict(response_span.attributes)}"
)

# Check for usage
has_usage = any(
key.startswith("gen_ai.usage.") or key.startswith("llm.usage.") for key in response_span.attributes.keys()
key.startswith("gen_ai.usage.") for key in response_span.attributes.keys()
)
assert has_usage, (
f"Response span {i} should have usage attributes, attributes: {dict(response_span.attributes)}"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# ruff: noqa: F401, F403
"""
Semconv compliance tests re-used from opentelemetry-semantic-conventions-ai.

Ensures the installed semconv package has the expected constant values.
To add more compliance checks, update _testing.py in that package — not here.
"""
from opentelemetry.semconv_ai._testing import *
Loading
Loading