traceloop · max-deygin-traceloop · Mar 19, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/...metry-instrumentation-openai-agents/opentelemetry/instrumentation/openai_agents/_hooks.py b/...metry-instrumentation-openai-agents/opentelemetry/instrumentation/openai_agents/_hooks.py
diff --git a/...mentation-openai-agents/opentelemetry/instrumentation/openai_agents/_realtime_wrappers.py b/...mentation-openai-agents/opentelemetry/instrumentation/openai_agents/_realtime_wrappers.py
@@ -4,6 +4,7 @@
 so we need to patch the RealtimeSession class directly to add OpenTelemetry tracing.
 """
 
+import json
 import logging
 import time
 from typing import Dict, Any, Optional, List, Tuple
@@ -258,7 +259,7 @@ def start_audio_span(self, item_id: str, content_index: int):
             kind=SpanKind.CLIENT,
             context=parent_context,
             attributes={
-                SpanAttributes.LLM_REQUEST_TYPE: "realtime",
+                GenAIAttributes.GEN_AI_OPERATION_NAME: "realtime",
                 GenAIAttributes.GEN_AI_SYSTEM: "openai",
             },
         )
@@ -351,8 +352,7 @@ def create_llm_span(self, completion_content: str):
             context=parent_context,
             start_time=start_time,
             attributes={
-                SpanAttributes.LLM_REQUEST_TYPE: "realtime",
-                SpanAttributes.LLM_SYSTEM: "openai",
+                GenAIAttributes.GEN_AI_OPERATION_NAME: "realtime",
                 GenAIAttributes.GEN_AI_SYSTEM: "openai",
                 GenAIAttributes.GEN_AI_REQUEST_MODEL: model_name_str,
             },
@@ -373,21 +373,14 @@ def create_llm_span(self, completion_content: str):
 
         if should_send_prompts():
             if prompt_content:
+                input_messages = [{"role": prompt_role or "user", "content": prompt_content}]
                 span.set_attribute(
-                    f"{GenAIAttributes.GEN_AI_PROMPT}.0.role", prompt_role or "user"
-                )
-                span.set_attribute(
-                    f"{GenAIAttributes.GEN_AI_PROMPT}.0.content", prompt_content
+                    GenAIAttributes.GEN_AI_INPUT_MESSAGES, json.dumps(input_messages)
                 )
 
+            output_messages = [{"role": "assistant", "content": completion_content, "finish_reason": "stop"}]
             span.set_attribute(
-                f"{GenAIAttributes.GEN_AI_COMPLETION}.0.role", "assistant"
-            )
-            span.set_attribute(
-                f"{GenAIAttributes.GEN_AI_COMPLETION}.0.content", completion_content
-            )
-            span.set_attribute(
-                f"{GenAIAttributes.GEN_AI_COMPLETION}.0.finish_reason", "stop"
+                GenAIAttributes.GEN_AI_OUTPUT_MESSAGES, json.dumps(output_messages)
             )
 
         span.set_status(Status(StatusCode.OK))

diff --git a/packages/opentelemetry-instrumentation-openai-agents/pyproject.toml b/packages/opentelemetry-instrumentation-openai-agents/pyproject.toml
@@ -12,7 +12,7 @@ requires-python = ">=3.10,<4"
 dependencies = [
   "opentelemetry-api>=1.38.0,<2",
   "opentelemetry-instrumentation>=0.59b0",
-  "opentelemetry-semantic-conventions-ai>=0.4.13,<0.5.0",
+  "opentelemetry-semantic-conventions-ai>=0.5.0,<0.6.0",
   "opentelemetry-semantic-conventions>=0.59b0",
 ]
 
@@ -74,3 +74,6 @@ select = ["E", "F", "W"]
 
 [tool.uv]
 constraint-dependencies = ["urllib3>=2.6.3", "pip>=25.3"]
+
+[tool.uv.sources]
+opentelemetry-semantic-conventions-ai = { path = "../opentelemetry-semantic-conventions-ai", editable = true }
diff --git a/packages/opentelemetry-instrumentation-openai-agents/tests/test_openai_agents.py b/packages/opentelemetry-instrumentation-openai-agents/tests/test_openai_agents.py
@@ -1,3 +1,4 @@
+import json
 import pytest
 from unittest.mock import MagicMock
 from opentelemetry.instrumentation.openai_agents import (
@@ -49,11 +50,11 @@ def test_dict_content_serialization(exporter):
 
     spans = exporter.get_finished_spans()
 
-    # Look for any spans with prompt/content attributes
+    # Look for any spans with message content attributes
     for span in spans:
         for attr_name, attr_value in span.attributes.items():
-            prompt_content_check = ("prompt" in attr_name and "content" in attr_name) or (
-                "gen_ai.prompt" in attr_name and "content" in attr_name
+            prompt_content_check = (
+                attr_name in ("gen_ai.input.messages", "gen_ai.output.messages")
             )
             if prompt_content_check:
                 # All content attributes should be strings, not dicts
@@ -94,39 +95,38 @@ def test_agent_spans(exporter, test_agent):
     assert agent_span.status.status_code == StatusCode.OK
 
     # Agent span should NOT contain LLM parameters
-    assert SpanAttributes.LLM_REQUEST_TEMPERATURE not in agent_span.attributes
-    assert SpanAttributes.LLM_REQUEST_MAX_TOKENS not in agent_span.attributes
-    assert SpanAttributes.LLM_REQUEST_TOP_P not in agent_span.attributes
-    assert "openai.agent.model.frequency_penalty" not in agent_span.attributes
+    assert GenAIAttributes.GEN_AI_REQUEST_TEMPERATURE not in agent_span.attributes
+    assert GenAIAttributes.GEN_AI_REQUEST_MAX_TOKENS not in agent_span.attributes
+    assert GenAIAttributes.GEN_AI_REQUEST_TOP_P not in agent_span.attributes
+    assert GenAIAttributes.GEN_AI_REQUEST_FREQUENCY_PENALTY not in agent_span.attributes
 
     # Find the response span (openai.response) - this should contain prompts/completions/usage
     response_spans = [s for s in spans if s.name == "openai.response"]
     assert len(response_spans) >= 1, f"Expected at least 1 openai.response span, got {len(response_spans)}"
     response_span = response_spans[0]
 
-    # Test response span attributes (should contain prompts/completions/usage)
-
     # Test proper semantic conventions
-    assert response_span.attributes[SpanAttributes.LLM_REQUEST_TYPE] == "response"
-    assert response_span.attributes["gen_ai.operation.name"] == "response"
-    assert response_span.attributes["gen_ai.system"] == "openai"
+    assert response_span.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME] == "response"
+    assert response_span.attributes[GenAIAttributes.GEN_AI_SYSTEM] == "openai"
 
-    # Test prompts using OpenAI semantic conventions
-    assert response_span.attributes[f"{GenAIAttributes.GEN_AI_PROMPT}.0.role"] == "user"
-    assert response_span.attributes[f"{GenAIAttributes.GEN_AI_PROMPT}.0.content"] == "What is AI?"
+    # Test input messages (JSON array)
+    input_messages = json.loads(response_span.attributes[GenAIAttributes.GEN_AI_INPUT_MESSAGES])
+    assert input_messages[0]["role"] == "user"
+    assert input_messages[0]["content"] == "What is AI?"
 
     # Test usage tokens
     assert response_span.attributes[GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS] is not None
     assert response_span.attributes[GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS] is not None
-    assert response_span.attributes[SpanAttributes.LLM_USAGE_TOTAL_TOKENS] is not None
+    assert response_span.attributes[SpanAttributes.GEN_AI_USAGE_TOTAL_TOKENS] is not None
     assert response_span.attributes[GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS] > 0
     assert response_span.attributes[GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS] > 0
-    assert response_span.attributes[SpanAttributes.LLM_USAGE_TOTAL_TOKENS] > 0
+    assert response_span.attributes[SpanAttributes.GEN_AI_USAGE_TOTAL_TOKENS] > 0
 
-    # Test completions using OpenAI semantic conventions
-    assert response_span.attributes[f"{GenAIAttributes.GEN_AI_COMPLETION}.0.content"] is not None
-    assert len(response_span.attributes[f"{GenAIAttributes.GEN_AI_COMPLETION}.0.content"]) > 0
-    assert response_span.attributes[f"{GenAIAttributes.GEN_AI_COMPLETION}.0.role"] is not None
+    # Test output messages (JSON array)
+    output_messages = json.loads(response_span.attributes[GenAIAttributes.GEN_AI_OUTPUT_MESSAGES])
+    assert output_messages[0]["content"] is not None
+    assert len(output_messages[0]["content"]) > 0
+    assert output_messages[0]["role"] is not None
 
     # Test model settings are in the response span
     assert response_span.attributes["gen_ai.request.temperature"] == 0.3
@@ -444,59 +444,39 @@ async def get_city_info(city_name: str) -> str:
     second_response_span = response_spans[1]
 
     # The tool call and result appear in the SECOND response span as part of conversation history
-    # Find the assistant message with tool call
+    # Parse the input messages JSON array
+    input_messages = json.loads(
+        second_response_span.attributes[GenAIAttributes.GEN_AI_INPUT_MESSAGES]
+    )
+
     tool_call_found = False
     tool_result_found = False
 
-    for i in range(20):  # Check conversation history
-        role_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.role"
-        if role_key not in second_response_span.attributes:
-            continue
-
-        role = second_response_span.attributes[role_key]
+    for msg in input_messages:
+        role = msg.get("role")
 
         if role == "assistant" and not tool_call_found:
-            # Check if this assistant message has tool_calls
-            tool_call_name_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.tool_calls.0.name"
-            if tool_call_name_key in second_response_span.attributes:
+            tool_calls = msg.get("tool_calls", [])
+            if tool_calls:
                 tool_call_found = True
-                # Verify tool call attributes
-                assert second_response_span.attributes[tool_call_name_key] == "get_city_info", (
-                    f"Expected tool name 'get_city_info', got '{second_response_span.attributes[tool_call_name_key]}'"
-                )
-                # Verify tool call ID exists
-                tool_call_id_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.tool_calls.0.id"
-                assert tool_call_id_key in second_response_span.attributes, (
-                    f"Tool call ID not found at {tool_call_id_key}"
+                assert tool_calls[0]["name"] == "get_city_info", (
+                    f"Expected tool name 'get_city_info', got '{tool_calls[0]['name']}'"
                 )
-                tool_call_id = second_response_span.attributes[tool_call_id_key]
+                tool_call_id = tool_calls[0].get("id", "")
                 assert len(tool_call_id) > 0, "Tool call ID should not be empty"
-
-                # Verify arguments exist and contain city name
-                tool_call_args_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.tool_calls.0.arguments"
-                assert tool_call_args_key in second_response_span.attributes, (
-                    f"Tool call arguments not found at {tool_call_args_key}"
-                )
-                arguments = second_response_span.attributes[tool_call_args_key]
+                arguments = tool_calls[0].get("arguments", "")
                 assert "London" in arguments or "london" in arguments.lower(), (
                     f"Expected 'London' in arguments, got: {arguments}"
                 )
 
         elif role == "tool" and not tool_result_found:
             tool_result_found = True
-            # Verify tool result attributes
-            content_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.content"
-            tool_call_id_key = f"{SpanAttributes.LLM_PROMPTS}.{i}.tool_call_id"
-
-            assert content_key in second_response_span.attributes, f"Tool result content not found at {content_key}"
-            content = second_response_span.attributes[content_key]
+            content = msg.get("content", "")
             assert len(content) > 0, "Tool result content should not be empty"
             assert "London" in content or "9000000" in content or "United Kingdom" in content, (
                 f"Expected tool result to contain city info, got: {content}"
             )
-
-            assert tool_call_id_key in second_response_span.attributes, f"Tool call ID not found at {tool_call_id_key}"
-            tool_call_id = second_response_span.attributes[tool_call_id_key]
+            tool_call_id = msg.get("tool_call_id", "")
             assert len(tool_call_id) > 0, "Tool call ID should not be empty"
 
     assert tool_call_found, "No assistant message with tool_calls found in second response span"

diff --git a/packages/opentelemetry-instrumentation-openai-agents/tests/test_realtime.py b/packages/opentelemetry-instrumentation-openai-agents/tests/test_realtime.py
@@ -11,7 +11,6 @@
 from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
 from opentelemetry.sdk.trace.export import SimpleSpanProcessor
 from opentelemetry.trace import StatusCode
-from opentelemetry.semconv_ai import SpanAttributes
 from opentelemetry.semconv._incubating.attributes import (
     gen_ai_attributes as GenAIAttributes,
 )
@@ -117,7 +116,6 @@ def test_speech_span_start_creates_otel_span(self, tracer_provider_and_exporter)
         assert "openai.realtime.speech" in span_names
 
         speech_span = next(s for s in spans if s.name == "openai.realtime.speech")
-        assert speech_span.attributes[SpanAttributes.LLM_REQUEST_TYPE] == "realtime"
         assert speech_span.attributes["gen_ai.system"] == "openai"
         assert speech_span.attributes["gen_ai.operation.name"] == "speech"
         assert speech_span.status.status_code == StatusCode.OK
@@ -213,7 +211,6 @@ def test_transcription_span_start_creates_otel_span(self, tracer_provider_and_ex
         assert "openai.realtime.transcription" in span_names
 
         transcription_span = next(s for s in spans if s.name == "openai.realtime.transcription")
-        assert transcription_span.attributes[SpanAttributes.LLM_REQUEST_TYPE] == "realtime"
         assert transcription_span.attributes["gen_ai.system"] == "openai"
         assert transcription_span.attributes["gen_ai.operation.name"] == "transcription"
 
@@ -306,7 +303,6 @@ def test_speech_group_span_creates_otel_span(self, tracer_provider_and_exporter)
         assert "openai.realtime.speech_group" in span_names
 
         speech_group_span = next(s for s in spans if s.name == "openai.realtime.speech_group")
-        assert speech_group_span.attributes[SpanAttributes.LLM_REQUEST_TYPE] == "realtime"
         assert speech_group_span.attributes["gen_ai.system"] == "openai"
         assert speech_group_span.attributes["gen_ai.operation.name"] == "speech_group"
         assert speech_group_span.status.status_code == StatusCode.OK

diff --git a/packages/opentelemetry-instrumentation-openai-agents/tests/test_realtime_session.py b/packages/opentelemetry-instrumentation-openai-agents/tests/test_realtime_session.py
@@ -1,5 +1,6 @@
 """Tests for realtime session instrumentation via wrapper patching."""
 
+import json
 import pytest
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
@@ -199,10 +200,12 @@ def test_record_completion_creates_llm_span(self, tracer, tracer_provider):
         assert len(llm_spans) == 1
 
         llm_span = llm_spans[0]
-        assert llm_span.attributes.get("gen_ai.prompt.0.role") == "user"
-        assert llm_span.attributes.get("gen_ai.prompt.0.content") == "What is the weather?"
-        assert llm_span.attributes.get("gen_ai.completion.0.role") == "assistant"
-        assert llm_span.attributes.get("gen_ai.completion.0.content") == "The weather is sunny."
+        input_msgs = json.loads(llm_span.attributes.get("gen_ai.input.messages"))
+        assert input_msgs[0]["role"] == "user"
+        assert input_msgs[0]["content"] == "What is the weather?"
+        output_msgs = json.loads(llm_span.attributes.get("gen_ai.output.messages"))
+        assert output_msgs[0]["role"] == "assistant"
+        assert output_msgs[0]["content"] == "The weather is sunny."
 
     def test_multiple_llm_spans(self, tracer, tracer_provider):
         """Test that multiple prompt/completion pairs create multiple LLM spans."""
@@ -229,12 +232,12 @@ def test_multiple_llm_spans(self, tracer, tracer_provider):
         assert len(llm_spans) == 2
 
         # First span should have "Hello" and "Hi there!"
-        assert llm_spans[0].attributes.get("gen_ai.prompt.0.content") == "Hello"
-        assert llm_spans[0].attributes.get("gen_ai.completion.0.content") == "Hi there!"
+        assert json.loads(llm_spans[0].attributes.get("gen_ai.input.messages"))[0]["content"] == "Hello"
+        assert json.loads(llm_spans[0].attributes.get("gen_ai.output.messages"))[0]["content"] == "Hi there!"
 
         # Second span should have "What is the weather?" and "It's sunny."
-        assert llm_spans[1].attributes.get("gen_ai.prompt.0.content") == "What is the weather?"
-        assert llm_spans[1].attributes.get("gen_ai.completion.0.content") == "It's sunny."
+        assert json.loads(llm_spans[1].attributes.get("gen_ai.input.messages"))[0]["content"] == "What is the weather?"
+        assert json.loads(llm_spans[1].attributes.get("gen_ai.output.messages"))[0]["content"] == "It's sunny."
 
     def test_cleanup_ends_all_spans(self, tracer, tracer_provider):
         """Test that cleanup ends all remaining spans."""
@@ -584,7 +587,7 @@ def __init__(self, role, content):
         spans = exporter.get_finished_spans()
         llm_spans = [s for s in spans if s.name == "openai.realtime"]
         assert len(llm_spans) == 1
-        assert llm_spans[0].attributes.get("gen_ai.completion.0.content") == "Hi there!"
+        assert json.loads(llm_spans[0].attributes.get("gen_ai.output.messages"))[0]["content"] == "Hi there!"
 
     def test_response_done_dict_captures_usage_and_completion(self, tracer, tracer_provider):
         """Test that response.done with dict data captures usage and completions."""
@@ -646,7 +649,7 @@ def test_response_done_dict_captures_usage_and_completion(self, tracer, tracer_p
         llm_span = llm_spans[0]
         assert llm_span.attributes.get("gen_ai.usage.input_tokens") == 42
         assert llm_span.attributes.get("gen_ai.usage.output_tokens") == 18
-        assert llm_span.attributes.get("gen_ai.completion.0.content") == "It is sunny today."
+        assert json.loads(llm_span.attributes.get("gen_ai.output.messages"))[0]["content"] == "It is sunny today."
 
     def test_response_done_without_usage_still_captures_completion(self, tracer, tracer_provider):
         """Test that completions are captured even when usage is absent from response.done."""
@@ -694,5 +697,6 @@ def test_response_done_without_usage_still_captures_completion(self, tracer, tra
         spans = exporter.get_finished_spans()
         llm_spans = [s for s in spans if s.name == "openai.realtime"]
         assert len(llm_spans) == 1
-        assert llm_spans[0].attributes.get("gen_ai.completion.0.content") == "Why did the chicken cross the road?"
+        output = json.loads(llm_spans[0].attributes.get("gen_ai.output.messages"))
+        assert output[0]["content"] == "Why did the chicken cross the road?"
         assert llm_spans[0].attributes.get("gen_ai.usage.input_tokens") is None
diff --git a/packages/opentelemetry-instrumentation-openai-agents/tests/test_recipe_agents_hierarchy.py b/packages/opentelemetry-instrumentation-openai-agents/tests/test_recipe_agents_hierarchy.py
@@ -272,21 +272,21 @@ async def test_recipe_agents_hierarchy(exporter, recipe_agents):
 
     # Verify each response span has prompts, completions, and usage
     for i, response_span in enumerate(response_spans):
-        # Check for prompts
-        has_prompt = any(key.startswith("gen_ai.prompt.") for key in response_span.attributes.keys())
+        # Check for input messages (new JSON array format)
+        has_prompt = "gen_ai.input.messages" in response_span.attributes
         assert has_prompt, (
             f"Response span {i} should have prompt attributes, attributes: {dict(response_span.attributes)}"
         )
 
-        # Check for completions
-        has_completion = any(key.startswith("gen_ai.completion.") for key in response_span.attributes.keys())
+        # Check for output messages (new JSON array format)
+        has_completion = "gen_ai.output.messages" in response_span.attributes
         assert has_completion, (
             f"Response span {i} should have completion attributes, attributes: {dict(response_span.attributes)}"
         )
 
         # Check for usage
         has_usage = any(
-            key.startswith("gen_ai.usage.") or key.startswith("llm.usage.") for key in response_span.attributes.keys()
+            key.startswith("gen_ai.usage.") for key in response_span.attributes.keys()
         )
         assert has_usage, (
             f"Response span {i} should have usage attributes, attributes: {dict(response_span.attributes)}"

diff --git a/packages/opentelemetry-instrumentation-openai-agents/tests/test_semconv_compliance.py b/packages/opentelemetry-instrumentation-openai-agents/tests/test_semconv_compliance.py
@@ -0,0 +1,8 @@
+# ruff: noqa: F401, F403
+"""
+Semconv compliance tests re-used from opentelemetry-semantic-conventions-ai.
+
+Ensures the installed semconv package has the expected constant values.
+To add more compliance checks, update _testing.py in that package — not here.
+"""
+from opentelemetry.semconv_ai._testing import *