(feat): address PR feedback

levilentz · declan-scale · commit 31bb49f550a6 · 2026-04-21T10:00:13.000-04:00
diff --git a/src/agentex/lib/core/tracing/processors/sgp_tracing_processor.py b/src/agentex/lib/core/tracing/processors/sgp_tracing_processor.py
@@ -145,9 +145,6 @@ async def on_span_start(self, span: Span) -> None:
             items=[sgp_span.to_request_params()]
         )
 
-        # Input has been serialized and sent; clear it on the retained span to
-        # release memory.  on_span_end only needs output/metadata/end_time.
-        sgp_span.input = None  # type: ignore[assignment]
         self._spans[span.id] = sgp_span
 
     @override
@@ -158,6 +155,7 @@ async def on_span_end(self, span: Span) -> None:
             return
 
         self._add_source_to_span(span)
+        sgp_span.input = span.input  # type: ignore[assignment]
         sgp_span.output = span.output  # type: ignore[assignment]
         sgp_span.metadata = span.data  # type: ignore[assignment]
         sgp_span.end_time = span.end_time.isoformat()  # type: ignore[union-attr]
diff --git a/src/agentex/lib/core/tracing/trace.py b/src/agentex/lib/core/tracing/trace.py
@@ -109,7 +109,7 @@ def end_span(
         if span.end_time is None:
             span.end_time = datetime.now(UTC)
 
-        # input was already serialized at start_span; skip redundant re-serialization
+        span.input = recursive_model_dump(span.input) if span.input else None
         span.output = recursive_model_dump(span.output) if span.output else None
         span.data = recursive_model_dump(span.data) if span.data else None
 
@@ -252,17 +252,12 @@ async def end_span(
         if span.end_time is None:
             span.end_time = datetime.now(UTC)
 
-        # input was already serialized at start_span; skip redundant re-serialization
+        span.input = recursive_model_dump(span.input) if span.input else None
         span.output = recursive_model_dump(span.output) if span.output else None
         span.data = recursive_model_dump(span.data) if span.data else None
 
         if self.processors:
-            end_copy = span.model_copy(deep=True)
-            # input was already sent with the START event; drop it from the END
-            # copy to avoid retaining large payloads (system prompts, full
-            # conversation histories) in the async queue.
-            end_copy.input = None
-            self._span_queue.enqueue(SpanEventType.END, end_copy, self.processors)
+            self._span_queue.enqueue(SpanEventType.END, span.model_copy(deep=True), self.processors)
 
         return span
 
diff --git a/tests/lib/core/tracing/processors/test_sgp_tracing_processor.py b/tests/lib/core/tracing/processors/test_sgp_tracing_processor.py
@@ -163,17 +163,28 @@ async def test_span_end_for_unknown_span_is_noop(self):
 
         assert len(processor._spans) == 0
 
-    async def test_sgp_span_input_cleared_after_start(self):
-        """After on_span_start sends the data, sgp_span.input should be None to release memory."""
+    async def test_sgp_span_input_updated_on_end(self):
+        """on_span_end should update sgp_span.input from the incoming span."""
         processor, _ = self._make_processor()
 
         with patch(f"{MODULE}.create_span", side_effect=lambda **kw: _make_mock_sgp_span()):
             span = _make_span()
-            span.input = {"system_prompt": "x" * 10_000}
+            span.input = {"messages": [{"role": "user", "content": "hello"}]}
             await processor.on_span_start(span)
 
         assert len(processor._spans) == 1
-        sgp_span = next(iter(processor._spans.values()))
-        assert sgp_span.input is None, (
-            "SGP span input should be cleared after upsert to release memory"
-        )
+
+        # Simulate modified input at end time
+        updated_input = {"messages": [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "hi"},
+        ]}
+        span.input = updated_input
+        span.output = {"response": "hi"}
+        span.end_time = datetime.now(UTC)
+        await processor.on_span_end(span)
+
+        # Span should be removed after end
+        assert len(processor._spans) == 0
+        # The end upsert should have been called
+        assert processor.sgp_async_client.spans.upsert_batch.call_count == 2  # start + end
diff --git a/tests/lib/core/tracing/test_span_queue.py b/tests/lib/core/tracing/test_span_queue.py
@@ -7,7 +7,7 @@
 from unittest.mock import AsyncMock, MagicMock, patch
 
 from agentex.types.span import Span
-from agentex.lib.core.tracing.span_queue import SpanEventType, AsyncSpanQueue, _SpanQueueItem
+from agentex.lib.core.tracing.span_queue import SpanEventType, AsyncSpanQueue
 
 
 def _make_span(span_id: str | None = None) -> Span:
@@ -260,8 +260,8 @@ async def record_end(span: Span) -> None:
         # Same span ID for both events
         assert call_log[0][1] == call_log[1][1]
 
-    async def test_end_event_drops_input(self):
-        """END event should NOT carry span.input — it was already sent at START."""
+    async def test_end_event_preserves_modified_input(self):
+        """END event should carry span.input so modifications after start are preserved."""
         start_spans: list[Span] = []
         end_spans: list[Span] = []
 
@@ -287,139 +287,26 @@ async def capture_end(span: Span) -> None:
             span_queue=queue,
         )
 
-        large_input = {"system_prompt": "x" * 10_000, "messages": [{"role": "user", "content": "hi"}]}
-        async with trace.span("llm-call", input=large_input) as span:
-            span.output = {"response": "hello"}
+        initial_input = {"messages": [{"role": "user", "content": "hello"}]}
+        async with trace.span("llm-call", input=initial_input) as span:
+            # Simulate modifying input after start (e.g. chatbot appending messages)
+            span.input["messages"].append({"role": "assistant", "content": "hi there"})
+            span.input["messages"].append({"role": "user", "content": "how are you?"})
+            span.output = {"response": "I'm good!"}
 
         await queue.shutdown()
 
         assert len(start_spans) == 1
         assert len(end_spans) == 1
 
-        # START should carry the full input
+        # START should carry the original input (serialized at start time)
         assert start_spans[0].input is not None
-        assert start_spans[0].input["system_prompt"] == "x" * 10_000
+        assert len(start_spans[0].input["messages"]) == 1  # only the original message
 
-        # END should have input=None (already sent at START)
-        assert end_spans[0].input is None
+        # END should carry the modified input (re-serialized at end time)
+        assert end_spans[0].input is not None
+        assert len(end_spans[0].input["messages"]) == 3  # all three messages
 
         # END should still carry output and end_time
         assert end_spans[0].output is not None
         assert end_spans[0].end_time is not None
-
-
-class TestMemoryUsage:
-    """Quantify that the fix actually reduces memory held by the tracing pipeline."""
-
-    async def test_end_events_use_less_memory_than_start_events(self):
-        """
-        Simulate N concurrent single-shot requests with large system prompts.
-        Collect what processors receive and measure serialized sizes.
-
-        Before the fix, START and END events were the same size (both carried
-        full input).  After the fix, END events should be dramatically smaller.
-        """
-        start_spans: list[Span] = []
-        end_spans: list[Span] = []
-
-        async def collect_start(span: Span) -> None:
-            start_spans.append(span)
-
-        async def collect_end(span: Span) -> None:
-            end_spans.append(span)
-
-        proc = _make_processor(
-            on_span_start=AsyncMock(side_effect=collect_start),
-            on_span_end=AsyncMock(side_effect=collect_end),
-        )
-        queue = AsyncSpanQueue()
-
-        from agentex.lib.core.tracing.trace import AsyncTrace
-
-        trace = AsyncTrace(
-            processors=[proc],
-            client=MagicMock(),
-            trace_id="test-trace",
-            span_queue=queue,
-        )
-
-        n_spans = 50
-        prompt_size = 100_000  # 100 KB system prompt per span
-        large_input = {"system_prompt": "x" * prompt_size}
-
-        for _ in range(n_spans):
-            span = await trace.start_span("llm-call", input=large_input)
-            span.output = {"response": "hello"}
-            await trace.end_span(span)
-
-        await queue.shutdown()
-
-        assert len(start_spans) == n_spans
-        assert len(end_spans) == n_spans
-
-        start_bytes = sum(len(s.model_dump_json()) for s in start_spans)
-        end_bytes = sum(len(s.model_dump_json()) for s in end_spans)
-
-        ratio = end_bytes / start_bytes
-        assert ratio < 0.05, (
-            f"END events used {ratio:.1%} of START event memory "
-            f"(start={start_bytes:,}B, end={end_bytes:,}B). "
-            f"Expected <5% because the ~{prompt_size:,}B input is dropped."
-        )
-
-    async def test_queue_payload_reduction_old_vs_new(self):
-        """
-        Directly compare data volume in the queue under old vs new behavior.
-
-        Simulates a backed-up queue (drain can't keep up with request rate)
-        holding N span lifecycles.  Old behavior: both START and END carry
-        full input.  New behavior: END events have input=None.
-
-        This mirrors what happens in K8s under concurrent load — items pile up
-        in the queue, and each one holds a serialized copy of the system prompt.
-        """
-        n_spans = 30
-        prompt_size = 200_000  # 200 KB system prompt
-
-        def _queue_payload_bytes(q: AsyncSpanQueue) -> int:
-            """Total serialized bytes of all spans sitting in the queue."""
-            return sum(len(item.span.model_dump_json()) for item in list(q._queue._queue))
-
-        large_input = {"system_prompt": "x" * prompt_size}
-
-        # --- OLD behavior: both START and END carry full input ---
-        old_queue = AsyncSpanQueue()
-        for _ in range(n_spans):
-            span = _make_span()
-            span.input = large_input
-            span.output = {"response": "ok"}
-            old_queue._queue.put_nowait(
-                _SpanQueueItem(SpanEventType.START, span.model_copy(deep=True), [])
-            )
-            old_queue._queue.put_nowait(
-                _SpanQueueItem(SpanEventType.END, span.model_copy(deep=True), [])
-            )
-
-        # --- NEW behavior: END events drop input ---
-        new_queue = AsyncSpanQueue()
-        for _ in range(n_spans):
-            span = _make_span()
-            span.input = large_input
-            span.output = {"response": "ok"}
-            new_queue._queue.put_nowait(
-                _SpanQueueItem(SpanEventType.START, span.model_copy(deep=True), [])
-            )
-            end_copy = span.model_copy(deep=True)
-            end_copy.input = None
-            new_queue._queue.put_nowait(
-                _SpanQueueItem(SpanEventType.END, end_copy, [])
-            )
-
-        old_bytes = _queue_payload_bytes(old_queue)
-        new_bytes = _queue_payload_bytes(new_queue)
-
-        savings_pct = 1.0 - (new_bytes / old_bytes)
-        assert savings_pct > 0.40, (
-            f"Expected >40% queue payload reduction, got {savings_pct:.0%} "
-            f"(old={old_bytes:,}B, new={new_bytes:,}B)"
-        )