2222_DEFAULT_MAX_SIZE = 0
2323# Total attempts per batch for a *transient* failure (1 == no retry).
2424_DEFAULT_MAX_RETRIES = 1
25+ # Max number of batch-export HTTP requests in flight at once. The export
26+ # backend (EGP) processes each upsert_batch in ~150ms but serves many requests
27+ # concurrently; issuing one batch at a time caps per-pod egress at ~1/latency.
28+ # Sending several concurrently lets a pod keep up with span production under
29+ # load. ``1`` restores the old strictly-serial behavior.
30+ _DEFAULT_CONCURRENCY = 8
2531# HTTP statuses worth retrying at the queue level. These are explicit
2632# backpressure / transient signals; everything else (esp. 401/403/4xx auth and
2733# validation errors) is a permanent failure that re-enqueuing cannot fix. Note
@@ -79,15 +85,23 @@ class AsyncSpanQueue:
7985 """Background FIFO queue for async span processing.
8086
8187 Span events are enqueued synchronously (non-blocking) and drained by a
82- background task. Items are processed in batches: all START events in a
83- batch are flushed concurrently, then all END events, so that per-span
84- start-before-end ordering is preserved while HTTP calls for independent
85- spans execute in parallel.
86-
87- Once the drain loop picks up the first item, it lingers up to
88- ``linger_ms`` waiting for more items to coalesce into the same batch.
89- Without the linger the drain almost always returned size-1 batches under
90- real agent workloads, because spans typically arrive a few ms apart.
88+ background task. The drain coalesces ready events into batches and
89+ *dispatches* each batch's export as its own task, so up to ``concurrency``
90+ batch requests can be in flight at once. This matters because each
91+ ``upsert_batch`` HTTP call takes tens-to-hundreds of ms server-side; issuing
92+ them one at a time caps a pod's egress at ~1/latency and lets a backlog
93+ build under load.
94+
95+ Ordering guarantee: a span's START export always completes before its END
96+ export is issued. END batches wait on the START batches that were in flight
97+ when they were formed; because a span's START is always enqueued before its
98+ END, that span's START send is either still in flight (and waited on) or
99+ already finished. Independent spans export fully concurrently.
100+
101+ Once the drain loop picks up the first item, it lingers up to ``linger_ms``
102+ waiting for more items to coalesce into the same batch. Without the linger
103+ the drain almost always returned size-1 batches under real agent workloads,
104+ because spans typically arrive a few ms apart.
91105
92106 Reliability:
93107 - ``max_size`` bounds the queue. When full, new events are dropped and
@@ -104,6 +118,7 @@ def __init__(
104118 linger_ms : int | None = None ,
105119 max_size : int | None = None ,
106120 max_retries : int | None = None ,
121+ concurrency : int | None = None ,
107122 ) -> None :
108123 resolved_max_size = (
109124 _read_int_env ("AGENTEX_SPAN_QUEUE_MAX_SIZE" , _DEFAULT_MAX_SIZE ) if max_size is None else max (0 , max_size )
@@ -118,6 +133,17 @@ def __init__(
118133 if max_retries is None
119134 else max (1 , max_retries )
120135 )
136+ self ._concurrency = (
137+ _read_int_env ("AGENTEX_SPAN_QUEUE_CONCURRENCY" , _DEFAULT_CONCURRENCY , minimum = 1 )
138+ if concurrency is None
139+ else max (1 , concurrency )
140+ )
141+ # Bounds concurrent export HTTP requests.
142+ self ._send_sema = asyncio .Semaphore (self ._concurrency )
143+ # Outstanding dispatched send tasks, and the subset that are START
144+ # sends (END sends wait on these to preserve per-span ordering).
145+ self ._inflight : set [asyncio .Task [None ]] = set ()
146+ self ._inflight_starts : set [asyncio .Task [None ]] = set ()
121147 # Total spans dropped for any reason (full queue, shutdown, permanent
122148 # failure, exhausted retries). Surfaced for metrics/observability so
123149 # span loss stops being silent.
@@ -186,6 +212,11 @@ def _ensure_drain_running(self) -> None:
186212
187213 async def _drain_loop (self ) -> None :
188214 while True :
215+ # Backpressure: cap the number of in-flight send tasks so the drain
216+ # does not run unboundedly ahead of the exporters.
217+ while len (self ._inflight ) >= self ._concurrency :
218+ await asyncio .wait (set (self ._inflight ), return_when = asyncio .FIRST_COMPLETED )
219+
189220 # Block until at least one item is available.
190221 first = await self ._queue .get ()
191222 batch : list [_SpanQueueItem ] = [first ]
@@ -213,39 +244,59 @@ async def _drain_loop(self) -> None:
213244 except asyncio .QueueEmpty :
214245 break
215246
216- try :
217- _metrics .record_batch_coalesced (
218- queue_depth = self ._queue .qsize () + len (batch ),
219- batch_items = batch ,
220- )
247+ _metrics .record_batch_coalesced (
248+ queue_depth = self ._queue .qsize () + len (batch ),
249+ batch_items = batch ,
250+ )
221251
222- # Separate START and END events. Processing all STARTs before
223- # ENDs ensures that on_span_start completes before on_span_end
224- # for any span whose both events land in the same batch.
225- starts = [i for i in batch if i .event_type == SpanEventType .START ]
226- ends = [i for i in batch if i .event_type == SpanEventType .END ]
227-
228- if starts :
229- phase_start = time .perf_counter ()
230- await self ._process_items (starts )
231- _metrics .record_batch_phase (
232- phase = "start" ,
233- size = len (starts ),
234- duration_ms = (time .perf_counter () - phase_start ) * 1000.0 ,
235- )
236- if ends :
237- phase_start = time .perf_counter ()
238- await self ._process_items (ends )
239- _metrics .record_batch_phase (
240- phase = "end" ,
241- size = len (ends ),
242- duration_ms = (time .perf_counter () - phase_start ) * 1000.0 ,
243- )
244- finally :
245- for _ in batch :
246- self ._queue .task_done ()
247- # Release span data for GC.
248- batch .clear ()
252+ # Separate START and END events and dispatch each as its own send
253+ # task. Dispatching STARTs first (so they are registered before the
254+ # END snapshot) guarantees an END never outruns a START of the same
255+ # span whose events land in this batch.
256+ starts = [i for i in batch if i .event_type == SpanEventType .START ]
257+ ends = [i for i in batch if i .event_type == SpanEventType .END ]
258+ if starts :
259+ self ._dispatch (starts , SpanEventType .START )
260+ if ends :
261+ # Re-check backpressure before the second dispatch so a batch
262+ # carrying both event types can't push _inflight past the cap.
263+ while len (self ._inflight ) >= self ._concurrency :
264+ await asyncio .wait (set (self ._inflight ), return_when = asyncio .FIRST_COMPLETED )
265+ self ._dispatch (ends , SpanEventType .END )
266+
267+ def _dispatch (self , items : list [_SpanQueueItem ], event_type : SpanEventType ) -> None :
268+ """Spawn a background task to export ``items``.
269+
270+ END sends snapshot the currently in-flight START tasks and wait for them
271+ before issuing, preserving the per-span START-before-END invariant.
272+ """
273+ barrier = tuple (self ._inflight_starts ) if event_type == SpanEventType .END else ()
274+ task = asyncio .create_task (self ._run_send (items , barrier ))
275+ self ._inflight .add (task )
276+ task .add_done_callback (self ._inflight .discard )
277+ if event_type == SpanEventType .START :
278+ self ._inflight_starts .add (task )
279+ task .add_done_callback (self ._inflight_starts .discard )
280+
281+ async def _run_send (self , items : list [_SpanQueueItem ], barrier : tuple [asyncio .Task [None ], ...]) -> None :
282+ try :
283+ if barrier :
284+ # Wait for the START sends this END batch depends on. Their
285+ # exceptions are irrelevant here — we only need them finished.
286+ await asyncio .gather (* barrier , return_exceptions = True )
287+ phase_start = time .perf_counter ()
288+ await self ._process_items (items )
289+ if items :
290+ _metrics .record_batch_phase (
291+ phase = items [0 ].event_type .value ,
292+ size = len (items ),
293+ duration_ms = (time .perf_counter () - phase_start ) * 1000.0 ,
294+ )
295+ finally :
296+ # Mark every item done so shutdown's queue.join() can complete only
297+ # once all sends (and their retries) have finished.
298+ for _ in items :
299+ self ._queue .task_done ()
249300
250301 async def _process_items (self , items : list [_SpanQueueItem ]) -> None :
251302 """Dispatch a batch of same-event-type items to each processor in one call.
@@ -277,10 +328,12 @@ async def _handle(
277328 ) -> None :
278329 spans = [item .span for item in items ]
279330 try :
280- if event_type == SpanEventType .START :
281- await p .on_spans_start (spans )
282- else :
283- await p .on_spans_end (spans )
331+ # Hold a concurrency slot only for the duration of the HTTP call.
332+ async with self ._send_sema :
333+ if event_type == SpanEventType .START :
334+ await p .on_spans_start (spans )
335+ else :
336+ await p .on_spans_end (spans )
284337 except Exception as exc :
285338 self ._handle_failure (p , items , event_type , exc )
286339
@@ -334,7 +387,14 @@ def _handle_failure(
334387
335388 def _reenqueue (self , item : _SpanQueueItem , p : AsyncTracingProcessor ) -> None :
336389 """Put a single failed item back on the queue, scoped to the processor
337- that failed, with an incremented attempt count."""
390+ that failed, with an incremented attempt count.
391+
392+ NOTE: a re-enqueued START goes to the *back* of the queue. If an END
393+ for the same span is dispatched concurrently before this START is picked
394+ up again, the END's barrier snapshot won't contain it, breaking the
395+ START-before-END guarantee for that span. This is benign at the default
396+ ``max_retries=1`` (retries disabled) but must be addressed before
397+ enabling retries by default."""
338398 try :
339399 self ._queue .put_nowait (
340400 _SpanQueueItem (
@@ -354,11 +414,17 @@ def _reenqueue(self, item: _SpanQueueItem, p: AsyncTracingProcessor) -> None:
354414
355415 async def shutdown (self , timeout : float = 30.0 ) -> None :
356416 self ._stopping = True
357- if self ._queue .empty () and (self ._drain_task is None or self ._drain_task .done ()):
417+ drain_idle = self ._drain_task is None or self ._drain_task .done ()
418+ if self ._queue .empty () and drain_idle and not self ._inflight :
358419 return
420+
421+ timed_out = False
359422 try :
423+ # join() returns once every enqueued (and re-enqueued) item has been
424+ # marked done by its send task.
360425 await asyncio .wait_for (self ._queue .join (), timeout = timeout )
361426 except asyncio .TimeoutError :
427+ timed_out = True
362428 remaining = self ._queue .qsize ()
363429 logger .warning (
364430 "Span queue shutdown timed out after %.1fs with %d items remaining" , timeout , remaining
@@ -371,6 +437,15 @@ async def shutdown(self, timeout: float = 30.0) -> None:
371437 except asyncio .CancelledError :
372438 pass
373439
440+ # Clean up any in-flight send tasks. On a clean shutdown these are
441+ # already finishing; on timeout, cancel the stragglers so we don't hang.
442+ inflight = list (self ._inflight )
443+ if inflight :
444+ if timed_out :
445+ for task in inflight :
446+ task .cancel ()
447+ await asyncio .gather (* inflight , return_exceptions = True )
448+
374449
375450_default_span_queue : AsyncSpanQueue | None = None
376451
0 commit comments