Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 27 additions & 2 deletions src/agents/realtime/_util.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,44 @@
from __future__ import annotations

from collections.abc import Mapping
from typing import Any

from .config import RealtimeAudioFormat

PCM16_SAMPLE_RATE_HZ = 24_000
PCM16_SAMPLE_WIDTH_BYTES = 2
G711_SAMPLE_RATE_HZ = 8_000


def _normalize_format_to_str(format: RealtimeAudioFormat | None) -> str | None:
"""Extract a lower-cased format identifier from any RealtimeAudioFormat shape.

`RealtimeAudioFormat` may be a string, a Mapping with a ``type`` key, or one of
the typed ``AudioPCM`` / ``AudioPCMU`` / ``AudioPCMA`` pydantic models. The
length calculator previously only handled strings, which silently fell back to
PCM math for typed/Mapping g711 formats and yielded a ~6x wrong duration.
"""
if format is None:
return None
if isinstance(format, str):
return format.lower()
type_value: Any
if isinstance(format, Mapping):
type_value = format.get("type")
else:
type_value = getattr(format, "type", None)
return type_value.lower() if isinstance(type_value, str) else None
Comment on lines +29 to +30
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Classify typed G.711 models even when type is omitted

AudioPCMU/AudioPCMA are accepted typed formats, but their generated type field is optional and defaults to None. Passing AudioPCMU() or AudioPCMA() still falls through to PCM math here because only the attribute value is inspected, so typed G.711 models are not reliably fixed unless callers redundantly set type.

Useful? React with 👍 / 👎.



def calculate_audio_length_ms(format: RealtimeAudioFormat | None, audio_bytes: bytes) -> float:
if not audio_bytes:
return 0.0

normalized_format = format.lower() if isinstance(format, str) else None
normalized_format = _normalize_format_to_str(format)

if normalized_format and normalized_format.startswith("g711"):
if normalized_format and (
normalized_format.startswith("g711") or normalized_format in ("audio/pcmu", "audio/pcma")
):
return (len(audio_bytes) / G711_SAMPLE_RATE_HZ) * 1000

samples = len(audio_bytes) / PCM16_SAMPLE_WIDTH_BYTES
Expand Down
51 changes: 51 additions & 0 deletions tests/realtime/test_playback_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,54 @@ def test_audio_length_calculation_with_different_formats(self):
# Test None format (defaults to PCM)
none_length = calculate_audio_length_ms(None, pcm_bytes)
assert none_length == pytest.approx(expected_pcm, rel=0, abs=1e-6)

def test_audio_length_calculation_handles_typed_and_mapping_g711_formats(self):
"""g711 audio passed as a typed pydantic model, Mapping, or ``audio/pcm*`` string
must be measured at the g711 sample rate.

``RealtimePlaybackTracker.set_audio_format`` and ``ModelAudioTracker.set_audio_format``
accept ``RealtimeAudioFormat``, which is ``str | Mapping | AudioPCM/PCMU/PCMA``.
Previously the length calculator only special-cased strings starting with
``g711``, so typed/Mapping g711 formats and the ``audio/pcmu``/``audio/pcma``
strings silently fell back to PCM-24kHz math, yielding a ~6x wrong duration
and miscalculating truncation offsets on interrupt for SIP/Twilio sessions.
"""
from openai.types.realtime.realtime_audio_formats import (
AudioPCM,
AudioPCMA,
AudioPCMU,
)

from agents.realtime._util import calculate_audio_length_ms

audio_bytes = b"x" * 80 # at g711 8kHz: 10ms
expected_g711 = (len(audio_bytes) / 8_000) * 1000
expected_pcm = (len(audio_bytes) / (24_000 * 2)) * 1000

# Typed pydantic models for g711 should resolve to g711 sample rate.
assert calculate_audio_length_ms(
AudioPCMU(type="audio/pcmu"), audio_bytes
) == pytest.approx(expected_g711, rel=0, abs=1e-6)
assert calculate_audio_length_ms(
AudioPCMA(type="audio/pcma"), audio_bytes
) == pytest.approx(expected_g711, rel=0, abs=1e-6)
# Typed PCM and Mapping/string equivalents stay on the PCM path.
assert calculate_audio_length_ms(
AudioPCM(type="audio/pcm", rate=24000), audio_bytes
) == pytest.approx(expected_pcm, rel=0, abs=1e-6)

# Mapping forms (as accepted by RealtimeAudioFormat).
assert calculate_audio_length_ms({"type": "audio/pcmu"}, audio_bytes) == pytest.approx(
expected_g711, rel=0, abs=1e-6
)
assert calculate_audio_length_ms({"type": "audio/pcma"}, audio_bytes) == pytest.approx(
expected_g711, rel=0, abs=1e-6
)

# API-style ``audio/pcm*`` strings should also be honored.
assert calculate_audio_length_ms("audio/pcmu", audio_bytes) == pytest.approx(
expected_g711, rel=0, abs=1e-6
)
assert calculate_audio_length_ms("audio/pcma", audio_bytes) == pytest.approx(
expected_g711, rel=0, abs=1e-6
)
Loading