Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions livekit-agents/livekit/agents/stt/stt.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ class SpeechData:
@dataclass
class RecognitionUsage:
audio_duration: float
"""Incremental audio duration/usage in seconds"""


@dataclass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,7 @@ async def process_stream(
],
) -> None:
has_started = False
last_usage_event_time: float = 0.0
async for resp in stream:
if resp.speech_event_type == (
cloud_speech_v2.StreamingRecognizeResponse.SpeechEventType.SPEECH_ACTIVITY_BEGIN
Expand Down Expand Up @@ -672,6 +673,16 @@ async def process_stream(
)
has_started = False

if (audio_duration := _get_audio_duration(resp, last_usage_event_time)) > 0:
self._event_ch.send_nowait(
stt.SpeechEvent(
type=stt.SpeechEventType.RECOGNITION_USAGE,
request_id=_get_request_id(resp),
recognition_usage=stt.RecognitionUsage(audio_duration=audio_duration),
)
)
last_usage_event_time += audio_duration

while True:
audio_pushed = False
try:
Expand Down Expand Up @@ -853,3 +864,32 @@ def _streaming_recognize_response_to_speech_data(
)

return data


def _get_audio_duration(
resp: cloud_speech_v2.StreamingRecognizeResponse | cloud_speech_v1.StreamingRecognizeResponse,
last_usage_event_time: float,
) -> float:
"""Calculate the audio duration from the response.
References:
- https://docs.cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v1.types.StreamingRecognizeResponse
- https://docs.cloud.google.com/speech-to-text/docs/reference/rest/v2/StreamingRecognitionResult
"""
# total_billed_time is only set "if this is the last response in the stream"
# use speech event time/offset before the last response is received
if isinstance(resp, cloud_speech_v2.StreamingRecognizeResponse):
if resp.metadata.total_billed_duration:
return _duration_to_seconds(resp.metadata.total_billed_duration) - last_usage_event_time
return _duration_to_seconds(resp.speech_event_offset) - last_usage_event_time
if resp.total_billed_time:
return _duration_to_seconds(resp.total_billed_time) - last_usage_event_time
return _duration_to_seconds(resp.speech_event_time) - last_usage_event_time


def _get_request_id(
resp: cloud_speech_v2.StreamingRecognizeResponse | cloud_speech_v1.StreamingRecognizeResponse,
) -> str:
if isinstance(resp, cloud_speech_v2.StreamingRecognizeResponse):
return resp.metadata.request_id
return str(resp.request_id)