Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 164 additions & 1 deletion src/agent/tools/telegram/media/__tests__/send-voice.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
import { existsSync, mkdtempSync, readdirSync, rmSync, writeFileSync } from "fs";
import { tmpdir } from "os";
import { join } from "path";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";

const mocks = vi.hoisted(() => ({
generateSpeech: vi.fn(),
wavToOggOpus: vi.fn(),
validateReadPath: vi.fn(),
}));

vi.mock("../../../../../services/tts.js", () => ({
Expand All @@ -10,6 +15,18 @@ vi.mock("../../../../../services/tts.js", () => ({
generateSpeech: mocks.generateSpeech,
}));

vi.mock("../../../../../utils/audio.js", () => ({
wavToOggOpus: mocks.wavToOggOpus,
}));

vi.mock("../../../../../workspace/index.js", async (importOriginal) => {
const actual = (await importOriginal()) as Record<string, unknown>;
return {
...actual,
validateReadPath: mocks.validateReadPath,
};
});

vi.mock("telegram", () => ({
Api: {
DocumentAttributeAudio: class {
Expand All @@ -27,14 +44,68 @@ vi.mock("telegram", () => ({

import { telegramSendVoiceExecutor } from "../send-voice.js";

const TTS_TEMP_DIR = join(tmpdir(), "teleton-tts");

/** Build a minimal valid PCM WAV buffer with the RIFF/WAVE header. */
function buildWavBuffer(payload: Buffer = Buffer.from([0, 0, 0, 0])): Buffer {
const dataSize = payload.length;
const fmtChunkSize = 16;
const riffSize = 4 + (8 + fmtChunkSize) + (8 + dataSize);
const buf = Buffer.alloc(8 + riffSize);
buf.write("RIFF", 0, "ascii");
buf.writeUInt32LE(riffSize, 4);
buf.write("WAVE", 8, "ascii");
buf.write("fmt ", 12, "ascii");
buf.writeUInt32LE(fmtChunkSize, 16);
buf.writeUInt16LE(1, 20); // PCM
buf.writeUInt16LE(1, 22); // mono
buf.writeUInt32LE(48000, 24); // sample rate
buf.writeUInt32LE(96000, 28); // byte rate
buf.writeUInt16LE(2, 32); // block align
buf.writeUInt16LE(16, 34); // bits per sample
buf.write("data", 36, "ascii");
buf.writeUInt32LE(dataSize, 40);
payload.copy(buf, 44);
return buf;
}

describe("telegramSendVoiceExecutor", () => {
let scratchDir: string;
let filesBeforeTest: Set<string>;

beforeEach(() => {
vi.clearAllMocks();
scratchDir = mkdtempSync(join(tmpdir(), "send-voice-test-"));
filesBeforeTest = new Set(existsSync(TTS_TEMP_DIR) ? readdirSync(TTS_TEMP_DIR) : []);
mocks.generateSpeech.mockResolvedValue({
filePath: "/tmp/teleton-test-voice.ogg",
provider: "groq",
voice: "diana",
});
mocks.wavToOggOpus.mockReturnValue(Buffer.from("ogg-bytes"));
mocks.validateReadPath.mockImplementation((p: string) => ({
absolutePath: p,
relativePath: p,
exists: true,
isDirectory: false,
extension: p.endsWith(".wav") ? ".wav" : ".ogg",
filename: p.split("/").pop() ?? p,
}));
});

afterEach(() => {
rmSync(scratchDir, { recursive: true, force: true });
if (!existsSync(TTS_TEMP_DIR)) return;

for (const filename of readdirSync(TTS_TEMP_DIR)) {
if (filesBeforeTest.has(filename)) continue;

try {
rmSync(join(TTS_TEMP_DIR, filename), { force: true });
} catch {
// Ignore cleanup errors from files already removed by the implementation.
}
}
});

it("sends generated speech as an explicit Telegram voice note", async () => {
Expand Down Expand Up @@ -73,4 +144,96 @@ describe("telegramSendVoiceExecutor", () => {
})
);
});

it("auto-converts a WAV file passed via voicePath into OGG/Opus before sending", async () => {
const wavPath = join(scratchDir, "groq-output.wav");
writeFileSync(wavPath, buildWavBuffer());

const sendFile = vi.fn().mockResolvedValue({ id: 7, date: 1 });
const result = await telegramSendVoiceExecutor(
{
chatId: "chat2",
voicePath: wavPath,
},
{
config: { agent: { provider: "anthropic" } },
bridge: {
getClient: () => ({
getClient: () => ({ sendFile }),
}),
},
} as any
);

expect(result.success).toBe(true);
expect(mocks.wavToOggOpus).toHaveBeenCalledTimes(1);
expect(mocks.wavToOggOpus).toHaveBeenCalledWith(expect.any(Buffer));

const sendFileArgs = sendFile.mock.calls[0];
expect(sendFileArgs[0]).toBe("chat2");
const sendOpts = sendFileArgs[1];
expect(sendOpts.voiceNote).toBe(true);
expect(sendOpts.file).toMatch(/\.ogg$/);
expect(sendOpts.file).not.toBe(wavPath);
// The transcoded temp file should be cleaned up after sending.
expect(existsSync(sendOpts.file)).toBe(false);
});

it("leaves an existing OGG voicePath untouched (no double conversion)", async () => {
const oggPath = join(scratchDir, "voice.ogg");
writeFileSync(oggPath, Buffer.from("OggS\0\0\0\0...")); // OGG magic, not WAV

const sendFile = vi.fn().mockResolvedValue({ id: 8, date: 2 });
const result = await telegramSendVoiceExecutor(
{
chatId: "chat3",
voicePath: oggPath,
},
{
config: { agent: { provider: "anthropic" } },
bridge: {
getClient: () => ({
getClient: () => ({ sendFile }),
}),
},
} as any
);

expect(result.success).toBe(true);
expect(mocks.wavToOggOpus).not.toHaveBeenCalled();
expect(sendFile).toHaveBeenCalledWith(
"chat3",
expect.objectContaining({ file: oggPath, voiceNote: true })
);
});

it("surfaces a clear error when WAV transcoding fails", async () => {
const wavPath = join(scratchDir, "bad.wav");
writeFileSync(wavPath, buildWavBuffer());
mocks.wavToOggOpus.mockImplementationOnce(() => {
throw new Error("opus encoder boom");
});

const sendFile = vi.fn();
const result = await telegramSendVoiceExecutor(
{
chatId: "chat4",
voicePath: wavPath,
},
{
config: { agent: { provider: "anthropic" } },
bridge: {
getClient: () => ({
getClient: () => ({ sendFile }),
}),
},
} as any
);

expect(result.success).toBe(false);
expect(result.error).toMatch(
/Failed to convert WAV voice file to OGG\/Opus.*opus encoder boom/i
);
expect(sendFile).not.toHaveBeenCalled();
});
});
44 changes: 40 additions & 4 deletions src/agent/tools/telegram/media/send-voice.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@
* telegram_send_voice - Send voice messages with optional TTS
*
* Two modes:
* 1. voicePath: Send existing audio file
* 1. voicePath: Send existing audio file (WAV is auto-converted to OGG/Opus)
* 2. text: Generate speech using TTS, then send
*/

import { Type } from "@sinclair/typebox";
import { Api } from "telegram";
import { unlinkSync } from "fs";
import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "fs";
import { join } from "path";
import { tmpdir } from "os";
import { randomUUID } from "crypto";
import type { Tool, ToolExecutor, ToolResult } from "../../types.js";
import {
generateSpeech,
Expand All @@ -17,6 +20,7 @@ import {
type TTSProvider,
} from "../../../../services/tts.js";
import { GROQ_TTS_VOICES } from "../../../../providers/groq/GroqTTSProvider.js";
import { wavToOggOpus } from "../../../../utils/audio.js";
import { validateReadPath, WorkspaceSecurityError } from "../../../../workspace/index.js";
import { getErrorMessage } from "../../../../utils/errors.js";
import { createLogger } from "../../../../utils/logger.js";
Expand All @@ -39,15 +43,16 @@ interface SendVoiceParams {
export const telegramSendVoiceTool: Tool = {
name: "telegram_send_voice",
description:
"Send a voice message. Either provide voicePath for an existing file, or text for TTS generation. Uses the configured TTS provider and voice from settings. Available providers: piper, edge, openai, elevenlabs, groq.",
"Send a voice message. Either provide voicePath for an existing file, or text for TTS generation. WAV inputs (e.g. raw Groq TTS output) are automatically transcoded to OGG/Opus so Telegram renders them as proper voice notes. Uses the configured TTS provider and voice from settings. Available providers: piper, edge, openai, elevenlabs, groq.",

parameters: Type.Object({
chatId: Type.String({
description: "The chat ID to send the voice message to",
}),
voicePath: Type.Optional(
Type.String({
description: "Local file path to voice/audio file (OGG, MP3). Use this OR text.",
description:
"Local file path to voice/audio file (OGG/Opus, WAV, MP3). WAV is auto-converted to OGG/Opus before sending (Telegram voice notes require OGG/Opus). Use this OR text.",
})
),
text: Type.Optional(
Expand Down Expand Up @@ -96,6 +101,15 @@ export const telegramSendVoiceTool: Tool = {
}),
};

/** Detect a WAV (PCM) container by its RIFF/WAVE magic bytes. */
function isWavBuffer(buffer: Buffer): boolean {
return (
buffer.length >= 12 &&
buffer.toString("ascii", 0, 4) === "RIFF" &&
buffer.toString("ascii", 8, 12) === "WAVE"
);
}

export const telegramSendVoiceExecutor: ToolExecutor<SendVoiceParams> = async (
params,
context
Expand Down Expand Up @@ -142,6 +156,28 @@ export const telegramSendVoiceExecutor: ToolExecutor<SendVoiceParams> = async (
}
throw error;
}

// Telegram only renders WAV as a generic document. If the caller passes
// a WAV file (e.g. raw Groq TTS output saved to disk), transcode it to
// OGG/Opus so the message becomes a proper voice note. OGG/Opus and
// other formats (MP3, M4A, …) are sent as-is.
try {
const buffer = readFileSync(audioPath);
if (isWavBuffer(buffer)) {
const tempDir = join(tmpdir(), "teleton-tts");
if (!existsSync(tempDir)) mkdirSync(tempDir, { recursive: true });
const oggPath = join(tempDir, `${randomUUID()}.ogg`);
const oggBuffer = wavToOggOpus(buffer);
writeFileSync(oggPath, oggBuffer);
audioPath = oggPath;
generatedFile = oggPath; // Mark transcoded file for cleanup
}
} catch (error) {
return {
success: false,
error: `Failed to convert WAV voice file to OGG/Opus: ${getErrorMessage(error)}`,
};
}
}

// TTS mode: generate speech from text
Expand Down
Loading