xlabtg · xlabtg · May 16, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
diff --git a/src/agent/tools/telegram/media/__tests__/send-voice.test.ts b/src/agent/tools/telegram/media/__tests__/send-voice.test.ts
@@ -1,7 +1,12 @@
-import { beforeEach, describe, expect, it, vi } from "vitest";
+import { existsSync, mkdtempSync, readdirSync, rmSync, writeFileSync } from "fs";
+import { tmpdir } from "os";
+import { join } from "path";
+import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
 
 const mocks = vi.hoisted(() => ({
   generateSpeech: vi.fn(),
+  wavToOggOpus: vi.fn(),
+  validateReadPath: vi.fn(),
 }));
 
 vi.mock("../../../../../services/tts.js", () => ({
@@ -10,6 +15,18 @@ vi.mock("../../../../../services/tts.js", () => ({
   generateSpeech: mocks.generateSpeech,
 }));
 
+vi.mock("../../../../../utils/audio.js", () => ({
+  wavToOggOpus: mocks.wavToOggOpus,
+}));
+
+vi.mock("../../../../../workspace/index.js", async (importOriginal) => {
+  const actual = (await importOriginal()) as Record<string, unknown>;
+  return {
+    ...actual,
+    validateReadPath: mocks.validateReadPath,
+  };
+});
+
 vi.mock("telegram", () => ({
   Api: {
     DocumentAttributeAudio: class {
@@ -27,14 +44,68 @@ vi.mock("telegram", () => ({
 
 import { telegramSendVoiceExecutor } from "../send-voice.js";
 
+const TTS_TEMP_DIR = join(tmpdir(), "teleton-tts");
+
+/** Build a minimal valid PCM WAV buffer with the RIFF/WAVE header. */
+function buildWavBuffer(payload: Buffer = Buffer.from([0, 0, 0, 0])): Buffer {
+  const dataSize = payload.length;
+  const fmtChunkSize = 16;
+  const riffSize = 4 + (8 + fmtChunkSize) + (8 + dataSize);
+  const buf = Buffer.alloc(8 + riffSize);
+  buf.write("RIFF", 0, "ascii");
+  buf.writeUInt32LE(riffSize, 4);
+  buf.write("WAVE", 8, "ascii");
+  buf.write("fmt ", 12, "ascii");
+  buf.writeUInt32LE(fmtChunkSize, 16);
+  buf.writeUInt16LE(1, 20); // PCM
+  buf.writeUInt16LE(1, 22); // mono
+  buf.writeUInt32LE(48000, 24); // sample rate
+  buf.writeUInt32LE(96000, 28); // byte rate
+  buf.writeUInt16LE(2, 32); // block align
+  buf.writeUInt16LE(16, 34); // bits per sample
+  buf.write("data", 36, "ascii");
+  buf.writeUInt32LE(dataSize, 40);
+  payload.copy(buf, 44);
+  return buf;
+}
+
 describe("telegramSendVoiceExecutor", () => {
+  let scratchDir: string;
+  let filesBeforeTest: Set<string>;
+
   beforeEach(() => {
     vi.clearAllMocks();
+    scratchDir = mkdtempSync(join(tmpdir(), "send-voice-test-"));
+    filesBeforeTest = new Set(existsSync(TTS_TEMP_DIR) ? readdirSync(TTS_TEMP_DIR) : []);
     mocks.generateSpeech.mockResolvedValue({
       filePath: "/tmp/teleton-test-voice.ogg",
       provider: "groq",
       voice: "diana",
     });
+    mocks.wavToOggOpus.mockReturnValue(Buffer.from("ogg-bytes"));
+    mocks.validateReadPath.mockImplementation((p: string) => ({
+      absolutePath: p,
+      relativePath: p,
+      exists: true,
+      isDirectory: false,
+      extension: p.endsWith(".wav") ? ".wav" : ".ogg",
+      filename: p.split("/").pop() ?? p,
+    }));
+  });
+
+  afterEach(() => {
+    rmSync(scratchDir, { recursive: true, force: true });
+    if (!existsSync(TTS_TEMP_DIR)) return;
+
+    for (const filename of readdirSync(TTS_TEMP_DIR)) {
+      if (filesBeforeTest.has(filename)) continue;
+
+      try {
+        rmSync(join(TTS_TEMP_DIR, filename), { force: true });
+      } catch {
+        // Ignore cleanup errors from files already removed by the implementation.
+      }
+    }
   });
 
   it("sends generated speech as an explicit Telegram voice note", async () => {
@@ -73,4 +144,96 @@ describe("telegramSendVoiceExecutor", () => {
       })
     );
   });
+
+  it("auto-converts a WAV file passed via voicePath into OGG/Opus before sending", async () => {
+    const wavPath = join(scratchDir, "groq-output.wav");
+    writeFileSync(wavPath, buildWavBuffer());
+
+    const sendFile = vi.fn().mockResolvedValue({ id: 7, date: 1 });
+    const result = await telegramSendVoiceExecutor(
+      {
+        chatId: "chat2",
+        voicePath: wavPath,
+      },
+      {
+        config: { agent: { provider: "anthropic" } },
+        bridge: {
+          getClient: () => ({
+            getClient: () => ({ sendFile }),
+          }),
+        },
+      } as any
+    );
+
+    expect(result.success).toBe(true);
+    expect(mocks.wavToOggOpus).toHaveBeenCalledTimes(1);
+    expect(mocks.wavToOggOpus).toHaveBeenCalledWith(expect.any(Buffer));
+
+    const sendFileArgs = sendFile.mock.calls[0];
+    expect(sendFileArgs[0]).toBe("chat2");
+    const sendOpts = sendFileArgs[1];
+    expect(sendOpts.voiceNote).toBe(true);
+    expect(sendOpts.file).toMatch(/\.ogg$/);
+    expect(sendOpts.file).not.toBe(wavPath);
+    // The transcoded temp file should be cleaned up after sending.
+    expect(existsSync(sendOpts.file)).toBe(false);
+  });
+
+  it("leaves an existing OGG voicePath untouched (no double conversion)", async () => {
+    const oggPath = join(scratchDir, "voice.ogg");
+    writeFileSync(oggPath, Buffer.from("OggS\0\0\0\0...")); // OGG magic, not WAV
+
+    const sendFile = vi.fn().mockResolvedValue({ id: 8, date: 2 });
+    const result = await telegramSendVoiceExecutor(
+      {
+        chatId: "chat3",
+        voicePath: oggPath,
+      },
+      {
+        config: { agent: { provider: "anthropic" } },
+        bridge: {
+          getClient: () => ({
+            getClient: () => ({ sendFile }),
+          }),
+        },
+      } as any
+    );
+
+    expect(result.success).toBe(true);
+    expect(mocks.wavToOggOpus).not.toHaveBeenCalled();
+    expect(sendFile).toHaveBeenCalledWith(
+      "chat3",
+      expect.objectContaining({ file: oggPath, voiceNote: true })
+    );
+  });
+
+  it("surfaces a clear error when WAV transcoding fails", async () => {
+    const wavPath = join(scratchDir, "bad.wav");
+    writeFileSync(wavPath, buildWavBuffer());
+    mocks.wavToOggOpus.mockImplementationOnce(() => {
+      throw new Error("opus encoder boom");
+    });
+
+    const sendFile = vi.fn();
+    const result = await telegramSendVoiceExecutor(
+      {
+        chatId: "chat4",
+        voicePath: wavPath,
+      },
+      {
+        config: { agent: { provider: "anthropic" } },
+        bridge: {
+          getClient: () => ({
+            getClient: () => ({ sendFile }),
+          }),
+        },
+      } as any
+    );
+
+    expect(result.success).toBe(false);
+    expect(result.error).toMatch(
+      /Failed to convert WAV voice file to OGG\/Opus.*opus encoder boom/i
+    );
+    expect(sendFile).not.toHaveBeenCalled();
+  });
 });
diff --git a/src/agent/tools/telegram/media/send-voice.ts b/src/agent/tools/telegram/media/send-voice.ts
@@ -2,13 +2,16 @@
  * telegram_send_voice - Send voice messages with optional TTS
  *
  * Two modes:
- * 1. voicePath: Send existing audio file
+ * 1. voicePath: Send existing audio file (WAV is auto-converted to OGG/Opus)
  * 2. text: Generate speech using TTS, then send
  */
 
 import { Type } from "@sinclair/typebox";
 import { Api } from "telegram";
-import { unlinkSync } from "fs";
+import { existsSync, mkdirSync, readFileSync, unlinkSync, writeFileSync } from "fs";
+import { join } from "path";
+import { tmpdir } from "os";
+import { randomUUID } from "crypto";
 import type { Tool, ToolExecutor, ToolResult } from "../../types.js";
 import {
   generateSpeech,
@@ -17,6 +20,7 @@ import {
   type TTSProvider,
 } from "../../../../services/tts.js";
 import { GROQ_TTS_VOICES } from "../../../../providers/groq/GroqTTSProvider.js";
+import { wavToOggOpus } from "../../../../utils/audio.js";
 import { validateReadPath, WorkspaceSecurityError } from "../../../../workspace/index.js";
 import { getErrorMessage } from "../../../../utils/errors.js";
 import { createLogger } from "../../../../utils/logger.js";
@@ -39,15 +43,16 @@ interface SendVoiceParams {
 export const telegramSendVoiceTool: Tool = {
   name: "telegram_send_voice",
   description:
-    "Send a voice message. Either provide voicePath for an existing file, or text for TTS generation. Uses the configured TTS provider and voice from settings. Available providers: piper, edge, openai, elevenlabs, groq.",
+    "Send a voice message. Either provide voicePath for an existing file, or text for TTS generation. WAV inputs (e.g. raw Groq TTS output) are automatically transcoded to OGG/Opus so Telegram renders them as proper voice notes. Uses the configured TTS provider and voice from settings. Available providers: piper, edge, openai, elevenlabs, groq.",
 
   parameters: Type.Object({
     chatId: Type.String({
       description: "The chat ID to send the voice message to",
     }),
     voicePath: Type.Optional(
       Type.String({
-        description: "Local file path to voice/audio file (OGG, MP3). Use this OR text.",
+        description:
+          "Local file path to voice/audio file (OGG/Opus, WAV, MP3). WAV is auto-converted to OGG/Opus before sending (Telegram voice notes require OGG/Opus). Use this OR text.",
       })
     ),
     text: Type.Optional(
@@ -96,6 +101,15 @@ export const telegramSendVoiceTool: Tool = {
   }),
 };
 
+/** Detect a WAV (PCM) container by its RIFF/WAVE magic bytes. */
+function isWavBuffer(buffer: Buffer): boolean {
+  return (
+    buffer.length >= 12 &&
+    buffer.toString("ascii", 0, 4) === "RIFF" &&
+    buffer.toString("ascii", 8, 12) === "WAVE"
+  );
+}
+
 export const telegramSendVoiceExecutor: ToolExecutor<SendVoiceParams> = async (
   params,
   context
@@ -142,6 +156,28 @@ export const telegramSendVoiceExecutor: ToolExecutor<SendVoiceParams> = async (
         }
         throw error;
       }
+
+      // Telegram only renders WAV as a generic document. If the caller passes
+      // a WAV file (e.g. raw Groq TTS output saved to disk), transcode it to
+      // OGG/Opus so the message becomes a proper voice note. OGG/Opus and
+      // other formats (MP3, M4A, …) are sent as-is.
+      try {
+        const buffer = readFileSync(audioPath);
+        if (isWavBuffer(buffer)) {
+          const tempDir = join(tmpdir(), "teleton-tts");
+          if (!existsSync(tempDir)) mkdirSync(tempDir, { recursive: true });
+          const oggPath = join(tempDir, `${randomUUID()}.ogg`);
+          const oggBuffer = wavToOggOpus(buffer);
+          writeFileSync(oggPath, oggBuffer);
+          audioPath = oggPath;
+          generatedFile = oggPath; // Mark transcoded file for cleanup
+        }
+      } catch (error) {
+        return {
+          success: false,
+          error: `Failed to convert WAV voice file to OGG/Opus: ${getErrorMessage(error)}`,
+        };
+      }
     }
 
     // TTS mode: generate speech from text