AACTools · OwenMcGirr · Apr 19, 2026 · Apr 19, 2026
diff --git a/README.md b/README.md
@@ -44,6 +44,7 @@ A JavaScript/TypeScript library that provides a unified API for working with mul
 |--------------|------------|-------------|----------|-------------|
 | `azure` | `AzureTTSClient` | Both | Microsoft Azure Cognitive Services | `@azure/cognitiveservices-speechservices`, `microsoft-cognitiveservices-speech-sdk` |
 | `google` | `GoogleTTSClient` | Both | Google Cloud Text-to-Speech | `@google-cloud/text-to-speech` |
+| `gemini` | `GeminiTTSClient` | Both | Gemini Flash TTS | None (uses fetch API) |
 | `elevenlabs` | `ElevenLabsTTSClient` | Both | ElevenLabs | `node-fetch@2` (Node.js only) |
 | `watson` | `WatsonTTSClient` | Both | IBM Watson | None (uses fetch API) |
 | `openai` | `OpenAITTSClient` | Both | OpenAI | `openai` |
@@ -271,7 +272,7 @@ async function runExample() {
 runExample().catch(console.error);
 ```
 
-The factory supports all engines: `'azure'`, `'google'`, `'polly'`, `'elevenlabs'`, `'openai'`, `'modelslab'`, `'playht'`, `'watson'`, `'witai'`, `'sherpaonnx'`, `'sherpaonnx-wasm'`, `'espeak'`, `'espeak-wasm'`, `'sapi'`, `'cartesia'`, `'deepgram'`, `'hume'`, `'xai'`, `'fishaudio'`, `'mistral'`, `'murf'`, `'unrealspeech'`, `'resemble'`, etc.
+The factory supports all engines: `'azure'`, `'google'`, `'gemini'`, `'polly'`, `'elevenlabs'`, `'openai'`, `'modelslab'`, `'playht'`, `'watson'`, `'witai'`, `'sherpaonnx'`, `'sherpaonnx-wasm'`, `'espeak'`, `'espeak-wasm'`, `'sapi'`, `'cartesia'`, `'deepgram'`, `'hume'`, `'xai'`, `'fishaudio'`, `'mistral'`, `'murf'`, `'unrealspeech'`, `'resemble'`, etc.
 
 ## Core Functionality
 
@@ -492,6 +493,7 @@ The following engines **automatically strip SSML tags** and convert to plain tex
 - **Cartesia** - SSML tags removed; audio tags (`[laugh]`, `[sigh]`, etc.) mapped to `<emotion>` for sonic-3, stripped for others
 - **Deepgram** - SSML tags are removed, plain text is synthesized
 - **Hume** - SSML tags are removed, plain text is synthesized
+- **Gemini** - SSML tags are removed; Gemini audio tags are passed natively
 - **xAI** - SSML tags are removed; audio tags passed natively for grok-tts
 - **Fish Audio** - SSML tags removed; audio tags passed natively for s2-pro
 - **Mistral** - SSML tags are removed, plain text is synthesized
@@ -697,6 +699,7 @@ When disabled, js-tts-wrapper falls back to the lightweight built-in converter (
 | Cartesia | ✅ Converted | → SSML → Plain text |
 | Deepgram | ✅ Converted | → SSML → Plain text |
 | Hume | ✅ Converted | → SSML → Plain text |
+| Gemini | ✅ Converted | → SSML → Plain text |
 | xAI | ✅ Converted | → SSML → Plain text |
 | Fish Audio | ✅ Converted | → SSML → Plain text |
 | Mistral | ✅ Converted | → SSML → Plain text |
@@ -842,6 +845,44 @@ Notes:
 - For true timings, use service account credentials (Node) where the beta client can be used.
 - Environment variable supported by examples/tests: `GOOGLECLOUDTTS_API_KEY`.
 
+### Gemini Flash TTS
+
+Gemini Flash TTS uses the Gemini API, not Google Cloud Text-to-Speech. Configure `GEMINI_API_KEY` or pass `apiKey` directly.
+
+Enable the **Gemini API** (`generativelanguage.googleapis.com`) in your Google Cloud project. Google Cloud Text-to-Speech (`texttospeech.googleapis.com`) is not used by this engine.
+
+#### ESM
+```javascript
+import { GeminiTTSClient } from 'js-tts-wrapper';
+
+const tts = new GeminiTTSClient({
+  apiKey: process.env.GEMINI_API_KEY,
+  model: 'gemini-3.1-flash-tts-preview',
+  voice: 'Kore'
+});
+
+const audio = await tts.synthToBytes('Say cheerfully: Have a wonderful day!');
+```
+
+#### Factory
+```javascript
+import { createTTSClient } from 'js-tts-wrapper';
+
+const tts = createTTSClient('gemini', {
+  apiKey: process.env.GEMINI_API_KEY,
+  voice: 'Puck'
+});
+
+await tts.speak('[excitedly] Hello from Gemini Flash TTS!');
+```
+
+Notes:
+- Supported models: `gemini-3.1-flash-tts-preview` (default) and `gemini-2.5-flash-preview-tts`.
+- Supported voices: Zephyr, Puck, Charon, Kore, Fenrir, Leda, Orus, Aoede, Callirrhoe, Autonoe, Enceladus, Iapetus, Umbriel, Algieba, Despina, Erinome, Algenib, Rasalgethi, Laomedeia, Achernar, Alnilam, Schedar, Gacrux, Pulcherrima, Achird, Zubenelgenubi, Vindemiatrix, Sadachbia, Sadaltager, Sulafat.
+- Gemini TTS does not support SSML; SSML tags are stripped before synthesis.
+- Gemini TTS does not provide true streaming; `synthToBytestream()` wraps the completed audio bytes in a stream.
+- Output is WAV by default. Use `{ format: 'pcm' }` to return raw PCM.
+- Gemini audio tags can be included directly in text, such as `[whispers]`, `[laughs]`, or `[excitedly]`.
 
 ### AWS Polly
 
@@ -1441,6 +1482,7 @@ cd your-project
 # Install specific engine dependencies
 npx js-tts-wrapper@latest run install:azure
 npx js-tts-wrapper@latest run install:google
+npx js-tts-wrapper@latest run install:gemini  # no additional dependencies
 npx js-tts-wrapper@latest run install:polly
 npx js-tts-wrapper@latest run install:openai
 npx js-tts-wrapper@latest run install:sherpaonnx

diff --git a/bin/cli.js b/bin/cli.js
@@ -61,6 +61,7 @@ async function installEngine(engine) {
   const engineDeps = {
     azure: ["microsoft-cognitiveservices-speech-sdk"],
     google: ["@google-cloud/text-to-speech"],
+    gemini: [],
     elevenlabs: ["node-fetch@2"],
     playht: ["node-fetch@2"],
     polly: ["@aws-sdk/client-polly"],
@@ -136,6 +137,7 @@ Commands:
 Available engines:
   azure               Microsoft Azure TTS
   google              Google Cloud TTS
+  gemini              Gemini Flash TTS (direct REST, no dependencies)
   elevenlabs          ElevenLabs TTS
   playht              PlayHT TTS
   polly               AWS Polly TTS

diff --git a/package.json b/package.json
@@ -60,6 +60,7 @@
     "test:azure": "node run-tts-tests.cjs azure",
     "test:elevenlabs": "node run-tts-tests.cjs elevenlabs",
     "test:google": "node run-tts-tests.cjs google",
+    "test:gemini": "node run-tts-tests.cjs gemini",
     "test:polly": "node run-tts-tests.cjs polly",
     "test:openai": "node run-tts-tests.cjs openai",
     "test:playht": "node run-tts-tests.cjs playht",
@@ -99,6 +100,7 @@
     "install:deps": "echo 'Use npm install js-tts-wrapper[engine] instead. For example: npm install js-tts-wrapper[azure]'",
     "install:azure": "npm install microsoft-cognitiveservices-speech-sdk",
     "install:google": "npm install @google-cloud/text-to-speech",
+    "install:gemini": "echo 'Gemini TTS uses direct REST API calls; no additional dependencies required.'",
     "install:polly": "npm install @aws-sdk/client-polly",
     "install:openai": "npm install openai",
     "install:elevenlabs": "npm install @elevenlabs/elevenlabs-js",
@@ -118,6 +120,7 @@
     "text-to-speech",
     "azure",
     "google",
+    "gemini",
     "polly",
     "elevenlabs",
     "ibm",
@@ -247,6 +250,7 @@
     "google": {
       "@google-cloud/text-to-speech": "^6.4.0"
     },
+    "gemini": {},
     "elevenlabs": {
       "@elevenlabs/elevenlabs-js": "^2.32.0"
     },

diff --git a/run-tts-tests.cjs b/run-tts-tests.cjs
@@ -20,14 +20,15 @@ const engineName = process.argv[2];
 
 if (!engineName) {
   console.error('Usage: node run-tts-tests.cjs <engine-name>');
-  console.error('Available engines: azure, google, polly, openai, elevenlabs, playht, upliftai, sherpaonnx, sherpaonnx-wasm, sapi, espeak, system');
+  console.error('Available engines: azure, google, gemini, polly, openai, elevenlabs, playht, upliftai, sherpaonnx, sherpaonnx-wasm, sapi, espeak, system');
   process.exit(1);
 }
 
 // Map engine names to test patterns
 const engineTestPatterns = {
   'azure': 'azure',
   'google': 'google',
+  'gemini': 'gemini',
   'polly': 'polly',
   'openai': 'openai',
   'elevenlabs': 'elevenlabs',

diff --git a/src/__tests__/gemini.test.ts b/src/__tests__/gemini.test.ts
@@ -0,0 +1,232 @@
+import { afterEach, beforeEach, describe, expect, it, jest } from "@jest/globals";
+import { GeminiTTSClient } from "../engines/gemini";
+import { createBrowserTTSClient } from "../factory-browser";
+import { createTTSClient } from "../factory";
+
+const originalFetch = globalThis.fetch;
+
+function response(body: any, init: { ok?: boolean; status?: number; statusText?: string } = {}) {
+  return {
+    ok: init.ok ?? true,
+    status: init.status ?? 200,
+    statusText: init.statusText ?? "OK",
+    headers: {} as Headers,
+    body: null as any,
+    json: async () => body,
+    text: async () => (typeof body === "string" ? body : JSON.stringify(body)),
+    arrayBuffer: async () => new ArrayBuffer(0),
+  };
+}
+
+function audioResponse(base64Audio: string) {
+  return response({
+    candidates: [
+      {
+        content: {
+          parts: [
+            {
+              inlineData: {
+                data: base64Audio,
+              },
+            },
+          ],
+        },
+      },
+    ],
+  });
+}
+
+function b64(bytes: number[]): string {
+  return Buffer.from(new Uint8Array(bytes)).toString("base64");
+}
+
+describe("GeminiTTSClient", () => {
+  let client: GeminiTTSClient;
+
+  beforeEach(() => {
+    client = new GeminiTTSClient({ apiKey: "test-api-key" });
+  });
+
+  afterEach(() => {
+    globalThis.fetch = originalFetch;
+    jest.restoreAllMocks();
+  });
+
+  it("initializes with default values", () => {
+    expect(client.getProperty("model")).toBe("gemini-3.1-flash-tts-preview");
+    expect(client.getProperty("voice")).toBe("Kore");
+  });
+
+  it("initializes with custom model and voice", () => {
+    const c = new GeminiTTSClient({
+      apiKey: "test",
+      model: "gemini-2.5-flash-preview-tts",
+      voice: "Puck",
+    });
+
+    expect(c.getProperty("model")).toBe("gemini-2.5-flash-preview-tts");
+    expect(c.getProperty("voice")).toBe("Puck");
+  });
+
+  it("initializes with properties object", () => {
+    const c = new GeminiTTSClient({
+      apiKey: "test",
+      properties: { model: "gemini-2.5-flash-preview-tts", voice: "Zephyr" },
+    });
+
+    expect(c.getProperty("model")).toBe("gemini-2.5-flash-preview-tts");
+    expect(c.getProperty("voice")).toBe("Zephyr");
+  });
+
+  it("initializes with propertiesJson string", () => {
+    const c = new GeminiTTSClient({
+      apiKey: "test",
+      propertiesJson: JSON.stringify({ voice: "Sulafat" }),
+    });
+
+    expect(c.getProperty("voice")).toBe("Sulafat");
+  });
+
+  it("sets and gets model, voice, and baseURL", () => {
+    client.setProperty("model", "gemini-2.5-flash-preview-tts");
+    client.setProperty("voice", "Puck");
+    client.setProperty("baseURL", "https://example.test/v1beta");
+
+    expect(client.getProperty("model")).toBe("gemini-2.5-flash-preview-tts");
+    expect(client.getProperty("voice")).toBe("Puck");
+    expect(client.getProperty("baseURL")).toBe("https://example.test/v1beta");
+  });
+
+  it("requires apiKey credential", () => {
+    expect((client as any).getRequiredCredentials()).toEqual(["apiKey"]);
+  });
+
+  it("returns false for checkCredentials without api key", async () => {
+    expect(await new GeminiTTSClient({}).checkCredentials()).toBe(false);
+  });
+
+  it("checks credentials against model list", async () => {
+    globalThis.fetch = jest.fn(async () =>
+      response({
+        models: [{ name: "models/gemini-3.1-flash-tts-preview" }],
+      })
+    ) as any;
+
+    expect(await client.checkCredentials()).toBe(true);
+  });
+
+  it("gets static voices", async () => {
+    const voices = await client.getVoices();
+
+    expect(voices).toHaveLength(30);
+    expect(voices[0]).toHaveProperty("id", "Zephyr");
+    expect(voices[0]).toHaveProperty("provider", "gemini");
+  });
+
+  it("filters voices by supported languages", async () => {
+    expect((await client.getVoicesByLanguage("en")).length).toBeGreaterThan(0);
+    expect((await client.getVoicesByLanguage("fr")).length).toBeGreaterThan(0);
+  });
+
+  it("creates via node and browser factories", () => {
+    expect(createTTSClient("gemini", { apiKey: "test" })).toBeInstanceOf(GeminiTTSClient);
+    expect(createBrowserTTSClient("gemini", { apiKey: "test" })).toBeInstanceOf(GeminiTTSClient);
+  });
+
+  it("strips SSML while preserving Gemini audio tags", async () => {
+    const result = await (client as any).prepareText(
+      "<speak>Hello <break time=\"500ms\"/> [laughs] world</speak>"
+    );
+
+    expect(result).not.toContain("<speak>");
+    expect(result).not.toContain("<break");
+    expect(result).toContain("[laughs]");
+  });
+
+  it("returns WAV bytes by default and sends the Gemini request shape", async () => {
+    const pcm = b64([0, 0, 1, 0]);
+    globalThis.fetch = jest.fn(async () => audioResponse(pcm)) as any;
+
+    const bytes = await client.synthToBytes("Say cheerfully: Hello", { voice: "Puck" });
+    const request = JSON.parse(((globalThis.fetch as any).mock.calls[0][1] as any).body);
+
+    expect(String.fromCharCode(bytes[0], bytes[1], bytes[2], bytes[3])).toBe("RIFF");
+    expect((globalThis.fetch as any).mock.calls[0][0]).toContain(
+      "/models/gemini-3.1-flash-tts-preview:generateContent"
+    );
+    expect(request.generationConfig.responseModalities).toEqual(["AUDIO"]);
+    expect(
+      request.generationConfig.speechConfig.voiceConfig.prebuiltVoiceConfig.voiceName
+    ).toBe("Puck");
+  });
+
+  it("returns raw PCM when requested", async () => {
+    globalThis.fetch = jest.fn(async () => audioResponse(b64([0, 0, 1, 0]))) as any;
+
+    const bytes = await client.synthToBytes("Hello", { format: "pcm" });
+
+    expect(Array.from(bytes)).toEqual([0, 0, 1, 0]);
+  });
+
+  it("uses selected model in request URL", async () => {
+    globalThis.fetch = jest.fn(async () => audioResponse(b64([0, 0]))) as any;
+
+    await client.synthToBytes("Hello", { model: "gemini-2.5-flash-preview-tts" });
+
+    expect((globalThis.fetch as any).mock.calls[0][0]).toContain(
+      "/models/gemini-2.5-flash-preview-tts:generateContent"
+    );
+  });
+
+  it("throws useful error for HTTP failures", async () => {
+    globalThis.fetch = jest.fn(async () =>
+      response("bad request", { ok: false, status: 400, statusText: "Bad Request" })
+    ) as any;
+
+    await expect(client.synthToBytes("Hello")).rejects.toThrow(
+      "Gemini TTS API error: 400 Bad Request"
+    );
+  });
+
+  it("throws useful error for missing audio data", async () => {
+    globalThis.fetch = jest.fn(async () =>
+      response({
+        candidates: [
+          {
+            finishReason: "STOP",
+            content: { parts: [{ text: "not audio" }] },
+          },
+        ],
+      })
+    ) as any;
+
+    await expect(client.synthToBytes("Hello")).rejects.toThrow(
+      "Gemini TTS response did not include audio data"
+    );
+  });
+
+  it("wraps synthesized bytes in a stream and returns estimated word boundaries", async () => {
+    globalThis.fetch = jest.fn(async () => audioResponse(b64([0, 0, 1, 0]))) as any;
+
+    const result = await client.synthToBytestream("Hello world", { useWordBoundary: true });
+    const reader = result.audioStream.getReader();
+    const chunk = await reader.read();
+
+    expect(chunk.done).toBe(false);
+    expect(chunk.value?.length).toBeGreaterThan(0);
+    expect(result.wordBoundaries).toHaveLength(2);
+  });
+
+  it("provides credential status", async () => {
+    globalThis.fetch = jest.fn(async () =>
+      response({
+        models: [{ name: "models/gemini-3.1-flash-tts-preview" }],
+      })
+    ) as any;
+
+    const status = await client.getCredentialStatus();
+
+    expect(status.engine).toBe("gemini");
+    expect(status.requiresCredentials).toBe(true);
+  });
+});
diff --git a/src/browser.ts b/src/browser.ts
@@ -13,6 +13,7 @@ export { DeepgramTTSClient } from "./engines/deepgram";
 export { ElevenLabsTTSClient } from "./engines/elevenlabs";
 export { EspeakBrowserTTSClient } from "./engines/espeak-wasm";
 export { FishAudioTTSClient } from "./engines/fishaudio";
+export { GeminiTTSClient } from "./engines/gemini";
 export { GoogleTTSClient } from "./engines/google";
 export { HumeTTSClient } from "./engines/hume";
 export { MistralTTSClient } from "./engines/mistral";