software-mansion · IgorSwat · Jan 22, 2026 · Jan 16, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt
@@ -88,6 +88,7 @@ kokoro
 phonemizer
 phonemizers
 phonemis
+phonemizing
 Español
 Français
 Português

diff --git a/apps/speech/screens/Quiz.tsx b/apps/speech/screens/Quiz.tsx
@@ -18,8 +18,8 @@ import Animated, {
 } from 'react-native-reanimated';
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
-  KOKORO_EN,
-  KOKORO_VOICE_AF_HEART,
+  KOKORO_MEDIUM,
+  KOKORO_VOICE_AM_SANTA,
   useTextToSpeech,
 } from 'react-native-executorch';
 import {
@@ -61,8 +61,8 @@ const createAudioBufferFromVector = (
 export const Quiz = ({ onBack }: { onBack: () => void }) => {
   // --- Hooks & State ---
   const model = useTextToSpeech({
-    model: KOKORO_EN,
-    voice: KOKORO_VOICE_AF_HEART,
+    model: KOKORO_MEDIUM,
+    voice: KOKORO_VOICE_AM_SANTA,
   });
 
   const [shuffledQuestions] = useState(() => shuffleArray(QUESTIONS));
@@ -153,7 +153,7 @@ export const Quiz = ({ onBack }: { onBack: () => void }) => {
           });
         };
 
-        await model.stream({ text, onNext, onEnd: async () => {} });
+        await model.stream({ text, speed: 0.9, onNext, onEnd: async () => {} });
       } catch (e) {
         console.error(e);
       } finally {

diff --git a/apps/speech/screens/TextToSpeechScreen.tsx b/apps/speech/screens/TextToSpeechScreen.tsx
@@ -10,7 +10,7 @@ import {
 } from 'react-native';
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
-  KOKORO_EN,
+  KOKORO_MEDIUM,
   KOKORO_VOICE_AF_HEART,
   useTextToSpeech,
 } from 'react-native-executorch';
@@ -49,14 +49,8 @@ const createAudioBufferFromVector = (
 
 export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => {
   const model = useTextToSpeech({
-    model: KOKORO_EN,
+    model: KOKORO_MEDIUM,
     voice: KOKORO_VOICE_AF_HEART,
-    options: {
-      // This allows to minimize the memory usage by utilizing only one of the models.
-      // However, it either increases the latency (in case of the largest model) or
-      // decreases the quality of the results (in case of the smaller models).
-      // fixedModel: "large"
-    },
   });
 
   const [inputText, setInputText] = useState('');

diff --git a/docs/docs/02-benchmarks/inference-time.md b/docs/docs/02-benchmarks/inference-time.md
@@ -66,6 +66,8 @@ The values below represent the averages across all runs for the benchmark image.
 
 ❌ - Insufficient RAM.
 
+## Speech to Text
+
 ### Encoding
 
 Average time for encoding audio of given length over 10 runs. For `Whisper` model we only list 30 sec audio chunks since `Whisper` does not accept other lengths (for shorter audio the audio needs to be padded to 30sec with silence).
@@ -82,6 +84,15 @@ Average time for decoding one token in sequence of approximately 100 tokens, wit
 | ------------------ | :--------------------------: | :--------------------------: | :------------------------: | :-------------------------------: | :-----------------------: |
 | Whisper-tiny (30s) |              23              |              25              |            121             |                92                 |            115            |
 
+## Text to Speech
+
+Average time to synthesize speech from an input text of approximately 60 tokens, resulting in 2 to 5 seconds of audio depending on the input and selected voice.
+
+| Model         | iPhone 17 Pro (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] |
+| ------------- | :--------------------------: | :-----------------------: |
+| Kokoro-small  |             2051             |           1548            |
+| Kokoro-medium |             2124             |           1625            |
+
 ## Text Embeddings
 
 | Model                      | iPhone 17 Pro (XNNPACK) [ms] | OnePlus 12 (XNNPACK) [ms] |

diff --git a/docs/docs/02-benchmarks/memory-usage.md b/docs/docs/02-benchmarks/memory-usage.md
@@ -56,6 +56,17 @@ All the below benchmarks were performed on iPhone 17 Pro (iOS) and OnePlus 12 (A
 | ------------ | :--------------------: | :----------------: |
 | WHISPER_TINY |          410           |        375         |
 
+## Text to speech
+
+| Model         | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |
+| ------------- | :--------------------: | :----------------: |
+| KOKORO_SMALL  |          820           |        820         |
+| KOKORO_MEDIUM |          1140          |        1100        |
+
+:::info
+The reported memory usage values include the memory footprint of the Phonemis package, which is used for phonemizing input text. Currently, this can range from 100 to 150 MB depending on the device.
+:::
+
 ## Text Embeddings
 
 | Model                      | Android (XNNPACK) [MB] | iOS (XNNPACK) [MB] |

diff --git a/docs/docs/02-benchmarks/model-size.md b/docs/docs/02-benchmarks/model-size.md
@@ -63,6 +63,13 @@ title: Model Size
 | WHISPER_SMALL_EN |     968      |
 | WHISPER_SMALL    |     968      |
 
+## Text to speech
+
+| Model         | XNNPACK [MB] |
+| ------------- | :----------: |
+| KOKORO_SMALL  |    329.6     |
+| KOKORO_MEDIUM |    334.4     |
+
 ## Text Embeddings
 
 | Model                      | XNNPACK [MB] |

diff --git a/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md b/docs/docs/03-hooks/01-natural-language-processing/useTextToSpeech.md
@@ -0,0 +1,217 @@
+---
+title: useTextToSpeech
+keywords: [
+    text to speech
+    tts,
+    voice synthesizer,
+    transcription,
+    kokoro,
+    react native,
+    executorch,
+    ai,
+    machine learning,
+    on-device,
+    mobile ai,
+  ]
+description: "Learn how to use text-to-speech models in your React Native applications with React Native ExecuTorch's useTextToSpeech hook."
+---
+
+Text to speech is a task that allows to transform written text into spoken language. It is commonly used to implement features such as voice assistants, accessibility tools, or audiobooks.
+
+:::warning
+It is recommended to use models provided by us, which are available at our [Hugging Face repository](https://huggingface.co/software-mansion/react-native-executorch-kokoro). You can also use [constants](https://github.com/software-mansion/react-native-executorch/blob/main/packages/react-native-executorch/src/constants/modelUrls.ts) shipped with our library.
+:::
+
+## Reference
+
+You can play the generated waveform in any way most suitable to you; however, in the snippet below we utilize the react-native-audio-api library to play synthesized speech.
+
+```typescript
+import {
+  useTextToSpeech,
+  KOKORO_MEDIUM,
+  KOKORO_VOICE_AF_HEART,
+} from 'react-native-executorch';
+import { AudioContext } from 'react-native-audio-api';
+
+const model = useTextToSpeech({
+  model: KOKORO_MEDIUM,
+  voice: KOKORO_VOICE_AF_HEART,
+});
+
+const audioContext = new AudioContext({ sampleRate: 24000 });
+
+const handleSpeech = async (text: string) => {
+  const speed = 1.0;
+  const waveform = await model.forward(text, speed);
+
+  const audioBuffer = audioContext.createBuffer(1, waveform.length, 24000);
+  audioBuffer.getChannelData(0).set(waveform);
+
+  const source = audioContext.createBufferSource();
+  source.buffer = audioBuffer;
+  source.connect(audioContext.destination);
+  source.start();
+};
+```
+
+### Arguments
+
+**`model`** (`KokoroConfig`) - Object specifying the source files for the Kokoro TTS model (duration predictor, synthesizer).
+
+**`voice`** (`VoiceConfig`) - Object specifying the voice data and phonemizer assets (tagger and lexicon).
+
+**`preventLoad?`** - Boolean that can prevent automatic model loading after running the hook.
+
+For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page.
+
+### Returns
+
+| Field              | Type                                                      | Description                                                                                                                                                                          |
+| ------------------ | --------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `forward`          | `(text: string, speed?: number) => Promise<Float32Array>` | Synthesizes a full text into speech. Returns a promise resolving to the full audio waveform as a `Float32Array`.                                                                     |
+| `stream`           | `(input: TextToSpeechStreamingInput) => Promise<void>`    | Starts a streaming synthesis session. Takes a text input and callbacks to handle audio chunks as they are generated. Ideal for reducing the "time to first audio" for long sentences |
+| `streamStop`       | `(): void`                                                | Stops the streaming process if there is any ongoing.                                                                                                                                 |
+| `error`            | `RnExecutorchError \| null`                               | Contains the error message if the model failed to load or synthesis failed.                                                                                                          |
+| `isGenerating`     | `boolean`                                                 | Indicates whether the model is currently processing a synthesis.                                                                                                                     |
+| `isReady`          | `boolean`                                                 | Indicates whether the model has successfully loaded and is ready for synthesis.                                                                                                      |
+| `downloadProgress` | `number`                                                  | Tracks the progress of the model and voice assets download process.                                                                                                                  |
+
+<details>
+<summary>Type definitions</summary>
+
+```typescript
+interface TextToSpeechStreamingInput {
+  text: string;
+  speed?: number;
+  onBegin?: () => void | Promise<void>;
+  onNext?: (chunk: Float32Array) => Promise<void> | void;
+  onEnd?: () => Promise<void> | void;
+}
+
+interface KokoroConfig {
+  durationSource: ResourceSource;
+  synthesizerSource: ResourceSource;
+}
+
+interface VoiceConfig {
+  voiceSource: ResourceSource;
+  extra: {
+    taggerSource: ResourceSource;
+    lexiconSource: ResourceSource;
+  };
+}
+```
+
+</details>
+
+## Running the model
+
+The module provides two ways to generate speech:
+
+1.  **`forward(text, speed)`**: Generates the complete audio waveform at once. Returns a promise resolving to a `Float32Array`.
+
+:::note
+Since it processes the entire text at once, it might take a significant amount of time to produce an audio for long text inputs.
+:::
+
+2.  **`stream({ text, speed })`**: An async generator that yields chunks of audio as they are computed.
+    This is ideal for reducing the "time to first audio" for long sentences.
+
+## Example
+
+### Speech Synthesis
+
+```tsx
+import React from 'react';
+import { Button, View } from 'react-native';
+import {
+  useTextToSpeech,
+  KOKORO_MEDIUM,
+  KOKORO_VOICE_AF_HEART,
+} from 'react-native-executorch';
+import { AudioContext } from 'react-native-audio-api';
+
+export default function App() {
+  const tts = useTextToSpeech({
+    model: KOKORO_MEDIUM,
+    voice: KOKORO_VOICE_AF_HEART,
+  });
+
+  const generateAudio = async () => {
+    const audioData = await tts.forward({
+      text: 'Hello world! This is a sample text.',
+    });
+
+    // Playback example
+    const ctx = new AudioContext({ sampleRate: 24000 });
+    const buffer = ctx.createBuffer(1, audioData.length, 24000);
+    buffer.getChannelData(0).set(audioData);
+
+    const source = ctx.createBufferSource();
+    source.buffer = buffer;
+    source.connect(ctx.destination);
+    source.start();
+  };
+
+  return (
+    <View style={{ flex: 1, justifyContent: 'center', alignItems: 'center' }}>
+      <Button title="Speak" onPress={generateAudio} disabled={!tts.isReady} />
+    </View>
+  );
+}
+```
+
+### Streaming Synthesis
+
+```tsx
+import React, { useRef } from 'react';
+import { Button, View } from 'react-native';
+import {
+  useTextToSpeech,
+  KOKORO_MEDIUM,
+  KOKORO_VOICE_AF_HEART,
+} from 'react-native-executorch';
+import { AudioContext } from 'react-native-audio-api';
+
+export default function App() {
+  const tts = useTextToSpeech({
+    model: KOKORO_MEDIUM,
+    voice: KOKORO_VOICE_AF_HEART,
+  });
+
+  const contextRef = useRef(new AudioContext({ sampleRate: 24000 }));
+
+  const generateStream = async () => {
+    const ctx = contextRef.current;
+
+    await tts.stream({
+      text: "This is a longer text, which is being streamed chunk by chunk. Let's see how it works!",
+      onNext: async (chunk) => {
+        return new Promise((resolve) => {
+          const buffer = ctx.createBuffer(1, chunk.length, 24000);
+          buffer.getChannelData(0).set(chunk);
+
+          const source = ctx.createBufferSource();
+          source.buffer = buffer;
+          source.connect(ctx.destination);
+          source.onEnded = () => resolve();
+          source.start();
+        });
+      },
+    });
+  };
+
+  return (
+    <View style={{ flex: 1, justifyContent: 'center', alignItems: 'center' }}>
+      <Button title="Stream" onPress={generateStream} disabled={!tts.isReady} />
+    </View>
+  );
+}
+```
+
+## Supported models
+
+| Model                                                                            | Language |
+| -------------------------------------------------------------------------------- | :------: |
+| [Kokoro](https://huggingface.co/software-mansion/react-native-executorch-kokoro) | English  |