fishaudio · Kilerd · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/developer-guide/core-features/fine-grained-control.mdx b/developer-guide/core-features/fine-grained-control.mdx
@@ -1,72 +1,124 @@
 ---
-title: 'Fine-grained Control'
-description: 'Advanced control over speech generation'
+title: "Fine-grained Control"
+description: "Advanced control over speech generation"
 icon: "sliders"
 iconType: "solid"
 ---
-import { AudioTranscript } from '/snippets/audio-transcript.jsx';
+
+import { AudioTranscript } from "/snippets/audio-transcript.jsx";
 
 {/* speak-mintlify-hash: 4a46ae86b04c62730f1554051768c47b306f2378a3624545d53de421d5d19acd */}
+
 <Visibility for="humans">
   <AudioTranscript page="core-features-fine-grained-control" />
 </Visibility>
 
-
 ## Getting Started
 
 To use fine-grained control, you can use either our SDK, API, or Playground.
 
-SDK/API: We recommend disabling normalization by setting `"normalize": false` in the request body. This ensures that the API doesn't alter the intonation of control tags.
+SDK/API: Phoneme tags are preserved by text normalization, so you can keep the default normalization behavior for pronunciation control. Set `"normalize": false` only when you want to prevent normalization from rewriting the surrounding text, such as numbers, dates, or URLs.
 
 Playground: You can use V1.6 Control Model, without setting any other options.
 
 <Note>
-Disabling normalization may reduce the stability of reading numbers, dates, and URLs. You'll need to handle these cases manually for best results.
+  Disabling normalization may reduce the stability of reading numbers, dates,
+  and URLs. You'll need to handle these cases manually for best results.
 </Note>
 
 ## Phoneme Control
 
-Phoneme control allows you to specify exact pronunciations for words or characters. Currently, we support:
+Phoneme control allows you to specify exact pronunciations for words, characters, or short phrases. Wrap the desired pronunciation in `<|phoneme_start|>` and `<|phoneme_end|>` tags.
+
+The replacement scope depends on the language:
+
+- English: replace one word with CMU Arpabet.
+- Chinese: replace one character or syllable with tone-number pinyin.
+- Japanese: replace a short Japanese word or phrase with OpenJTalk-style romaji and pitch accent markers.
+
+<CardGroup cols={3}>
+  <Card
+    title="English"
+    icon="language"
+    href="/developer-guide/core-features/fine-grained-control/english"
+  >
+    CMU Arpabet examples for names, homographs, acronyms, and technical terms.
+  </Card>
+
+<Card
+  title="Chinese"
+  icon="language"
+  href="/developer-guide/core-features/fine-grained-control/chinese"
+>
+  Tone-number pinyin examples for multi-character words, tones, and polyphonic
+  characters.
+</Card>
 
-- CMU Arpabet (for English)
-- Pinyin (for Chinese)
+  <Card
+    title="Japanese"
+    icon="language"
+    href="/developer-guide/core-features/fine-grained-control/japanese"
+  >
+    OpenJTalk romaji phonemes with pitch accent digits or rising/falling edge
+    markers.
+  </Card>
+</CardGroup>
 
-To use phoneme control, wrap the desired pronunciation in `<|phoneme_start|>` and `<|phoneme_end|>` tags. Each tag should contain a single word or character.
+### Quick Examples
 
-### English Example
+English:
 
-Standard: "I am an engineer."
-With phoneme control: "I am an `<|phoneme_start|>EH N JH AH N IH R<|phoneme_end|>`."
+```text
+I am an <|phoneme_start|>EH1 N JH AH0 N IH1 R<|phoneme_end|>.
+```
 
-### Chinese Example
+Chinese:
 
-Standard: "我是一个工程师。"
-With phoneme control: "我是一个`<|phoneme_start|>gong1<|phoneme_end|><|phoneme_start|>cheng2<|phoneme_end|><|phoneme_start|>shi1<|phoneme_end|>`。"
+```text
+我是一个<|phoneme_start|>gong1<|phoneme_end|><|phoneme_start|>cheng2<|phoneme_end|><|phoneme_start|>shi1<|phoneme_end|>。
+```
+
+Japanese:
+
+```text
+<|phoneme_start|>ha0shi1ga0<|phoneme_end|>見えます。
+```
 
 ## Paralanguage
 
 Paralanguage controls allow you to add natural speech elements and pauses to make the generated speech sound more human-like. There are two main types of controls:
 
 ### Pause Words
+
 You can use common pause words like "um", "uh", "嗯", "啊" to control the rhythm of the speech.
 
 ### Special Effects
+
 The following special effects can be added using parentheses:
 
-| Effect | Description | First Available | Stage |
-|--------|-------------|-----------------|-------|
-| `(break)` | Short pause | V1.6 | Experimental |
-| `(long-break)` | Extended pause | V1.6 | Experimental |
-| `(breath)` | Breathing sound | V1.6 | Experimental |
-| `(laugh)` | Laughter sound | V1.6 | Experimental |
-| `(cough)` | Coughing sound | V1.6 | Experimental |
-| `(lip-smacking)` | Lip smacking sound | V1.6 | Experimental |
-| `(sigh)` | Sighing sound | V1.6 | Experimental |
+| Effect           | Description        | First Available | Stage        |
+| ---------------- | ------------------ | --------------- | ------------ |
+| `(break)`        | Short pause        | V1.6            | Experimental |
+| `(long-break)`   | Extended pause     | V1.6            | Experimental |
+| `(breath)`       | Breathing sound    | V1.6            | Experimental |
+| `(laugh)`        | Laughter sound     | V1.6            | Experimental |
+| `(cough)`        | Coughing sound     | V1.6            | Experimental |
+| `(lip-smacking)` | Lip smacking sound | V1.6            | Experimental |
+| `(sigh)`         | Sighing sound      | V1.6            | Experimental |
 
 <Warning>
-The effects `(laugh)`, `(cough)`, `(lip-smacking)`, and `(sigh)` are developing. You may need to repeat them multiple times for better results.
+  The effects `(laugh)`, `(cough)`, `(lip-smacking)`, and `(sigh)` are
+  developing. You may need to repeat them multiple times for better results.
 </Warning>
 
 Example:
-Standard: "I am an engineer."
-With paralanguage: "I am, um, an (break) engineer."
+
+```text
+I am, um, an (break) engineer.
+```
+
+You can combine paralanguage and phoneme control in the same text:
+
+```text
+I am, um, an (break) <|phoneme_start|>EH1 N JH AH0 N IH1 R<|phoneme_end|>.
+```
diff --git a/developer-guide/core-features/fine-grained-control/chinese.mdx b/developer-guide/core-features/fine-grained-control/chinese.mdx
@@ -0,0 +1,98 @@
+---
+title: "Chinese Phoneme Control"
+description: "Control Chinese pronunciation with tone-number pinyin"
+icon: "language"
+---
+
+## Overview
+
+Chinese phoneme control uses pinyin with tone numbers, also known as tone3 pinyin. Wrap one syllable in each `<|phoneme_start|>` and `<|phoneme_end|>` tag.
+
+```text
+我是一个<|phoneme_start|>gong1<|phoneme_end|><|phoneme_start|>cheng2<|phoneme_end|><|phoneme_start|>shi1<|phoneme_end|>。
+```
+
+This format is especially useful for polyphonic characters, names, and domain-specific terms where the default reading may be ambiguous.
+
+## Tone Numbers
+
+Put the tone number at the end of each pinyin syllable:
+
+| Tone | Example | Description |
+| ---- | ------- | ----------- |
+| 1    | `ma1`   | High level  |
+| 2    | `ma2`   | Rising      |
+| 3    | `ma3`   | Dipping     |
+| 4    | `ma4`   | Falling     |
+| 5    | `ma5`   | Neutral     |
+
+Use lowercase pinyin and keep punctuation outside the phoneme tag.
+
+## Multi-character Words
+
+For a multi-character word, place adjacent phoneme tags in the same order as the original characters:
+
+```text
+Standard: 我是一个工程师。
+With phoneme control: 我是一个<|phoneme_start|>gong1<|phoneme_end|><|phoneme_start|>cheng2<|phoneme_end|><|phoneme_start|>shi1<|phoneme_end|>。
+```
+
+You can also tag only the ambiguous character and leave the rest of the sentence unchanged:
+
+```text
+请把这个字读作<|phoneme_start|>hang2<|phoneme_end|>。
+```
+
+## Polyphonic Characters
+
+For polyphonic characters, choose the pinyin that matches the phrase meaning:
+
+```text
+重庆: <|phoneme_start|>chong2<|phoneme_end|><|phoneme_start|>qing4<|phoneme_end|>
+重要: <|phoneme_start|>zhong4<|phoneme_end|><|phoneme_start|>yao4<|phoneme_end|>
+```
+
+```text
+银行: <|phoneme_start|>yin2<|phoneme_end|><|phoneme_start|>hang2<|phoneme_end|>
+行走: <|phoneme_start|>xing2<|phoneme_end|><|phoneme_start|>zou3<|phoneme_end|>
+```
+
+```text
+音乐: <|phoneme_start|>yin1<|phoneme_end|><|phoneme_start|>yue4<|phoneme_end|>
+快乐: <|phoneme_start|>kuai4<|phoneme_end|><|phoneme_start|>le4<|phoneme_end|>
+```
+
+## Generate Pinyin
+
+The training pipeline uses the `pypinyin` dictionary and converts entries to tone3 pinyin. The helper below mirrors that behavior for single characters:
+
+```bash
+pip install pypinyin
+```
+
+```python
+from pypinyin.contrib.tone_convert import to_tone3
+from pypinyin.pinyin_dict import pinyin_dict
+
+
+def chinese_char_to_pinyin(char: str) -> str | None:
+    pinyin = pinyin_dict.get(ord(char))
+    if pinyin is None:
+        return None
+    if "," in pinyin:
+        raise ValueError(f"{char} has multiple readings; choose one manually")
+    return to_tone3(pinyin)
+
+
+print(chinese_char_to_pinyin("工"))
+# gong1
+```
+
+Phrase-level words can require a phrase dictionary or manual selection. For example, `重` should be `chong2` in `重庆` but `zhong4` in `重要`.
+
+## Practical Tips
+
+- Use one phoneme tag per Chinese character or syllable.
+- Keep Chinese punctuation, brackets, and spaces outside the tag.
+- Choose readings manually for names and polyphonic characters.
+- Use `ma5`-style tone 5 when you need to mark a neutral tone explicitly.
diff --git a/developer-guide/core-features/fine-grained-control/english.mdx b/developer-guide/core-features/fine-grained-control/english.mdx
@@ -0,0 +1,100 @@
+---
+title: "English Phoneme Control"
+description: "Control English pronunciation with CMU Arpabet"
+icon: "language"
+---
+
+## Overview
+
+English phoneme control uses CMU Arpabet, the pronunciation format used by CMUdict.
+
+Wrap the pronunciation for one word in `<|phoneme_start|>` and `<|phoneme_end|>`, and keep surrounding punctuation outside the tag.
+
+```text
+I am an <|phoneme_start|>EH1 N JH AH0 N IH1 R<|phoneme_end|>.
+```
+
+<Note>
+  IPA is not supported for English phoneme tags. Convert IPA pronunciations to
+  CMU Arpabet before using phoneme control.
+</Note>
+
+## CMU Arpabet
+
+CMU Arpabet is written as space-separated uppercase symbols. Vowels can include stress digits:
+
+- `0` for unstressed vowels.
+- `1` for primary stress.
+- `2` for secondary stress.
+
+For the full symbol inventory, see the CMUdict [`cmudict.symbols`](https://github.com/cmusphinx/cmudict/blob/master/cmudict.symbols) list. You can also look up words on the [CMU Pronouncing Dictionary](http://www.speech.cs.cmu.edu/cgi-bin/cmudict) page.
+
+Example:
+
+```text
+Standard: I am an engineer.
+With phoneme control: I am an <|phoneme_start|>EH1 N JH AH0 N IH1 R<|phoneme_end|>.
+```
+
+You can omit stress digits when you only need a rough pronunciation, but CMUdict-style output with stress digits usually gives the model the clearest signal.
+
+## Common Examples
+
+Use phoneme control when spelling alone is ambiguous:
+
+```text
+The <|phoneme_start|>R IY1 D<|phoneme_end|> endpoint returns the current state.
+The book was <|phoneme_start|>R EH1 D<|phoneme_end|> yesterday.
+```
+
+```text
+The <|phoneme_start|>B EY1 S<|phoneme_end|> line is too loud.
+The <|phoneme_start|>B AE1 S<|phoneme_end|> swam upstream.
+```
+
+```text
+The <|phoneme_start|>P OW1 L IH0 SH<|phoneme_end|> team joined the call.
+Please <|phoneme_start|>P AA1 L IH0 SH<|phoneme_end|> the final mix.
+```
+
+Use it for product names, acronyms, and technical terms:
+
+```text
+Deploy with <|phoneme_start|>K UW2 B ER0 N EH1 T IY0 Z<|phoneme_end|>.
+The query uses <|phoneme_start|>EH1 S K Y UW1 EH1 L<|phoneme_end|>.
+```
+
+## Generate CMU Arpabet
+
+The training pipeline uses CMUdict-style pronunciations. You can generate the same format with the `cmudict` package:
+
+```bash
+pip install cmudict
+```
+
+```python
+import cmudict
+
+
+entries = cmudict.dict()
+
+
+def cmu_pronunciation(word: str) -> str | None:
+    phones = entries.get(word.lower())
+    if not phones:
+        return None
+    return " ".join(phones[0])
+
+
+print(cmu_pronunciation("engineer"))
+# EH1 N JH AH0 N IH1 R
+```
+
+CMUdict may contain multiple pronunciations for the same word. Listen to the result and choose the variant that matches your intended accent or context.
+
+## Practical Tips
+
+- Replace only the word whose pronunciation needs control.
+- Strip punctuation before dictionary lookup, then place punctuation after the tag.
+- Use CMU Arpabet for English phoneme tags.
+- For names and brands, write the pronunciation that you want the listener to hear, not necessarily the spelling.