-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathchatterbox_cli.cpp
More file actions
2206 lines (2059 loc) · 117 KB
/
chatterbox_cli.cpp
File metadata and controls
2206 lines (2059 loc) · 117 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// CLI entry point + CLI-specific helpers for the tts-cpp executable.
//
// This file is part of the tts-cpp static library build, but none of its
// symbols are referenced by the Engine API in
// `include/tts-cpp/chatterbox/engine.h`. Consumers that link libtts-cpp.a
// purely for `tts_cpp::chatterbox::Engine` pay nothing for the CLI code:
// the object file produced from this translation unit is left out of the
// final link by the linker's standard static-archive dead-code rule.
//
// Split out of src/main.cpp so the Engine-only TUs
// (chatterbox_engine.cpp + the T3 helpers still in main.cpp) stay lean
// and don't drag in the CLI's argv parser, signal handlers, live-input
// reader, save-voice dumper, multi-segment crossfade logic, etc.
#include "gpt2_bpe.h"
#include "mtl_tokenizer.h"
#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "gguf.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#include <algorithm>
#include <atomic>
#include <cctype>
#include <cerrno>
#include <chrono>
#include <cmath>
#include <csignal>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <future>
#include <map>
#include <random>
#include <set>
#include <sstream>
#include <stdexcept>
#include <string>
#include <thread>
#include <vector>
#include "tts-cpp/tts-cpp.h"
#include "tts-cpp/chatterbox/s3gen_pipeline.h"
#include "chatterbox_t3_internal.h"
#include "t3_mtl.h"
#include "npy.h"
#include "voice_features.h"
#include "voice_encoder.h"
#include "campplus.h"
#include "s3tokenizer.h"
#include "gguf.h"
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/select.h>
#include <fcntl.h>
#include <unistd.h>
using namespace tts_cpp::chatterbox::detail;
static bool file_exists(const std::string & path) {
struct stat st;
return ::stat(path.c_str(), &st) == 0 && S_ISREG(st.st_mode);
}
// Sanity-check a --reference-audio file before we kick off the full voice-
// cloning pipeline. The Python reference asserts `len(ref) / sr > 5.0` and
// fails hard otherwise; we silently accept any length, but produce undersized
// conditioning tensors (prompt_token=125 instead of 250, etc.) which falls
// back on whatever is in the built-in voice slots. That's misleading — give
// a clear error instead. Recommended length is 10–15 seconds.
// Minimal 16-bit PCM WAV writer; matches the one in chatterbox_tts.cpp / mel2wav.cpp.
// Used by the streaming synthesis path to write the final concatenated wav.
static void stream_write_wav(const std::string & path, const std::vector<float> & wav, int sr) {
std::ofstream f(path, std::ios::binary);
if (!f) { fprintf(stderr, "error: cannot write %s\n", path.c_str()); return; }
auto w32 = [&](uint32_t v){ f.write((const char*)&v, 4); };
auto w16 = [&](uint16_t v){ f.write((const char*)&v, 2); };
uint32_t data_bytes = (uint32_t)(wav.size() * sizeof(int16_t));
f.write("RIFF", 4); w32(36 + data_bytes); f.write("WAVE", 4);
f.write("fmt ", 4); w32(16); w16(1); w16(1); w32(sr); w32(sr * 2); w16(2); w16(16);
f.write("data", 4); w32(data_bytes);
for (float v : wav) {
int s = (int)std::lround(v * 32767.0f);
if (s > 32767) s = 32767; if (s < -32768) s = -32768;
int16_t s16 = (int16_t)s;
f.write((const char*)&s16, 2);
}
}
// Emit a chunk of float samples to stdout as raw 16-bit little-endian PCM
// and flush so downstream players hear it immediately (stdio buffers would
// otherwise hold up to 4-8 KB, stalling real-time playback at chunk
// boundaries). Used by `--out -` streaming mode; callers pipe into e.g.
// `ffplay -f s16le -ar 24000 -ac 1 -nodisp -autoexit -`.
static void stream_emit_pcm_stdout(const std::vector<float> & wav) {
for (float v : wav) {
int s = (int)std::lround(v * 32767.0f);
if (s > 32767) s = 32767; if (s < -32768) s = -32768;
int16_t s16 = (int16_t)s;
std::fwrite(&s16, sizeof(s16), 1, stdout);
}
std::fflush(stdout);
}
// Split `text` into TTS-friendly segments of at most `max_chars` characters.
//
// Motivation: Chatterbox Turbo's T3 was trained on utterances of 5–15 s and
// degrades (prosody drift, hallucinated phonemes, timbre wandering) on much
// longer autoregressive outputs. Reproducible on every backend (ggml / ONNX
// / upstream Python). The only reliable fix is sentence-level segmentation
// above the model.
//
// The splitter does three passes:
// 1. Break at `. ? !` followed by whitespace / EOF.
// 2. For any sentence longer than `max_chars`, break further at `, : ;`
// (preferring boundaries past max_chars/2 so we don't fragment into
// unpronouncable stubs). Last-resort: hard-break every max_chars.
// 3. Greedily merge consecutive short fragments forward while their
// combined length stays <= max_chars, so very short sentences ride
// with their neighbours rather than stand alone.
//
// Abbreviations like "e.g." are not treated specially; in practice the
// greedy merge pass absorbs false splits on them back into the next segment.
static std::vector<std::string> split_text_for_tts(const std::string & text, int max_chars) {
std::vector<std::string> out;
if (text.empty() || max_chars <= 0) { out.push_back(text); return out; }
auto is_ws = [](unsigned char c) { return std::isspace(c) != 0; };
// Pass 1: sentence split.
std::vector<std::string> sentences;
{
std::string cur;
size_t i = 0;
while (i < text.size()) {
cur += text[i];
const char c = text[i];
const bool at_end = (i + 1 == text.size());
const bool nx_ws = !at_end && is_ws((unsigned char)text[i + 1]);
if ((c == '.' || c == '?' || c == '!') && (at_end || nx_ws)) {
size_t j = i + 1;
while (j < text.size() && is_ws((unsigned char)text[j])) { cur += text[j]; ++j; }
sentences.push_back(cur);
cur.clear();
i = j;
} else {
++i;
}
}
if (!cur.empty()) sentences.push_back(cur);
}
// Pass 2: refine any sentence longer than max_chars.
std::vector<std::string> refined;
refined.reserve(sentences.size());
for (auto & s : sentences) {
if ((int)s.size() <= max_chars) { refined.push_back(std::move(s)); continue; }
std::string acc;
size_t k = 0;
while (k < s.size()) {
acc += s[k];
const char c = s[k];
const bool nx_ws = (k + 1 < s.size()) && is_ws((unsigned char)s[k + 1]);
const bool soft_break = (c == ',' || c == ':' || c == ';') && nx_ws &&
(int)acc.size() > max_chars / 2;
if (soft_break) {
size_t j = k + 1;
while (j < s.size() && is_ws((unsigned char)s[j])) { acc += s[j]; ++j; }
refined.push_back(acc);
acc.clear();
k = j;
continue;
}
if ((int)acc.size() >= max_chars) {
// Last-resort hard break at a space if we can find one in the
// tail quarter; otherwise just cut.
size_t back = acc.size();
while (back > (size_t)(max_chars * 3 / 4) && !is_ws((unsigned char)acc[back - 1])) --back;
if (back <= (size_t)(max_chars / 2)) back = acc.size();
refined.push_back(acc.substr(0, back));
acc.erase(0, back);
}
++k;
}
if (!acc.empty()) refined.push_back(acc);
}
// Pass 3: greedy forward merge of short fragments.
for (auto & s : refined) {
if (!out.empty() && (int)(out.back().size() + s.size()) <= max_chars) {
out.back() += s;
} else {
out.push_back(std::move(s));
}
}
// Strip trailing whitespace per segment.
for (auto & s : out) {
while (!s.empty() && is_ws((unsigned char)s.back())) s.pop_back();
}
// Drop empty segments (paranoia).
out.erase(std::remove_if(out.begin(), out.end(),
[](const std::string & s) { return s.empty(); }),
out.end());
if (out.empty()) out.push_back(text);
return out;
}
// Append `src` PCM to `dst`, crossfading the last `fade_ms` of `dst` with the
// leading `fade_ms` of `src` via a raised-cosine ramp. Removes clicks at
// segment seams in auto-split mode.
static void append_pcm_crossfade(std::vector<float> & dst, const std::vector<float> & src,
int sr, int fade_ms) {
if (src.empty()) return;
if (dst.empty() || fade_ms <= 0) {
dst.insert(dst.end(), src.begin(), src.end());
return;
}
int fade_n = sr * fade_ms / 1000;
fade_n = std::min(fade_n, (int)dst.size());
fade_n = std::min(fade_n, (int)src.size());
if (fade_n <= 0) { dst.insert(dst.end(), src.begin(), src.end()); return; }
const size_t ofs = dst.size() - fade_n;
for (int i = 0; i < fade_n; ++i) {
const float t = (float)(i + 1) / (float)(fade_n + 1);
const float w = 0.5f * (1.0f - std::cos((float)M_PI * t)); // 0 → 1 cosine ramp
dst[ofs + i] = dst[ofs + i] * (1.0f - w) + src[i] * w;
}
dst.insert(dst.end(), src.begin() + fade_n, src.end());
}
// Save the five voice-conditioning tensors to a directory as .npy so later
// runs can reuse them via --ref-dir (no --reference-audio needed), skipping
// VoiceEncoder / CAMPPlus / S3TokenizerV2 / mel-extract entirely.
//
// Any of the five buffers may be empty; missing ones are silently skipped
// (we emit whatever we have and let the reuse path fall back to built-in
// or error cleanly if a required tensor is absent).
static void save_voice_profile(const std::string & dir,
const std::vector<float> & speaker_emb,
const std::vector<int32_t> & cond_prompt_speech_tokens,
const std::vector<float> & embedding,
const std::vector<int32_t> & prompt_token,
const std::vector<float> & prompt_feat,
int prompt_feat_rows /* = pf_data.size() / 80 */)
{
struct stat st;
if (::stat(dir.c_str(), &st) != 0) {
if (::mkdir(dir.c_str(), 0755) != 0) {
fprintf(stderr, "save_voice_profile: cannot create %s\n", dir.c_str());
return;
}
} else if (!S_ISDIR(st.st_mode)) {
fprintf(stderr, "save_voice_profile: %s exists but is not a directory\n", dir.c_str());
return;
}
int n_saved = 0;
if (!speaker_emb.empty()) {
npy_save_f32(dir + "/speaker_emb.npy",
{(int64_t)speaker_emb.size()}, speaker_emb.data());
++n_saved;
}
if (!cond_prompt_speech_tokens.empty()) {
npy_save_i32(dir + "/cond_prompt_speech_tokens.npy",
{(int64_t)cond_prompt_speech_tokens.size()},
cond_prompt_speech_tokens.data());
++n_saved;
}
if (!embedding.empty()) {
npy_save_f32(dir + "/embedding.npy",
{(int64_t)embedding.size()}, embedding.data());
++n_saved;
}
if (!prompt_token.empty()) {
npy_save_i32(dir + "/prompt_token.npy",
{(int64_t)prompt_token.size()}, prompt_token.data());
++n_saved;
}
if (!prompt_feat.empty() && prompt_feat_rows > 0) {
npy_save_f32(dir + "/prompt_feat.npy",
{(int64_t)prompt_feat_rows, 80}, prompt_feat.data());
++n_saved;
}
fprintf(stderr, "save_voice_profile: wrote %d .npy files into %s\n", n_saved, dir.c_str());
}
// --------------------------------------------------------------------------
// CLI
// --------------------------------------------------------------------------
struct cli_params {
std::string model; // T3 GGUF (required unless --tokens-file + --s3gen-gguf)
std::string tokens_file; // optional pre-tokenized speech tokens (skips T3)
std::string text; // input text for T3
std::string output; // legacy: speech-tokens output file (if set, write tokens)
// S3Gen + HiFT vocoder:
std::string s3gen_gguf; // enables full text → wav pipeline
std::string out_wav; // wav output path (requires --s3gen-gguf)
std::string ref_dir; // override built-in voice with .npy reference dump
std::string reference_audio; // wav file; computes prompt_feat natively in C++
std::string save_voice_dir; // if set, dump the 5 conditioning tensors here for reuse
bool debug = false; // --debug: load Python-dumped intermediates for validation
bool verbose = false; // --verbose: per-stage profile timings (human-readable)
bool dump_tokens_only = false;
int32_t seed = 0;
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_predict = 1000; // matches Python's default-ish output budget for paragraph-length text
int32_t n_ctx = 0;
int32_t n_gpu_layers = 0;
// Sampling defaults matched to ChatterboxTurboTTS.generate() in tts_turbo.py:
// temperature=0.8, top_k=1000, top_p=0.95, repetition_penalty=1.2
// The previous greedy defaults (top_k=1) collapse into silence-token
// repetition loops on any non-trivial text.
int32_t top_k = 1000;
float top_p = 0.95f;
float temp = 0.8f;
float repeat_penalty = 1.2f;
// Experimental: route CFM flash-attn through the F32 Q + F16 K/V path
// so backends with `flash_attn_f32_f16` (Adreno OpenCL) dispatch the
// mixed-precision kernel. Opt-in mobile latency knob. See PROGRESS.md
// "OpenCL / Adreno bring-up".
bool cfm_f16_kv_attn = false;
// Multilingual-only knobs. Python ChatterboxMultilingualTTS.generate()
// defaults: cfg_weight=0.5, temperature=0.8, repetition_penalty=2.0,
// min_p=0.05, top_p=1.0 (top_k unused).
float cfg_weight = 0.5f; // classifier-free guidance strength
float min_p = 0.05f; // minimum-probability warp (0 = off)
std::string language; // tier-1 lang code when variant = t3_mtl
float exaggeration = 0.5f; // emotion_adv scalar (0..1)
// Streaming synthesis (PROGRESS.md B1). When > 0, speech tokens from
// T3 are fed to S3Gen+HiFT in chunks of this size, with `cache_source`
// carried across chunks for phase continuity and `trim_fade` only on
// chunk 0. Chunks are concatenated in memory and written to --out when
// the loop finishes, or piped to stdout as soon as each chunk finishes
// when --out is "-". No per-chunk files are ever written.
int32_t stream_chunk_tokens = 0;
// Optional: override first-chunk size (typically smaller than
// stream_chunk_tokens so first-audio-out is fast, then the pipeline
// switches to larger chunks to amortise the fixed per-chunk overhead).
// 0 → same as stream_chunk_tokens.
int32_t stream_first_chunk_tokens = 0;
// Optional: override CFM Euler step count for streaming chunks. Defaults
// to 2 (matches Python's meanflow); setting 1 halves CFM cost at the
// price of a bit of extra high-frequency noise.
int32_t stream_cfm_steps = 0;
// Override CFM Euler step count for non-streaming synthesis. Defaults
// to 0 (= use the GGUF's `n_timesteps`: 10 for Multilingual standard
// CFM, 2 for Turbo's meanflow). Lowering N (e.g. 7-8 on Multilingual)
// reduces S3Gen wall-clock proportionally; the §3.21 sweep documents
// the audio-cosine knee. Streaming uses --stream-cfm-steps instead.
int32_t cfm_steps = 0;
// Auto-split the input text into sentences before running the pipeline.
// Chatterbox Turbo's T3 degrades badly on autoregressive outputs longer
// than ~15 s (well outside its training distribution), so anything over
// a few sentences comes out as garbled prosody, hallucinated phonemes
// or drifting timbre — regardless of backend (reproduced on Python and
// ONNX too). Splitting at sentence boundaries keeps each T3 call
// in-distribution. Segments are concatenated with a short raised-cosine
// crossfade at the seams.
//
// max_sentence_chars Target length per segment in characters.
// When a sentence exceeds this, we split
// further at `, : ;`. Set to 0 to disable
// auto-split entirely (single-shot, old
// behaviour; matches --no-auto-split).
// Default 180 ≈ 5–8 s of audio.
//
// crossfade_ms Raised-cosine crossfade length at segment
// seams, in ms. Default 30.
int32_t max_sentence_chars = 180;
int32_t crossfade_ms = 30;
// Incremental streaming input. When --input-file PATH is set, the binary
// opens PATH for reading and follows it with tail -f semantics: as soon as
// a complete sentence (ending in . ! ? or \n) has been read, it's
// tokenised, fed to T3, and the resulting speech tokens are streamed
// through S3Gen + HiFT to stdout. Intended for pairing with an upstream
// process (a streaming LLM, a live transcription, a human typing, …) that
// writes text to the file while we synthesise it.
//
// Requires --s3gen-gguf, --stream-chunk-tokens > 0, --out -.
// Exclusive with --text / --tokens-file.
std::string input_file;
std::string input_eof_marker; // optional; stops reading when seen
bool input_by_line = false; // one request per \n; don't split
// on . ! ? within a line
};
static int32_t sample_next_token(
const std::vector<float> & logits,
const std::vector<int32_t> & generated,
const cli_params & params,
std::mt19937 & rng) {
chatterbox_sampling_params sp;
sp.top_k = params.top_k;
sp.top_p = params.top_p;
sp.temp = params.temp;
sp.repeat_penalty = params.repeat_penalty;
return sample_next_token_ex(logits, generated, sp, rng);
}
static void print_usage(const char * argv0) {
fprintf(stderr, "usage: %s --model MODEL.gguf [--text TEXT | --tokens-file tokens.txt] [options]\n", argv0);
fprintf(stderr, "\noptions:\n");
fprintf(stderr, " --model PATH GGUF model produced by convert-t3-turbo-to-gguf.py\n");
fprintf(stderr, " (must embed tokenizer.ggml.* metadata; produced by the\n");
fprintf(stderr, " current converter)\n");
fprintf(stderr, " --text TEXT Input text (uses the GPT-2 BPE tokenizer embedded in GGUF)\n");
fprintf(stderr, " --tokens-file PATH Pre-tokenized text token ids (alternative to --text).\n");
fprintf(stderr, " With --s3gen-gguf this is interpreted as *speech* tokens\n");
fprintf(stderr, " and the T3 step is skipped.\n");
fprintf(stderr, " --output PATH Write generated speech tokens to PATH (text mode).\n");
fprintf(stderr, "\n");
fprintf(stderr, " --s3gen-gguf PATH Enables the full text -> wav pipeline (S3Gen + HiFT).\n");
fprintf(stderr, " --out PATH Output wav file when --s3gen-gguf is set.\n");
fprintf(stderr, " Use `--out -` together with --stream-chunk-tokens to\n");
fprintf(stderr, " pipe raw s16le mono @ 24 kHz to stdout as each chunk\n");
fprintf(stderr, " is ready (for live playback, e.g. `| ffplay -f s16le`).\n");
fprintf(stderr, " --ref-dir DIR Override built-in voice with embedding.npy /\n");
fprintf(stderr, " prompt_token.npy / prompt_feat.npy from DIR, plus\n");
fprintf(stderr, " T3 speaker_emb.npy / cond_prompt_speech_tokens.npy.\n");
fprintf(stderr, " --reference-audio PATH Reference .wav; all five voice-conditioning tensors\n");
fprintf(stderr, " (speaker_emb, cond_prompt_speech_tokens, embedding,\n");
fprintf(stderr, " prompt_token, prompt_feat) are computed in C++.\n");
fprintf(stderr, " --save-voice DIR Dump the 5 computed conditioning tensors as .npy into\n");
fprintf(stderr, " DIR (created if missing). Use --ref-dir DIR on later\n");
fprintf(stderr, " runs to reuse the voice without --reference-audio —\n");
fprintf(stderr, " skips VoiceEncoder/CAMPPlus/S3TokenizerV2 entirely.\n");
fprintf(stderr, " --debug Load reference intermediates from --ref-dir for\n");
fprintf(stderr, " bit-exact numerical validation (requires --ref-dir).\n");
fprintf(stderr, " --verbose Print per-stage wall-time breakdown for T3, S3Gen,\n");
fprintf(stderr, " HiFT and (when --reference-audio is used) the voice-\n");
fprintf(stderr, " cloning preprocessing pipeline.\n");
fprintf(stderr, " --seed N RNG seed (default: 0)\n");
fprintf(stderr, " --threads N CPU threads (default: %d)\n", std::min(4, (int32_t) std::thread::hardware_concurrency()));
fprintf(stderr, " --n-predict N Max speech tokens (default: 1000)\n");
fprintf(stderr, " --context N Override KV context length\n");
fprintf(stderr, " --n-gpu-layers N GPU backend when N > 0\n");
fprintf(stderr, " --top-k N (default: 1000, matches Python; use 1 for greedy)\n");
fprintf(stderr, " --top-p P (default: 0.95)\n");
fprintf(stderr, " --temp T (default: 0.8)\n");
fprintf(stderr, " --repeat-penalty R (default: 1.2)\n");
fprintf(stderr, " --min-p P Minimum-probability warp (default: 0.05; t3_mtl only)\n");
fprintf(stderr, " --cfg-weight W Classifier-free guidance strength (default: 0.5;\n");
fprintf(stderr, " t3_mtl only)\n");
fprintf(stderr, " --exaggeration X Emotion-adv scalar in [0,1] (default: 0.5; t3_mtl only)\n");
fprintf(stderr, "\n");
fprintf(stderr, "multilingual (variant=t3_mtl) options:\n");
fprintf(stderr, " --language CODE Required for t3_mtl GGUFs. Tier-1: en, es, fr, de, it,\n");
fprintf(stderr, " pt, nl, pl, tr, sv, da, fi, no, el, ms, sw, ar, ko.\n");
fprintf(stderr, "\n");
fprintf(stderr, " --stream-chunk-tokens N Synthesize the wav in streaming chunks of N speech\n");
fprintf(stderr, " tokens each (~1 s audio per 25-token chunk). With\n");
fprintf(stderr, " --out PATH.wav, the concatenated wav is written at the\n");
fprintf(stderr, " end; with --out -, each chunk's PCM is piped to stdout\n");
fprintf(stderr, " as soon as it's produced. No per-chunk files are\n");
fprintf(stderr, " written. Requires --s3gen-gguf. (default: 0 = batch)\n");
fprintf(stderr, " --stream-first-chunk-tokens N Override first-chunk size to minimise first-audio\n");
fprintf(stderr, " latency. Typical value: 10-15. (default: 0 = same\n");
fprintf(stderr, " as --stream-chunk-tokens)\n");
fprintf(stderr, " --stream-cfm-steps N CFM Euler step count per chunk. Python uses 2 for\n");
fprintf(stderr, " meanflow; 1 halves CFM cost. (default: 0 = 2)\n");
fprintf(stderr, " --cfm-steps N Non-streaming CFM Euler step count. Multilingual's\n");
fprintf(stderr, " standard CFM ships at 10 steps; lower (e.g. 7-8)\n");
fprintf(stderr, " trades small audio quality for proportional S3Gen\n");
fprintf(stderr, " speedup. Turbo's meanflow defaults to 2 steps.\n");
fprintf(stderr, " See PROGRESS.md §3.21 for the quality knee sweep.\n");
fprintf(stderr, " (default: 0 = GGUF's n_timesteps)\n");
fprintf(stderr, " --cfm-f16-kv-attn Experimental: CFM flash-attn uses F32 Q + F16 K/V so\n");
fprintf(stderr, " OpenCL/Adreno can dispatch flash_attn_f32_f16.\n");
fprintf(stderr, "\n");
fprintf(stderr, " --input-file PATH Stream text from PATH as another process writes to it.\n");
fprintf(stderr, " tail -f semantics: each complete sentence (ending in\n");
fprintf(stderr, " . ! ? or newline) is tokenised and synthesised the\n");
fprintf(stderr, " moment it arrives; audio streams chunk-by-chunk to\n");
fprintf(stderr, " stdout as raw s16le @ 24 kHz mono. Use '-' to read\n");
fprintf(stderr, " from stdin instead of a file; on a TTY this gives an\n");
fprintf(stderr, " interactive prompt where each Enter-terminated line\n");
fprintf(stderr, " is spoken immediately (Ctrl-D exits). Runs until\n");
fprintf(stderr, " SIGINT or stdin EOF / --input-eof-marker. Requires\n");
fprintf(stderr, " --s3gen-gguf, --stream-chunk-tokens > 0, --out -.\n");
fprintf(stderr, " Exclusive with --text / --tokens-file.\n");
fprintf(stderr, " --input-eof-marker STR When this string is seen in the input, flush any\n");
fprintf(stderr, " preceding text, synthesise it, and exit cleanly.\n");
fprintf(stderr, " (default: none = run until SIGINT)\n");
fprintf(stderr, " --input-by-line Treat one newline-terminated line as one request.\n");
fprintf(stderr, " . ! ? inside a line no longer split it into multiple\n");
fprintf(stderr, " synthesis runs (and the 150 ms gap that goes with them);\n");
fprintf(stderr, " the full line is sent to T3 as a single utterance.\n");
fprintf(stderr, " Ideal when each upstream message is one 'request' and\n");
fprintf(stderr, " internal punctuation is meant as prosody, not as a\n");
fprintf(stderr, " hard boundary.\n");
fprintf(stderr, "\n");
fprintf(stderr, " --max-sentence-chars N Split --text into segments of at most N chars, running\n");
fprintf(stderr, " T3+S3Gen+HiFT per segment and concatenating the PCM with\n");
fprintf(stderr, " a raised-cosine crossfade. Works around Chatterbox Turbo's\n");
fprintf(stderr, " degradation on > ~15 s outputs. (default: 180)\n");
fprintf(stderr, " --no-auto-split Disable the above (single-shot T3 over the full text).\n");
fprintf(stderr, " --crossfade-ms N Crossfade length between segments in ms. (default: 30)\n");
fprintf(stderr, " -h, --help\n");
}
static bool parse_args(int argc, char ** argv, cli_params & params) {
for (int i = 1; i < argc; ++i) {
const std::string arg = argv[i];
auto next = [&](const char * flag) -> const char * {
if (i + 1 >= argc) { fprintf(stderr, "error: %s requires an argument\n", flag); return nullptr; }
return argv[++i];
};
// Safe numeric parsers: turn std::stoi / std::stof "no conversion"
// exceptions into a user-friendly error + clean exit. Catches the
// common mistake of a flag being followed by *another flag* because
// the intended value was forgotten (e.g. `--n-gpu-layers --out ...`).
auto parse_int = [&](const char * flag, int32_t & out) -> bool {
auto v = next(flag);
if (!v) return false;
try {
size_t pos = 0;
long long n = std::stoll(v, &pos);
if (pos != std::strlen(v)) throw std::invalid_argument("trailing garbage");
out = (int32_t) n;
return true;
} catch (const std::exception & e) {
fprintf(stderr, "error: %s expects an integer value, got '%s' (%s)\n", flag, v, e.what());
return false;
}
};
auto parse_float = [&](const char * flag, float & out) -> bool {
auto v = next(flag);
if (!v) return false;
try {
size_t pos = 0;
float f = std::stof(v, &pos);
if (pos != std::strlen(v)) throw std::invalid_argument("trailing garbage");
out = f;
return true;
} catch (const std::exception & e) {
fprintf(stderr, "error: %s expects a number, got '%s' (%s)\n", flag, v, e.what());
return false;
}
};
if (arg == "--model") { auto v = next("--model"); if (!v) return false; params.model = v; }
else if (arg == "--text") { auto v = next("--text"); if (!v) return false; params.text = v; }
else if (arg == "--tokens-file") { auto v = next("--tokens-file"); if (!v) return false; params.tokens_file = v; }
else if (arg == "--output") { auto v = next("--output"); if (!v) return false; params.output = v; }
else if (arg == "--s3gen-gguf") { auto v = next("--s3gen-gguf"); if (!v) return false; params.s3gen_gguf = v; }
else if (arg == "--out") { auto v = next("--out"); if (!v) return false; params.out_wav = v; }
else if (arg == "--ref-dir") { auto v = next("--ref-dir"); if (!v) return false; params.ref_dir = v; }
else if (arg == "--reference-audio"){ auto v = next("--reference-audio");if (!v) return false; params.reference_audio = v; }
else if (arg == "--save-voice") { auto v = next("--save-voice"); if (!v) return false; params.save_voice_dir = v; }
else if (arg == "--debug") { params.debug = true; }
else if (arg == "--verbose" || arg == "-v") { params.verbose = true; }
else if (arg == "--seed") { if (!parse_int ("--seed", params.seed)) return false; }
else if (arg == "--threads") { if (!parse_int ("--threads", params.n_threads)) return false; }
else if (arg == "--n-predict") { if (!parse_int ("--n-predict", params.n_predict)) return false; }
else if (arg == "--context") { if (!parse_int ("--context", params.n_ctx)) return false; }
else if (arg == "--n-gpu-layers") { if (!parse_int ("--n-gpu-layers", params.n_gpu_layers)) return false; }
else if (arg == "--top-k") { if (!parse_int ("--top-k", params.top_k)) return false; }
else if (arg == "--top-p") { if (!parse_float("--top-p", params.top_p)) return false; }
else if (arg == "--temp") { if (!parse_float("--temp", params.temp)) return false; }
else if (arg == "--repeat-penalty") { if (!parse_float("--repeat-penalty", params.repeat_penalty)) return false; }
else if (arg == "--min-p") {
if (!parse_float("--min-p", params.min_p)) return false;
if (params.min_p < 0.0f || params.min_p > 1.0f) {
fprintf(stderr, "error: --min-p must be in [0, 1] (got %g)\n", (double) params.min_p);
return false;
}
}
else if (arg == "--cfg-weight") {
if (!parse_float("--cfg-weight", params.cfg_weight)) return false;
if (params.cfg_weight < 0.0f) {
fprintf(stderr, "error: --cfg-weight must be >= 0 (got %g)\n", (double) params.cfg_weight);
return false;
}
}
else if (arg == "--exaggeration") {
if (!parse_float("--exaggeration", params.exaggeration)) return false;
if (params.exaggeration < 0.0f || params.exaggeration > 1.0f) {
fprintf(stderr, "error: --exaggeration must be in [0, 1] (got %g)\n", (double) params.exaggeration);
return false;
}
}
else if (arg == "--language") { auto v = next("--language"); if (!v) return false; params.language = v; }
else if (arg == "--cfm-f16-kv-attn") { params.cfm_f16_kv_attn = true; }
else if (arg == "--max-sentence-chars") { if (!parse_int("--max-sentence-chars", params.max_sentence_chars)) return false; }
else if (arg == "--no-auto-split") { params.max_sentence_chars = 0; }
else if (arg == "--crossfade-ms") { if (!parse_int("--crossfade-ms", params.crossfade_ms)) return false; }
else if (arg == "--stream-chunk-tokens") { if (!parse_int("--stream-chunk-tokens", params.stream_chunk_tokens)) return false; }
else if (arg == "--stream-first-chunk-tokens") { if (!parse_int("--stream-first-chunk-tokens", params.stream_first_chunk_tokens)) return false; }
else if (arg == "--stream-cfm-steps") { if (!parse_int("--stream-cfm-steps", params.stream_cfm_steps)) return false; }
else if (arg == "--cfm-steps") { if (!parse_int("--cfm-steps", params.cfm_steps)) return false; }
else if (arg == "--input-file") { auto v = next("--input-file"); if (!v) return false; params.input_file = v; }
else if (arg == "--input-eof-marker") { auto v = next("--input-eof-marker"); if (!v) return false; params.input_eof_marker = v; }
else if (arg == "--input-by-line") { params.input_by_line = true; }
else if (arg == "--dump-tokens-only") { params.dump_tokens_only = true; }
else if (arg == "-h" || arg == "--help") { print_usage(argv[0]); std::exit(0); }
else {
// Surface two common shell typos that would otherwise produce
// cryptic messages: (a) an argument that's entirely whitespace
// — symptom of `\<space>` at end of a continuation line, the
// backslash escapes the space instead of the newline; (b) a
// leading backslash on the arg itself, symptom of the same
// thing on the previous line.
bool all_ws = !arg.empty();
for (char c : arg) if (!std::isspace((unsigned char)c)) { all_ws = false; break; }
if (all_ws) {
fprintf(stderr, "error: empty / whitespace-only argument at position %d. "
"This usually means you have a trailing space after '\\' at the "
"end of a continuation line — remove it so the shell treats the "
"next newline as the line break.\n", i);
} else if (!arg.empty() && arg[0] == '\\') {
fprintf(stderr, "error: argument starts with a backslash: %s\n "
"You probably have a trailing space after '\\' on the *previous* "
"line, which escaped the space instead of the newline. Remove "
"the trailing space so the next line is treated as a continuation.\n",
arg.c_str());
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
}
return false;
}
}
if (params.dump_tokens_only) {
if (params.text.empty()) {
fprintf(stderr, "error: --dump-tokens-only requires --text\n");
return false;
}
return true;
}
// Bake-only mode: just save the 5 voice tensors and exit.
const bool bake_only = !params.save_voice_dir.empty()
&& !params.reference_audio.empty()
&& params.text.empty()
&& params.tokens_file.empty();
// If we're only doing the S3Gen+HiFT back half (user already has speech tokens),
// --model (T3) is optional; otherwise it's required.
const bool skip_t3 = !params.s3gen_gguf.empty() && !params.tokens_file.empty() && params.text.empty();
if (!skip_t3 && !bake_only && params.model.empty()) {
fprintf(stderr, "error: --model is required (pass --s3gen-gguf + --tokens-file to skip T3, "
"or --save-voice + --reference-audio to bake only)\n");
return false;
}
if (!bake_only && params.text.empty() && params.tokens_file.empty() && params.input_file.empty()) {
fprintf(stderr, "error: one of --text / --tokens-file / --input-file is required "
"(or --save-voice + --reference-audio to bake a voice profile without synthesising)\n");
return false;
}
if (!params.input_file.empty()) {
if (params.s3gen_gguf.empty()) {
fprintf(stderr, "error: --input-file requires --s3gen-gguf\n"); return false;
}
if (params.stream_chunk_tokens <= 0) {
fprintf(stderr, "error: --input-file requires --stream-chunk-tokens > 0\n"); return false;
}
if (params.out_wav != "-") {
fprintf(stderr, "error: --input-file requires --out - (stream raw PCM to stdout)\n"); return false;
}
if (!params.text.empty() || !params.tokens_file.empty()) {
fprintf(stderr, "error: --input-file is mutually exclusive with --text / --tokens-file\n"); return false;
}
}
if (!params.s3gen_gguf.empty() && !bake_only && params.out_wav.empty()) {
fprintf(stderr, "error: --s3gen-gguf requires --out PATH.wav\n"); return false;
}
if (params.debug && params.ref_dir.empty()) {
fprintf(stderr, "error: --debug requires --ref-dir\n"); return false;
}
return true;
}
// --------------------------------------------------------------------------
// I/O helpers
// --------------------------------------------------------------------------
static std::vector<int32_t> read_token_file(const std::string & path) {
std::ifstream fin(path);
if (!fin) throw std::runtime_error("failed to open token file: " + path);
std::string raw((std::istreambuf_iterator<char>(fin)), std::istreambuf_iterator<char>());
for (char & ch : raw) if (ch == ',') ch = ' ';
std::vector<int32_t> tokens;
std::stringstream ss(raw);
int32_t tok;
while (ss >> tok) tokens.push_back(tok);
return tokens;
}
static void write_token_file(const std::string & path, const std::vector<int32_t> & tokens) {
std::ofstream fout(path);
if (!fout) throw std::runtime_error("failed to open output file: " + path);
for (size_t i = 0; i < tokens.size(); ++i) { if (i) fout << ','; fout << tokens[i]; }
fout << '\n';
}
// --------------------------------------------------------------------------
// GGUF helpers
int tts_cpp_cli_main(int argc, char ** argv) {
ggml_time_init();
cli_params params;
if (!parse_args(argc, argv, params)) {
// Don't dump the full usage here — parse_args already printed the
// specific error (missing / malformed value, unknown flag). Dumping
// ~90 lines of option descriptions below it just pushes the actual
// message off-screen. Point users at --help if they want it.
fprintf(stderr, "Run `%s --help` for the full list of options.\n", argv[0]);
return 1;
}
// Apply the log filter BEFORE any ggml_backend_*_init() runs, otherwise
// Metal / Vulkan device-init messages leak out.
g_log_verbose = params.verbose ? 1 : 0;
ggml_log_set(chatterbox_log_cb, nullptr);
try {
// Early preflight: if the user supplied --reference-audio, make sure
// it's long enough for real voice cloning. Bail out now with a clear
// message instead of silently falling back on the built-in voice when
// the conditioning tensors come out undersized.
if (!params.reference_audio.empty()) {
if (!validate_reference_audio(params.reference_audio)) return 1;
}
// Bake-only mode: user passed --reference-audio + --save-voice but no
// text to synthesise. Compute the five voice tensors, dump them, and
// exit. Later runs can reuse with --ref-dir DIR (no preprocessing).
if (!params.save_voice_dir.empty()
&& !params.reference_audio.empty()
&& params.text.empty()
&& params.tokens_file.empty()) {
if (params.model.empty() || params.s3gen_gguf.empty()) {
fprintf(stderr, "error: --save-voice needs both --model and --s3gen-gguf\n");
return 1;
}
// Peek cond_prompt_len out of the T3 GGUF metadata (no weight load).
int cond_prompt_len = 375; // Turbo default
{
gguf_init_params gp = { /*.no_alloc=*/ true, /*.ctx=*/ nullptr };
gguf_context * g = gguf_init_from_file(params.model.c_str(), gp);
if (g) {
int64_t id = gguf_find_key(g, KEY_COND_PROMPT_LEN);
if (id >= 0) cond_prompt_len = (int)gguf_get_val_u32(g, id);
gguf_free(g);
}
}
// Voice-cloning preprocessing shares a backend: on Mac we pick
// Metal, on Linux + NVIDIA we pick CUDA / Vulkan. Falls back to
// the ggml-cpu NEON/AVX kernels when n_gpu_layers == 0.
ggml_backend_t vc_backend = init_backend(params.n_gpu_layers);
// (1) speaker_emb via VoiceEncoder (3-layer LSTM + proj + L2-norm
// on the chosen backend).
std::vector<float> se_bake;
{
const int64_t _t0 = ggml_time_us();
voice_encoder_weights vew;
if (voice_encoder_load(params.model, vew)) {
std::vector<float> wav; int sr = 0;
if (!wav_load(params.reference_audio, wav, sr))
throw std::runtime_error("failed to load --reference-audio");
normalise_lufs(wav, sr, -27.0);
if (sr != 16000) wav = resample_sinc(wav, sr, 16000);
if (!voice_encoder_embed(wav, vew, vc_backend, se_bake))
throw std::runtime_error("VoiceEncoder forward failed");
}
fprintf(stderr, "BENCH: VC_STAGE_speaker_emb_ms=%lld\n", (long long)((ggml_time_us() - _t0)/1000));
}
// (2 + 4) cond_prompt_speech_tokens + prompt_token via S3TokenizerV2.
std::vector<int32_t> pt_bake, ct_bake;
{
const int64_t _t0 = ggml_time_us();
(void)compute_speech_tokens_native(params.reference_audio, params.s3gen_gguf,
cond_prompt_len, pt_bake, ct_bake,
params.n_threads, vc_backend,
params.verbose);
fprintf(stderr, "BENCH: VC_STAGE_s3tokenizer_ms=%lld\n", (long long)((ggml_time_us() - _t0)/1000));
}
// (3) embedding via CAMPPlus.
std::vector<float> emb_bake;
{
const int64_t _t0 = ggml_time_us();
(void)compute_embedding_native(params.reference_audio, params.s3gen_gguf,
emb_bake, vc_backend, params.verbose);
fprintf(stderr, "BENCH: VC_STAGE_campplus_ms=%lld\n", (long long)((ggml_time_us() - _t0)/1000));
}
// (5) prompt_feat via mel_extract_24k_80.
std::vector<float> pf_bake;
int pf_rows = 0;
{
const int64_t _t0 = ggml_time_us();
(void)compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf,
pf_bake, pf_rows, params.verbose);
fprintf(stderr, "BENCH: VC_STAGE_prompt_feat_ms=%lld\n", (long long)((ggml_time_us() - _t0)/1000));
}
save_voice_profile(params.save_voice_dir,
se_bake, ct_bake, emb_bake, pt_bake, pf_bake, pf_rows);
fprintf(stderr,
"done: voice profile written to %s. Reuse it with "
"--ref-dir %s (no --reference-audio needed).\n",
params.save_voice_dir.c_str(), params.save_voice_dir.c_str());
ggml_backend_free(vc_backend);
return 0;
}
// Short-circuit: user gave us speech tokens directly + --s3gen-gguf. Skip T3 entirely.
if (params.model.empty() && !params.s3gen_gguf.empty() && !params.tokens_file.empty()) {
std::vector<int32_t> speech_tokens = read_token_file(params.tokens_file);
if (speech_tokens.empty()) throw std::runtime_error("empty speech tokens file");
s3gen_synthesize_opts opts;
opts.s3gen_gguf_path = params.s3gen_gguf;
opts.out_wav_path = params.out_wav;
opts.ref_dir = params.ref_dir;
opts.seed = params.seed;
opts.n_threads = params.n_threads;
opts.debug = params.debug;
opts.verbose = params.verbose;
opts.n_gpu_layers = params.n_gpu_layers;
opts.cfm_steps = params.cfm_steps;
opts.cfm_f16_kv_attn = params.cfm_f16_kv_attn;
if (!params.reference_audio.empty()) {
if (!compute_prompt_feat_native(params.reference_audio, params.s3gen_gguf,
opts.prompt_feat_override,
opts.prompt_feat_rows_override,
params.verbose))
throw std::runtime_error("failed to compute prompt_feat from --reference-audio");
// Best-effort: try to compute the S3Gen `embedding` natively too.
// Falls through to ref_dir/embedding.npy if the s3gen GGUF is pre-A1-2d-a.
(void)compute_embedding_native(params.reference_audio, params.s3gen_gguf,
opts.embedding_override,
/*backend=*/nullptr, params.verbose);
// And the S3Gen-side prompt_token via S3TokenizerV2 (Phase 2e).
// No backend available in this path yet (we haven't loaded T3);
// fall back to ggml-cpu. Callers going through the bake path
// above or the main T3 path below pass the real backend.
std::vector<int32_t> dummy_cond;
(void)compute_speech_tokens_native(params.reference_audio, params.s3gen_gguf,
/*max_cond_tokens=*/-1,
opts.prompt_token_override, dummy_cond,
params.n_threads, /*backend=*/nullptr,
params.verbose);
}
return s3gen_synthesize_to_wav(speech_tokens, opts);
}
// Load model first so we can use the GGUF-embedded tokenizer (if any).
chatterbox_model model;
const int64_t _t3_load_t0 = ggml_time_us();
if (!load_model_gguf(params.model, model, params.n_ctx, params.n_gpu_layers)) return 1;
const int64_t _t3_load_ms = (ggml_time_us() - _t3_load_t0) / 1000;
fprintf(stderr, "BENCH: T3_LOAD_MS=%lld\n", (long long)_t3_load_ms);
// Warm the S3Gen GGUF cache in the background while T3 inference
// runs. This cuts first-audio-out latency by ~700 ms in streaming
// mode — by the time T3 emits its first chunk of tokens, S3Gen is
// already in RAM with its tensors allocated on the right backend.
std::thread s3gen_preload_thread;
if (!params.s3gen_gguf.empty()) {
s3gen_preload_thread = std::thread([path = params.s3gen_gguf,
ngpu = params.n_gpu_layers]() {
s3gen_preload(path, ngpu);
});
}
// Voice-profile override on the T3 side. We resolve two tensors
// independently:
//
// speaker_emb — take from ref_dir/speaker_emb.npy if
// available, otherwise compute in C++ from
// --reference-audio via VoiceEncoder.
// cond_prompt_tokens — only available from ref_dir (until the
// S3TokenizerV2 C++ port in Phase 2e).
//
// The S3Gen side is overridden later inside s3gen_synthesize_to_wav.
bool have_se = false, have_ct = false;
std::vector<float> se_data;
std::vector<int32_t> ct_data;
if (!params.ref_dir.empty()) {
const std::string se_path = params.ref_dir + "/speaker_emb.npy";
const std::string ct_path = params.ref_dir + "/cond_prompt_speech_tokens.npy";
if (file_exists(se_path)) {
npy_array se = npy_load(se_path);
se_data.assign((const float *)se.data.data(),
(const float *)se.data.data() + se.n_elements());
have_se = true;
}
if (file_exists(ct_path)) {
npy_array ct = npy_load(ct_path);
ct_data.assign((const int32_t *)ct.data.data(),
(const int32_t *)ct.data.data() + ct.n_elements());
have_ct = true;
}
}
if (!have_se && !params.reference_audio.empty()) {
voice_encoder_weights vew;
if (voice_encoder_load(params.model, vew)) {
if (params.verbose) fprintf(stderr, "voice_encoder: computing speaker_emb from %s\n",
params.reference_audio.c_str());
std::vector<float> wav;
int sr = 0;
if (!wav_load(params.reference_audio, wav, sr))
throw std::runtime_error("failed to load --reference-audio for VoiceEncoder");
normalise_lufs(wav, sr, -27.0);
if (sr != 16000) wav = resample_sinc(wav, sr, 16000);
// Reuse the T3 backend — already loaded & sitting on the GPU
// at this point in the flow.
if (!voice_encoder_embed(wav, vew, model.backend, se_data))
throw std::runtime_error("VoiceEncoder forward failed");
have_se = true;
} else {
fprintf(stderr,
"voice_encoder: T3 GGUF has no VE weights; cannot synthesise speaker_emb natively "
"(re-run scripts/convert-t3-turbo-to-gguf.py)\n");
}
}
// Speech-token overrides: compute both cond_prompt_speech_tokens
// (T3 side) and prompt_token (S3Gen side, stashed for later) via
// the C++ S3TokenizerV2 port if --reference-audio is given and the
// s3gen GGUF has the tokenizer weights (Phase 2e).
std::vector<int32_t> prompt_token_from_ref;
bool ct_from_cpp = false;
if (!have_ct && !params.reference_audio.empty() && !params.s3gen_gguf.empty()) {
std::vector<int32_t> cond_tokens;
if (compute_speech_tokens_native(params.reference_audio, params.s3gen_gguf,
/*max_cond_tokens=*/model.hparams.cond_prompt_len,
prompt_token_from_ref, cond_tokens,
params.n_threads,
/*backend=*/model.backend,
params.verbose)) {
ct_data = std::move(cond_tokens);
have_ct = true;
ct_from_cpp = true;
}
}
if (have_se) {
if ((int64_t)se_data.size() != ggml_nelements(model.builtin_speaker_emb)) {
fprintf(stderr,
"error: speaker_emb has %zu elements but builtin_speaker_emb expects %lld\n",
se_data.size(), (long long)ggml_nelements(model.builtin_speaker_emb));
return 1;
}
ggml_backend_tensor_set(model.builtin_speaker_emb,
se_data.data(), 0, ggml_nbytes(model.builtin_speaker_emb));
}
if (have_ct) {
if ((int64_t)ct_data.size() == ggml_nelements(model.builtin_cond_prompt_tokens)) {
ggml_backend_tensor_set(model.builtin_cond_prompt_tokens,
ct_data.data(), 0,
ggml_nbytes(model.builtin_cond_prompt_tokens));
} else {
ggml_init_params op = { ggml_tensor_overhead() * 2, nullptr, true };
model.ctx_override = ggml_init(op);
if (!model.ctx_override) throw std::runtime_error("ggml_init(ctx_override) failed");
ggml_tensor * new_ct = ggml_new_tensor_1d(model.ctx_override, GGML_TYPE_I32, (int64_t)ct_data.size());
ggml_set_name(new_ct, "chatterbox/builtin/cond_prompt_speech_tokens_override");
model.buffer_override = ggml_backend_alloc_ctx_tensors(model.ctx_override, model.backend);
if (!model.buffer_override) throw std::runtime_error("alloc override buffer failed");
ggml_backend_tensor_set(new_ct, ct_data.data(), 0, ct_data.size() * sizeof(int32_t));
model.builtin_cond_prompt_tokens = new_ct;
model.hparams.cond_prompt_len = (int32_t)ct_data.size();
}
}
if (have_se || have_ct) {
if (params.verbose) {
fprintf(stderr,
"%s: T3 voice override — speaker_emb=%s, cond_prompt_tokens=%s\n",
__func__,
have_se ? (params.reference_audio.empty() ? "ref_dir" : "C++ VoiceEncoder") : "built-in",
have_ct ? (ct_from_cpp ? "C++ S3TokenizerV2" : "ref_dir") : "built-in");
}
} else if (!params.ref_dir.empty() || !params.reference_audio.empty()) {