Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions src/acoustic_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,4 +351,58 @@ struct ggml_tensor* decoder_forward(struct ggml_context* ctx,
return y;
}

namespace {

// Streaming causal SConvTranspose1d (upsample) β€” cache-backed counterpart of
// convtr_step.
struct ggml_tensor* convtr_step_streaming(struct ggml_context* ctx,
struct ggml_tensor* x,
const StridedConvWeights& w,
StreamingCache& cache,
const std::string& layer_id) {
return sconv_transpose1d_causal_streaming(ctx, x, w.kernel, w.bias, w.stride,
cache, layer_id);
}

} // namespace

struct ggml_tensor* decoder_forward_streaming(struct ggml_context* ctx,
struct ggml_tensor* z,
const DecoderWeights& w,
const AcousticConfig& cfg,
StreamingCache& cache) {
char buf[80];

struct ggml_tensor* h = sconv1d_causal_streaming(
ctx, z, w.stem.kernel, w.stem.bias, w.stem.stride, /*dilation=*/1, /*groups=*/1,
cache, "dec.stem");
for (size_t j = 0; j < w.stages[0].size(); ++j) {
std::snprintf(buf, sizeof(buf), "dec.stage_0_block_%zu", j);
h = block1d_forward_streaming(ctx, h, w.stages[0][j], cfg.eps, cache, buf);
}

for (size_t i = 1; i < cfg.depths.size(); ++i) {
std::snprintf(buf, sizeof(buf), "dec.up_%zu", i);
h = convtr_step_streaming(ctx, h, w.ups[i - 1], cache, buf);
for (size_t j = 0; j < w.stages[i].size(); ++j) {
std::snprintf(buf, sizeof(buf), "dec.stage_%zu_block_%zu", i, j);
h = block1d_forward_streaming(ctx, h, w.stages[i][j], cfg.eps, cache, buf);
}
}

struct ggml_tensor* y = h;
if (w.final_norm) {
struct ggml_tensor* p = ggml_permute(ctx, y, 1, 0, 2, 3);
p = ggml_cont(ctx, p);
p = ggml_rms_norm(ctx, p, cfg.eps);
p = ggml_mul(ctx, p, w.final_norm);
p = ggml_permute(ctx, p, 1, 0, 2, 3);
y = ggml_cont(ctx, p);
}
y = sconv1d_causal_streaming(
ctx, y, w.head.kernel, w.head.bias, w.head.stride, /*dilation=*/1, /*groups=*/1,
cache, "dec.head");
return y;
}

} // namespace vv
12 changes: 12 additions & 0 deletions src/acoustic_tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,18 @@ struct ggml_tensor* encoder_forward_streaming(struct ggml_context* ctx,
const AcousticConfig& cfg,
StreamingCache& cache);

// Streaming decoder: same math as decoder_forward, but every causal conv
// (stem, per-block depthwise mixers, the transposed upsamplers, and the
// head) reads/writes its left context through `cache`. Driven in chunk
// order with cache.is_first_chunk / is_final_chunk set by the caller, the
// concatenated per-chunk audio is bit-exact with a single-shot decode while
// keeping peak activation memory bounded to one chunk's worth of frames.
struct ggml_tensor* decoder_forward_streaming(struct ggml_context* ctx,
struct ggml_tensor* z,
const DecoderWeights& w,
const AcousticConfig& cfg,
StreamingCache& cache);

} // namespace vv

#endif // VIBEVOICE_ACOUSTIC_TOKENIZER_HPP
73 changes: 73 additions & 0 deletions src/conv1d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,4 +175,77 @@ struct ggml_tensor* sconv_transpose1d_causal(struct ggml_context* ctx,
return maybe_add_bias_t(ctx, y, bias);
}

struct ggml_tensor* sconv_transpose1d_causal_streaming(struct ggml_context* ctx,
struct ggml_tensor* x,
struct ggml_tensor* kernel,
struct ggml_tensor* bias,
int stride,
StreamingCache& cache,
const std::string& layer_id) {
const int K = static_cast<int>(kernel->ne[0]);
const int C_in = static_cast<int>(x->ne[1]);
const int B = static_cast<int>(x->ne[2]);
const int T_in = static_cast<int>(x->ne[0]);
const int S = stride;

// Left-context input frames needed so the first kept output sample is
// exact: output position p depends on input frames down to
// floor((p - K + 1) / S). For the first kept position p = context*S this
// reaches context - ceil((K-1)/S), so context = ceil((K-1)/S) frames of
// history make the kept region bit-exact vs single-shot.
const int context = (K - 1 + S - 1) / S;

auto& entry = cache[layer_id];
entry.T = context;
entry.C = C_in;

// Cache-prefix leaf (filled by caller post-alloc): zeros on the first
// chunk, the previous chunk's trailing `context` input frames otherwise.
struct ggml_tensor* prefix = nullptr;
if (context > 0) {
prefix = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context, C_in, B);
ggml_set_name(prefix, ("cache_prefix_" + layer_id).c_str());
}
entry.prefix = prefix;

struct ggml_tensor* xp = x;
if (prefix) xp = ggml_concat(ctx, prefix, x, /*dim=*/0);

// ggml-cuda's conv_transpose_1d requires F32 kernels (see the single-shot
// path); standardize on F32 so the same graph runs on CPU and CUDA.
struct ggml_tensor* k = (kernel->type == GGML_TYPE_F32)
? kernel
: ggml_cast(ctx, kernel, GGML_TYPE_F32);

struct ggml_tensor* y_full = ggml_conv_transpose_1d(ctx, k, xp, S, /*p0=*/0, /*d0=*/1);
// y_full length = (context + T_in - 1) * S + K. The output frames that
// belong to the NEW input frames live at [context*S, (context+T_in)*S);
// that upper bound equals the single-shot trimmed length, so the slice
// both drops the warmup prefix and applies the causal right-trim.
const int64_t out_start = static_cast<int64_t>(context) * S;
const int64_t out_len = static_cast<int64_t>(T_in) * S;
struct ggml_tensor* y = ggml_view_3d(ctx, y_full,
/*ne0=*/out_len,
/*ne1=*/y_full->ne[1],
/*ne2=*/y_full->ne[2],
/*nb1=*/y_full->nb[1],
/*nb2=*/y_full->nb[2],
/*offset=*/static_cast<size_t>(out_start) * y_full->nb[0]);
y = ggml_cont(ctx, y);

// Register a view of the last `context` input frames (of the concatenated
// stream) so the caller can carry them into the next chunk.
if (context > 0) {
const int start = std::max(0, (T_in + context) - context); // = T_in
struct ggml_tensor* view = ggml_view_3d(
ctx, xp,
/*ne0=*/context, /*ne1=*/C_in, /*ne2=*/B,
/*nb1=*/xp->nb[1], /*nb2=*/xp->nb[2],
/*offset=*/static_cast<size_t>(start) * xp->nb[0]);
entry.next_view = ggml_cont(ctx, view);
}

return maybe_add_bias_t(ctx, y, bias);
}

} // namespace vv
16 changes: 16 additions & 0 deletions src/conv1d.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,22 @@ struct ggml_tensor* sconv_transpose1d_causal(struct ggml_context* ctx,
struct ggml_tensor* bias, // [C_out] or null
int stride);

// Streaming variant of sconv_transpose1d_causal. Mirrors
// sconv1d_causal_streaming: prepends `context = ceil((K-1)/stride)` input
// frames from the cache (zeros on the first chunk), runs the transposed
// conv on the concatenated input, then slices the output region that
// belongs to the NEW frames ([context*stride, (context+T_in)*stride)) so
// the per-chunk outputs concatenate bit-exact with a single-shot pass.
// Registers a view of the last `context` input frames as
// cache[layer_id].next_view for the caller to copy out post-compute.
struct ggml_tensor* sconv_transpose1d_causal_streaming(struct ggml_context* ctx,
struct ggml_tensor* x, // [T, C_in, B]
struct ggml_tensor* kernel, // [K, C_out, C_in]
struct ggml_tensor* bias, // [C_out] or null
int stride,
StreamingCache& cache,
const std::string& layer_id);

} // namespace vv

#endif // VIBEVOICE_CONV1D_HPP
148 changes: 144 additions & 4 deletions src/vibevoice_tts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <random>
#include <regex>
Expand Down Expand Up @@ -575,10 +576,14 @@ void add_input_type_embedding(const VibeVoiceConfig& cfg,
// `scaled_latents` has shape [vae_dim * n_frames] in row-major (latent
// fastest), matching what `ggml_new_tensor_3d(ctx, F32, n_frames, vae_dim, 1)`
// expects when ne[0] = n_frames is the contiguous dim.
std::vector<float> decode_latent_sequence(const VibeVoiceConfig& cfg,
const VibeVoiceWeights& w,
const float* scaled_latents,
int n_frames) {
// Single-shot acoustic decode: the whole latent sequence is upsampled to
// audio in one graph. Peak activation memory scales with n_frames (the last
// decoder stages run at 24 kHz), so this is only used for short sequences;
// longer ones go through decode_latent_streaming below.
std::vector<float> decode_latent_single_shot(const VibeVoiceConfig& cfg,
const VibeVoiceWeights& w,
const float* scaled_latents,
int n_frames) {
if (n_frames <= 0) return {};
// Backend-aware compute: build the graph in a no_alloc ctx, allocate
// leaf tensors on the active backend's buffer, upload input via
Expand Down Expand Up @@ -614,6 +619,141 @@ std::vector<float> decode_latent_sequence(const VibeVoiceConfig& cfg,
return samples;
}

// Per-chunk default for the streaming decode, in latent frames. Each frame
// upsamples to 3200 audio samples, so peak decoder activation memory and
// ggml-cuda's IM2COL gridDim.y limit (65535 over the
// 24 kHz time axis) both scale with this. CUDA stays well under the limit at
// 15 frames (48 k samples); CPU can afford much larger chunks. Override with
// VIBEVOICE_DECODE_CHUNK_FRAMES.
int decode_chunk_frames() {
if (const char* env = std::getenv("VIBEVOICE_DECODE_CHUNK_FRAMES")) {
const int v = std::atoi(env);
if (v > 0) return v;
}
const ggml_backend_t b = vv::backend();
const bool is_cuda = b &&
std::string(ggml_backend_name(b)).find("CUDA") != std::string::npos;
return is_cuda ? 15 : 64;
}

// One streaming decode chunk: builds the decoder graph against `cache`, runs
// it, and pulls each conv's trailing context into the cache for the next
// call. `chunk` is the packed latent slice [vae_dim * C] in ggml ne layout
// (time fastest: data[d * C + t]). Returns this chunk's C * 3200 audio
// samples.
std::vector<float> run_decoder_chunk_streaming(const VibeVoiceConfig& cfg,
const VibeVoiceWeights& w,
const std::vector<float>& chunk,
int C,
StreamingCache& cache) {
struct ggml_init_params p {};
p.mem_size = ggml_tensor_overhead() * 32768 + ggml_graph_overhead_custom(32768, false);
p.no_alloc = true;
struct ggml_context* ctx = ggml_init(p);
if (!ctx) return {};

struct ggml_tensor* z = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, C, cfg.vae_dim, 1);
ggml_set_name(z, "decode_z_chunk");
struct ggml_tensor* y = decoder_forward_streaming(ctx, z, w.at_dec, cfg.acoustic, cache);

struct ggml_cgraph* gf = ggml_new_graph_custom(ctx, 32768, false);
ggml_build_forward_expand(gf, y);
// Keep each conv's "trailing context" view alive in the graph so its
// memory survives until we copy it back into the cache.
for (auto& kv : cache) {
if (kv.second.next_view) ggml_build_forward_expand(gf, kv.second.next_view);
}

ggml_backend_buffer_t in_buf = vv::allocate_ctx_tensors(ctx);
if (!in_buf) { ggml_free(ctx); return {}; }
ggml_backend_tensor_set(z, chunk.data(), 0, sizeof(float) * cfg.vae_dim * C);

// Populate the per-conv prefixes: zeros on the first chunk, the previous
// chunk's tail thereafter. (.data is only valid after the alloc above.)
for (auto& kv : cache) {
StreamingCacheEntry& e = kv.second;
if (!e.prefix || e.T == 0) continue;
const size_t need = static_cast<size_t>(e.T) * e.C;
if (cache.is_first_chunk || e.data.size() != need) {
std::vector<float> zeros(need, 0.0f);
ggml_backend_tensor_set(e.prefix, zeros.data(), 0, sizeof(float) * need);
} else {
ggml_backend_tensor_set(e.prefix, e.data.data(), 0, sizeof(float) * need);
}
}

if (!vv::compute_graph(gf)) {
ggml_backend_buffer_free(in_buf);
ggml_free(ctx);
return {};
}
const int T_full = static_cast<int>(y->ne[0]);
std::vector<float> samples(T_full);
ggml_backend_tensor_get(y, samples.data(), 0, sizeof(float) * T_full);

// Carry each conv's trailing context into the cache for the next chunk.
for (auto& kv : cache) {
StreamingCacheEntry& e = kv.second;
if (!e.next_view || e.T == 0) continue;
const size_t n = static_cast<size_t>(e.T) * e.C;
e.data.assign(n, 0.0f);
ggml_backend_tensor_get(e.next_view, e.data.data(), 0, sizeof(float) * n);
e.next_view = nullptr;
e.prefix = nullptr;
}
cache.is_first_chunk = false;
ggml_backend_buffer_free(in_buf);
ggml_free(ctx);
return samples;
}

// Acoustic decode entry point. Dispatches short sequences to the single-shot
// path and longer ones to a chunked streaming decode whose peak memory is
// bounded to one chunk. The streaming cache makes the concatenated output
// bit-exact with single-shot. `scaled_latents` is packed [vae_dim * n_frames]
// in ggml ne layout (time fastest: data[d * n_frames + t]).
std::vector<float> decode_latent_sequence(const VibeVoiceConfig& cfg,
const VibeVoiceWeights& w,
const float* scaled_latents,
int n_frames) {
if (n_frames <= 0) return {};

const int chunk = decode_chunk_frames();
if (n_frames <= chunk) {
return decode_latent_single_shot(cfg, w, scaled_latents, n_frames);
}

const int vae = cfg.vae_dim;
StreamingCache cache;
cache.is_first_chunk = true;

// Audio samples per latent frame = product of the decoder upsample ratios.
size_t upsample = 1;
for (int r : cfg.acoustic.ratios) upsample *= static_cast<size_t>(r);

std::vector<float> out;
out.reserve(static_cast<size_t>(n_frames) * upsample);

for (int off = 0; off < n_frames; off += chunk) {
const int end = std::min(off + chunk, n_frames);
const int C = end - off;
cache.is_final_chunk = (end == n_frames);

// Slice this chunk into a contiguous [vae_dim * C] (time fastest)
// buffer matching the z tensor layout.
std::vector<float> cp(static_cast<size_t>(vae) * C);
for (int d = 0; d < vae; ++d) {
const float* src = scaled_latents + static_cast<size_t>(d) * n_frames + off;
std::copy(src, src + C, cp.begin() + static_cast<size_t>(d) * C);
}

std::vector<float> seg = run_decoder_chunk_streaming(cfg, w, cp, C, cache);
if (seg.empty()) return {};
out.insert(out.end(), seg.begin(), seg.end());
}
return out;
}

} // namespace

// Forward declaration β€” implementation later in this file.
Expand Down