localai-org · VelvetBeans · May 30, 2026
diff --git a/src/acoustic_tokenizer.cpp b/src/acoustic_tokenizer.cpp
@@ -351,4 +351,58 @@ struct ggml_tensor* decoder_forward(struct ggml_context*  ctx,
     return y;
 }
 
+namespace {
+
+// Streaming causal SConvTranspose1d (upsample) — cache-backed counterpart of
+// convtr_step.
+struct ggml_tensor* convtr_step_streaming(struct ggml_context*       ctx,
+                                          struct ggml_tensor*        x,
+                                          const StridedConvWeights&  w,
+                                          StreamingCache&            cache,
+                                          const std::string&         layer_id) {
+    return sconv_transpose1d_causal_streaming(ctx, x, w.kernel, w.bias, w.stride,
+                                              cache, layer_id);
+}
+
+}  // namespace
+
+struct ggml_tensor* decoder_forward_streaming(struct ggml_context*  ctx,
+                                              struct ggml_tensor*   z,
+                                              const DecoderWeights& w,
+                                              const AcousticConfig& cfg,
+                                              StreamingCache&       cache) {
+    char buf[80];
+
+    struct ggml_tensor* h = sconv1d_causal_streaming(
+        ctx, z, w.stem.kernel, w.stem.bias, w.stem.stride, /*dilation=*/1, /*groups=*/1,
+        cache, "dec.stem");
+    for (size_t j = 0; j < w.stages[0].size(); ++j) {
+        std::snprintf(buf, sizeof(buf), "dec.stage_0_block_%zu", j);
+        h = block1d_forward_streaming(ctx, h, w.stages[0][j], cfg.eps, cache, buf);
+    }
+
+    for (size_t i = 1; i < cfg.depths.size(); ++i) {
+        std::snprintf(buf, sizeof(buf), "dec.up_%zu", i);
+        h = convtr_step_streaming(ctx, h, w.ups[i - 1], cache, buf);
+        for (size_t j = 0; j < w.stages[i].size(); ++j) {
+            std::snprintf(buf, sizeof(buf), "dec.stage_%zu_block_%zu", i, j);
+            h = block1d_forward_streaming(ctx, h, w.stages[i][j], cfg.eps, cache, buf);
+        }
+    }
+
+    struct ggml_tensor* y = h;
+    if (w.final_norm) {
+        struct ggml_tensor* p = ggml_permute(ctx, y, 1, 0, 2, 3);
+        p = ggml_cont(ctx, p);
+        p = ggml_rms_norm(ctx, p, cfg.eps);
+        p = ggml_mul(ctx, p, w.final_norm);
+        p = ggml_permute(ctx, p, 1, 0, 2, 3);
+        y = ggml_cont(ctx, p);
+    }
+    y = sconv1d_causal_streaming(
+        ctx, y, w.head.kernel, w.head.bias, w.head.stride, /*dilation=*/1, /*groups=*/1,
+        cache, "dec.head");
+    return y;
+}
+
 }  // namespace vv
diff --git a/src/acoustic_tokenizer.hpp b/src/acoustic_tokenizer.hpp
@@ -120,6 +120,18 @@ struct ggml_tensor* encoder_forward_streaming(struct ggml_context*    ctx,
                                               const AcousticConfig&   cfg,
                                               StreamingCache&         cache);
 
+// Streaming decoder: same math as decoder_forward, but every causal conv
+// (stem, per-block depthwise mixers, the transposed upsamplers, and the
+// head) reads/writes its left context through `cache`. Driven in chunk
+// order with cache.is_first_chunk / is_final_chunk set by the caller, the
+// concatenated per-chunk audio is bit-exact with a single-shot decode while
+// keeping peak activation memory bounded to one chunk's worth of frames.
+struct ggml_tensor* decoder_forward_streaming(struct ggml_context*  ctx,
+                                              struct ggml_tensor*   z,
+                                              const DecoderWeights& w,
+                                              const AcousticConfig& cfg,
+                                              StreamingCache&       cache);
+
 }  // namespace vv
 
 #endif  // VIBEVOICE_ACOUSTIC_TOKENIZER_HPP
diff --git a/src/conv1d.cpp b/src/conv1d.cpp
@@ -175,4 +175,77 @@ struct ggml_tensor* sconv_transpose1d_causal(struct ggml_context* ctx,
     return maybe_add_bias_t(ctx, y, bias);
 }
 
+struct ggml_tensor* sconv_transpose1d_causal_streaming(struct ggml_context* ctx,
+                                                       struct ggml_tensor*  x,
+                                                       struct ggml_tensor*  kernel,
+                                                       struct ggml_tensor*  bias,
+                                                       int stride,
+                                                       StreamingCache&      cache,
+                                                       const std::string&   layer_id) {
+    const int K     = static_cast<int>(kernel->ne[0]);
+    const int C_in  = static_cast<int>(x->ne[1]);
+    const int B     = static_cast<int>(x->ne[2]);
+    const int T_in  = static_cast<int>(x->ne[0]);
+    const int S     = stride;
+
+    // Left-context input frames needed so the first kept output sample is
+    // exact: output position p depends on input frames down to
+    // floor((p - K + 1) / S). For the first kept position p = context*S this
+    // reaches context - ceil((K-1)/S), so context = ceil((K-1)/S) frames of
+    // history make the kept region bit-exact vs single-shot.
+    const int context = (K - 1 + S - 1) / S;
+
+    auto& entry = cache[layer_id];
+    entry.T = context;
+    entry.C = C_in;
+
+    // Cache-prefix leaf (filled by caller post-alloc): zeros on the first
+    // chunk, the previous chunk's trailing `context` input frames otherwise.
+    struct ggml_tensor* prefix = nullptr;
+    if (context > 0) {
+        prefix = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, context, C_in, B);
+        ggml_set_name(prefix, ("cache_prefix_" + layer_id).c_str());
+    }
+    entry.prefix = prefix;
+
+    struct ggml_tensor* xp = x;
+    if (prefix) xp = ggml_concat(ctx, prefix, x, /*dim=*/0);
+
+    // ggml-cuda's conv_transpose_1d requires F32 kernels (see the single-shot
+    // path); standardize on F32 so the same graph runs on CPU and CUDA.
+    struct ggml_tensor* k = (kernel->type == GGML_TYPE_F32)
+                              ? kernel
+                              : ggml_cast(ctx, kernel, GGML_TYPE_F32);
+
+    struct ggml_tensor* y_full = ggml_conv_transpose_1d(ctx, k, xp, S, /*p0=*/0, /*d0=*/1);
+    // y_full length = (context + T_in - 1) * S + K. The output frames that
+    // belong to the NEW input frames live at [context*S, (context+T_in)*S);
+    // that upper bound equals the single-shot trimmed length, so the slice
+    // both drops the warmup prefix and applies the causal right-trim.
+    const int64_t out_start = static_cast<int64_t>(context) * S;
+    const int64_t out_len   = static_cast<int64_t>(T_in)    * S;
+    struct ggml_tensor* y = ggml_view_3d(ctx, y_full,
+                                         /*ne0=*/out_len,
+                                         /*ne1=*/y_full->ne[1],
+                                         /*ne2=*/y_full->ne[2],
+                                         /*nb1=*/y_full->nb[1],
+                                         /*nb2=*/y_full->nb[2],
+                                         /*offset=*/static_cast<size_t>(out_start) * y_full->nb[0]);
+    y = ggml_cont(ctx, y);
+
+    // Register a view of the last `context` input frames (of the concatenated
+    // stream) so the caller can carry them into the next chunk.
+    if (context > 0) {
+        const int start = std::max(0, (T_in + context) - context);  // = T_in
+        struct ggml_tensor* view = ggml_view_3d(
+            ctx, xp,
+            /*ne0=*/context, /*ne1=*/C_in, /*ne2=*/B,
+            /*nb1=*/xp->nb[1], /*nb2=*/xp->nb[2],
+            /*offset=*/static_cast<size_t>(start) * xp->nb[0]);
+        entry.next_view = ggml_cont(ctx, view);
+    }
+
+    return maybe_add_bias_t(ctx, y, bias);
+}
+
 }  // namespace vv
diff --git a/src/conv1d.hpp b/src/conv1d.hpp
@@ -110,6 +110,22 @@ struct ggml_tensor* sconv_transpose1d_causal(struct ggml_context* ctx,
                                              struct ggml_tensor*  bias,   // [C_out] or null
                                              int stride);
 
+// Streaming variant of sconv_transpose1d_causal. Mirrors
+// sconv1d_causal_streaming: prepends `context = ceil((K-1)/stride)` input
+// frames from the cache (zeros on the first chunk), runs the transposed
+// conv on the concatenated input, then slices the output region that
+// belongs to the NEW frames ([context*stride, (context+T_in)*stride)) so
+// the per-chunk outputs concatenate bit-exact with a single-shot pass.
+// Registers a view of the last `context` input frames as
+// cache[layer_id].next_view for the caller to copy out post-compute.
+struct ggml_tensor* sconv_transpose1d_causal_streaming(struct ggml_context* ctx,
+                                                       struct ggml_tensor*  x,      // [T, C_in, B]
+                                                       struct ggml_tensor*  kernel, // [K, C_out, C_in]
+                                                       struct ggml_tensor*  bias,   // [C_out] or null
+                                                       int stride,
+                                                       StreamingCache&      cache,
+                                                       const std::string&   layer_id);
+
 }  // namespace vv
 
 #endif  // VIBEVOICE_CONV1D_HPP
diff --git a/src/vibevoice_tts.cpp b/src/vibevoice_tts.cpp
@@ -11,6 +11,7 @@
 #include <algorithm>
 #include <cmath>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <random>
 #include <regex>
@@ -575,10 +576,14 @@ void add_input_type_embedding(const VibeVoiceConfig& cfg,
 // `scaled_latents` has shape [vae_dim * n_frames] in row-major (latent
 // fastest), matching what `ggml_new_tensor_3d(ctx, F32, n_frames, vae_dim, 1)`
 // expects when ne[0] = n_frames is the contiguous dim.
-std::vector<float> decode_latent_sequence(const VibeVoiceConfig&  cfg,
-                                          const VibeVoiceWeights& w,
-                                          const float*            scaled_latents,
-                                          int                     n_frames) {
+// Single-shot acoustic decode: the whole latent sequence is upsampled to
+// audio in one graph. Peak activation memory scales with n_frames (the last
+// decoder stages run at 24 kHz), so this is only used for short sequences;
+// longer ones go through decode_latent_streaming below.
+std::vector<float> decode_latent_single_shot(const VibeVoiceConfig&  cfg,
+                                             const VibeVoiceWeights& w,
+                                             const float*            scaled_latents,
+                                             int                     n_frames) {
     if (n_frames <= 0) return {};
     // Backend-aware compute: build the graph in a no_alloc ctx, allocate
     // leaf tensors on the active backend's buffer, upload input via
@@ -614,6 +619,141 @@ std::vector<float> decode_latent_sequence(const VibeVoiceConfig&  cfg,
     return samples;
 }
 
+// Per-chunk default for the streaming decode, in latent frames. Each frame
+// upsamples to 3200 audio samples, so peak decoder activation memory and
+// ggml-cuda's IM2COL gridDim.y limit (65535 over the
+// 24 kHz time axis) both scale with this. CUDA stays well under the limit at
+// 15 frames (48 k samples); CPU can afford much larger chunks. Override with
+// VIBEVOICE_DECODE_CHUNK_FRAMES.
+int decode_chunk_frames() {
+    if (const char* env = std::getenv("VIBEVOICE_DECODE_CHUNK_FRAMES")) {
+        const int v = std::atoi(env);
+        if (v > 0) return v;
+    }
+    const ggml_backend_t b = vv::backend();
+    const bool is_cuda = b &&
+        std::string(ggml_backend_name(b)).find("CUDA") != std::string::npos;
+    return is_cuda ? 15 : 64;
+}
+
+// One streaming decode chunk: builds the decoder graph against `cache`, runs
+// it, and pulls each conv's trailing context into the cache for the next
+// call. `chunk` is the packed latent slice [vae_dim * C] in ggml ne layout
+// (time fastest: data[d * C + t]). Returns this chunk's C * 3200 audio
+// samples.
+std::vector<float> run_decoder_chunk_streaming(const VibeVoiceConfig&  cfg,
+                                               const VibeVoiceWeights& w,
+                                               const std::vector<float>& chunk,
+                                               int                       C,
+                                               StreamingCache&           cache) {
+    struct ggml_init_params p {};
+    p.mem_size = ggml_tensor_overhead() * 32768 + ggml_graph_overhead_custom(32768, false);
+    p.no_alloc = true;
+    struct ggml_context* ctx = ggml_init(p);
+    if (!ctx) return {};
+
+    struct ggml_tensor* z = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, C, cfg.vae_dim, 1);
+    ggml_set_name(z, "decode_z_chunk");
+    struct ggml_tensor* y = decoder_forward_streaming(ctx, z, w.at_dec, cfg.acoustic, cache);
+
+    struct ggml_cgraph* gf = ggml_new_graph_custom(ctx, 32768, false);
+    ggml_build_forward_expand(gf, y);
+    // Keep each conv's "trailing context" view alive in the graph so its
+    // memory survives until we copy it back into the cache.
+    for (auto& kv : cache) {
+        if (kv.second.next_view) ggml_build_forward_expand(gf, kv.second.next_view);
+    }
+
+    ggml_backend_buffer_t in_buf = vv::allocate_ctx_tensors(ctx);
+    if (!in_buf) { ggml_free(ctx); return {}; }
+    ggml_backend_tensor_set(z, chunk.data(), 0, sizeof(float) * cfg.vae_dim * C);
+
+    // Populate the per-conv prefixes: zeros on the first chunk, the previous
+    // chunk's tail thereafter. (.data is only valid after the alloc above.)
+    for (auto& kv : cache) {
+        StreamingCacheEntry& e = kv.second;
+        if (!e.prefix || e.T == 0) continue;
+        const size_t need = static_cast<size_t>(e.T) * e.C;
+        if (cache.is_first_chunk || e.data.size() != need) {
+            std::vector<float> zeros(need, 0.0f);
+            ggml_backend_tensor_set(e.prefix, zeros.data(), 0, sizeof(float) * need);
+        } else {
+            ggml_backend_tensor_set(e.prefix, e.data.data(), 0, sizeof(float) * need);
+        }
+    }
+
+    if (!vv::compute_graph(gf)) {
+        ggml_backend_buffer_free(in_buf);
+        ggml_free(ctx);
+        return {};
+    }
+    const int T_full = static_cast<int>(y->ne[0]);
+    std::vector<float> samples(T_full);
+    ggml_backend_tensor_get(y, samples.data(), 0, sizeof(float) * T_full);
+
+    // Carry each conv's trailing context into the cache for the next chunk.
+    for (auto& kv : cache) {
+        StreamingCacheEntry& e = kv.second;
+        if (!e.next_view || e.T == 0) continue;
+        const size_t n = static_cast<size_t>(e.T) * e.C;
+        e.data.assign(n, 0.0f);
+        ggml_backend_tensor_get(e.next_view, e.data.data(), 0, sizeof(float) * n);
+        e.next_view = nullptr;
+        e.prefix    = nullptr;
+    }
+    cache.is_first_chunk = false;
+    ggml_backend_buffer_free(in_buf);
+    ggml_free(ctx);
+    return samples;
+}
+
+// Acoustic decode entry point. Dispatches short sequences to the single-shot
+// path and longer ones to a chunked streaming decode whose peak memory is
+// bounded to one chunk. The streaming cache makes the concatenated output
+// bit-exact with single-shot. `scaled_latents` is packed [vae_dim * n_frames]
+// in ggml ne layout (time fastest: data[d * n_frames + t]).
+std::vector<float> decode_latent_sequence(const VibeVoiceConfig&  cfg,
+                                          const VibeVoiceWeights& w,
+                                          const float*            scaled_latents,
+                                          int                     n_frames) {
+    if (n_frames <= 0) return {};
+
+    const int chunk = decode_chunk_frames();
+    if (n_frames <= chunk) {
+        return decode_latent_single_shot(cfg, w, scaled_latents, n_frames);
+    }
+
+    const int vae = cfg.vae_dim;
+    StreamingCache cache;
+    cache.is_first_chunk = true;
+
+    // Audio samples per latent frame = product of the decoder upsample ratios.
+    size_t upsample = 1;
+    for (int r : cfg.acoustic.ratios) upsample *= static_cast<size_t>(r);
+
+    std::vector<float> out;
+    out.reserve(static_cast<size_t>(n_frames) * upsample);
+
+    for (int off = 0; off < n_frames; off += chunk) {
+        const int end = std::min(off + chunk, n_frames);
+        const int C   = end - off;
+        cache.is_final_chunk = (end == n_frames);
+
+        // Slice this chunk into a contiguous [vae_dim * C] (time fastest)
+        // buffer matching the z tensor layout.
+        std::vector<float> cp(static_cast<size_t>(vae) * C);
+        for (int d = 0; d < vae; ++d) {
+            const float* src = scaled_latents + static_cast<size_t>(d) * n_frames + off;
+            std::copy(src, src + C, cp.begin() + static_cast<size_t>(d) * C);
+        }
+
+        std::vector<float> seg = run_decoder_chunk_streaming(cfg, w, cp, C, cache);
+        if (seg.empty()) return {};
+        out.insert(out.end(), seg.begin(), seg.end());
+    }
+    return out;
+}
+
 }  // namespace
 
 // Forward declaration — implementation later in this file.