structured-world · polaz · May 22, 2026 · May 22, 2026 · May 22, 2026 · May 22, 2026
diff --git a/zstd/Cargo.toml b/zstd/Cargo.toml
@@ -70,6 +70,11 @@ critical-section = ["dep:critical-section"]
 bench_internals = []
 fuzz_exports = []
 std = []
+# Diagnostic tracing of the Fast kernel's inner loop — per-iteration state
+# dumps gated at compile time so production builds carry zero cost. Runtime
+# activation via `STRUCTURED_ZSTD_KERNEL_TRACE=1` env var. Used by the
+# `trace_fast_kernel` example for #220 ratio-divergence investigation.
+kernel_trace = ["std"]
 
 # Internal feature, only used when building as part of libstd, not part of the
 # stable interface of this crate.

diff --git a/zstd/examples/donor_compress_z000033.rs b/zstd/examples/donor_compress_z000033.rs
@@ -0,0 +1,50 @@
+//! One-shot diagnostic: invoke FFI `ZSTD_compress` on z000033 at level 1.
+//! Donor's `zstd_fast.c` has been patched with `fprintf(stderr, "D_...")`
+//! traces gated by `DONOR_TRACE_FAST=1` env var. Run this binary with
+//! both `DONOR_TRACE_FAST=1` set and stderr redirected to a file to
+//! capture donor's actual cursor trace.
+//!
+//! Build: cargo build --release -p structured-zstd --example donor_compress_z000033
+//! Run:   DONOR_TRACE_FAST=1 ./target/release/examples/donor_compress_z000033 \
+//!          > /dev/null 2> /tmp/donor_trace.log
+
+use std::env;
+use std::fs;
+
+use zstd::zstd_safe::zstd_sys;
+
+fn main() {
+    let corpus_path = env::args()
+        .nth(1)
+        .unwrap_or_else(|| "zstd/decodecorpus_files/z000033".to_string());
+    let bytes = fs::read(&corpus_path).expect("read corpus");
+    eprintln!(
+        "DONOR_TRACE_START corpus={} size={} level=1",
+        corpus_path,
+        bytes.len()
+    );
+
+    let dst_cap = unsafe { zstd_sys::ZSTD_compressBound(bytes.len()) };
+    let mut dst: Vec<u8> = vec![0u8; dst_cap];
+
+    let rc = unsafe {
+        zstd_sys::ZSTD_compress(
+            dst.as_mut_ptr() as *mut core::ffi::c_void,
+            dst_cap,
+            bytes.as_ptr() as *const core::ffi::c_void,
+            bytes.len(),
+            1,
+        )
+    };
+    assert_eq!(
+        unsafe { zstd_sys::ZSTD_isError(rc) },
+        0,
+        "ZSTD_compress failed"
+    );
+
+    eprintln!(
+        "DONOR_TRACE_END ffi_bytes={} input_bytes={}",
+        rc,
+        bytes.len()
+    );
+}
diff --git a/zstd/examples/donor_cparams_check.rs b/zstd/examples/donor_cparams_check.rs
@@ -0,0 +1,26 @@
+//! One-shot diagnostic: ask donor what cParams it selects for our exact
+//! (level, srcSize, dictSize) tuple. Confirms whether donor's L1 path
+//! uses mls=7 vs mls=4/6 for the 1MB decodecorpus-z000033 fixture.
+//!
+//! Build: cargo build --release -p structured-zstd --example donor_cparams_check
+//! Run:   ./target/release/examples/donor_cparams_check
+
+use zstd::zstd_safe::zstd_sys;
+
+fn main() {
+    let src_size = 1022035u64; // bytes in zstd/decodecorpus_files/z000033
+    let level = 1i32;
+    let dict_size = 0usize;
+
+    // SAFETY: standard libzstd query.
+    let cp = unsafe { zstd_sys::ZSTD_getCParams(level, src_size, dict_size) };
+
+    println!("L{level} srcSize={src_size} dictSize={dict_size}:");
+    println!("  windowLog = {}", cp.windowLog);
+    println!("  chainLog  = {}", cp.chainLog);
+    println!("  hashLog   = {}", cp.hashLog);
+    println!("  searchLog = {}", cp.searchLog);
+    println!("  minMatch  = {} (mls)", cp.minMatch);
+    println!("  targetLength = {}", cp.targetLength);
+    println!("  strategy  = {}", cp.strategy as u32);
+}
diff --git a/zstd/examples/trace_fast_kernel.rs b/zstd/examples/trace_fast_kernel.rs
@@ -0,0 +1,59 @@
+//! Trace driver for #220 Fast L1 ratio gap investigation.
+//!
+//! Loads `decodecorpus_files/z000033`, runs production encoder at
+//! `CompressionLevel::Fastest` (Level 1 → Fast strategy), prints the
+//! kernel inner-loop trace from `compress_block_fast` to stderr.
+//!
+//! Build: `cargo build --release --features kernel_trace --example trace_fast_kernel`
+//! Run:   `STRUCTURED_ZSTD_KERNEL_TRACE=1 ./target/release/examples/trace_fast_kernel > /dev/null 2> trace.log`
+//!
+//! The trace records every outer-iter state, every hash-table put, and
+//! every match-found event. Diff vs donor's expected sequence (manually
+//! computed from `zstd_fast.c:266-348`) to pinpoint the first
+//! divergence in block 0.
+//!
+//! Caps trace output at the first 2000 lines via `STRUCTURED_ZSTD_KERNEL_TRACE_HEAD`
+//! env var (defaults to 2000) so the log stays diffable. Set to 0 to
+//! disable the cap and dump every iteration.
+
+use std::env;
+use std::fs;
+use std::io::{self, Write};
+
+use structured_zstd::encoding::{CompressionLevel, compress_to_vec};
+
+fn main() {
+    let corpus_path = env::args()
+        .nth(1)
+        .unwrap_or_else(|| "zstd/decodecorpus_files/z000033".to_string());
+    let bytes = fs::read(&corpus_path).expect("read corpus");
+    eprintln!(
+        "TRACE_START corpus={} size={} level=Fastest",
+        corpus_path,
+        bytes.len()
+    );
+
+    // Drive the production encoder. Sequence emissions land via the
+    // standard FrameCompressor → MatchGeneratorDriver → FastKernelMatcher
+    // → compress_block_fast path, hitting every ktrace! site.
+    // CRITICAL: use Level(1), NOT CompressionLevel::Fastest. They look
+    // synonymous from the docs ("Fastest is roughly equivalent to Zstd
+    // compression level 1") but `Fastest` overrides `LEVEL_TABLE[0]`'s
+    // `fast_mls = 7` to `fast_mls = 6` for an even-faster preset. The
+    // canonical L1 Fast strategy (which `compare_ffi` benches and the
+    // donor `ZSTD_compress_usingCDict(level=1)` both use) is mls=7 via
+    // `LEVEL_TABLE[0]` unchanged → reach it via `Level(1)`.
+    let compressed = compress_to_vec(&bytes[..], CompressionLevel::Level(1));
+
+    eprintln!(
+        "TRACE_END rust_bytes={} input_bytes={}",
+        compressed.len(),
+        bytes.len()
+    );
+
+    // Write the compressed output to stdout so the binary can also be
+    // used as a one-shot compressor for ad-hoc verification.
+    let stdout = io::stdout();
+    let mut out = stdout.lock();
+    out.write_all(&compressed).expect("write compressed");
+}
diff --git a/zstd/src/bit_io/bit_writer.rs b/zstd/src/bit_io/bit_writer.rs
@@ -111,6 +111,37 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
         self.bit_idx += data.len() * 8;
     }
 
+    /// Bridge for the donor-faithful Huffman encoder (`HufCStream`) so
+    /// it can write bytes directly into our backing `Vec<u8>` without
+    /// going through the `BitWriter`'s partial-bit accumulator. The
+    /// closure receives full mutable access to the underlying `Vec`;
+    /// any bytes it appends are integrated into `bit_idx` afterward.
+    ///
+    /// MUST be called only when the writer is byte-aligned
+    /// (`bits_in_partial` a multiple of 8); the assertion mirrors
+    /// `append_bytes`. Internally calls `flush()` first so the
+    /// closure sees a Vec whose `len()` reflects every bit written so
+    /// far.
+    pub fn with_aligned_output_mut<F, R>(&mut self, f: F) -> R
+    where
+        F: FnOnce(&mut Vec<u8>) -> R,
+    {
+        assert!(
+            self.bits_in_partial.is_multiple_of(8),
+            "with_aligned_output_mut requires byte-aligned writer state",
+        );
+        self.flush();
+        let prev_len = self.output.as_mut().len();
+        let result = f(self.output.as_mut());
+        let new_len = self.output.as_mut().len();
+        // Closure may only APPEND bytes (HufCStream's contract).
+        // Detect accidental truncation early — that would corrupt
+        // bit_idx into a phantom future bit.
+        debug_assert!(new_len >= prev_len, "closure must not shrink output");
+        self.bit_idx += (new_len - prev_len) * 8;
+        result
+    }
+
     /// Flush temporary internal buffers to the output buffer. Only works if this is currently byte aligned
     pub fn flush(&mut self) {
         assert!(self.bits_in_partial.is_multiple_of(8));

diff --git a/zstd/src/decoding/mod.rs b/zstd/src/decoding/mod.rs
@@ -56,7 +56,7 @@ mod ringbuffer;
 pub(crate) mod scratch;
 pub(crate) mod sequence_execution;
 pub(crate) mod sequence_section_decoder;
-mod simd_copy;
+pub(crate) mod simd_copy;
 
 #[cfg(feature = "bench_internals")]
 pub(crate) use self::simd_copy::copy_bytes_overshooting_for_bench;
diff --git a/zstd/src/encoding/blocks/compressed.rs b/zstd/src/encoding/blocks/compressed.rs
@@ -346,6 +346,55 @@ pub(crate) fn compress_block_with_post_split<M: Matcher>(
     state.block_scratch = scratch;
 }
 
+/// Append `lits` to `dst` using inline byte / u64 ops for short
+/// slices, avoiding the libc memmove call overhead that
+/// `Vec::extend_from_slice` lowers to for runtime-sized
+/// `ptr::copy_nonoverlapping`. Fast L1 emits literal runs of 1-10
+/// bytes typically — at thousands of sequences per block, the per-
+/// emit libc call dominated the hot path (flamegraph: 60 % of CPU
+/// in `__memmove_avx_unaligned_erms` chain).
+///
+/// Route through `simd_copy::copy_bytes_overshooting` with src.1 ==
+/// dst.1 == lit_len (no overshoot READ; we don't know how much
+/// readable slack the caller's slice has). For lit_len ≤ 32 that
+/// drops into the byte-by-byte / overlapping-u64 path, fully
+/// inlineable. Larger runs fall through `extend_from_slice` —
+/// they're rare and libc memmove amortises across the longer copy.
+#[inline]
+fn append_literals(dst: &mut Vec<u8>, lits: &[u8]) {
+    let lit_len = lits.len();
+    if lit_len == 0 {
+        return;
+    }
+    if lit_len <= 32 {
+        // Caller pre-reserved `src_len` (the whole block); the sum
+        // of all literal runs is ≤ src_len, so the unused tail
+        // always has ≥ `lit_len` capacity.
+        debug_assert!(
+            dst.capacity() - dst.len() >= lit_len,
+            "append_literals requires `dst` to have at least `lit_len` reserved capacity \
+             past `dst.len()` — caller failed to reserve before the emit loop",
+        );
+        let cur_len = dst.len();
+        let dst_ptr = unsafe { dst.as_mut_ptr().add(cur_len) };
+        // SAFETY: `lits` is a valid slice (so reading `lit_len`
+        // bytes from `lits.as_ptr()` is in-bounds); `dst_ptr` has
+        // `lit_len` bytes of reserved capacity (debug_assert above).
+        // copy_bytes_overshooting writes EXACTLY `lit_len` bytes when
+        // `min(src.1, dst.1) == lit_len`.
+        unsafe {
+            crate::decoding::simd_copy::copy_bytes_overshooting(
+                (lits.as_ptr(), lit_len),
+                (dst_ptr, lit_len),
+                lit_len,
+            );
+            dst.set_len(cur_len + lit_len);
+        }
+    } else {
+        dst.extend_from_slice(lits);
+    }
+}
+
 fn collect_block_parts<M: Matcher>(state: &mut CompressState<M>, parts: &mut EncodedBlockParts) {
     let src_len = state.matcher.get_last_space().len();
     parts.literals.clear();
@@ -365,14 +414,14 @@ fn collect_block_parts<M: Matcher>(state: &mut CompressState<M>, parts: &mut Enc
             .reserve_exact(sequence_capacity - parts.sequences.len());
     }
     state.matcher.start_matching(|seq| match seq {
-        Sequence::Literals { literals } => parts.literals.extend_from_slice(literals),
+        Sequence::Literals { literals } => append_literals(&mut parts.literals, literals),
         Sequence::Triple {
             literals,
             offset,
             match_len,
         } => {
             let ll = literals.len() as u32;
-            parts.literals.extend_from_slice(literals);
+            append_literals(&mut parts.literals, literals);
             parts.sequences.push(RawSequence {
                 ll,
                 ml: match_len as u32,