Skip to content
5 changes: 5 additions & 0 deletions zstd/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,11 @@ critical-section = ["dep:critical-section"]
bench_internals = []
fuzz_exports = []
std = []
# Diagnostic tracing of the Fast kernel's inner loop — per-iteration state
# dumps gated at compile time so production builds carry zero cost. Runtime
# activation via `STRUCTURED_ZSTD_KERNEL_TRACE=1` env var. Used by the
# `trace_fast_kernel` example for #220 ratio-divergence investigation.
kernel_trace = ["std"]

# Internal feature, only used when building as part of libstd, not part of the
# stable interface of this crate.
Expand Down
50 changes: 50 additions & 0 deletions zstd/examples/donor_compress_z000033.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
//! One-shot diagnostic: invoke FFI `ZSTD_compress` on z000033 at level 1.
//! Donor's `zstd_fast.c` has been patched with `fprintf(stderr, "D_...")`
//! traces gated by `DONOR_TRACE_FAST=1` env var. Run this binary with
//! both `DONOR_TRACE_FAST=1` set and stderr redirected to a file to
//! capture donor's actual cursor trace.
//!
//! Build: cargo build --release -p structured-zstd --example donor_compress_z000033
//! Run: DONOR_TRACE_FAST=1 ./target/release/examples/donor_compress_z000033 \
//! > /dev/null 2> /tmp/donor_trace.log

use std::env;
use std::fs;

use zstd::zstd_safe::zstd_sys;

fn main() {
let corpus_path = env::args()
.nth(1)
.unwrap_or_else(|| "zstd/decodecorpus_files/z000033".to_string());
let bytes = fs::read(&corpus_path).expect("read corpus");
eprintln!(
"DONOR_TRACE_START corpus={} size={} level=1",
corpus_path,
bytes.len()
);

let dst_cap = unsafe { zstd_sys::ZSTD_compressBound(bytes.len()) };
let mut dst: Vec<u8> = vec![0u8; dst_cap];

let rc = unsafe {
zstd_sys::ZSTD_compress(
dst.as_mut_ptr() as *mut core::ffi::c_void,
dst_cap,
bytes.as_ptr() as *const core::ffi::c_void,
bytes.len(),
1,
)
};
assert_eq!(
unsafe { zstd_sys::ZSTD_isError(rc) },
0,
"ZSTD_compress failed"
);

eprintln!(
"DONOR_TRACE_END ffi_bytes={} input_bytes={}",
rc,
bytes.len()
);
}
26 changes: 26 additions & 0 deletions zstd/examples/donor_cparams_check.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
//! One-shot diagnostic: ask donor what cParams it selects for our exact
//! (level, srcSize, dictSize) tuple. Confirms whether donor's L1 path
//! uses mls=7 vs mls=4/6 for the 1MB decodecorpus-z000033 fixture.
//!
//! Build: cargo build --release -p structured-zstd --example donor_cparams_check
//! Run: ./target/release/examples/donor_cparams_check

use zstd::zstd_safe::zstd_sys;

fn main() {
let src_size = 1022035u64; // bytes in zstd/decodecorpus_files/z000033
let level = 1i32;
let dict_size = 0usize;

// SAFETY: standard libzstd query.
let cp = unsafe { zstd_sys::ZSTD_getCParams(level, src_size, dict_size) };

println!("L{level} srcSize={src_size} dictSize={dict_size}:");
println!(" windowLog = {}", cp.windowLog);
println!(" chainLog = {}", cp.chainLog);
println!(" hashLog = {}", cp.hashLog);
println!(" searchLog = {}", cp.searchLog);
println!(" minMatch = {} (mls)", cp.minMatch);
println!(" targetLength = {}", cp.targetLength);
println!(" strategy = {}", cp.strategy as u32);
}
59 changes: 59 additions & 0 deletions zstd/examples/trace_fast_kernel.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
//! Trace driver for #220 Fast L1 ratio gap investigation.
//!
//! Loads `decodecorpus_files/z000033`, runs production encoder at
//! `CompressionLevel::Fastest` (Level 1 → Fast strategy), prints the
//! kernel inner-loop trace from `compress_block_fast` to stderr.
//!
//! Build: `cargo build --release --features kernel_trace --example trace_fast_kernel`
//! Run: `STRUCTURED_ZSTD_KERNEL_TRACE=1 ./target/release/examples/trace_fast_kernel > /dev/null 2> trace.log`
//!
//! The trace records every outer-iter state, every hash-table put, and
//! every match-found event. Diff vs donor's expected sequence (manually
//! computed from `zstd_fast.c:266-348`) to pinpoint the first
//! divergence in block 0.
//!
//! Caps trace output at the first 2000 lines via `STRUCTURED_ZSTD_KERNEL_TRACE_HEAD`
//! env var (defaults to 2000) so the log stays diffable. Set to 0 to
//! disable the cap and dump every iteration.

use std::env;
use std::fs;
use std::io::{self, Write};

use structured_zstd::encoding::{CompressionLevel, compress_to_vec};

fn main() {
let corpus_path = env::args()
.nth(1)
.unwrap_or_else(|| "zstd/decodecorpus_files/z000033".to_string());
let bytes = fs::read(&corpus_path).expect("read corpus");
eprintln!(
"TRACE_START corpus={} size={} level=Fastest",
corpus_path,
bytes.len()
);

// Drive the production encoder. Sequence emissions land via the
// standard FrameCompressor → MatchGeneratorDriver → FastKernelMatcher
// → compress_block_fast path, hitting every ktrace! site.
// CRITICAL: use Level(1), NOT CompressionLevel::Fastest. They look
// synonymous from the docs ("Fastest is roughly equivalent to Zstd
// compression level 1") but `Fastest` overrides `LEVEL_TABLE[0]`'s
// `fast_mls = 7` to `fast_mls = 6` for an even-faster preset. The
// canonical L1 Fast strategy (which `compare_ffi` benches and the
// donor `ZSTD_compress_usingCDict(level=1)` both use) is mls=7 via
// `LEVEL_TABLE[0]` unchanged → reach it via `Level(1)`.
let compressed = compress_to_vec(&bytes[..], CompressionLevel::Level(1));

eprintln!(
"TRACE_END rust_bytes={} input_bytes={}",
compressed.len(),
bytes.len()
);

// Write the compressed output to stdout so the binary can also be
// used as a one-shot compressor for ad-hoc verification.
let stdout = io::stdout();
let mut out = stdout.lock();
out.write_all(&compressed).expect("write compressed");
}
31 changes: 31 additions & 0 deletions zstd/src/bit_io/bit_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,37 @@ impl<V: AsMut<Vec<u8>>> BitWriter<V> {
self.bit_idx += data.len() * 8;
}

/// Bridge for the donor-faithful Huffman encoder (`HufCStream`) so
/// it can write bytes directly into our backing `Vec<u8>` without
/// going through the `BitWriter`'s partial-bit accumulator. The
/// closure receives full mutable access to the underlying `Vec`;
/// any bytes it appends are integrated into `bit_idx` afterward.
///
/// MUST be called only when the writer is byte-aligned
/// (`bits_in_partial` a multiple of 8); the assertion mirrors
/// `append_bytes`. Internally calls `flush()` first so the
/// closure sees a Vec whose `len()` reflects every bit written so
/// far.
pub fn with_aligned_output_mut<F, R>(&mut self, f: F) -> R
where
F: FnOnce(&mut Vec<u8>) -> R,
{
assert!(
self.bits_in_partial.is_multiple_of(8),
"with_aligned_output_mut requires byte-aligned writer state",
);
self.flush();
let prev_len = self.output.as_mut().len();
let result = f(self.output.as_mut());
let new_len = self.output.as_mut().len();
// Closure may only APPEND bytes (HufCStream's contract).
// Detect accidental truncation early — that would corrupt
// bit_idx into a phantom future bit.
debug_assert!(new_len >= prev_len, "closure must not shrink output");
self.bit_idx += (new_len - prev_len) * 8;
result
}

/// Flush temporary internal buffers to the output buffer. Only works if this is currently byte aligned
pub fn flush(&mut self) {
assert!(self.bits_in_partial.is_multiple_of(8));
Expand Down
2 changes: 1 addition & 1 deletion zstd/src/decoding/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ mod ringbuffer;
pub(crate) mod scratch;
pub(crate) mod sequence_execution;
pub(crate) mod sequence_section_decoder;
mod simd_copy;
pub(crate) mod simd_copy;

#[cfg(feature = "bench_internals")]
pub(crate) use self::simd_copy::copy_bytes_overshooting_for_bench;
53 changes: 51 additions & 2 deletions zstd/src/encoding/blocks/compressed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,55 @@ pub(crate) fn compress_block_with_post_split<M: Matcher>(
state.block_scratch = scratch;
}

/// Append `lits` to `dst` using inline byte / u64 ops for short
/// slices, avoiding the libc memmove call overhead that
/// `Vec::extend_from_slice` lowers to for runtime-sized
/// `ptr::copy_nonoverlapping`. Fast L1 emits literal runs of 1-10
/// bytes typically — at thousands of sequences per block, the per-
/// emit libc call dominated the hot path (flamegraph: 60 % of CPU
/// in `__memmove_avx_unaligned_erms` chain).
///
/// Route through `simd_copy::copy_bytes_overshooting` with src.1 ==
/// dst.1 == lit_len (no overshoot READ; we don't know how much
/// readable slack the caller's slice has). For lit_len ≤ 32 that
/// drops into the byte-by-byte / overlapping-u64 path, fully
/// inlineable. Larger runs fall through `extend_from_slice` —
/// they're rare and libc memmove amortises across the longer copy.
#[inline]
fn append_literals(dst: &mut Vec<u8>, lits: &[u8]) {
let lit_len = lits.len();
if lit_len == 0 {
return;
}
if lit_len <= 32 {
// Caller pre-reserved `src_len` (the whole block); the sum
// of all literal runs is ≤ src_len, so the unused tail
// always has ≥ `lit_len` capacity.
debug_assert!(
dst.capacity() - dst.len() >= lit_len,
"append_literals requires `dst` to have at least `lit_len` reserved capacity \
past `dst.len()` — caller failed to reserve before the emit loop",
);
let cur_len = dst.len();
let dst_ptr = unsafe { dst.as_mut_ptr().add(cur_len) };
// SAFETY: `lits` is a valid slice (so reading `lit_len`
// bytes from `lits.as_ptr()` is in-bounds); `dst_ptr` has
// `lit_len` bytes of reserved capacity (debug_assert above).
// copy_bytes_overshooting writes EXACTLY `lit_len` bytes when
// `min(src.1, dst.1) == lit_len`.
unsafe {
crate::decoding::simd_copy::copy_bytes_overshooting(
(lits.as_ptr(), lit_len),
(dst_ptr, lit_len),
lit_len,
);
dst.set_len(cur_len + lit_len);
}
} else {
dst.extend_from_slice(lits);
}
}

fn collect_block_parts<M: Matcher>(state: &mut CompressState<M>, parts: &mut EncodedBlockParts) {
let src_len = state.matcher.get_last_space().len();
parts.literals.clear();
Expand All @@ -365,14 +414,14 @@ fn collect_block_parts<M: Matcher>(state: &mut CompressState<M>, parts: &mut Enc
.reserve_exact(sequence_capacity - parts.sequences.len());
}
state.matcher.start_matching(|seq| match seq {
Sequence::Literals { literals } => parts.literals.extend_from_slice(literals),
Sequence::Literals { literals } => append_literals(&mut parts.literals, literals),
Sequence::Triple {
literals,
offset,
match_len,
} => {
let ll = literals.len() as u32;
parts.literals.extend_from_slice(literals);
append_literals(&mut parts.literals, literals);
parts.sequences.push(RawSequence {
ll,
ml: match_len as u32,
Expand Down
Loading
Loading