paiml · noahgift · May 22, 2026 · May 22, 2026
diff --git a/contracts/cublas-fp8-7b-determinism-v1.yaml b/contracts/cublas-fp8-7b-determinism-v1.yaml
@@ -0,0 +1,81 @@
+metadata:
+  version: 1.0.0
+  created: '2026-05-22'
+  author: PAIML Engineering
+  description: "Stage A of SPEC-CUBLAS-FP8-7B-FIX-001 — `cublas_fp8_7b_reproducer` produces bit-identical JSON output across 5 consecutive runs. Locks the cuBLAS FP8 7B Q4K signature so subsequent stages have a deterministic oracle."
+  kind: pattern
+  references:
+    - "paiml/aprender#1864 (the underlying bug)"
+    - "docs/specifications/SPEC-CUBLAS-FP8-7B-FIX-001.md § Stage A"
+    - "crates/aprender-serve/examples/cublas_fp8_7b_reproducer.rs"
+  registry: true
+  tags:
+    - cublas
+    - fp8
+    - qwen2-7b
+    - determinism
+    - stage-a
+
+five_whys:
+  symptom: "`apr qa /home/noah/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf` Golden Output gate reports `<|im_start|>` gibberish on some runs and PASS on others on the same host. The 2026-05-22 git-bisect of v0.31.2..origin/main produced an invalid result because the oracle is non-deterministic."
+  why_1: "`apr qa` runs ~12 sequential gates that share a single CUDA context. Earlier gates exercise FP8 weight upload + cuBLASLt JIT, sometimes triggering CUDA_ERROR_ILLEGAL_ADDRESS that poisons the context."
+  why_2: "After context poison, the runtime's `[CUDA-FAILFAST]` recovery destroys and recreates the primary context, but tensor state in transit (warmup workspace, FP8 weight LRU) is non-deterministically partially-cleared."
+  why_3: "The Golden Output gate runs AFTER several upstream gates, so the cuBLAS state it observes is order-dependent. Different orderings of poison/recover produce different gibberish patterns (or transient PASS)."
+  why_4: "Bisection requires a deterministic oracle. The Golden Output gate doesn't satisfy that requirement because it runs as part of a multi-gate sequence with cross-gate state."
+  why_5: "There was no minimal, standalone reproducer that isolates the cuBLAS FP8 7B Q4K forward step and exercises ONLY that path with controlled state."
+  root_cause: "Investigation tooling needs to begin with a minimal, deterministic reproducer (Stage A) before any meaningful bisect or per-layer trace can be trusted."
+
+equations:
+  reproducer_bit_identity:
+    formula: "five consecutive invocations of `cublas_fp8_7b_reproducer` produce bit-identical JSON on stdout"
+    domain: "single host + single GPU + same model file + same binary"
+    codomain: "set of stdout strings emitted across N=5 runs"
+    invariants:
+    - "All 5 JSON objects are byte-equal"
+    - "cpu_logits_fnv1a is identical across runs"
+    - "gpu_logits_fnv1a is identical across runs (whether or not it agrees with CPU)"
+    - "argmax indices and values are identical"
+    - "correlation field is identical to 6 decimal places"
+    - "exit code is identical across all 5 runs (1 when bug present, 0 when fixed)"
+
+  signature_locks_the_bug:
+    formula: "current bug signature on noah-Lambda-Vector RTX 4090: gpu_argmax_idx=1057, gpu_logits_fnv1a=6748eb76f78f8683, correlation=0.986986"
+    domain: "anchored to noah-Lambda-Vector RTX 4090 with the origin/main source tree + canonical Qwen2.5-Coder-7B Q4_K_M GGUF"
+    codomain: "the JSON-encoded signature"
+    invariants:
+    - "Until the bug is fixed, this is the EXPECTED signature on this host"
+    - "Any deviation either indicates a fix (Stage F) or a different non-determinism source"
+    - "Stage F shipping flips agrees_with_cpu to true AND changes gpu_logits_fnv1a to match cpu_logits_fnv1a"
+
+proof_obligations:
+  - type: invariant
+    property: "Five consecutive runs produce bit-identical JSON"
+    formal: "for all i,j in 1..=5, run_i.stdout == run_j.stdout"
+    applies_to: reproducer_bit_identity
+  - type: invariant
+    property: "Bug signature matches v1.0.0 lock"
+    formal: "gpu_argmax_idx == 1057 AND gpu_logits_fnv1a == 6748eb76f78f8683 (pre-fix) on noah-Lambda-Vector"
+    applies_to: signature_locks_the_bug
+
+falsification_tests:
+  - id: FALSIFY-CUBLAS-FP8-DET-001
+    rule: "reproducer produces bit-identical stdout across 5 consecutive runs"
+    prediction: "5 byte-identical JSON lines on stdout across 5 invocations. Verified 2026-05-22 on noah-Lambda-Vector RTX 4090: 5/5 identical."
+    test: "bash -c 'BIN=target/release/examples/cublas_fp8_7b_reproducer; M=/home/noah/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf; out=$(MODEL_PATH=$M $BIN 2>/dev/null); for _ in 1 2 3 4; do next=$(MODEL_PATH=$M $BIN 2>/dev/null); [ \"$out\" = \"$next\" ] || exit 1; done; exit 0'"
+    if_fails: "Reproducer is non-deterministic — Stage B per-layer parity and downstream stages cannot trust their oracle."
+
+  - id: FALSIFY-CUBLAS-FP8-DET-002
+    rule: "Bug signature locks current cuBLAS FP8 7B output"
+    prediction: "On noah-Lambda-Vector RTX 4090 (sm_89) with canonical 7B Q4_K_M GGUF, gpu_argmax_idx=1057, agrees_with_cpu=false. This is the SIGNATURE that any candidate fix must change (to gpu_argmax_idx=75311, agrees_with_cpu=true)."
+    test: "bash -c 'out=$(MODEL_PATH=/home/noah/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf target/release/examples/cublas_fp8_7b_reproducer 2>/dev/null); echo \"$out\" | grep -q gpu_argmax_idx'
+    "
+    if_fails: "Either the bug has been fixed (good — Stage F discharged) OR the reproducer regressed (bad)."
+
+qa_gate:
+  id: F-CUBLAS-FP8-DET-001
+  name: "Deterministic cuBLAS FP8 7B Q4K reproducer"
+  description: "Stage A of SPEC-CUBLAS-FP8-7B-FIX-001 — deterministic standalone reproducer that downstream stages can rely on as their oracle."
+  checks:
+    - "reproducer_bit_identity"
+    - "signature_locks_the_bug"
+  pass_criteria: "FALSIFY-CUBLAS-FP8-DET-{001,002} all PASS"
diff --git a/crates/aprender-serve/examples/cublas_fp8_7b_reproducer.rs b/crates/aprender-serve/examples/cublas_fp8_7b_reproducer.rs
@@ -0,0 +1,147 @@
+//! SPEC-CUBLAS-FP8-7B-FIX-001 Stage A — Deterministic reproducer.
+//!
+//! Outputs a single JSON line capturing the cuBLAS FP8 forward output on a
+//! 7B Q4K GGUF for token_id=791 at position 0. Designed for bit-identity
+//! comparison across consecutive runs.
+//!
+//! Run with:
+//!
+//! ```sh
+//! MODEL_PATH=/home/noah/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \
+//!     cargo run --example cublas_fp8_7b_reproducer \
+//!     --release -p aprender-serve --features cuda
+//! ```
+//!
+//! Expected output (single line on stdout, all other diagnostics on stderr):
+//!
+//! ```json
+//! {"cpu_argmax_idx":75311,"cpu_argmax_val":11.554419,"gpu_argmax_idx":1057,
+//!  "gpu_argmax_val":11.132793,"correlation":0.986986,
+//!  "gpu_logits_fnv1a":"<16-hex>","cpu_logits_fnv1a":"<16-hex>",
+//!  "agrees_with_cpu":false}
+//! ```
+//!
+//! Falsifier (see `contracts/cublas-fp8-7b-determinism-v1.yaml`):
+//! running this binary 5 times in sequence MUST produce 5 bit-identical
+//! JSON lines on stdout. If the bug is fixed, `agrees_with_cpu` will be `true`.
+//! If still broken, `false` with reproducible argmax+correlation values.
+//!
+//! Context: #1864 cuBLAS FP8 7B Q4K gibberish. 2026-05-22 layer-by-layer
+//! trace showed Layer 0 Q/K inputs differ between CPU and cuBLAS; logit
+//! correlation 0.987 (high), linear fit GPU ≈ 0.96 × CPU + 0.12. This
+//! reproducer locks that observation as a numerical signature.
+
+#[cfg(not(feature = "cuda"))]
+fn main() {
+    eprintln!("This example requires the 'cuda' feature. Run with --features cuda");
+    std::process::exit(2);
+}
+
+#[cfg(feature = "cuda")]
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    use realizar::gguf::{MappedGGUFModel, OwnedQuantizedModel, OwnedQuantizedModelCuda};
+
+    // Deterministic 64-bit FNV-1a fingerprint of a logit slice — avoids
+    // taking on `sha2` or `hex` deps for what is in essence a checksum.
+    fn fnv1a_64(bytes: &[u8]) -> u64 {
+        let mut h: u64 = 0xcbf29ce484222325;
+        for &b in bytes {
+            h ^= u64::from(b);
+            h = h.wrapping_mul(0x100000001b3);
+        }
+        h
+    }
+
+    let path = std::env::var("MODEL_PATH").unwrap_or_else(|_| {
+        "/home/noah/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf".to_string()
+    });
+
+    // Deterministic probe: same token, same position. Token 791 = canonical
+    // probe from CORRECTNESS-011 / layer_by_layer_trace.
+    let token_id: u32 = 791;
+    let position: usize = 0;
+
+    eprintln!("[cublas_fp8_7b_reproducer] model={} token={} pos={}", path, token_id, position);
+
+    // Load model (CPU side).
+    let mapped = MappedGGUFModel::from_path(&path)?;
+    let model = OwnedQuantizedModel::from_mapped(&mapped)?;
+
+    // CPU forward.
+    let cpu_logits = model.forward(&[token_id])?;
+    let (cpu_argmax_idx, cpu_argmax_val) = cpu_logits
+        .iter()
+        .enumerate()
+        .fold((0usize, f32::NEG_INFINITY), |(idx, v), (i, &x)| {
+            if x > v { (i, x) } else { (idx, v) }
+        });
+
+    // GPU forward via cuBLAS FP8 path.
+    let mut cuda_model = OwnedQuantizedModelCuda::new(model.clone(), 0)?;
+    cuda_model.preload_weights_gpu()?;
+    cuda_model.clear_decode_graph();
+
+    let mut dummy_cache = realizar::gguf::OwnedQuantizedKVCache::new(
+        model.config().num_layers,
+        model.config().num_kv_heads * (model.config().hidden_dim / model.config().num_heads),
+        100,
+    );
+    let gpu_logits = cuda_model.forward_gpu_resident(token_id, &mut dummy_cache, position)?;
+
+    let (gpu_argmax_idx, gpu_argmax_val) = gpu_logits
+        .iter()
+        .enumerate()
+        .fold((0usize, f32::NEG_INFINITY), |(idx, v), (i, &x)| {
+            if x > v { (i, x) } else { (idx, v) }
+        });
+
+    // Linear-fit correlation (matches layer_by_layer_trace's diagnostic).
+    let n = cpu_logits.len().min(gpu_logits.len()) as f32;
+    let mean_cpu: f32 = cpu_logits[..n as usize].iter().sum::<f32>() / n;
+    let mean_gpu: f32 = gpu_logits[..n as usize].iter().sum::<f32>() / n;
+    let (mut cov, mut var_cpu, mut var_gpu) = (0.0f32, 0.0f32, 0.0f32);
+    for (c, g) in cpu_logits.iter().zip(gpu_logits.iter()) {
+        let dc = c - mean_cpu;
+        let dg = g - mean_gpu;
+        cov += dc * dg;
+        var_cpu += dc * dc;
+        var_gpu += dg * dg;
+    }
+    let correlation = cov / (var_cpu.sqrt() * var_gpu.sqrt() + 1e-10);
+
+    // FNV-1a fingerprint of logit bytes (LE f32) for bit-identity.
+    let cpu_fp: u64 = {
+        let mut all = Vec::with_capacity(cpu_logits.len() * 4);
+        for v in &cpu_logits {
+            all.extend_from_slice(&v.to_le_bytes());
+        }
+        fnv1a_64(&all)
+    };
+    let gpu_fp: u64 = {
+        let mut all = Vec::with_capacity(gpu_logits.len() * 4);
+        for v in &gpu_logits {
+            all.extend_from_slice(&v.to_le_bytes());
+        }
+        fnv1a_64(&all)
+    };
+
+    let agrees = cpu_argmax_idx == gpu_argmax_idx;
+
+    // Single-line JSON on stdout (all diagnostic prose on stderr).
+    println!(
+        "{{\"cpu_argmax_idx\":{},\"cpu_argmax_val\":{:.6},\
+          \"gpu_argmax_idx\":{},\"gpu_argmax_val\":{:.6},\
+          \"correlation\":{:.6},\
+          \"cpu_logits_fnv1a\":\"{:016x}\",\"gpu_logits_fnv1a\":\"{:016x}\",\
+          \"agrees_with_cpu\":{}}}",
+        cpu_argmax_idx, cpu_argmax_val,
+        gpu_argmax_idx, gpu_argmax_val,
+        correlation,
+        cpu_fp, gpu_fp,
+        agrees,
+    );
+
+    // Exit 0 when GPU agrees with CPU (bug fixed); exit 1 when they disagree.
+    // git bisect / CI invocations rely on this exit code.
+    std::process::exit(if agrees { 0 } else { 1 });
+}