Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions contracts/cublas-fp8-7b-determinism-v1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
metadata:
version: 1.0.0
created: '2026-05-22'
author: PAIML Engineering
description: "Stage A of SPEC-CUBLAS-FP8-7B-FIX-001 — `cublas_fp8_7b_reproducer` produces bit-identical JSON output across 5 consecutive runs. Locks the cuBLAS FP8 7B Q4K signature so subsequent stages have a deterministic oracle."
kind: pattern
references:
- "paiml/aprender#1864 (the underlying bug)"
- "docs/specifications/SPEC-CUBLAS-FP8-7B-FIX-001.md § Stage A"
- "crates/aprender-serve/examples/cublas_fp8_7b_reproducer.rs"
registry: true
tags:
- cublas
- fp8
- qwen2-7b
- determinism
- stage-a

five_whys:
symptom: "`apr qa /home/noah/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf` Golden Output gate reports `<|im_start|>` gibberish on some runs and PASS on others on the same host. The 2026-05-22 git-bisect of v0.31.2..origin/main produced an invalid result because the oracle is non-deterministic."
why_1: "`apr qa` runs ~12 sequential gates that share a single CUDA context. Earlier gates exercise FP8 weight upload + cuBLASLt JIT, sometimes triggering CUDA_ERROR_ILLEGAL_ADDRESS that poisons the context."
why_2: "After context poison, the runtime's `[CUDA-FAILFAST]` recovery destroys and recreates the primary context, but tensor state in transit (warmup workspace, FP8 weight LRU) is non-deterministically partially-cleared."
why_3: "The Golden Output gate runs AFTER several upstream gates, so the cuBLAS state it observes is order-dependent. Different orderings of poison/recover produce different gibberish patterns (or transient PASS)."
why_4: "Bisection requires a deterministic oracle. The Golden Output gate doesn't satisfy that requirement because it runs as part of a multi-gate sequence with cross-gate state."
why_5: "There was no minimal, standalone reproducer that isolates the cuBLAS FP8 7B Q4K forward step and exercises ONLY that path with controlled state."
root_cause: "Investigation tooling needs to begin with a minimal, deterministic reproducer (Stage A) before any meaningful bisect or per-layer trace can be trusted."

equations:
reproducer_bit_identity:
formula: "five consecutive invocations of `cublas_fp8_7b_reproducer` produce bit-identical JSON on stdout"
domain: "single host + single GPU + same model file + same binary"
codomain: "set of stdout strings emitted across N=5 runs"
invariants:
- "All 5 JSON objects are byte-equal"
- "cpu_logits_fnv1a is identical across runs"
- "gpu_logits_fnv1a is identical across runs (whether or not it agrees with CPU)"
- "argmax indices and values are identical"
- "correlation field is identical to 6 decimal places"
- "exit code is identical across all 5 runs (1 when bug present, 0 when fixed)"

signature_locks_the_bug:
formula: "current bug signature on noah-Lambda-Vector RTX 4090: gpu_argmax_idx=1057, gpu_logits_fnv1a=6748eb76f78f8683, correlation=0.986986"
domain: "anchored to noah-Lambda-Vector RTX 4090 with the origin/main source tree + canonical Qwen2.5-Coder-7B Q4_K_M GGUF"
codomain: "the JSON-encoded signature"
invariants:
- "Until the bug is fixed, this is the EXPECTED signature on this host"
- "Any deviation either indicates a fix (Stage F) or a different non-determinism source"
- "Stage F shipping flips agrees_with_cpu to true AND changes gpu_logits_fnv1a to match cpu_logits_fnv1a"

proof_obligations:
- type: invariant
property: "Five consecutive runs produce bit-identical JSON"
formal: "for all i,j in 1..=5, run_i.stdout == run_j.stdout"
applies_to: reproducer_bit_identity
- type: invariant
property: "Bug signature matches v1.0.0 lock"
formal: "gpu_argmax_idx == 1057 AND gpu_logits_fnv1a == 6748eb76f78f8683 (pre-fix) on noah-Lambda-Vector"
applies_to: signature_locks_the_bug

falsification_tests:
- id: FALSIFY-CUBLAS-FP8-DET-001
rule: "reproducer produces bit-identical stdout across 5 consecutive runs"
prediction: "5 byte-identical JSON lines on stdout across 5 invocations. Verified 2026-05-22 on noah-Lambda-Vector RTX 4090: 5/5 identical."
test: "bash -c 'BIN=target/release/examples/cublas_fp8_7b_reproducer; M=/home/noah/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf; out=$(MODEL_PATH=$M $BIN 2>/dev/null); for _ in 1 2 3 4; do next=$(MODEL_PATH=$M $BIN 2>/dev/null); [ \"$out\" = \"$next\" ] || exit 1; done; exit 0'"
if_fails: "Reproducer is non-deterministic — Stage B per-layer parity and downstream stages cannot trust their oracle."

- id: FALSIFY-CUBLAS-FP8-DET-002
rule: "Bug signature locks current cuBLAS FP8 7B output"
prediction: "On noah-Lambda-Vector RTX 4090 (sm_89) with canonical 7B Q4_K_M GGUF, gpu_argmax_idx=1057, agrees_with_cpu=false. This is the SIGNATURE that any candidate fix must change (to gpu_argmax_idx=75311, agrees_with_cpu=true)."
test: "bash -c 'out=$(MODEL_PATH=/home/noah/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf target/release/examples/cublas_fp8_7b_reproducer 2>/dev/null); echo \"$out\" | grep -q gpu_argmax_idx'
"
if_fails: "Either the bug has been fixed (good — Stage F discharged) OR the reproducer regressed (bad)."

qa_gate:
id: F-CUBLAS-FP8-DET-001
name: "Deterministic cuBLAS FP8 7B Q4K reproducer"
description: "Stage A of SPEC-CUBLAS-FP8-7B-FIX-001 — deterministic standalone reproducer that downstream stages can rely on as their oracle."
checks:
- "reproducer_bit_identity"
- "signature_locks_the_bug"
pass_criteria: "FALSIFY-CUBLAS-FP8-DET-{001,002} all PASS"
147 changes: 147 additions & 0 deletions crates/aprender-serve/examples/cublas_fp8_7b_reproducer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
//! SPEC-CUBLAS-FP8-7B-FIX-001 Stage A — Deterministic reproducer.
//!
//! Outputs a single JSON line capturing the cuBLAS FP8 forward output on a
//! 7B Q4K GGUF for token_id=791 at position 0. Designed for bit-identity
//! comparison across consecutive runs.
//!
//! Run with:
//!
//! ```sh
//! MODEL_PATH=/home/noah/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf \
//! cargo run --example cublas_fp8_7b_reproducer \
//! --release -p aprender-serve --features cuda
//! ```
//!
//! Expected output (single line on stdout, all other diagnostics on stderr):
//!
//! ```json
//! {"cpu_argmax_idx":75311,"cpu_argmax_val":11.554419,"gpu_argmax_idx":1057,
//! "gpu_argmax_val":11.132793,"correlation":0.986986,
//! "gpu_logits_fnv1a":"<16-hex>","cpu_logits_fnv1a":"<16-hex>",
//! "agrees_with_cpu":false}
//! ```
//!
//! Falsifier (see `contracts/cublas-fp8-7b-determinism-v1.yaml`):
//! running this binary 5 times in sequence MUST produce 5 bit-identical
//! JSON lines on stdout. If the bug is fixed, `agrees_with_cpu` will be `true`.
//! If still broken, `false` with reproducible argmax+correlation values.
//!
//! Context: #1864 cuBLAS FP8 7B Q4K gibberish. 2026-05-22 layer-by-layer
//! trace showed Layer 0 Q/K inputs differ between CPU and cuBLAS; logit
//! correlation 0.987 (high), linear fit GPU ≈ 0.96 × CPU + 0.12. This
//! reproducer locks that observation as a numerical signature.

#[cfg(not(feature = "cuda"))]
fn main() {
eprintln!("This example requires the 'cuda' feature. Run with --features cuda");
std::process::exit(2);
}

#[cfg(feature = "cuda")]
fn main() -> Result<(), Box<dyn std::error::Error>> {
use realizar::gguf::{MappedGGUFModel, OwnedQuantizedModel, OwnedQuantizedModelCuda};

// Deterministic 64-bit FNV-1a fingerprint of a logit slice — avoids
// taking on `sha2` or `hex` deps for what is in essence a checksum.
fn fnv1a_64(bytes: &[u8]) -> u64 {
let mut h: u64 = 0xcbf29ce484222325;
for &b in bytes {
h ^= u64::from(b);
h = h.wrapping_mul(0x100000001b3);
}
h
}

let path = std::env::var("MODEL_PATH").unwrap_or_else(|_| {
"/home/noah/models/qwen2.5-coder-7b-instruct-q4_k_m.gguf".to_string()
});

// Deterministic probe: same token, same position. Token 791 = canonical
// probe from CORRECTNESS-011 / layer_by_layer_trace.
let token_id: u32 = 791;
let position: usize = 0;

eprintln!("[cublas_fp8_7b_reproducer] model={} token={} pos={}", path, token_id, position);

// Load model (CPU side).
let mapped = MappedGGUFModel::from_path(&path)?;
let model = OwnedQuantizedModel::from_mapped(&mapped)?;

// CPU forward.
let cpu_logits = model.forward(&[token_id])?;
let (cpu_argmax_idx, cpu_argmax_val) = cpu_logits
.iter()
.enumerate()
.fold((0usize, f32::NEG_INFINITY), |(idx, v), (i, &x)| {
if x > v { (i, x) } else { (idx, v) }
});

// GPU forward via cuBLAS FP8 path.
let mut cuda_model = OwnedQuantizedModelCuda::new(model.clone(), 0)?;
cuda_model.preload_weights_gpu()?;
cuda_model.clear_decode_graph();

let mut dummy_cache = realizar::gguf::OwnedQuantizedKVCache::new(
model.config().num_layers,
model.config().num_kv_heads * (model.config().hidden_dim / model.config().num_heads),
100,
);
let gpu_logits = cuda_model.forward_gpu_resident(token_id, &mut dummy_cache, position)?;

let (gpu_argmax_idx, gpu_argmax_val) = gpu_logits
.iter()
.enumerate()
.fold((0usize, f32::NEG_INFINITY), |(idx, v), (i, &x)| {
if x > v { (i, x) } else { (idx, v) }
});

// Linear-fit correlation (matches layer_by_layer_trace's diagnostic).
let n = cpu_logits.len().min(gpu_logits.len()) as f32;
let mean_cpu: f32 = cpu_logits[..n as usize].iter().sum::<f32>() / n;
let mean_gpu: f32 = gpu_logits[..n as usize].iter().sum::<f32>() / n;
let (mut cov, mut var_cpu, mut var_gpu) = (0.0f32, 0.0f32, 0.0f32);
for (c, g) in cpu_logits.iter().zip(gpu_logits.iter()) {
let dc = c - mean_cpu;
let dg = g - mean_gpu;
cov += dc * dg;
var_cpu += dc * dc;
var_gpu += dg * dg;
}
let correlation = cov / (var_cpu.sqrt() * var_gpu.sqrt() + 1e-10);

// FNV-1a fingerprint of logit bytes (LE f32) for bit-identity.
let cpu_fp: u64 = {
let mut all = Vec::with_capacity(cpu_logits.len() * 4);
for v in &cpu_logits {
all.extend_from_slice(&v.to_le_bytes());
}
fnv1a_64(&all)
};
let gpu_fp: u64 = {
let mut all = Vec::with_capacity(gpu_logits.len() * 4);
for v in &gpu_logits {
all.extend_from_slice(&v.to_le_bytes());
}
fnv1a_64(&all)
};

let agrees = cpu_argmax_idx == gpu_argmax_idx;

// Single-line JSON on stdout (all diagnostic prose on stderr).
println!(
"{{\"cpu_argmax_idx\":{},\"cpu_argmax_val\":{:.6},\
\"gpu_argmax_idx\":{},\"gpu_argmax_val\":{:.6},\
\"correlation\":{:.6},\
\"cpu_logits_fnv1a\":\"{:016x}\",\"gpu_logits_fnv1a\":\"{:016x}\",\
\"agrees_with_cpu\":{}}}",
cpu_argmax_idx, cpu_argmax_val,
gpu_argmax_idx, gpu_argmax_val,
correlation,
cpu_fp, gpu_fp,
agrees,
);

// Exit 0 when GPU agrees with CPU (bug fixed); exit 1 when they disagree.
// git bisect / CI invocations rely on this exit code.
std::process::exit(if agrees { 0 } else { 1 });
}
Loading