Skip to content
96 changes: 96 additions & 0 deletions contracts/apr-validate-quality-threshold-v1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
metadata:
version: 1.0.0
created: '2026-05-22'
author: PAIML Engineering
description: "`apr validate --quality` must gate pass/fail on the *implemented* check denominator, not the aspirational 100-point denominator. Stubbed `Skip(Not implemented)` checks cannot count against working models — otherwise every valid APR file scores Grade F until every placeholder is filled in."
kind: pattern
references:
- "paiml/aprender#1866 (apr validate --quality: 22/25 checks 'Pending — Not implemented' → working models score 3/100, exit 5)"
- "crates/apr-cli/src/commands/validate.rs (score-threshold gate)"
- "crates/aprender-core/src/format/validation.rs (ValidationReport)"
registry: true
tags:
- cli
- validate
- threshold
- five-whys

five_whys:
symptom: "`apr validate /home/noah/models/qwen2.5-coder-1.5b-instruct-q4k.apr --quality --strict` reports 3/100 Grade F and exits 5 (ValidationFailed) on a model that `apr qa` says ✓ ALL GATES PASSED and that produces correct inference (`apr run` returns '2 + 2 equals 4.')"
why_1: "validate.rs:96 gates on `report.total_score < 50` against the full 100-point QA checklist"
why_2: "22 of 25 checks return `CheckStatus::Skip(\"Not implemented\")` with `points = 0` — they contribute nothing to total_score but also reduce the achievable max to 3 points"
why_3: "ValidationReport.total_score is a raw sum of awarded points; there's no `implemented_max` denominator that would let the gate compute a meaningful percentage"
why_4: "The QA checklist was designed as an aspirational 100-point ceiling (APR-SPEC-v2-draft.md Section 11) but the implementation grew incrementally — gate code assumed all checks would be implemented; they never were"
why_5: "No provable contract bound the threshold to the implementation surface — 'score thresholds are relative to runnable checks' was an unstated invariant"
root_cause: "Threshold gate uses the aspirational 100-point denominator instead of the runnable denominator (count of non-Skip checks). The fix moves the gate to `implemented_score_pct() < 50` and treats a fully-stubbed suite as informational (None) rather than a hard fail."

equations:
implemented_denominator:
formula: "implemented_max(report) = count{ c in report.checks : c.status != Skip }"
domain: "validation reports"
codomain: "u8 count of runnable checks"
invariants:
- "Skip checks (Skip(reason)) are excluded from the denominator"
- "Pass, Fail, Warn checks contribute 1 to implemented_max regardless of points"

implemented_score_pct:
formula: "pct(report) = if implemented_max > 0 then (pass_count / implemented_max) * 100 else None"
domain: "validation reports"
codomain: "Option<f64>"
invariants:
- "When no checks ran (all Skip), returns None — caller treats as informational, not pass/fail"
- "When at least one check ran, returns a percentage in [0, 100]"
- "All-Pass with N runnable checks returns Some(100.0)"

threshold_gate_on_implemented:
formula: "fail_gate(report) = implemented_score_pct(report) is Some(pct) AND pct < 50"
domain: "any apr validate invocation without --skip-contract"
codomain: "boolean: should exit 5 (ValidationFailed)"
invariants:
- "Models scoring 100% on implemented checks PASS, regardless of total_score"
- "Models scoring 0% on implemented checks FAIL (clear breakage signal)"
- "Fully-stubbed reports (implemented_max == 0) PASS as informational"
- "apr qa is the canonical pass/fail gate per CLAUDE.md; `apr validate --quality` complements with structural integrity audit"

proof_obligations:
- type: invariant
property: "Working models pass the threshold"
formal: "all_pass(report) AND implemented_max(report) > 0 ⟹ NOT fail_gate(report)"
applies_to: threshold_gate_on_implemented
- type: invariant
property: "Fully-stubbed reports do not fail"
formal: "implemented_max(report) == 0 ⟹ NOT fail_gate(report)"
applies_to: threshold_gate_on_implemented
- type: invariant
property: "Half-implemented half-failing models do fail"
formal: "implemented_max = 4 AND fail_count = 3 ⟹ fail_gate(report)"
applies_to: threshold_gate_on_implemented

falsification_tests:
- id: FALSIFY-VALIDATE-QUALITY-001
rule: "implemented_score_pct returns None when entire suite is stubbed"
prediction: "ValidationReport with only Skip checks returns None"
test: "cargo test -p aprender-core --lib test_implemented_score_pct_none_when_all_stubbed"
if_fails: "Stubbed suites still count against the threshold"

- id: FALSIFY-VALIDATE-QUALITY-002
rule: "implemented_score_pct returns 100 when all runnable checks pass"
prediction: "ValidationReport with 3 Pass + 22 Skip returns Some(100.0)"
test: "cargo test -p aprender-core --lib test_implemented_score_pct_100_when_all_pass"
if_fails: "Working models still mis-scored against unimplemented denominator"

- id: FALSIFY-VALIDATE-QUALITY-003
rule: "validate.rs gates on implemented_score_pct, not total_score"
prediction: "validate.rs source contains implemented_score_pct call and no `total_score < 50` bare comparison in the gate"
test: "grep -q 'implemented_score_pct' crates/apr-cli/src/commands/validate.rs && ! grep -nE 'report\\.total_score *< *50' crates/apr-cli/src/commands/validate.rs"
if_fails: "Threshold gate still uses the aspirational 100-point denominator"

qa_gate:
id: F-VALIDATE-QUALITY-001
name: "apr validate --quality threshold gates on implemented checks"
description: "Pass/fail threshold must use percentage of implemented (non-Skip) checks. A fully-stubbed suite must be treated as informational, not a hard fail."
checks:
- "implemented_denominator"
- "implemented_score_pct"
- "threshold_gate_on_implemented"
pass_criteria: "FALSIFY-VALIDATE-QUALITY-{001,002,003} all PASS"
23 changes: 18 additions & 5 deletions crates/apr-cli/src/commands/validate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,24 @@ fn run_apr_validation(

// GH-647: Exit non-zero when validation shows contract violations
// GH-642: --skip-contract bypasses the contract score threshold gate
if !skip_contract && report.total_score < 50 {
return Err(CliError::ValidationFailed(format!(
"Score {}/100 (below 50% threshold)",
report.total_score
)));
// #1866: gate on percentage of *implemented* checks (Pass/Fail/Warn),
// not the full 100-point denominator. Stubbed "Pending" checks
// scored as Skip — counting them against the model produced
// Grade F on every valid APR file until every stub was filled in.
// See apr-validate-quality-threshold-v1.yaml.
if !skip_contract {
if let Some(pct) = report.implemented_score_pct() {
if pct < 50.0 {
let max = report.implemented_max();
return Err(CliError::ValidationFailed(format!(
"Score {}/{max} implemented checks passed ({:.0}%) — below 50% threshold",
report.total_score, pct
)));
}
}
// implemented_score_pct() == None: entire QA suite is stubbed.
// Treat as informational, not a hard fail. (apr qa remains the
// canonical pass/fail gate per CLAUDE.md.)
}

Ok(())
Expand Down
36 changes: 36 additions & 0 deletions crates/aprender-core/src/format/validation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,42 @@ impl ValidationReport {
pub fn failed_checks(&self) -> Vec<&ValidationCheck> {
self.checks.iter().filter(|c| c.status.is_fail()).collect()
}

/// Count checks that actually ran (Pass / Fail / Warn — not Skip).
///
/// Contract: apr-validate-quality-threshold-v1 (#1866) — the 100-point
/// QA checklist contains many `Skip("Not implemented")` placeholders. A
/// pass/fail gate computed against the full 100-point denominator marks
/// every working model as Grade F until every stub is filled in. This
/// helper exposes the implemented denominator so callers can gate on
/// implementation-relative percentage instead.
#[must_use]
pub fn implemented_max(&self) -> u8 {
self.checks
.iter()
.filter(|c| !matches!(c.status, CheckStatus::Skip(_)))
.count()
.min(u8::MAX as usize) as u8
}

/// Percentage of *implemented* (non-Skip) checks that passed. Returns
/// `None` when no checks have run (entire QA suite stubbed), in which
/// case callers should treat the score as informational rather than
/// a hard fail (#1866).
#[must_use]
pub fn implemented_score_pct(&self) -> Option<f64> {
let max = self.implemented_max();
if max == 0 {
return None;
}
let passed: u8 = self
.checks
.iter()
.filter(|c| c.status.is_pass())
.count()
.min(u8::MAX as usize) as u8;
Some((f64::from(passed) / f64::from(max)) * 100.0)
}
}

impl Default for ValidationReport {
Expand Down
71 changes: 71 additions & 0 deletions crates/aprender-core/src/format/validation_tests_report.rs
Original file line number Diff line number Diff line change
Expand Up @@ -488,4 +488,75 @@ mod tests_report {
assert_eq!(stats.min, 42.0);
assert_eq!(stats.max, 42.0);
}

// ========================================================================
// #1866: implemented_score_pct + implemented_max
//
// Contract: apr-validate-quality-threshold-v1
// ========================================================================

fn push_check(report: &mut ValidationReport, id: u8, status: CheckStatus) {
let points = if matches!(status, CheckStatus::Pass) { 1 } else { 0 };
report.add_check(ValidationCheck {
id,
name: "test",
category: Category::Structure,
status,
points,
});
}

/// FALSIFY-VALIDATE-QUALITY-001: fully-stubbed suite returns None.
#[test]
fn test_implemented_score_pct_none_when_all_stubbed() {
let mut report = ValidationReport::new();
for i in 1..=25 {
push_check(&mut report, i, CheckStatus::Skip("Not implemented".into()));
}
assert_eq!(report.implemented_max(), 0);
assert_eq!(report.implemented_score_pct(), None);
}

/// FALSIFY-VALIDATE-QUALITY-002: 3 Pass + 22 Skip returns 100% (matches #1866 reproducer).
#[test]
fn test_implemented_score_pct_100_when_all_pass() {
let mut report = ValidationReport::new();
for i in 1..=3 {
push_check(&mut report, i, CheckStatus::Pass);
}
for i in 4..=25 {
push_check(&mut report, i, CheckStatus::Skip("Not implemented".into()));
}
assert_eq!(report.implemented_max(), 3);
assert_eq!(report.implemented_score_pct(), Some(100.0));
// Per #1866: this is the 1.5B Q4K APR case — must not fail the gate.
}

/// Half-implemented half-failing: implemented_pct = 50% — gate fires at < 50.
#[test]
fn test_implemented_score_pct_mixed() {
let mut report = ValidationReport::new();
push_check(&mut report, 1, CheckStatus::Pass);
push_check(&mut report, 2, CheckStatus::Pass);
push_check(&mut report, 3, CheckStatus::Fail("bad".into()));
push_check(&mut report, 4, CheckStatus::Fail("bad".into()));
for i in 5..=25 {
push_check(&mut report, i, CheckStatus::Skip("Not implemented".into()));
}
assert_eq!(report.implemented_max(), 4);
let pct = report.implemented_score_pct().expect("some");
assert!((pct - 50.0).abs() < f64::EPSILON, "expected 50.0, got {pct}");
}

/// Below-threshold case: 1/4 pass = 25% — gate must fire.
#[test]
fn test_implemented_score_pct_below_threshold() {
let mut report = ValidationReport::new();
push_check(&mut report, 1, CheckStatus::Pass);
push_check(&mut report, 2, CheckStatus::Fail("bad".into()));
push_check(&mut report, 3, CheckStatus::Fail("bad".into()));
push_check(&mut report, 4, CheckStatus::Fail("bad".into()));
let pct = report.implemented_score_pct().expect("some");
assert!(pct < 50.0, "expected < 50, got {pct}");
}
}
Loading