Skip to content

Commit 3ffc8e7

Browse files
committed
fix: fail open eval verification errors
Keep eval fixture runs informative when verifier requests or parsing fail, and surface those fallbacks as warnings so live benchmark results stay explainable instead of silently dropping comments. Made-with: Cursor
1 parent 061dc43 commit 3ffc8e7

17 files changed

Lines changed: 226 additions & 43 deletions

File tree

TODO.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
- Persist labeled eval runs into `QualityTrend` JSON so live provider sweeps can be trended over time.
1515
- Add suite/category/language baseline comparisons instead of only whole-run threshold gates.
1616
- Expand `review-depth-core` with authz, supply-chain, and async-correctness benchmark packs.
17-
- Harden verification fallback for live eval runs that return unparseable verification responses.
1817
- [ ] `src/commands/feedback_eval/`
1918
- Correlate feedback calibration with eval-suite category and rule-level performance.
2019

src/commands/eval/command.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,12 @@ use options::prepare_eval_options;
1616
use report::emit_eval_report;
1717

1818
pub async fn eval_command(
19-
config: config::Config,
19+
mut config: config::Config,
2020
fixtures_dir: PathBuf,
2121
output_path: Option<PathBuf>,
2222
options: EvalRunOptions,
2323
) -> Result<()> {
24+
config.verification_fail_open = true;
2425
let execution = run_eval_fixtures(&config, &fixtures_dir, &options).await?;
2526
let prepared_options = prepare_eval_options(&options)?;
2627
let run_metadata = build_eval_run_metadata(&config, &fixtures_dir, &options, &execution);
@@ -62,6 +63,7 @@ fn build_eval_run_metadata(
6263
fixture_name_filters: options.fixture_name_filters.clone(),
6364
max_fixtures: options.max_fixtures,
6465
},
66+
verification_fail_open: config.verification_fail_open,
6567
}
6668
}
6769

src/commands/eval/metrics/suites.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ mod tests {
207207
metadata: None,
208208
rule_metrics: vec![],
209209
rule_summary: None,
210+
warnings: vec![],
210211
failures: vec!["missing finding".to_string()],
211212
}];
212213

@@ -239,6 +240,7 @@ mod tests {
239240
}),
240241
rule_metrics: vec![],
241242
rule_summary: None,
243+
warnings: vec![],
242244
failures: vec![],
243245
},
244246
EvalFixtureResult {
@@ -259,6 +261,7 @@ mod tests {
259261
}),
260262
rule_metrics: vec![],
261263
rule_summary: None,
264+
warnings: vec![],
262265
failures: vec![],
263266
},
264267
];

src/commands/eval/report/build.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,15 @@ pub(in super::super) fn build_eval_report(
1414
let fixtures_total = results.len();
1515
let fixtures_passed = results.iter().filter(|result| result.passed).count();
1616
let fixtures_failed = fixtures_total.saturating_sub(fixtures_passed);
17+
let warnings = results
18+
.iter()
19+
.flat_map(|result| {
20+
result
21+
.warnings
22+
.iter()
23+
.map(|warning| format!("{}: {}", result.fixture, warning))
24+
})
25+
.collect::<Vec<_>>();
1726
let rule_metrics = aggregate_rule_metrics(&results);
1827
let rule_summary = summarize_rule_metrics(&rule_metrics);
1928
let suite_results = build_suite_results(&results);
@@ -30,6 +39,7 @@ pub(in super::super) fn build_eval_report(
3039
benchmark_by_category: breakdowns.by_category,
3140
benchmark_by_language: breakdowns.by_language,
3241
benchmark_by_difficulty: breakdowns.by_difficulty,
42+
warnings,
3343
threshold_failures: Vec::new(),
3444
results,
3545
};

src/commands/eval/report/output.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
2121
if !report.run.fixtures_root.is_empty() {
2222
println!("Fixtures root: {}", report.run.fixtures_root);
2323
}
24+
println!(
25+
"Verification fallback: {}",
26+
if report.run.verification_fail_open {
27+
"fail-open"
28+
} else {
29+
"strict"
30+
}
31+
);
2432
}
2533

2634
println!(
@@ -71,6 +79,9 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
7179
println!(" metadata: {}", labels.join(", "));
7280
}
7381
}
82+
for warning in &result.warnings {
83+
println!(" warning: {}", warning);
84+
}
7485
}
7586

7687
if let Some(rule_summary) = report.rule_summary {
@@ -164,6 +175,10 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
164175
}
165176
}
166177

178+
for warning in &report.warnings {
179+
println!("Warning: {}", warning);
180+
}
181+
167182
for failure in &report.threshold_failures {
168183
println!("Threshold failure: {}", failure);
169184
}

src/commands/eval/runner/execute.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ pub(in super::super) async fn run_eval_fixture(
2121
let review_result =
2222
review_diff_content_raw(&prepared.diff_content, config.clone(), &prepared.repo_path)
2323
.await?;
24+
let warnings = review_result.warnings;
2425
let comments = review_result.comments;
2526
let total_comments = comments.len();
2627
let match_summary = evaluate_fixture_expectations(&prepared.fixture.expect, &comments);
@@ -35,6 +36,7 @@ pub(in super::super) async fn run_eval_fixture(
3536
total_comments,
3637
match_summary,
3738
benchmark_metrics,
39+
warnings,
3840
failures,
3941
))
4042
}

src/commands/eval/runner/execute/result.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ pub(super) fn build_fixture_result(
5757
total_comments: usize,
5858
match_summary: FixtureMatchSummary,
5959
benchmark_metrics: Option<BenchmarkFixtureResult>,
60+
warnings: Vec<String>,
6061
failures: Vec<String>,
6162
) -> EvalFixtureResult {
6263
EvalFixtureResult {
@@ -72,6 +73,7 @@ pub(super) fn build_fixture_result(
7273
metadata: prepared.metadata,
7374
rule_metrics: match_summary.rule_metrics,
7475
rule_summary: match_summary.rule_summary,
76+
warnings,
7577
failures,
7678
}
7779
}

src/commands/eval/thresholds/evaluation/run.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ mod tests {
6060
benchmark_by_category: Default::default(),
6161
benchmark_by_language: Default::default(),
6262
benchmark_by_difficulty: Default::default(),
63+
warnings: vec![],
6364
threshold_failures: vec![],
6465
results: vec![],
6566
};
@@ -102,6 +103,7 @@ mod tests {
102103
benchmark_by_category: Default::default(),
103104
benchmark_by_language: Default::default(),
104105
benchmark_by_difficulty: Default::default(),
106+
warnings: vec![],
105107
threshold_failures: vec![],
106108
results: vec![],
107109
};
@@ -126,6 +128,7 @@ mod tests {
126128
benchmark_by_category: Default::default(),
127129
benchmark_by_language: Default::default(),
128130
benchmark_by_difficulty: Default::default(),
131+
warnings: vec![],
129132
threshold_failures: vec![],
130133
results: vec![],
131134
};

src/commands/eval/types/report.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ pub(in super::super) struct EvalRunMetadata {
4545
pub(in super::super) base_url: Option<String>,
4646
#[serde(default)]
4747
pub(in super::super) filters: EvalRunFilters,
48+
#[serde(default)]
49+
pub(in super::super) verification_fail_open: bool,
4850
}
4951

5052
#[derive(Debug, Clone, Serialize, Deserialize)]
@@ -112,6 +114,8 @@ pub(in super::super) struct EvalFixtureResult {
112114
#[serde(default)]
113115
pub(in super::super) rule_summary: Option<EvalRuleScoreSummary>,
114116
#[serde(default)]
117+
pub(in super::super) warnings: Vec<String>,
118+
#[serde(default)]
115119
pub(in super::super) failures: Vec<String>,
116120
}
117121

@@ -154,6 +158,8 @@ pub(in super::super) struct EvalReport {
154158
#[serde(default)]
155159
pub(in super::super) benchmark_by_difficulty: HashMap<String, BenchmarkAggregateMetrics>,
156160
#[serde(default)]
161+
pub(in super::super) warnings: Vec<String>,
162+
#[serde(default)]
157163
pub(in super::super) threshold_failures: Vec<String>,
158164
#[serde(default)]
159165
pub(in super::super) results: Vec<EvalFixtureResult>,

src/config.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,11 @@ pub struct Config {
306306
#[serde(default = "default_verification_max_comments")]
307307
pub verification_max_comments: usize,
308308

309+
/// When true, keep original comments if the verification pass fails or
310+
/// returns an unparseable response (default false).
311+
#[serde(default = "default_false")]
312+
pub verification_fail_open: bool,
313+
309314
/// Enable enhanced feedback loop with per-category/file-pattern tracking
310315
/// and feedback-adjusted confidence scores (default false).
311316
#[serde(default)]
@@ -506,6 +511,7 @@ impl Default for Config {
506511
verification_model_role: default_verification_model_role(),
507512
verification_min_score: default_verification_min_score(),
508513
verification_max_comments: default_verification_max_comments(),
514+
verification_fail_open: false,
509515
enhanced_feedback: false,
510516
feedback_min_observations: default_feedback_min_observations(),
511517
semantic_rag: false,

0 commit comments

Comments
 (0)