fix: fail open eval verification errors

haasonsaas · haasonsaas · commit 3ffc8e7537e2 · 2026-03-13T12:00:10.000-07:00
Keep eval fixture runs informative when verifier requests or parsing fail, and surface those fallbacks as warnings so live benchmark results stay explainable instead of silently dropping comments.

Made-with: Cursor
diff --git a/TODO.md b/TODO.md
@@ -14,7 +14,6 @@
   - Persist labeled eval runs into `QualityTrend` JSON so live provider sweeps can be trended over time.
   - Add suite/category/language baseline comparisons instead of only whole-run threshold gates.
   - Expand `review-depth-core` with authz, supply-chain, and async-correctness benchmark packs.
-  - Harden verification fallback for live eval runs that return unparseable verification responses.
 - [ ] `src/commands/feedback_eval/`
   - Correlate feedback calibration with eval-suite category and rule-level performance.
 
diff --git a/src/commands/eval/command.rs b/src/commands/eval/command.rs
@@ -16,11 +16,12 @@ use options::prepare_eval_options;
 use report::emit_eval_report;
 
 pub async fn eval_command(
-    config: config::Config,
+    mut config: config::Config,
     fixtures_dir: PathBuf,
     output_path: Option<PathBuf>,
     options: EvalRunOptions,
 ) -> Result<()> {
+    config.verification_fail_open = true;
     let execution = run_eval_fixtures(&config, &fixtures_dir, &options).await?;
     let prepared_options = prepare_eval_options(&options)?;
     let run_metadata = build_eval_run_metadata(&config, &fixtures_dir, &options, &execution);
@@ -62,6 +63,7 @@ fn build_eval_run_metadata(
             fixture_name_filters: options.fixture_name_filters.clone(),
             max_fixtures: options.max_fixtures,
         },
+        verification_fail_open: config.verification_fail_open,
     }
 }
 
diff --git a/src/commands/eval/metrics/suites.rs b/src/commands/eval/metrics/suites.rs
@@ -207,6 +207,7 @@ mod tests {
             metadata: None,
             rule_metrics: vec![],
             rule_summary: None,
+            warnings: vec![],
             failures: vec!["missing finding".to_string()],
         }];
 
@@ -239,6 +240,7 @@ mod tests {
                 }),
                 rule_metrics: vec![],
                 rule_summary: None,
+                warnings: vec![],
                 failures: vec![],
             },
             EvalFixtureResult {
@@ -259,6 +261,7 @@ mod tests {
                 }),
                 rule_metrics: vec![],
                 rule_summary: None,
+                warnings: vec![],
                 failures: vec![],
             },
         ];
diff --git a/src/commands/eval/report/build.rs b/src/commands/eval/report/build.rs
@@ -14,6 +14,15 @@ pub(in super::super) fn build_eval_report(
     let fixtures_total = results.len();
     let fixtures_passed = results.iter().filter(|result| result.passed).count();
     let fixtures_failed = fixtures_total.saturating_sub(fixtures_passed);
+    let warnings = results
+        .iter()
+        .flat_map(|result| {
+            result
+                .warnings
+                .iter()
+                .map(|warning| format!("{}: {}", result.fixture, warning))
+        })
+        .collect::<Vec<_>>();
     let rule_metrics = aggregate_rule_metrics(&results);
     let rule_summary = summarize_rule_metrics(&rule_metrics);
     let suite_results = build_suite_results(&results);
@@ -30,6 +39,7 @@ pub(in super::super) fn build_eval_report(
         benchmark_by_category: breakdowns.by_category,
         benchmark_by_language: breakdowns.by_language,
         benchmark_by_difficulty: breakdowns.by_difficulty,
+        warnings,
         threshold_failures: Vec::new(),
         results,
     };
diff --git a/src/commands/eval/report/output.rs b/src/commands/eval/report/output.rs
@@ -21,6 +21,14 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
         if !report.run.fixtures_root.is_empty() {
             println!("Fixtures root: {}", report.run.fixtures_root);
         }
+        println!(
+            "Verification fallback: {}",
+            if report.run.verification_fail_open {
+                "fail-open"
+            } else {
+                "strict"
+            }
+        );
     }
 
     println!(
@@ -71,6 +79,9 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
                 println!("  metadata: {}", labels.join(", "));
             }
         }
+        for warning in &result.warnings {
+            println!("  warning: {}", warning);
+        }
     }
 
     if let Some(rule_summary) = report.rule_summary {
@@ -164,6 +175,10 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {
         }
     }
 
+    for warning in &report.warnings {
+        println!("Warning: {}", warning);
+    }
+
     for failure in &report.threshold_failures {
         println!("Threshold failure: {}", failure);
     }
diff --git a/src/commands/eval/runner/execute.rs b/src/commands/eval/runner/execute.rs
@@ -21,6 +21,7 @@ pub(in super::super) async fn run_eval_fixture(
     let review_result =
         review_diff_content_raw(&prepared.diff_content, config.clone(), &prepared.repo_path)
             .await?;
+    let warnings = review_result.warnings;
     let comments = review_result.comments;
     let total_comments = comments.len();
     let match_summary = evaluate_fixture_expectations(&prepared.fixture.expect, &comments);
@@ -35,6 +36,7 @@ pub(in super::super) async fn run_eval_fixture(
         total_comments,
         match_summary,
         benchmark_metrics,
+        warnings,
         failures,
     ))
 }
diff --git a/src/commands/eval/runner/execute/result.rs b/src/commands/eval/runner/execute/result.rs
@@ -57,6 +57,7 @@ pub(super) fn build_fixture_result(
     total_comments: usize,
     match_summary: FixtureMatchSummary,
     benchmark_metrics: Option<BenchmarkFixtureResult>,
+    warnings: Vec<String>,
     failures: Vec<String>,
 ) -> EvalFixtureResult {
     EvalFixtureResult {
@@ -72,6 +73,7 @@ pub(super) fn build_fixture_result(
         metadata: prepared.metadata,
         rule_metrics: match_summary.rule_metrics,
         rule_summary: match_summary.rule_summary,
+        warnings,
         failures,
     }
 }
diff --git a/src/commands/eval/thresholds/evaluation/run.rs b/src/commands/eval/thresholds/evaluation/run.rs
@@ -60,6 +60,7 @@ mod tests {
             benchmark_by_category: Default::default(),
             benchmark_by_language: Default::default(),
             benchmark_by_difficulty: Default::default(),
+            warnings: vec![],
             threshold_failures: vec![],
             results: vec![],
         };
@@ -102,6 +103,7 @@ mod tests {
             benchmark_by_category: Default::default(),
             benchmark_by_language: Default::default(),
             benchmark_by_difficulty: Default::default(),
+            warnings: vec![],
             threshold_failures: vec![],
             results: vec![],
         };
@@ -126,6 +128,7 @@ mod tests {
             benchmark_by_category: Default::default(),
             benchmark_by_language: Default::default(),
             benchmark_by_difficulty: Default::default(),
+            warnings: vec![],
             threshold_failures: vec![],
             results: vec![],
         };
diff --git a/src/commands/eval/types/report.rs b/src/commands/eval/types/report.rs
@@ -45,6 +45,8 @@ pub(in super::super) struct EvalRunMetadata {
     pub(in super::super) base_url: Option<String>,
     #[serde(default)]
     pub(in super::super) filters: EvalRunFilters,
+    #[serde(default)]
+    pub(in super::super) verification_fail_open: bool,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -112,6 +114,8 @@ pub(in super::super) struct EvalFixtureResult {
     #[serde(default)]
     pub(in super::super) rule_summary: Option<EvalRuleScoreSummary>,
     #[serde(default)]
+    pub(in super::super) warnings: Vec<String>,
+    #[serde(default)]
     pub(in super::super) failures: Vec<String>,
 }
 
@@ -154,6 +158,8 @@ pub(in super::super) struct EvalReport {
     #[serde(default)]
     pub(in super::super) benchmark_by_difficulty: HashMap<String, BenchmarkAggregateMetrics>,
     #[serde(default)]
+    pub(in super::super) warnings: Vec<String>,
+    #[serde(default)]
     pub(in super::super) threshold_failures: Vec<String>,
     #[serde(default)]
     pub(in super::super) results: Vec<EvalFixtureResult>,
diff --git a/src/config.rs b/src/config.rs
@@ -306,6 +306,11 @@ pub struct Config {
     #[serde(default = "default_verification_max_comments")]
     pub verification_max_comments: usize,
 
+    /// When true, keep original comments if the verification pass fails or
+    /// returns an unparseable response (default false).
+    #[serde(default = "default_false")]
+    pub verification_fail_open: bool,
+
     /// Enable enhanced feedback loop with per-category/file-pattern tracking
     /// and feedback-adjusted confidence scores (default false).
     #[serde(default)]
@@ -506,6 +511,7 @@ impl Default for Config {
             verification_model_role: default_verification_model_role(),
             verification_min_score: default_verification_min_score(),
             verification_max_comments: default_verification_max_comments(),
+            verification_fail_open: false,
             enhanced_feedback: false,
             feedback_min_observations: default_feedback_min_observations(),
             semantic_rag: false,
diff --git a/src/review/pipeline/chunking.rs b/src/review/pipeline/chunking.rs
@@ -61,4 +61,5 @@ fn merge_chunk_result(merged: &mut ReviewResult, chunk_result: ReviewResult) {
         *merged.comments_by_pass.entry(pass).or_insert(0) += count;
     }
     merged.hotspots.extend(chunk_result.hotspots);
+    merged.warnings.extend(chunk_result.warnings);
 }
diff --git a/src/review/pipeline/postprocess.rs b/src/review/pipeline/postprocess.rs
@@ -58,7 +58,8 @@ pub(super) async fn run_postprocess(
         .plugin_manager
         .run_post_processors(all_comments, &repo_path_str)
         .await?;
-    let processed_comments = apply_verification_pass(processed_comments, services, session).await;
+    let verification_output = apply_verification_pass(processed_comments, services, session).await;
+    let processed_comments = verification_output.comments;
 
     let processed_comments = if services.config.semantic_feedback {
         apply_semantic_feedback_adjustment(
@@ -103,5 +104,6 @@ pub(super) async fn run_postprocess(
         comments_by_pass,
         hotspots: session.enhanced_ctx.hotspots.clone(),
         agent_activity,
+        warnings: verification_output.warnings,
     })
 }
diff --git a/src/review/pipeline/postprocess/verification.rs b/src/review/pipeline/postprocess/verification.rs
@@ -6,50 +6,53 @@ use super::super::comments::is_analyzer_comment;
 use super::super::services::PipelineServices;
 use super::super::session::ReviewSession;
 
+pub(super) struct VerificationPassOutput {
+    pub(super) comments: Vec<core::Comment>,
+    pub(super) warnings: Vec<String>,
+}
+
 pub(super) async fn apply_verification_pass(
     comments: Vec<core::Comment>,
     services: &PipelineServices,
     session: &ReviewSession,
-) -> Vec<core::Comment> {
+) -> VerificationPassOutput {
     let (analyzer_comments, llm_comments): (Vec<_>, Vec<_>) =
         comments.into_iter().partition(is_analyzer_comment);
 
-    let verified_llm_comments = if services.config.verification_pass
+    let (verified_llm_comments, warnings) = if services.config.verification_pass
         && !llm_comments.is_empty()
         && llm_comments.len() <= services.config.verification_max_comments
     {
         let comment_count_before = llm_comments.len();
-        match super::super::super::verification::verify_comments(
+        let summary = super::super::super::verification::verify_comments(
             llm_comments,
             &session.diffs,
             &session.source_files,
             &session.verification_context,
             services.verification_adapter.as_ref(),
             services.config.verification_min_score,
+            services.config.verification_fail_open,
         )
-        .await
-        {
-            Ok(verified) => {
-                info!(
-                    "Verification pass: {}/{} comments passed",
-                    verified.len(),
-                    comment_count_before
-                );
-                verified
-            }
-            Err(error) => {
-                warn!(
-                    "Verification pass failed, dropping unverified LLM comments: {}",
-                    error
-                );
-                Vec::new()
-            }
+        .await;
+
+        for warning_message in &summary.warnings {
+            warn!("{}", warning_message);
         }
+
+        info!(
+            "Verification pass: {}/{} comments passed",
+            summary.comments.len(),
+            comment_count_before
+        );
+        (summary.comments, summary.warnings)
     } else {
-        llm_comments
+        (llm_comments, Vec::new())
     };
 
     let mut processed_comments = analyzer_comments;
     processed_comments.extend(verified_llm_comments);
-    processed_comments
+    VerificationPassOutput {
+        comments: processed_comments,
+        warnings,
+    }
 }
diff --git a/src/review/pipeline/types.rs b/src/review/pipeline/types.rs
@@ -22,6 +22,7 @@ pub struct ReviewResult {
     pub comments_by_pass: HashMap<String, usize>,
     pub hotspots: Vec<core::multi_pass::HotspotResult>,
     pub agent_activity: Option<AgentActivity>,
+    pub warnings: Vec<String>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/src/review/verification.rs b/src/review/verification.rs
diff --git a/src/review/verification/parser.rs b/src/review/verification/parser.rs
diff --git a/src/review/verification/tests.rs b/src/review/verification/tests.rs

Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,14 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {`
`21`	`21`	`if !report.run.fixtures_root.is_empty() {`
`22`	`22`	`println!("Fixtures root: {}", report.run.fixtures_root);`
`23`	`23`	`}`
	`24`	`+ println!(`
	`25`	`+ "Verification fallback: {}",`
	`26`	`+ if report.run.verification_fail_open {`
	`27`	`+ "fail-open"`
	`28`	`+ } else {`
	`29`	`+ "strict"`
	`30`	`+ }`
	`31`	`+ );`
`24`	`32`	`}`
`25`	`33`
`26`	`34`	`println!(`
`@@ -71,6 +79,9 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {`
`71`	`79`	`println!(" metadata: {}", labels.join(", "));`
`72`	`80`	`}`
`73`	`81`	`}`
	`82`	`+ for warning in &result.warnings {`
	`83`	`+ println!(" warning: {}", warning);`
	`84`	`+ }`
`74`	`85`	`}`
`75`	`86`
`76`	`87`	`if let Some(rule_summary) = report.rule_summary {`
`@@ -164,6 +175,10 @@ pub(in super::super) fn print_eval_report(report: &EvalReport) {`
`164`	`175`	`}`
`165`	`176`	`}`
`166`	`177`
	`178`	`+ for warning in &report.warnings {`
	`179`	`+ println!("Warning: {}", warning);`
	`180`	`+ }`
	`181`	`+`
`167`	`182`	`for failure in &report.threshold_failures {`
`168`	`183`	`println!("Threshold failure: {}", failure);`
`169`	`184`	`}`