Skip to content

Commit 0399371

Browse files
committed
refactor: split threshold evaluation helpers
Separate minimum, drop, and rule-score evaluation helpers so future threshold policy changes stay easier to isolate and verify. Made-with: Cursor
1 parent 99bc404 commit 0399371

5 files changed

Lines changed: 263 additions & 208 deletions

File tree

Lines changed: 10 additions & 208 deletions
Original file line numberDiff line numberDiff line change
@@ -1,208 +1,10 @@
1-
use std::collections::HashMap;
2-
3-
use super::super::{EvalReport, EvalRuleMetrics};
4-
use super::EvalThresholdOptions;
5-
6-
pub(in super::super) fn evaluate_eval_thresholds(
7-
current: &EvalReport,
8-
baseline: Option<&EvalReport>,
9-
options: &EvalThresholdOptions,
10-
) -> Vec<String> {
11-
let mut failures = Vec::new();
12-
let current_micro_f1 = current
13-
.rule_summary
14-
.map(|summary| summary.micro_f1)
15-
.unwrap_or(0.0);
16-
let current_macro_f1 = current
17-
.rule_summary
18-
.map(|summary| summary.macro_f1)
19-
.unwrap_or(0.0);
20-
21-
if let Some(threshold) = options.min_micro_f1 {
22-
let threshold = threshold.clamp(0.0, 1.0);
23-
if current_micro_f1 < threshold {
24-
failures.push(format!(
25-
"micro-F1 {:.3} is below minimum {:.3}",
26-
current_micro_f1, threshold
27-
));
28-
}
29-
}
30-
31-
if let Some(threshold) = options.min_macro_f1 {
32-
let threshold = threshold.clamp(0.0, 1.0);
33-
if current_macro_f1 < threshold {
34-
failures.push(format!(
35-
"macro-F1 {:.3} is below minimum {:.3}",
36-
current_macro_f1, threshold
37-
));
38-
}
39-
}
40-
41-
let current_by_rule = build_rule_f1_map(&current.rule_metrics);
42-
for threshold in &options.min_rule_f1 {
43-
let current = current_by_rule
44-
.get(&threshold.rule_id)
45-
.copied()
46-
.unwrap_or(0.0);
47-
if current < threshold.value {
48-
failures.push(format!(
49-
"rule '{}' F1 {:.3} is below minimum {:.3}",
50-
threshold.rule_id, current, threshold.value
51-
));
52-
}
53-
}
54-
55-
if options.max_micro_f1_drop.is_some() || !options.max_rule_f1_drop.is_empty() {
56-
let Some(baseline) = baseline else {
57-
failures.push(
58-
"baseline report is required for drop-based thresholds (--baseline)".to_string(),
59-
);
60-
return failures;
61-
};
62-
63-
let baseline_summary = baseline.rule_summary.unwrap_or_default();
64-
if let Some(max_drop) = options.max_micro_f1_drop {
65-
let max_drop = max_drop.clamp(0.0, 1.0);
66-
let drop = (baseline_summary.micro_f1 - current_micro_f1).max(0.0);
67-
if drop > max_drop {
68-
failures.push(format!(
69-
"micro-F1 drop {:.3} exceeded max {:.3} (baseline {:.3} -> current {:.3})",
70-
drop, max_drop, baseline_summary.micro_f1, current_micro_f1
71-
));
72-
}
73-
}
74-
75-
if !options.max_rule_f1_drop.is_empty() {
76-
let baseline_by_rule = build_rule_f1_map(&baseline.rule_metrics);
77-
for threshold in &options.max_rule_f1_drop {
78-
let baseline_f1 = baseline_by_rule
79-
.get(&threshold.rule_id)
80-
.copied()
81-
.unwrap_or(0.0);
82-
let current_f1 = current_by_rule
83-
.get(&threshold.rule_id)
84-
.copied()
85-
.unwrap_or(0.0);
86-
let drop = (baseline_f1 - current_f1).max(0.0);
87-
if drop > threshold.value {
88-
failures.push(format!(
89-
"rule '{}' F1 drop {:.3} exceeded max {:.3} (baseline {:.3} -> current {:.3})",
90-
threshold.rule_id, drop, threshold.value, baseline_f1, current_f1
91-
));
92-
}
93-
}
94-
}
95-
}
96-
97-
failures
98-
}
99-
100-
fn build_rule_f1_map(metrics: &[EvalRuleMetrics]) -> HashMap<String, f32> {
101-
let mut by_rule = HashMap::new();
102-
for metric in metrics {
103-
by_rule.insert(metric.rule_id.to_ascii_lowercase(), metric.f1);
104-
}
105-
by_rule
106-
}
107-
108-
#[cfg(test)]
109-
mod tests {
110-
use super::super::super::{EvalReport, EvalRuleMetrics, EvalRuleScoreSummary};
111-
use super::*;
112-
use crate::commands::eval::thresholds::EvalRuleThreshold;
113-
114-
#[test]
115-
fn test_evaluate_eval_thresholds_requires_baseline_for_drop_checks() {
116-
let report = EvalReport {
117-
fixtures_total: 1,
118-
fixtures_passed: 1,
119-
fixtures_failed: 0,
120-
rule_metrics: vec![],
121-
rule_summary: Some(EvalRuleScoreSummary {
122-
micro_precision: 1.0,
123-
micro_recall: 1.0,
124-
micro_f1: 1.0,
125-
macro_precision: 1.0,
126-
macro_recall: 1.0,
127-
macro_f1: 1.0,
128-
}),
129-
suite_results: vec![],
130-
threshold_failures: vec![],
131-
results: vec![],
132-
};
133-
let options = EvalThresholdOptions {
134-
max_micro_f1_drop: Some(0.05),
135-
min_micro_f1: None,
136-
min_macro_f1: None,
137-
min_rule_f1: vec![],
138-
max_rule_f1_drop: vec![],
139-
};
140-
141-
let failures = evaluate_eval_thresholds(&report, None, &options);
142-
143-
assert_eq!(
144-
failures,
145-
vec!["baseline report is required for drop-based thresholds (--baseline)".to_string()]
146-
);
147-
}
148-
149-
#[test]
150-
fn test_evaluate_eval_thresholds_checks_rule_specific_drop() {
151-
let current = EvalReport {
152-
fixtures_total: 1,
153-
fixtures_passed: 1,
154-
fixtures_failed: 0,
155-
rule_metrics: vec![EvalRuleMetrics {
156-
rule_id: "sec.sql.injection".to_string(),
157-
expected: 1,
158-
predicted: 1,
159-
true_positives: 0,
160-
false_positives: 1,
161-
false_negatives: 1,
162-
precision: 0.0,
163-
recall: 0.0,
164-
f1: 0.0,
165-
}],
166-
rule_summary: Some(EvalRuleScoreSummary::default()),
167-
suite_results: vec![],
168-
threshold_failures: vec![],
169-
results: vec![],
170-
};
171-
let baseline = EvalReport {
172-
fixtures_total: 1,
173-
fixtures_passed: 1,
174-
fixtures_failed: 0,
175-
rule_metrics: vec![EvalRuleMetrics {
176-
rule_id: "sec.sql.injection".to_string(),
177-
expected: 1,
178-
predicted: 1,
179-
true_positives: 1,
180-
false_positives: 0,
181-
false_negatives: 0,
182-
precision: 1.0,
183-
recall: 1.0,
184-
f1: 1.0,
185-
}],
186-
rule_summary: Some(EvalRuleScoreSummary::default()),
187-
suite_results: vec![],
188-
threshold_failures: vec![],
189-
results: vec![],
190-
};
191-
let options = EvalThresholdOptions {
192-
max_micro_f1_drop: None,
193-
min_micro_f1: None,
194-
min_macro_f1: None,
195-
min_rule_f1: vec![],
196-
max_rule_f1_drop: vec![EvalRuleThreshold {
197-
rule_id: "sec.sql.injection".to_string(),
198-
value: 0.2,
199-
}],
200-
};
201-
202-
let failures = evaluate_eval_thresholds(&current, Some(&baseline), &options);
203-
204-
assert_eq!(failures.len(), 1);
205-
assert!(failures[0].contains("sec.sql.injection"));
206-
assert!(failures[0].contains("exceeded max 0.200"));
207-
}
208-
}
1+
#[path = "evaluation/drops.rs"]
2+
mod drops;
3+
#[path = "evaluation/minimums.rs"]
4+
mod minimums;
5+
#[path = "evaluation/rules.rs"]
6+
mod rules;
7+
#[path = "evaluation/run.rs"]
8+
mod run;
9+
10+
pub(in super::super) use run::evaluate_eval_thresholds;
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
use std::collections::HashMap;
2+
3+
use super::super::super::EvalReport;
4+
use super::super::EvalThresholdOptions;
5+
use super::rules::build_rule_f1_map;
6+
7+
pub(super) fn check_drop_thresholds(
8+
current_micro_f1: f32,
9+
current_by_rule: &HashMap<String, f32>,
10+
baseline: Option<&EvalReport>,
11+
options: &EvalThresholdOptions,
12+
) -> Vec<String> {
13+
let mut failures = Vec::new();
14+
if options.max_micro_f1_drop.is_none() && options.max_rule_f1_drop.is_empty() {
15+
return failures;
16+
}
17+
18+
let Some(baseline) = baseline else {
19+
failures
20+
.push("baseline report is required for drop-based thresholds (--baseline)".to_string());
21+
return failures;
22+
};
23+
24+
let baseline_summary = baseline.rule_summary.unwrap_or_default();
25+
if let Some(max_drop) = options.max_micro_f1_drop {
26+
let max_drop = max_drop.clamp(0.0, 1.0);
27+
let drop = (baseline_summary.micro_f1 - current_micro_f1).max(0.0);
28+
if drop > max_drop {
29+
failures.push(format!(
30+
"micro-F1 drop {:.3} exceeded max {:.3} (baseline {:.3} -> current {:.3})",
31+
drop, max_drop, baseline_summary.micro_f1, current_micro_f1
32+
));
33+
}
34+
}
35+
36+
if !options.max_rule_f1_drop.is_empty() {
37+
let baseline_by_rule = build_rule_f1_map(&baseline.rule_metrics);
38+
for threshold in &options.max_rule_f1_drop {
39+
let baseline_f1 = baseline_by_rule
40+
.get(&threshold.rule_id)
41+
.copied()
42+
.unwrap_or(0.0);
43+
let current_f1 = current_by_rule
44+
.get(&threshold.rule_id)
45+
.copied()
46+
.unwrap_or(0.0);
47+
let drop = (baseline_f1 - current_f1).max(0.0);
48+
if drop > threshold.value {
49+
failures.push(format!(
50+
"rule '{}' F1 drop {:.3} exceeded max {:.3} (baseline {:.3} -> current {:.3})",
51+
threshold.rule_id, drop, threshold.value, baseline_f1, current_f1
52+
));
53+
}
54+
}
55+
}
56+
57+
failures
58+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
use std::collections::HashMap;
2+
3+
use super::super::EvalThresholdOptions;
4+
5+
pub(super) fn check_minimum_thresholds(
6+
current_micro_f1: f32,
7+
current_macro_f1: f32,
8+
current_by_rule: &HashMap<String, f32>,
9+
options: &EvalThresholdOptions,
10+
) -> Vec<String> {
11+
let mut failures = Vec::new();
12+
13+
if let Some(threshold) = options.min_micro_f1 {
14+
let threshold = threshold.clamp(0.0, 1.0);
15+
if current_micro_f1 < threshold {
16+
failures.push(format!(
17+
"micro-F1 {:.3} is below minimum {:.3}",
18+
current_micro_f1, threshold
19+
));
20+
}
21+
}
22+
23+
if let Some(threshold) = options.min_macro_f1 {
24+
let threshold = threshold.clamp(0.0, 1.0);
25+
if current_macro_f1 < threshold {
26+
failures.push(format!(
27+
"macro-F1 {:.3} is below minimum {:.3}",
28+
current_macro_f1, threshold
29+
));
30+
}
31+
}
32+
33+
for threshold in &options.min_rule_f1 {
34+
let current = current_by_rule
35+
.get(&threshold.rule_id)
36+
.copied()
37+
.unwrap_or(0.0);
38+
if current < threshold.value {
39+
failures.push(format!(
40+
"rule '{}' F1 {:.3} is below minimum {:.3}",
41+
threshold.rule_id, current, threshold.value
42+
));
43+
}
44+
}
45+
46+
failures
47+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
use std::collections::HashMap;
2+
3+
use super::super::super::EvalRuleMetrics;
4+
5+
pub(super) fn build_rule_f1_map(metrics: &[EvalRuleMetrics]) -> HashMap<String, f32> {
6+
let mut by_rule = HashMap::new();
7+
for metric in metrics {
8+
by_rule.insert(metric.rule_id.to_ascii_lowercase(), metric.f1);
9+
}
10+
by_rule
11+
}

0 commit comments

Comments
 (0)