|
1 | | -use std::collections::HashMap; |
2 | | - |
3 | | -use super::super::{EvalReport, EvalRuleMetrics}; |
4 | | -use super::EvalThresholdOptions; |
5 | | - |
6 | | -pub(in super::super) fn evaluate_eval_thresholds( |
7 | | - current: &EvalReport, |
8 | | - baseline: Option<&EvalReport>, |
9 | | - options: &EvalThresholdOptions, |
10 | | -) -> Vec<String> { |
11 | | - let mut failures = Vec::new(); |
12 | | - let current_micro_f1 = current |
13 | | - .rule_summary |
14 | | - .map(|summary| summary.micro_f1) |
15 | | - .unwrap_or(0.0); |
16 | | - let current_macro_f1 = current |
17 | | - .rule_summary |
18 | | - .map(|summary| summary.macro_f1) |
19 | | - .unwrap_or(0.0); |
20 | | - |
21 | | - if let Some(threshold) = options.min_micro_f1 { |
22 | | - let threshold = threshold.clamp(0.0, 1.0); |
23 | | - if current_micro_f1 < threshold { |
24 | | - failures.push(format!( |
25 | | - "micro-F1 {:.3} is below minimum {:.3}", |
26 | | - current_micro_f1, threshold |
27 | | - )); |
28 | | - } |
29 | | - } |
30 | | - |
31 | | - if let Some(threshold) = options.min_macro_f1 { |
32 | | - let threshold = threshold.clamp(0.0, 1.0); |
33 | | - if current_macro_f1 < threshold { |
34 | | - failures.push(format!( |
35 | | - "macro-F1 {:.3} is below minimum {:.3}", |
36 | | - current_macro_f1, threshold |
37 | | - )); |
38 | | - } |
39 | | - } |
40 | | - |
41 | | - let current_by_rule = build_rule_f1_map(¤t.rule_metrics); |
42 | | - for threshold in &options.min_rule_f1 { |
43 | | - let current = current_by_rule |
44 | | - .get(&threshold.rule_id) |
45 | | - .copied() |
46 | | - .unwrap_or(0.0); |
47 | | - if current < threshold.value { |
48 | | - failures.push(format!( |
49 | | - "rule '{}' F1 {:.3} is below minimum {:.3}", |
50 | | - threshold.rule_id, current, threshold.value |
51 | | - )); |
52 | | - } |
53 | | - } |
54 | | - |
55 | | - if options.max_micro_f1_drop.is_some() || !options.max_rule_f1_drop.is_empty() { |
56 | | - let Some(baseline) = baseline else { |
57 | | - failures.push( |
58 | | - "baseline report is required for drop-based thresholds (--baseline)".to_string(), |
59 | | - ); |
60 | | - return failures; |
61 | | - }; |
62 | | - |
63 | | - let baseline_summary = baseline.rule_summary.unwrap_or_default(); |
64 | | - if let Some(max_drop) = options.max_micro_f1_drop { |
65 | | - let max_drop = max_drop.clamp(0.0, 1.0); |
66 | | - let drop = (baseline_summary.micro_f1 - current_micro_f1).max(0.0); |
67 | | - if drop > max_drop { |
68 | | - failures.push(format!( |
69 | | - "micro-F1 drop {:.3} exceeded max {:.3} (baseline {:.3} -> current {:.3})", |
70 | | - drop, max_drop, baseline_summary.micro_f1, current_micro_f1 |
71 | | - )); |
72 | | - } |
73 | | - } |
74 | | - |
75 | | - if !options.max_rule_f1_drop.is_empty() { |
76 | | - let baseline_by_rule = build_rule_f1_map(&baseline.rule_metrics); |
77 | | - for threshold in &options.max_rule_f1_drop { |
78 | | - let baseline_f1 = baseline_by_rule |
79 | | - .get(&threshold.rule_id) |
80 | | - .copied() |
81 | | - .unwrap_or(0.0); |
82 | | - let current_f1 = current_by_rule |
83 | | - .get(&threshold.rule_id) |
84 | | - .copied() |
85 | | - .unwrap_or(0.0); |
86 | | - let drop = (baseline_f1 - current_f1).max(0.0); |
87 | | - if drop > threshold.value { |
88 | | - failures.push(format!( |
89 | | - "rule '{}' F1 drop {:.3} exceeded max {:.3} (baseline {:.3} -> current {:.3})", |
90 | | - threshold.rule_id, drop, threshold.value, baseline_f1, current_f1 |
91 | | - )); |
92 | | - } |
93 | | - } |
94 | | - } |
95 | | - } |
96 | | - |
97 | | - failures |
98 | | -} |
99 | | - |
100 | | -fn build_rule_f1_map(metrics: &[EvalRuleMetrics]) -> HashMap<String, f32> { |
101 | | - let mut by_rule = HashMap::new(); |
102 | | - for metric in metrics { |
103 | | - by_rule.insert(metric.rule_id.to_ascii_lowercase(), metric.f1); |
104 | | - } |
105 | | - by_rule |
106 | | -} |
107 | | - |
108 | | -#[cfg(test)] |
109 | | -mod tests { |
110 | | - use super::super::super::{EvalReport, EvalRuleMetrics, EvalRuleScoreSummary}; |
111 | | - use super::*; |
112 | | - use crate::commands::eval::thresholds::EvalRuleThreshold; |
113 | | - |
114 | | - #[test] |
115 | | - fn test_evaluate_eval_thresholds_requires_baseline_for_drop_checks() { |
116 | | - let report = EvalReport { |
117 | | - fixtures_total: 1, |
118 | | - fixtures_passed: 1, |
119 | | - fixtures_failed: 0, |
120 | | - rule_metrics: vec![], |
121 | | - rule_summary: Some(EvalRuleScoreSummary { |
122 | | - micro_precision: 1.0, |
123 | | - micro_recall: 1.0, |
124 | | - micro_f1: 1.0, |
125 | | - macro_precision: 1.0, |
126 | | - macro_recall: 1.0, |
127 | | - macro_f1: 1.0, |
128 | | - }), |
129 | | - suite_results: vec![], |
130 | | - threshold_failures: vec![], |
131 | | - results: vec![], |
132 | | - }; |
133 | | - let options = EvalThresholdOptions { |
134 | | - max_micro_f1_drop: Some(0.05), |
135 | | - min_micro_f1: None, |
136 | | - min_macro_f1: None, |
137 | | - min_rule_f1: vec![], |
138 | | - max_rule_f1_drop: vec![], |
139 | | - }; |
140 | | - |
141 | | - let failures = evaluate_eval_thresholds(&report, None, &options); |
142 | | - |
143 | | - assert_eq!( |
144 | | - failures, |
145 | | - vec!["baseline report is required for drop-based thresholds (--baseline)".to_string()] |
146 | | - ); |
147 | | - } |
148 | | - |
149 | | - #[test] |
150 | | - fn test_evaluate_eval_thresholds_checks_rule_specific_drop() { |
151 | | - let current = EvalReport { |
152 | | - fixtures_total: 1, |
153 | | - fixtures_passed: 1, |
154 | | - fixtures_failed: 0, |
155 | | - rule_metrics: vec![EvalRuleMetrics { |
156 | | - rule_id: "sec.sql.injection".to_string(), |
157 | | - expected: 1, |
158 | | - predicted: 1, |
159 | | - true_positives: 0, |
160 | | - false_positives: 1, |
161 | | - false_negatives: 1, |
162 | | - precision: 0.0, |
163 | | - recall: 0.0, |
164 | | - f1: 0.0, |
165 | | - }], |
166 | | - rule_summary: Some(EvalRuleScoreSummary::default()), |
167 | | - suite_results: vec![], |
168 | | - threshold_failures: vec![], |
169 | | - results: vec![], |
170 | | - }; |
171 | | - let baseline = EvalReport { |
172 | | - fixtures_total: 1, |
173 | | - fixtures_passed: 1, |
174 | | - fixtures_failed: 0, |
175 | | - rule_metrics: vec![EvalRuleMetrics { |
176 | | - rule_id: "sec.sql.injection".to_string(), |
177 | | - expected: 1, |
178 | | - predicted: 1, |
179 | | - true_positives: 1, |
180 | | - false_positives: 0, |
181 | | - false_negatives: 0, |
182 | | - precision: 1.0, |
183 | | - recall: 1.0, |
184 | | - f1: 1.0, |
185 | | - }], |
186 | | - rule_summary: Some(EvalRuleScoreSummary::default()), |
187 | | - suite_results: vec![], |
188 | | - threshold_failures: vec![], |
189 | | - results: vec![], |
190 | | - }; |
191 | | - let options = EvalThresholdOptions { |
192 | | - max_micro_f1_drop: None, |
193 | | - min_micro_f1: None, |
194 | | - min_macro_f1: None, |
195 | | - min_rule_f1: vec![], |
196 | | - max_rule_f1_drop: vec![EvalRuleThreshold { |
197 | | - rule_id: "sec.sql.injection".to_string(), |
198 | | - value: 0.2, |
199 | | - }], |
200 | | - }; |
201 | | - |
202 | | - let failures = evaluate_eval_thresholds(¤t, Some(&baseline), &options); |
203 | | - |
204 | | - assert_eq!(failures.len(), 1); |
205 | | - assert!(failures[0].contains("sec.sql.injection")); |
206 | | - assert!(failures[0].contains("exceeded max 0.200")); |
207 | | - } |
208 | | -} |
| 1 | +#[path = "evaluation/drops.rs"] |
| 2 | +mod drops; |
| 3 | +#[path = "evaluation/minimums.rs"] |
| 4 | +mod minimums; |
| 5 | +#[path = "evaluation/rules.rs"] |
| 6 | +mod rules; |
| 7 | +#[path = "evaluation/run.rs"] |
| 8 | +mod run; |
| 9 | + |
| 10 | +pub(in super::super) use run::evaluate_eval_thresholds; |
0 commit comments