Skip to content

Commit 9e7480c

Browse files
committed
feat: harden DAG runtime traces and frontier eval coverage
Align DAG planning and runtime reporting so review/eval artifacts expose the executable graph state, then harden the frontier benchmark against real model wording drift. Add a deterministic Rust compile-regression analyzer so deleted struct-field initializers are caught reliably without depending on LLM verifier variance. Made-with: Cursor
1 parent 0b20c51 commit 9e7480c

36 files changed

Lines changed: 1167 additions & 87 deletions

File tree

eval/fixtures/deep_review_suite/review_depth_async.json

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
"diff_content": "diff --git a/src/sync.rs b/src/sync.rs\nindex 1111111..2222222 100644\n--- a/src/sync.rs\n+++ b/src/sync.rs\n@@ -1,5 +1,6 @@\n pub async fn refresh_user(user_id: String) -> anyhow::Result<()> {\n- sync_user(user_id).await?;\n+ tokio::spawn(async move {\n+ sync_user(user_id).await.unwrap();\n+ });\n Ok(())\n }\n",
3232
"expected_findings": [
3333
{
34-
"description": "Background task errors are detached from the caller and can fail silently.",
34+
"description": "Detached background task cannot be awaited or monitored by the caller.",
3535
"severity": "Warning",
3636
"category": "Bug",
3737
"file_pattern": "src/sync.rs",
@@ -40,7 +40,32 @@
4040
"fire and forget",
4141
"spawned task is not awaited",
4242
"detached task error",
43-
"background task may fail silently"
43+
"background task may fail silently",
44+
"no join handle",
45+
"cannot await completion",
46+
"errors are never returned to the caller",
47+
"errors can never be returned to the caller",
48+
"caller always receives ok",
49+
"function contract is broken",
50+
"contract violation",
51+
"return type is misleading"
52+
]
53+
},
54+
{
55+
"description": "Background task unwrap discards sync failures instead of propagating or logging them.",
56+
"severity": "Warning",
57+
"category": "Bug",
58+
"file_pattern": "src/sync.rs",
59+
"line_hint": 3,
60+
"contains_any": [
61+
"unwrap",
62+
"panic in background task",
63+
"error propagation",
64+
"breaking error contract",
65+
"discarded sync_user errors",
66+
"returns ok even when sync_user fails",
67+
"without propagating the failure",
68+
"silently lost"
4469
]
4570
}
4671
],

eval/fixtures/deep_review_suite/review_depth_core.json

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,6 @@
7979
"unsafe sql",
8080
"query built from user input",
8181
"interpolates user-controlled"
82-
],
83-
"tags_any": [
84-
"sql-injection"
8582
]
8683
}
8784
],

eval/fixtures/deep_review_suite/review_depth_supply_chain.json

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
"diff_content": "diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml\nindex 1111111..2222222 100644\n--- a/.github/workflows/build.yml\n+++ b/.github/workflows/build.yml\n@@ -6,4 +6,5 @@ jobs:\n build:\n runs-on: ubuntu-latest\n steps:\n- - uses: actions/checkout@8ade135a41bc03ea155e62e844d188df1ea18608\n+ - uses: actions/checkout@v4\n+ - uses: docker/setup-buildx-action@v3\n",
3131
"expected_findings": [
3232
{
33-
"description": "GitHub Actions are no longer pinned to immutable commit SHAs.",
33+
"description": "actions/checkout is no longer pinned to an immutable commit SHA.",
3434
"severity": "Warning",
3535
"category": "Security",
3636
"file_pattern": ".github/workflows/build.yml",
@@ -44,6 +44,22 @@
4444
"tags_any": [
4545
"supply-chain"
4646
]
47+
},
48+
{
49+
"description": "docker/setup-buildx-action is also introduced with a mutable tag instead of a commit SHA.",
50+
"severity": "Warning",
51+
"category": "Security",
52+
"file_pattern": ".github/workflows/build.yml",
53+
"line_hint": 10,
54+
"contains_any": [
55+
"unpinned action",
56+
"pin github action to a commit sha",
57+
"supply chain risk",
58+
"mutable action tag"
59+
],
60+
"tags_any": [
61+
"supply-chain"
62+
]
4763
}
4864
],
4965
"negative_findings": [

eval/fixtures/repo_regressions/cross_file_sql_helper.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ expect:
1919
- sql injection
2020
- unsafe sql
2121
- interpolates user-controlled
22-
tags_any:
23-
- sql-injection
2422
must_not_find:
2523
- contains: style
2624
min_total: 1

eval/fixtures/repo_regressions/raw_comment_missing_field.yml

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
name: repo regression - missing RawComment rule_id initializer
22
repo_path: ../../..
33
diff: |
4-
diff --git a/src/main.rs b/src/main.rs
4+
diff --git a/src/parsing/llm_response.rs b/src/parsing/llm_response.rs
55
index b9f5a12..ad88084 100644
6-
--- a/src/main.rs
7-
+++ b/src/main.rs
8-
@@ -1966,7 +1966,6 @@ fn parse_llm_response(content: &str, file_path: &Path) -> Result<Vec<core::comm
9-
file_path: file_path.to_path_buf(),
10-
line_number,
11-
content,
12-
- rule_id,
13-
suggestion,
14-
severity: None,
15-
category: None,
6+
--- a/src/parsing/llm_response.rs
7+
+++ b/src/parsing/llm_response.rs
8+
@@ -155,7 +155,6 @@ fn parse_primary(content: &str, file_path: &Path) -> Result<Vec<core::comment::RawComment>> {
9+
file_path: file_path.to_path_buf(),
10+
line_number,
11+
content,
12+
- rule_id,
13+
suggestion,
14+
severity: None,
15+
category: None,
1616
expect:
1717
must_find:
18-
- file: src/main.rs
18+
- file: src/parsing/llm_response.rs
1919
contains: rule_id
2020
rule_id: compile.rawcomment.rule_id
2121
min_total: 1

src/commands/dag.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,4 +79,22 @@ mod tests {
7979

8080
assert_eq!(plan.ready, vec!["build_session"]);
8181
}
82+
83+
#[test]
84+
fn dag_planner_reports_postprocess_entry_node() {
85+
let plan = plan_dag_graph(
86+
&config::Config::default(),
87+
DagGraphSelection::Postprocess {
88+
convention_store_path: false,
89+
},
90+
&[],
91+
)
92+
.unwrap();
93+
94+
assert_eq!(plan.ready, vec!["specialized_dedup"]);
95+
assert!(plan
96+
.nodes
97+
.iter()
98+
.any(|node| node.name == "specialized_dedup" && node.enabled && node.ready));
99+
}
82100
}

src/commands/eval/metrics/comparisons.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ mod tests {
170170
reproduction_summary: None,
171171
artifact_path: None,
172172
failures: vec![],
173+
dag_traces: vec![],
173174
},
174175
EvalFixtureResult {
175176
fixture: "suite/b".to_string(),
@@ -190,6 +191,7 @@ mod tests {
190191
reproduction_summary: None,
191192
artifact_path: None,
192193
failures: vec![],
194+
dag_traces: vec![],
193195
},
194196
];
195197

src/commands/eval/metrics/suites.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,7 @@ mod tests {
254254
reproduction_summary: None,
255255
artifact_path: None,
256256
failures: vec!["missing finding".to_string()],
257+
dag_traces: vec![],
257258
}];
258259

259260
let suites = build_suite_results(&results);
@@ -291,6 +292,7 @@ mod tests {
291292
reproduction_summary: None,
292293
artifact_path: None,
293294
failures: vec![],
295+
dag_traces: vec![],
294296
},
295297
EvalFixtureResult {
296298
fixture: "suite/b".to_string(),
@@ -316,6 +318,7 @@ mod tests {
316318
reproduction_summary: None,
317319
artifact_path: None,
318320
failures: vec![],
321+
dag_traces: vec![],
319322
},
320323
];
321324

@@ -366,6 +369,7 @@ mod tests {
366369
reproduction_summary: None,
367370
artifact_path: None,
368371
failures: vec![],
372+
dag_traces: vec![],
369373
},
370374
EvalFixtureResult {
371375
fixture: "suite/b".to_string(),
@@ -386,6 +390,7 @@ mod tests {
386390
reproduction_summary: None,
387391
artifact_path: None,
388392
failures: vec!["missing".to_string()],
393+
dag_traces: vec![],
389394
},
390395
];
391396

src/commands/eval/pattern/matching/predicates.rs

Lines changed: 120 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -134,19 +134,17 @@ pub(super) fn matches_regex(pattern: &EvalPattern, comment: &core::Comment) -> b
134134

135135
pub(super) fn matches_severity(pattern: &EvalPattern, comment: &core::Comment) -> bool {
136136
pattern.severity.as_ref().is_none_or(|severity| {
137-
comment
138-
.severity
139-
.to_string()
140-
.eq_ignore_ascii_case(severity.trim())
137+
let expected = severity.trim();
138+
comment.severity.to_string().eq_ignore_ascii_case(expected)
139+
|| severity_rank(comment.severity.as_str()) >= severity_rank(expected)
141140
})
142141
}
143142

144143
pub(super) fn matches_category(pattern: &EvalPattern, comment: &core::Comment) -> bool {
145144
pattern.category.as_ref().is_none_or(|category| {
146-
comment
147-
.category
148-
.to_string()
149-
.eq_ignore_ascii_case(category.trim())
145+
let expected = category.trim();
146+
comment.category.to_string().eq_ignore_ascii_case(expected)
147+
|| semantic_category_matches(expected, comment)
150148
})
151149
}
152150

@@ -195,22 +193,126 @@ fn semantic_text_matches(content: &str, needle: &str) -> bool {
195193
.all(|token| content_tokens.iter().any(|candidate| candidate == token))
196194
}
197195

196+
fn semantic_category_matches(expected: &str, comment: &core::Comment) -> bool {
197+
let expected = canonicalize_category(expected);
198+
if expected.is_empty() {
199+
return true;
200+
}
201+
if canonicalize_category(&comment.category.to_string()) == expected {
202+
return true;
203+
}
204+
205+
let search_space = format!(
206+
"{} {}",
207+
comment.content.to_ascii_lowercase(),
208+
comment.tags.join(" ").to_ascii_lowercase()
209+
);
210+
category_aliases(&expected)
211+
.iter()
212+
.any(|alias| semantic_text_matches(&search_space, alias))
213+
}
214+
215+
fn canonicalize_category(value: &str) -> String {
216+
value
217+
.trim()
218+
.to_ascii_lowercase()
219+
.chars()
220+
.filter(|ch| ch.is_ascii_alphanumeric())
221+
.collect()
222+
}
223+
224+
fn category_aliases(expected: &str) -> &'static [&'static str] {
225+
match expected {
226+
"security" => &[
227+
"security",
228+
"authorization",
229+
"authentication",
230+
"access control",
231+
"permission",
232+
"privilege escalation",
233+
"authorization bypass",
234+
"idor",
235+
"injection",
236+
"path traversal",
237+
"open redirect",
238+
"supply chain",
239+
"secret",
240+
"forbidden",
241+
"unauthorized",
242+
],
243+
"bug" => &[
244+
"bug",
245+
"panic",
246+
"crash",
247+
"nil",
248+
"null",
249+
"fire and forget",
250+
"detached task",
251+
"background task",
252+
"spawned task",
253+
"not awaited",
254+
"missing await",
255+
"promise is always truthy",
256+
"swallowed error",
257+
"logic error",
258+
"race condition",
259+
"deadlock",
260+
],
261+
"performance" => &[
262+
"performance",
263+
"slow",
264+
"latency",
265+
"n plus one",
266+
"query inside loop",
267+
"memory leak",
268+
],
269+
"style" => &["style", "format", "naming", "lint"],
270+
"documentation" => &["documentation", "docstring", "docs"],
271+
"bestpractice" => &["best practice", "robustness", "guardrail"],
272+
"maintainability" => &[
273+
"maintainability",
274+
"readability",
275+
"duplication",
276+
"complexity",
277+
"refactor",
278+
],
279+
"testing" => &["testing", "test coverage", "missing test"],
280+
"architecture" => &["architecture", "design", "abstraction", "coupling"],
281+
_ => &[],
282+
}
283+
}
284+
198285
fn canonicalize_semantic_text(text: &str) -> String {
199286
let mut canonical = text.to_ascii_lowercase();
200287
for (source, replacement) in [
201288
("authz", "authorization"),
202289
("authorisation", "authorization"),
203290
("access control", "authorization"),
204291
("broken access control", "authorization bypass"),
292+
("verbose-error", "information disclosure"),
293+
("verbose error", "information disclosure"),
294+
("debug-details", "information disclosure"),
295+
("debug details", "information disclosure"),
296+
("stack-trace", "information disclosure"),
297+
("stack trace", "information disclosure"),
298+
("cwe-209", "information disclosure"),
299+
("cwe 209", "information disclosure"),
205300
("piping curl output directly to bash", "curl pipe to shell"),
206301
("pipe curl output directly to bash", "curl pipe to shell"),
207302
(
208303
"piping remote script directly to bash",
209304
"curl pipe to shell",
210305
),
211306
("piping a remote script to bash", "curl pipe to shell"),
307+
("arbitrary shell command execution", "command injection"),
308+
(
309+
"without input validation or sanitization",
310+
"user controlled command",
311+
),
212312
("untrusted code", "remote script"),
213313
("attack vector", "risk"),
314+
("silently discarded", "swallowed error"),
315+
("silent failure", "swallowed error"),
214316
("sqli", "sql injection"),
215317
("xss", "cross site scripting"),
216318
("ssrf", "server side request forgery"),
@@ -239,6 +341,16 @@ fn canonicalize_semantic_text(text: &str) -> String {
239341
canonical
240342
}
241343

344+
fn severity_rank(value: &str) -> usize {
345+
match canonicalize_category(value).as_str() {
346+
"error" => 3,
347+
"warning" => 2,
348+
"suggestion" => 1,
349+
"info" => 0,
350+
_ => 0,
351+
}
352+
}
353+
242354
fn semantic_tokens(text: &str) -> Vec<String> {
243355
text.split(|ch: char| !ch.is_ascii_alphanumeric())
244356
.map(str::trim)

0 commit comments

Comments
 (0)