Skip to content

Commit 0cade0a

Browse files
committed
Add support for tree-sitter-style corpus tests
This adds tests consisting of source code and a printout of its rewritten AST.
1 parent 36554d1 commit 0cade0a

6 files changed

Lines changed: 224 additions & 9 deletions

File tree

unified/AGENTS.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,15 @@ This is a CodeQL extractor based on tree-sitter.
55
## Building
66
To build the extractor, run `scripts/create-extractor-pack.sh`
77

8-
## Testing
9-
- If you changed the extractor code, always rebuild it before running tests.
8+
## Extractor Testing
9+
- To run extractor tests, run `cargo test` in the `extractor` directory.
1010

11-
- To run all tests, run `codeql test run --search-path extractor-pack ql/test`
11+
- Do not edit the printed ASTs in `extractor/test/corpus` directly. To regenerate the ASTs, run tests with the environment variable `YEAST_UPDATE_CORPUS=1`.
12+
13+
## CodeQL Testing
14+
- If you changed the extractor code, always rebuild it before running CodeQL tests.
15+
16+
- To run all CodeQL tests, run `codeql test run --search-path extractor-pack ql/test`
1217

1318
- Do not edit `.expected` files manually. To update the expected output, pass `--learn` to the `codeql test run` command.
1419

unified/extractor/src/extractor.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,7 @@ use std::path::PathBuf;
33

44
use codeql_extractor::extractor::simple;
55
use codeql_extractor::trap;
6-
7-
#[path = "languages/swift/swift.rs"]
8-
mod swift;
6+
use crate::languages;
97

108
#[derive(Args)]
119
pub struct Options {
@@ -27,9 +25,7 @@ pub fn run(options: Options) -> std::io::Result<()> {
2725

2826
let extractor = simple::Extractor {
2927
prefix: "unified".to_string(),
30-
languages: vec![
31-
swift::language_spec(),
32-
],
28+
languages: languages::all_language_specs(),
3329
trap_dir: options.output_dir,
3430
trap_compression: trap::Compression::from_env("CODEQL_EXTRACTOR_UNIFIED_OPTION_TRAP_COMPRESSION"),
3531
source_archive_dir: options.source_archive_dir,
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
use codeql_extractor::extractor::simple;
2+
3+
#[path = "swift/swift.rs"]
4+
mod swift;
5+
6+
pub fn all_language_specs() -> Vec<simple::LanguageSpec> {
7+
vec![swift::language_spec()]
8+
}

unified/extractor/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use clap::Parser;
33
mod autobuilder;
44
mod extractor;
55
mod generator;
6+
mod languages;
67

78
#[derive(Parser)]
89
#[command(author, version, about)]
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
===
2+
Additive expression is desugared
3+
===
4+
5+
1 + 2
6+
7+
---
8+
9+
source_file
10+
simple_identifier "blah"
11+
12+
13+
===
14+
Another additive expression is desugared
15+
===
16+
17+
foo + bar
18+
19+
---
20+
21+
source_file
22+
simple_identifier "blah"
23+
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
use std::fs;
2+
use std::path::Path;
3+
4+
use codeql_extractor::extractor::simple;
5+
use yeast::{dump::dump_ast, Runner};
6+
7+
#[path = "../src/languages/mod.rs"]
8+
mod languages;
9+
10+
#[derive(Debug)]
11+
struct CorpusCase {
12+
name: String,
13+
input: String,
14+
expected: String,
15+
}
16+
17+
fn update_mode_enabled() -> bool {
18+
std::env::var("YEAST_UPDATE_CORPUS")
19+
.map(|v| matches!(v.to_ascii_lowercase().as_str(), "1" | "true" | "yes" | "on"))
20+
.unwrap_or(false)
21+
}
22+
23+
fn is_header_rule(line: &str) -> bool {
24+
let trimmed = line.trim();
25+
trimmed.len() >= 3 && trimmed.chars().all(|c| c == '=')
26+
}
27+
28+
fn parse_corpus(content: &str) -> Vec<CorpusCase> {
29+
let lines: Vec<&str> = content.lines().collect();
30+
let mut i = 0;
31+
let mut cases = Vec::new();
32+
33+
while i < lines.len() {
34+
while i < lines.len() && lines[i].trim().is_empty() {
35+
i += 1;
36+
}
37+
if i >= lines.len() {
38+
break;
39+
}
40+
41+
assert!(
42+
is_header_rule(lines[i]),
43+
"Expected header delimiter at line {}",
44+
i + 1
45+
);
46+
i += 1;
47+
48+
assert!(i < lines.len(), "Missing test name at line {}", i + 1);
49+
let name = lines[i].trim().to_string();
50+
i += 1;
51+
52+
assert!(
53+
i < lines.len() && is_header_rule(lines[i]),
54+
"Missing closing header delimiter for case {name}"
55+
);
56+
i += 1;
57+
58+
let input_start = i;
59+
while i < lines.len() && lines[i].trim() != "---" {
60+
i += 1;
61+
}
62+
assert!(i < lines.len(), "Missing --- separator for case {name}");
63+
let input = lines[input_start..i].join("\n").trim_end().to_string();
64+
i += 1;
65+
66+
let expected_start = i;
67+
while i < lines.len() {
68+
if is_header_rule(lines[i])
69+
&& i + 2 < lines.len()
70+
&& !lines[i + 1].trim().is_empty()
71+
&& is_header_rule(lines[i + 2])
72+
{
73+
break;
74+
}
75+
i += 1;
76+
}
77+
let expected = lines[expected_start..i].join("\n").trim().to_string();
78+
79+
cases.push(CorpusCase {
80+
name,
81+
input,
82+
expected,
83+
});
84+
}
85+
86+
cases
87+
}
88+
89+
fn render_corpus(cases: &[CorpusCase]) -> String {
90+
let mut out = String::new();
91+
92+
for (idx, case) in cases.iter().enumerate() {
93+
if idx > 0 {
94+
out.push('\n');
95+
}
96+
out.push_str("===\n");
97+
out.push_str(case.name.trim());
98+
out.push_str("\n===\n");
99+
out.push('\n');
100+
out.push_str(case.input.trim());
101+
out.push_str("\n\n---\n");
102+
out.push('\n');
103+
out.push_str(case.expected.trim());
104+
out.push_str("\n\n");
105+
}
106+
107+
out
108+
}
109+
110+
fn run_desugaring(lang: &simple::LanguageSpec, input: &str) -> String {
111+
let runner = match lang.desugar.as_ref() {
112+
Some(config) => Runner::from_config(lang.ts_language.clone(), config)
113+
.expect("Failed to create yeast runner from desugaring config"),
114+
None => Runner::new(lang.ts_language.clone(), &[]),
115+
};
116+
let ast = runner
117+
.run(input)
118+
.unwrap_or_else(|e| panic!("Failed to parse corpus input: {e}"));
119+
dump_ast(&ast, ast.get_root(), input)
120+
}
121+
122+
#[test]
123+
fn test_corpus() {
124+
let update_mode = update_mode_enabled();
125+
let all_languages = languages::all_language_specs();
126+
let corpus_dir = Path::new("tests/corpus");
127+
128+
for lang in all_languages {
129+
let lang_corpus_dir = corpus_dir.join(&lang.prefix);
130+
if !lang_corpus_dir.exists() {
131+
continue;
132+
}
133+
134+
let mut corpus_files: Vec<_> = fs::read_dir(&lang_corpus_dir)
135+
.unwrap_or_else(|e| {
136+
panic!(
137+
"Failed to read corpus directory {}: {e}",
138+
lang_corpus_dir.display()
139+
)
140+
})
141+
.map(|entry| entry.expect("Failed to read corpus entry").path())
142+
.filter(|path| path.extension().is_some_and(|ext| ext == "txt"))
143+
.collect();
144+
corpus_files.sort();
145+
146+
for corpus_path in corpus_files {
147+
let content = fs::read_to_string(&corpus_path)
148+
.unwrap_or_else(|e| panic!("Failed to read {}: {e}", corpus_path.display()));
149+
let mut cases = parse_corpus(&content);
150+
assert!(
151+
!cases.is_empty(),
152+
"No corpus cases found in {}",
153+
corpus_path.display()
154+
);
155+
156+
for case in &mut cases {
157+
let actual = run_desugaring(&lang, &case.input);
158+
if update_mode {
159+
case.expected = actual.trim().to_string();
160+
} else {
161+
assert_eq!(
162+
case.expected.trim(),
163+
actual.trim(),
164+
"Corpus case failed in {}: {}",
165+
corpus_path.display(),
166+
case.name
167+
);
168+
}
169+
}
170+
171+
if update_mode {
172+
let updated = render_corpus(&cases);
173+
fs::write(&corpus_path, updated).unwrap_or_else(|e| {
174+
panic!(
175+
"Failed to update corpus file {}: {e}",
176+
corpus_path.display()
177+
)
178+
});
179+
}
180+
}
181+
}
182+
}

0 commit comments

Comments
 (0)