Skip to content

Commit a741dbd

Browse files
Parser: fix exponential parse time on speculative NOT prefix parsing
1 parent 182eae8 commit a741dbd

3 files changed

Lines changed: 80 additions & 1 deletion

File tree

sqlparser_bench/benches/sqlparser_bench.rs

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,11 +177,36 @@ fn parse_compound_chain(c: &mut Criterion) {
177177
group.finish();
178178
}
179179

180+
/// Benchmark parsing pathological chains of `<ident>-NOT-<ident>.` segments
181+
/// that previously triggered 2^N speculative `parse_not` work. The input
182+
/// `SELECT x-not-b.x-not-b...` rejects at the trailing `.`. Each `-NOT-`
183+
/// segment used to recurse into `parse_not` which re-walks the rest of the
184+
/// chain, doubling the work at every level. Post-fix the cost is linear in
185+
/// the number of segments.
186+
fn parse_not_chain(c: &mut Criterion) {
187+
let mut group = c.benchmark_group("parse_not_chain");
188+
let dialect = GenericDialect {};
189+
190+
for &n in &[10usize, 20, 30] {
191+
let body: String = std::iter::repeat_n("x-not-b.", n).collect();
192+
let sql = format!("SELECT {body}");
193+
194+
group.bench_function(format!("chain_{n}"), |b| {
195+
b.iter(|| {
196+
let _ = Parser::parse_sql(&dialect, std::hint::black_box(&sql));
197+
});
198+
});
199+
}
200+
201+
group.finish();
202+
}
203+
180204
criterion_group!(
181205
benches,
182206
basic_queries,
183207
word_to_ident,
184208
parse_many_identifiers,
185-
parse_compound_chain
209+
parse_compound_chain,
210+
parse_not_chain
186211
);
187212
criterion_main!(benches);

src/parser/mod.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212

1313
//! SQL Parser
1414

15+
#[cfg(not(feature = "std"))]
16+
use alloc::collections::BTreeSet;
1517
#[cfg(not(feature = "std"))]
1618
use alloc::{
1719
boxed::Box,
@@ -25,6 +27,8 @@ use core::{
2527
str::FromStr,
2628
};
2729
use helpers::attached_token::AttachedToken;
30+
#[cfg(feature = "std")]
31+
use std::collections::BTreeSet;
2832

2933
use log::debug;
3034

@@ -359,6 +363,14 @@ pub struct Parser<'a> {
359363
options: ParserOptions,
360364
/// Ensures the stack does not overflow by limiting recursion depth.
361365
recursion_counter: RecursionCounter,
366+
/// Token indices where a speculative attempt to parse `NOT` as a unary
367+
/// prefix operator has already failed during this parse. Re-entering
368+
/// `parse_not` at the same position would repeat the same work and fail
369+
/// again, so the second visit short-circuits to identifier interpretation.
370+
/// Without this cache, inputs like `SELECT x-not-b.x-not-b...` (chains of
371+
/// `<ident>-NOT-<ident>.` ending in a parse error) trigger 2^N exploration
372+
/// because each `-NOT-` segment otherwise re-walks the rest of the chain.
373+
failed_unary_not_positions: BTreeSet<usize>,
362374
}
363375

364376
impl<'a> Parser<'a> {
@@ -385,6 +397,7 @@ impl<'a> Parser<'a> {
385397
dialect,
386398
recursion_counter: RecursionCounter::new(DEFAULT_REMAINING_DEPTH),
387399
options: ParserOptions::new().with_trailing_commas(dialect.supports_trailing_commas()),
400+
failed_unary_not_positions: BTreeSet::new(),
388401
}
389402
}
390403

@@ -446,6 +459,7 @@ impl<'a> Parser<'a> {
446459
pub fn with_tokens_with_locations(mut self, tokens: Vec<TokenWithSpan>) -> Self {
447460
self.tokens = tokens;
448461
self.index = 0;
462+
self.failed_unary_not_positions.clear();
449463
self
450464
}
451465

@@ -1801,6 +1815,17 @@ impl<'a> Parser<'a> {
18011815
// We first try to parse the word and following tokens as a special expression, and if that fails,
18021816
// we rollback and try to parse it as an identifier.
18031817
let w = w.clone();
1818+
// If we already tried to parse `NOT` as a unary prefix operator
1819+
// at this exact position and it failed, skip the speculative
1820+
// path and fall back to identifier interpretation directly. The
1821+
// speculative `parse_not` re-walks the rest of the expression,
1822+
// so repeating it on every visit causes 2^N work on chains
1823+
// like `x-not-b.x-not-b...` that end in a parse error.
1824+
if w.keyword == Keyword::NOT
1825+
&& self.failed_unary_not_positions.contains(&self.index)
1826+
{
1827+
return self.parse_expr_prefix_by_unreserved_word(&w, span);
1828+
}
18041829
match self.try_parse(|parser| parser.parse_expr_prefix_by_reserved_word(&w, span)) {
18051830
// This word indicated an expression prefix and parsing was successful
18061831
Ok(Some(expr)) => Ok(expr),
@@ -1815,6 +1840,9 @@ impl<'a> Parser<'a> {
18151840
// we rollback and return the parsing error we got from trying to parse a
18161841
// special expression (to maintain backwards compatibility of parsing errors).
18171842
Err(e) => {
1843+
if w.keyword == Keyword::NOT {
1844+
self.failed_unary_not_positions.insert(self.index);
1845+
}
18181846
if !self.dialect.is_reserved_for_identifier(w.keyword) {
18191847
if let Ok(Some(expr)) = self.maybe_parse(|parser| {
18201848
parser.parse_expr_prefix_by_unreserved_word(&w, span)

tests/sqlparser_common.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19004,3 +19004,29 @@ fn parse_compound_chain_no_exponential_blowup() {
1900419004
rx.recv_timeout(Duration::from_secs(5))
1900519005
.expect("parser should reject this quickly, not loop exponentially");
1900619006
}
19007+
19008+
/// Regression test for the 2^N parse-time blowup on chains of
19009+
/// `<ident>-NOT-<ident>.` segments ending in a parse error. Each `-NOT-`
19010+
/// segment used to trigger a speculative `parse_not` that re-walks the rest of
19011+
/// the chain, so re-entering it at every `NOT` doubled the work. The fix
19012+
/// caches positions where the speculative `parse_not` already failed and
19013+
/// skips repeating it. Post-fix the parser rejects this in well under a
19014+
/// millisecond, so the timeout is a hang guard, not a perf threshold.
19015+
#[test]
19016+
fn parse_not_chain_no_exponential_blowup() {
19017+
use std::sync::mpsc;
19018+
use std::thread;
19019+
use std::time::Duration;
19020+
19021+
let body: String = std::iter::repeat_n("x-not-b.", 30).collect();
19022+
let sql = format!("SELECT {body}");
19023+
19024+
let (tx, rx) = mpsc::channel();
19025+
thread::spawn(move || {
19026+
let _ = Parser::parse_sql(&GenericDialect {}, &sql);
19027+
let _ = tx.send(());
19028+
});
19029+
19030+
rx.recv_timeout(Duration::from_secs(5))
19031+
.expect("parser should reject this quickly, not loop exponentially");
19032+
}

0 commit comments

Comments
 (0)