Skip to content

Commit 7dcc611

Browse files
Parser: fix exponential parse time on speculative prefix parsing
1 parent 182eae8 commit 7dcc611

3 files changed

Lines changed: 143 additions & 3 deletions

File tree

sqlparser_bench/benches/sqlparser_bench.rs

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
// under the License.
1717

1818
use criterion::{criterion_group, criterion_main, Criterion};
19-
use sqlparser::dialect::GenericDialect;
19+
use sqlparser::dialect::{GenericDialect, PostgreSqlDialect, SQLiteDialect};
2020
use sqlparser::keywords::Keyword;
2121
use sqlparser::parser::Parser;
2222
use sqlparser::tokenizer::{Span, Word};
@@ -177,11 +177,58 @@ fn parse_compound_chain(c: &mut Criterion) {
177177
group.finish();
178178
}
179179

180+
/// Benchmark parsing pathological `IF(<keyword-fn>(<keyword-fn>(...x` chains
181+
/// that previously caused 2^N work in `parse_prefix`. Each nested
182+
/// `current_time(` segment used to be explored twice at every level (once via
183+
/// the speculative reserved-word arm, once via the unreserved-word fallback),
184+
/// doubling work per level. Post-fix the cost is linear in chain length.
185+
fn parse_prefix_keyword_call_chain(c: &mut Criterion) {
186+
let mut group = c.benchmark_group("parse_prefix_keyword_call_chain");
187+
let dialect = PostgreSqlDialect {};
188+
189+
for &n in &[10usize, 20, 30] {
190+
let sql = String::from("if(") + &"current_time(".repeat(n) + "x";
191+
192+
group.bench_function(format!("chain_{n}"), |b| {
193+
b.iter(|| {
194+
let _ = Parser::parse_sql(&dialect, std::hint::black_box(&sql));
195+
});
196+
});
197+
}
198+
199+
group.finish();
200+
}
201+
202+
/// Benchmark parsing pathological `case-case-case-...c` chains that
203+
/// previously caused 2^N work in `parse_prefix`. Each `case` token used to
204+
/// trigger a speculative `parse_case_expr` that recursively descends the
205+
/// chain, but the unreserved-word fallback returns `Identifier(case)` so the
206+
/// overall `parse_prefix` succeeds and the failure cache never fires.
207+
/// Post-fix the per-arm cache short-circuits the speculative descent.
208+
fn parse_prefix_case_chain(c: &mut Criterion) {
209+
let mut group = c.benchmark_group("parse_prefix_case_chain");
210+
let dialect = SQLiteDialect {};
211+
212+
for &n in &[10usize, 20, 30] {
213+
let sql = "case\t-".repeat(n) + "c";
214+
215+
group.bench_function(format!("chain_{n}"), |b| {
216+
b.iter(|| {
217+
let _ = Parser::parse_sql(&dialect, std::hint::black_box(&sql));
218+
});
219+
});
220+
}
221+
222+
group.finish();
223+
}
224+
180225
criterion_group!(
181226
benches,
182227
basic_queries,
183228
word_to_ident,
184229
parse_many_identifiers,
185-
parse_compound_chain
230+
parse_compound_chain,
231+
parse_prefix_keyword_call_chain,
232+
parse_prefix_case_chain
186233
);
187234
criterion_main!(benches);

src/parser/mod.rs

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#[cfg(not(feature = "std"))]
1616
use alloc::{
1717
boxed::Box,
18+
collections::BTreeMap,
1819
format,
1920
string::{String, ToString},
2021
vec,
@@ -24,6 +25,9 @@ use core::{
2425
fmt::{self, Display},
2526
str::FromStr,
2627
};
28+
#[cfg(feature = "std")]
29+
use std::collections::BTreeMap;
30+
2731
use helpers::attached_token::AttachedToken;
2832

2933
use log::debug;
@@ -359,6 +363,12 @@ pub struct Parser<'a> {
359363
options: ParserOptions,
360364
/// Ensures the stack does not overflow by limiting recursion depth.
361365
recursion_counter: RecursionCounter,
366+
/// Cached errors from `parse_prefix` calls that returned `Err`. See
367+
/// [`Parser::parse_prefix`] for the 2^N patterns this guards.
368+
failed_prefix_positions: BTreeMap<usize, ParserError>,
369+
/// Cached errors from the speculative reserved-word prefix arm. See
370+
/// [`Parser::parse_prefix`] for the 2^N patterns this guards.
371+
failed_reserved_word_prefix_positions: BTreeMap<usize, ParserError>,
362372
}
363373

364374
impl<'a> Parser<'a> {
@@ -385,6 +395,8 @@ impl<'a> Parser<'a> {
385395
dialect,
386396
recursion_counter: RecursionCounter::new(DEFAULT_REMAINING_DEPTH),
387397
options: ParserOptions::new().with_trailing_commas(dialect.supports_trailing_commas()),
398+
failed_prefix_positions: BTreeMap::new(),
399+
failed_reserved_word_prefix_positions: BTreeMap::new(),
388400
}
389401
}
390402

@@ -446,6 +458,8 @@ impl<'a> Parser<'a> {
446458
pub fn with_tokens_with_locations(mut self, tokens: Vec<TokenWithSpan>) -> Self {
447459
self.tokens = tokens;
448460
self.index = 0;
461+
self.failed_prefix_positions.clear();
462+
self.failed_reserved_word_prefix_positions.clear();
449463
self
450464
}
451465

@@ -1717,6 +1731,23 @@ impl<'a> Parser<'a> {
17171731
return prefix;
17181732
}
17191733

1734+
// Memoize parse_prefix failures to break 2^N speculation when both
1735+
// prefix arms fail at every level (e.g. `IF(current_time(...x`).
1736+
// The per-arm cache in `parse_prefix_inner` complements this for
1737+
// chains where the reserved arm fails but the unreserved fallback
1738+
// succeeds (e.g. `case-case-...c`).
1739+
let start_index = self.index;
1740+
if let Some(cached) = self.failed_prefix_positions.get(&start_index) {
1741+
return Err(cached.clone());
1742+
}
1743+
let result = self.parse_prefix_inner();
1744+
if let Err(ref e) = result {
1745+
self.failed_prefix_positions.insert(start_index, e.clone());
1746+
}
1747+
result
1748+
}
1749+
1750+
fn parse_prefix_inner(&mut self) -> Result<Expr, ParserError> {
17201751
// PostgreSQL allows any string literal to be preceded by a type name, indicating that the
17211752
// string literal represents a literal of that type. Some examples:
17221753
//
@@ -1801,7 +1832,21 @@ impl<'a> Parser<'a> {
18011832
// We first try to parse the word and following tokens as a special expression, and if that fails,
18021833
// we rollback and try to parse it as an identifier.
18031834
let w = w.clone();
1804-
match self.try_parse(|parser| parser.parse_expr_prefix_by_reserved_word(&w, span)) {
1835+
// Memoize failed speculative reserved-word parses. When
1836+
// the reserved arm (CASE, CURRENT_TIME, etc.) does
1837+
// exponential work but the unreserved fallback ultimately
1838+
// succeeds, the overall `parse_prefix` returns `Ok` and the
1839+
// outer cache never fires. Chains like `case-case-...c`
1840+
// need this per-arm cache to break the doubling.
1841+
let try_parse_result = if let Some(cached) = self
1842+
.failed_reserved_word_prefix_positions
1843+
.get(&next_token_index)
1844+
{
1845+
Err(cached.clone())
1846+
} else {
1847+
self.try_parse(|parser| parser.parse_expr_prefix_by_reserved_word(&w, span))
1848+
};
1849+
match try_parse_result {
18051850
// This word indicated an expression prefix and parsing was successful
18061851
Ok(Some(expr)) => Ok(expr),
18071852

@@ -1815,6 +1860,8 @@ impl<'a> Parser<'a> {
18151860
// we rollback and return the parsing error we got from trying to parse a
18161861
// special expression (to maintain backwards compatibility of parsing errors).
18171862
Err(e) => {
1863+
self.failed_reserved_word_prefix_positions
1864+
.insert(next_token_index, e.clone());
18181865
if !self.dialect.is_reserved_for_identifier(w.keyword) {
18191866
if let Ok(Some(expr)) = self.maybe_parse(|parser| {
18201867
parser.parse_expr_prefix_by_unreserved_word(&w, span)

tests/sqlparser_common.rs

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19004,3 +19004,49 @@ fn parse_compound_chain_no_exponential_blowup() {
1900419004
rx.recv_timeout(Duration::from_secs(5))
1900519005
.expect("parser should reject this quickly, not loop exponentially");
1900619006
}
19007+
19008+
/// Regression test for the 2^N parse-time blowup in `parse_prefix` on inputs
19009+
/// like `IF(current_time(current_time(...x`. Each nested `current_time(` used
19010+
/// to be explored twice at every level (once via the speculative reserved-word
19011+
/// arm, once via the unreserved-word fallback), doubling work per level.
19012+
/// Post-fix the failing parse short-circuits via the position-keyed cache.
19013+
#[test]
19014+
fn parse_prefix_keyword_call_chain_no_exponential_blowup() {
19015+
use std::sync::mpsc;
19016+
use std::thread;
19017+
use std::time::Duration;
19018+
19019+
let sql = String::from("if(") + &"current_time(".repeat(30) + "x";
19020+
19021+
let (tx, rx) = mpsc::channel();
19022+
thread::spawn(move || {
19023+
let _ = Parser::parse_sql(&PostgreSqlDialect {}, &sql);
19024+
let _ = tx.send(());
19025+
});
19026+
19027+
rx.recv_timeout(Duration::from_secs(5))
19028+
.expect("parser should reject this quickly, not loop exponentially");
19029+
}
19030+
19031+
/// Regression test for the 2^N parse-time blowup in `parse_prefix` on inputs
19032+
/// like `case-case-case-...c`. Each `case` token triggers a speculative
19033+
/// `parse_case_expr` that fails, but the unreserved-word fallback returns
19034+
/// `Identifier(case)`, so the outer failure cache never fires. Post-fix the
19035+
/// per-arm cache short-circuits the speculative descent.
19036+
#[test]
19037+
fn parse_prefix_case_chain_no_exponential_blowup() {
19038+
use std::sync::mpsc;
19039+
use std::thread;
19040+
use std::time::Duration;
19041+
19042+
let sql = "case\t-".repeat(30) + "c";
19043+
19044+
let (tx, rx) = mpsc::channel();
19045+
thread::spawn(move || {
19046+
let _ = Parser::parse_sql(&SQLiteDialect {}, &sql);
19047+
let _ = tx.send(());
19048+
});
19049+
19050+
rx.recv_timeout(Duration::from_secs(5))
19051+
.expect("parser should reject this quickly, not loop exponentially");
19052+
}

0 commit comments

Comments
 (0)