Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions quickwit/quickwit-query/src/tokenizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
mod chinese_compatible;
mod code_tokenizer;
mod tokenizer_manager;
mod truncate_tokenizer;

use std::sync::LazyLock;

Expand All @@ -26,8 +27,9 @@ use tantivy::tokenizer::{
use self::chinese_compatible::ChineseTokenizer;
pub use self::code_tokenizer::CodeTokenizer;
pub use self::tokenizer_manager::{RAW_TOKENIZER_NAME, TokenizerManager};

pub use self::truncate_tokenizer::TruncateLongFilter;
pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255;
pub const DEFAULT_TRUNCATE_TOKEN_LENGTH: usize = 255;

/// Quickwit's tokenizer/analyzer manager.
pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager {
Expand Down Expand Up @@ -85,11 +87,11 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager {

fn create_quickwit_fastfield_normalizer_manager() -> TokenizerManager {
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(TruncateLongFilter::limit(DEFAULT_TRUNCATE_TOKEN_LENGTH))
.build();
let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(TruncateLongFilter::limit(DEFAULT_TRUNCATE_TOKEN_LENGTH))
.build();
let tokenizer_manager = TokenizerManager::new();
tokenizer_manager.register("raw", raw_tokenizer, false);
Expand Down Expand Up @@ -166,4 +168,18 @@ mod tests {
assert!(stream.token().text.chars().all(|c| !c.is_uppercase()));
assert!(!stream.advance());
}

#[test]
fn test_truncate_tokenizer() {
let tokenizer_manager = super::create_quickwit_fastfield_normalizer_manager();
let very_long_text = "a text, that is just too long, no one will type it, no one will like \
it, no one shall find it. I just need some more chars, now you may \
not pass.".repeat(3);

let mut truncate_tokenizer = tokenizer_manager.get_tokenizer("raw").unwrap();
let mut truncate_stream = truncate_tokenizer.token_stream(&very_long_text);
assert!(truncate_stream.advance());
assert!(!truncate_stream.advance());
assert!(truncate_stream.token().text.len() <= super::DEFAULT_TRUNCATE_TOKEN_LENGTH);
}
}
70 changes: 70 additions & 0 deletions quickwit/quickwit-query/src/tokenizers/truncate_tokenizer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
use tantivy::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};

#[derive(Clone)]
pub struct TruncateLongFilter {
max_bytes: usize,
}
impl TruncateLongFilter {
pub fn limit(max_bytes: usize) -> Self {
Self { max_bytes }
}
}
impl TokenFilter for TruncateLongFilter {
type Tokenizer<T: Tokenizer> = TruncateLongWrapper<T>;
fn transform<T: Tokenizer>(self, inner: T) -> Self::Tokenizer<T> {
TruncateLongWrapper {
max_bytes: self.max_bytes,
inner,
}
}
}
#[derive(Clone)]
pub struct TruncateLongWrapper<T: Tokenizer> {
max_bytes: usize,
inner: T,
}
impl<T: Tokenizer> Tokenizer for TruncateLongWrapper<T> {
type TokenStream<'a> = TruncateLongStream<T::TokenStream<'a>> where T: 'a;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
TruncateLongStream {
max_bytes: self.max_bytes,
tail: self.inner.token_stream(text),
}
}
}
pub struct TruncateLongStream<T> {
max_bytes: usize,
tail: T,
}
impl<T: TokenStream> TokenStream for TruncateLongStream<T> {
fn advance(&mut self) -> bool {
if !self.tail.advance() {
return false;
}
let tok = self.tail.token_mut();
if tok.text.len() > self.max_bytes {
truncate_at_char_boundary(&mut tok.text, self.max_bytes);
tok.offset_to = tok.offset_from.saturating_add(tok.text.len());
}
true
}
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
}

/// Shrinks `s` to at most `max_bytes` UTF-8 bytes without splitting a
/// multibyte character at the given `max_bytes` index.
fn truncate_at_char_boundary(s: &mut String, max_bytes: usize) {
if s.len() <= max_bytes {
return;
}
let mut end = max_bytes;
while !s.is_char_boundary(end) {
end -= 1;
}
s.truncate(end);
}