Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ on:
- "assets/**"
- "src/**"
- "tests/**"
- "tools/**"
- ".taplo.toml"
- ".typos.toml"
- "Cargo.lock"
Expand Down Expand Up @@ -68,7 +69,7 @@ jobs:
- name: Set up Environment
run: ./tools/setup.sh ${{ matrix.version }}
- name: Set up Sccache
uses: mozilla-actions/sccache-action@v0.0.7
uses: mozilla-actions/sccache-action@v0.0.9
- name: Clippy
run: cargo clippy --features pg${{ matrix.version }}
- name: Unit Test
Expand Down
10 changes: 3 additions & 7 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ name = "pgrx_embed_pg_tokenizer"
path = "./src/bin/pgrx_embed.rs"

[features]
pg12 = ["pgrx/pg12", "pgrx-tests/pg12"]
pg13 = ["pgrx/pg13", "pgrx-tests/pg13"]
pg14 = ["pgrx/pg14", "pgrx-tests/pg14"]
pg15 = ["pgrx/pg15", "pgrx-tests/pg15"]
Expand All @@ -28,8 +27,8 @@ lindera-cc-cedict = ["lindera/cc-cedict"]
anyhow = "1.0.97"
dashmap = "6.1.0"
jieba-rs = "0.7.2"
lindera = "0.37.0"
pgrx = "=0.13.1"
lindera = "0.42.2"
pgrx = "=0.14.1"
regex = "1.11.1"
rust-stemmers = { git = "https://github.com/tensorchord/rust-stemmers.git", rev = "51696378e352688b7ffd4fface615370ff5e8768" }
serde = { version = "1.0.218", features = ["derive"] }
Expand All @@ -42,10 +41,7 @@ unicode-segmentation = "1.12.0"
validator = { version = "0.20.0", features = ["derive"] }

[dev-dependencies]
pgrx-tests = "=0.13.1"

[patch.crates-io]
pgrx = { git = "https://github.com/tensorchord/pgrx", branch = "patch-to-pg_tokenizer" }
pgrx-tests = "=0.14.1"

[profile.release]
opt-level = 3
Expand Down
9 changes: 9 additions & 0 deletions docs/00-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,22 @@ You can choose only one of the above options for each character filter.
| stopwords | String | Stopwords name, builtin: `lucene_english`, `nltk_english`, `iso_english` |
| synonym | String | Synonym name |
| pg_dict | String | Using [postgres text search dictionary](https://www.postgresql.org/docs/current/textsearch-dictionaries.html). We currently support all dictionaries except `Thesaurus Dictionary`. |
| ngram | Table | N-gram tokenizer, see [Options for `ngram`](#options-for-ngram) |

You can choose only one of the above options for each token filter.

#### Supported values for `stemmer`

arabic, armenian, basque, catalan, danish, dutch, english_porter, english_porter2, estonian, finnish, french, german, greek, hindi, hungarian, indonesian, irish, italian, lithuanian, nepali, norwegian, portuguese, romanian, russian, serbian, spanish, swedish, tamil, turkish, yiddish

#### Options for `ngram`

| Key | Type | Description |
| ----------------- | ------- | -------------------------------------------------------- |
| max_gram | Integer | Maximum n-gram size, range: `1..=255`, default: `2` |
| min_gram | Integer | Minimum n-gram size, range: `1..=255`, default: `1` |
| preserve_original | Boolean | Whether to preserve the original token, default: `false` |

### Options for `tokenizer`

| Key | Type | Description |
Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ compile_error!("Target is not supported.");
compiler_error!("PostgreSQL version must be selected.");

#[pgrx::pg_guard]
unsafe extern "C" fn _PG_init() {
extern "C-unwind" fn _PG_init() {
if unsafe { pgrx::pg_sys::IsUnderPostmaster } {
pgrx::error!("pg_tokenizer must be loaded via shared_preload_libraries.");
}
Expand Down
5 changes: 5 additions & 0 deletions src/token_filter/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mod ngram;
mod pg_dict;
mod skip_non_alphanumeric;
mod stemmer;
Expand All @@ -6,6 +7,7 @@ mod synonym;

use std::sync::Arc;

use ngram::{Ngram, NgramConfig};
use pg_dict::PgDictTokenFilter;
use serde::{Deserialize, Serialize};
use skip_non_alphanumeric::SkipNonAlphanumeric;
Expand All @@ -32,6 +34,8 @@ pub enum TokenFilterConfig {
Stopwords(String),
PgDict(String),
Synonym(String),
#[serde(rename = "ngram")]
NGram(NgramConfig),
}

pub fn get_token_filter(config: TokenFilterConfig) -> TokenFilterPtr {
Expand All @@ -41,5 +45,6 @@ pub fn get_token_filter(config: TokenFilterConfig) -> TokenFilterPtr {
TokenFilterConfig::Stopwords(name) => stopwords::get_stopwords_token_filter(&name),
TokenFilterConfig::PgDict(name) => Arc::new(PgDictTokenFilter::new(&name)),
TokenFilterConfig::Synonym(name) => synonym::get_synonym_token_filter(&name),
TokenFilterConfig::NGram(config) => Arc::new(Ngram::new(config)),
}
}
71 changes: 71 additions & 0 deletions src/token_filter/ngram.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
use serde::{Deserialize, Serialize};
use validator::{Validate, ValidationError};

use super::TokenFilter;

#[derive(Clone, Debug, Serialize, Deserialize, Validate)]
#[serde(rename_all = "snake_case")]
#[serde(deny_unknown_fields)]
#[validate(schema(function = "NgramConfig::validate_grams"))]
pub struct NgramConfig {
#[serde(default = "NgramConfig::default_max_gram")]
#[validate(range(min = 1, max = 255))]
pub max_gram: usize,
#[serde(default = "NgramConfig::default_min_gram")]
#[validate(range(min = 1, max = 255))]
pub min_gram: usize,
#[serde(default = "NgramConfig::default_preserve_original")]
pub preserve_original: bool,
}

impl NgramConfig {
fn default_max_gram() -> usize {
2
}
fn default_min_gram() -> usize {
1
}
fn default_preserve_original() -> bool {
false
}
fn validate_grams(&self) -> Result<(), ValidationError> {
if self.min_gram > self.max_gram {
return Err(ValidationError::new(
"min_gram must be less than or equal to max_gram",
));
}
Ok(())
}
}

pub struct Ngram {
config: NgramConfig,
}

impl TokenFilter for Ngram {
fn apply(&self, token: String) -> Vec<String> {
let mut results = Vec::new();
let len = token.len();
Copy link

Copilot AI May 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the token length is less than min_gram, the loop 'for i in 0..=(len - self.config.min_gram)' will underflow and panic. Consider adding a guard that returns an empty vector when token.len() < self.config.min_gram.

Suggested change
let len = token.len();
let len = token.len();
if len < self.config.min_gram {
return results;
}

Copilot uses AI. Check for mistakes.
for i in 0..=(len - self.config.min_gram) {
for j in (i + self.config.min_gram)..=(i + self.config.max_gram).min(len) {
results.push(token[i..j].to_string());
}
}
if self.config.preserve_original
&& !(self.config.min_gram..=self.config.max_gram).contains(&len)
{
results.push(token);
}
results
}
}

impl Ngram {
pub fn new(config: NgramConfig) -> Self {
if let Err(e) = config.validate() {
panic!("Invalid NgramConfig: {}", e);
}

Ngram { config }
Comment on lines +64 to +69
Copy link

Copilot AI May 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than panicking on an invalid configuration, consider returning a Result to allow clients to handle configuration errors gracefully.

Suggested change
pub fn new(config: NgramConfig) -> Self {
if let Err(e) = config.validate() {
panic!("Invalid NgramConfig: {}", e);
}
Ngram { config }
pub fn new(config: NgramConfig) -> Result<Self, ValidationError> {
if let Err(e) = config.validate() {
return Err(e);
}
Ok(Ngram { config })

Copilot uses AI. Check for mistakes.
}
}
29 changes: 29 additions & 0 deletions tests/sqllogictest/ngram.slt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
statement ok
BEGIN;

statement ok
SELECT tokenizer_catalog.create_text_analyzer('test_ngram', $$
pre_tokenizer = "unicode_segmentation"
[[token_filters]]
[token_filters.ngram]
$$);

query T
SELECT tokenizer_catalog.apply_text_analyzer('Quick fox', 'test_ngram');
----
{Q,Qu,u,ui,i,ic,c,ck,k,f,fo,o,ox,x}

statement ok
SELECT tokenizer_catalog.create_text_analyzer('test_ngram2', $$
pre_tokenizer = "unicode_segmentation"
[[token_filters]]
[token_filters.ngram]
max_gram = 3
min_gram = 2
preserve_original = true
$$);

query T
SELECT tokenizer_catalog.apply_text_analyzer('Quick fox', 'test_ngram2');
----
{Qu,Qui,ui,uic,ic,ick,ck,Quick,fo,fox,ox}
2 changes: 1 addition & 1 deletion tools/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ sudo -iu postgres createdb -O $USER $USER
sudo -iu postgres psql -c 'ALTER SYSTEM SET shared_preload_libraries = "pg_tokenizer.so"'
sudo systemctl stop postgresql

curl -fsSL https://github.com/tensorchord/pgrx/releases/download/v0.13.1/cargo-pgrx-v0.13.1-$(uname -m)-unknown-linux-gnu.tar.gz | tar -xOzf - ./cargo-pgrx | install -m 755 /dev/stdin /usr/local/bin/cargo-pgrx
curl -fsSL https://github.com/tensorchord/pgrx/releases/download/v0.14.1/cargo-pgrx-v0.14.1-$(uname -m)-unknown-linux-gnu.tar.gz | tar -xOzf - ./cargo-pgrx | install -m 755 /dev/stdin /usr/local/bin/cargo-pgrx
cargo pgrx init --pg${version}=$(which pg_config)

curl -fsSL https://github.com/risinglightdb/sqllogictest-rs/releases/download/v0.26.4/sqllogictest-bin-v0.26.4-$(uname -m)-unknown-linux-musl.tar.gz | tar -xOzf - ./sqllogictest | install -m 755 /dev/stdin /usr/local/bin/sqllogictest
Loading