Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/lower-body-text-index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@hyperdx/common-utils": patch
---

fix: support text index on lower(Body) with no preprocessor
130 changes: 130 additions & 0 deletions packages/common-utils/src/__tests__/queryParser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1177,6 +1177,136 @@ describe('CustomSchemaSQLSerializerV2 - text indices', () => {
expect(sql).toBe("((hasToken(lower(Body), lower('foo'))))");
});

describe('lower(Body) text index (no preprocessor)', () => {
it('should use hasAllTokens(lower(Body), lower(...)) when index expression is lower(Body)', async () => {
metadata.getSkipIndices = jest.fn().mockResolvedValue([
{
name: 'idx_lower_body',
type: 'text',
typeFull: "text(tokenizer = 'splitByNonAlpha')",
expression: 'lower(Body)',
granularity: '8',
},
]);

const serializer = new CustomSchemaSQLSerializerV2({
metadata,
databaseName,
tableName,
connectionId,
implicitColumnExpression: 'Body',
});

const builder = new SearchQueryBuilder('Foo', serializer);
const sql = await builder.build();

expect(sql).toBe("((hasAllTokens(lower(Body), lower('Foo'))))");
});

it('should use hasAllTokens(lower(Body), lower(...)) for multi-token terms', async () => {
metadata.getSkipIndices = jest.fn().mockResolvedValue([
{
name: 'idx_lower_body',
type: 'text',
typeFull: "text(tokenizer = 'splitByNonAlpha')",
expression: 'lower(Body)',
granularity: '8',
},
]);

const serializer = new CustomSchemaSQLSerializerV2({
metadata,
databaseName,
tableName,
connectionId,
implicitColumnExpression: 'Body',
});

const builder = new SearchQueryBuilder('"Foo Bar"', serializer);
const sql = await builder.build();

expect(sql).toContain("hasAllTokens(lower(Body), lower('Foo Bar'))");
expect(sql).toContain("(lower(Body) LIKE lower('%Foo Bar%'))");
});

it('should handle negated searches with lower(Body) index', async () => {
metadata.getSkipIndices = jest.fn().mockResolvedValue([
{
name: 'idx_lower_body',
type: 'text',
typeFull: "text(tokenizer = 'splitByNonAlpha')",
expression: 'lower(Body)',
granularity: '8',
},
]);

const serializer = new CustomSchemaSQLSerializerV2({
metadata,
databaseName,
tableName,
connectionId,
implicitColumnExpression: 'Body',
});

const builder = new SearchQueryBuilder('-Foo', serializer);
const sql = await builder.build();

expect(sql).toBe("((NOT hasAllTokens(lower(Body), lower('Foo'))))");
});

it('should NOT use lower() when index is directly on Body', async () => {
metadata.getSkipIndices = jest.fn().mockResolvedValue([
{
name: 'idx_body_text',
type: 'text',
typeFull: 'text(tokenizer=splitByNonAlpha)',
expression: 'Body',
granularity: '8',
},
]);

const serializer = new CustomSchemaSQLSerializerV2({
metadata,
databaseName,
tableName,
connectionId,
implicitColumnExpression: 'Body',
});

const builder = new SearchQueryBuilder('Foo', serializer);
const sql = await builder.build();

expect(sql).toBe("((hasAllTokens(Body, 'Foo')))");
});

it('should batch tokens with lower() when index is on lower(Body)', async () => {
metadata.getSkipIndices = jest.fn().mockResolvedValue([
{
name: 'idx_lower_body',
type: 'text',
typeFull: "text(tokenizer = 'splitByNonAlpha')",
expression: 'lower(Body)',
granularity: '8',
},
]);

const serializer = new CustomSchemaSQLSerializerV2({
metadata,
databaseName,
tableName,
connectionId,
implicitColumnExpression: 'Body',
});

const builder = new SearchQueryBuilder('FOO NOT BAR BAZ', serializer);
const sql = await builder.build();

expect(sql).toContain("hasAllTokens(lower(Body), lower('FOO'))");
expect(sql).toContain("NOT (hasAllTokens(lower(Body), lower('BAR')))");
expect(sql).toContain("hasAllTokens(lower(Body), lower('BAZ'))");
});
});

describe('useTextIndexForImplicitColumn source preference', () => {
it('Auto preserves the existing detection behavior when a text index is found', async () => {
metadata.getSkipIndices = jest.fn().mockResolvedValue([
Expand Down
45 changes: 35 additions & 10 deletions packages/common-utils/src/queryParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1273,20 +1273,24 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
// - enabled: force hasAllTokens(), even if no text index is detected
// - disabled: skip the text-index branch entirely
let useHasAllTokens = false;
let textIndexHasLower = false;
if (this.useTextIndexForImplicitColumn === UseTextIndex.Enabled) {
useHasAllTokens = true;
} else if (this.useTextIndexForImplicitColumn === UseTextIndex.Auto) {
// Note: We check that enable_full_text_index = 1, otherwise hasAllTokens() errors
const isTextIndexEnabled = await this.enableTextIndexPromise;
const textIndex = isTextIndexEnabled
const textIndexResult = isTextIndexEnabled
? await this.findTextIndex(column)
: undefined;

if (textIndex) {
const tokenizer = parseTokenizerFromTextIndex(textIndex);
if (textIndexResult) {
const tokenizer = parseTokenizerFromTextIndex(
textIndexResult.index,
);
// HDX-3259: Support other tokenizers by overriding tokenizeTerm, termHasSeparators, and batching logic
if (tokenizer?.type === 'splitByNonAlpha') {
useHasAllTokens = true;
textIndexHasLower = textIndexResult.indexHasLower;
}
}
}
Expand All @@ -1295,13 +1299,24 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {
const tokens = this.tokenizeTerm(term);
const hasSeparators = this.termHasSeparators(term);

// When the text index is on lower(column), we must pass lower(column)
// as the first argument and wrap the tokens in lower() to match.
const hasAllTokensColumn = textIndexHasLower
? `lower(${column})`
: column;

// Batch tokens to avoid exceeding hasAllTokens limit (64)
const tokenBatches = chunk(tokens, HAS_ALL_TOKENS_CHUNK_SIZE);
const hasAllTokensExpressions = tokenBatches.map(batch =>
SqlString.format(`hasAllTokens(?, ?)`, [
SqlString.raw(column),
batch.join(' '),
]),
textIndexHasLower
? SqlString.format(`hasAllTokens(?, lower(?))`, [
SqlString.raw(hasAllTokensColumn),
batch.join(' '),
])
: SqlString.format(`hasAllTokens(?, ?)`, [
SqlString.raw(hasAllTokensColumn),
batch.join(' '),
]),
);

if (hasSeparators || tokenBatches.length > 1) {
Expand Down Expand Up @@ -1533,19 +1548,29 @@ export class CustomSchemaSQLSerializerV2 extends SQLSerializer {

private async findTextIndex(
columnExpression: string,
): Promise<SkipIndexMetadata | undefined> {
): Promise<{ index: SkipIndexMetadata; indexHasLower: boolean } | undefined> {
const skipIndices = await this.skipIndicesPromise;

if (!skipIndices || skipIndices.length === 0) {
return undefined;
}

// Note: Text index expressions should not be wrapped in tokens() or preprocessing functions like lower().
return skipIndices.find(
const idx = skipIndices.find(
idx =>
idx.type === 'text' &&
this.indexCoversColumn(idx.expression, columnExpression),
);

if (!idx) {
return undefined;
}

const normalizedExpr = normalizeChExpression(idx.expression);
const normalizedCol = normalizeChExpression(columnExpression);
const indexHasLower =
normalizedExpr !== normalizedCol && /\blower\s*\(/.test(idx.expression);

return { index: idx, indexHasLower };
}

/**
Expand Down
Loading