Skip to content
Open
Original file line number Diff line number Diff line change
Expand Up @@ -44,26 +44,28 @@ private PredicateEvaluatorProvider() {
/// Builds a [PredicateEvaluator] for a leaf filter on the column backed by `dataSource`. The dictionary is derived
/// via [#getDictionaryUsableForFiltering], which keeps it only when a dict-consuming filter operator (inverted /
/// exact range) will actually run for this predicate type on the column's forward-index encoding. The data type is
/// taken from the data-source metadata.
/// taken from the data-source metadata. For REGEXP_LIKE, the FST/IFST text index (when present) is consulted here
/// — the upgrade happens only when the dictionary is usable, so no evaluator is built and discarded.
public static PredicateEvaluator getPredicateEvaluator(Predicate predicate, DataSource dataSource,
QueryContext queryContext) {
Dictionary dictionary = getDictionaryUsableForFiltering(dataSource, queryContext, predicate);
DataType dataType = dataSource.getDataSourceMetadata().getDataType();
return buildEvaluator(predicate, dictionary, dataType, queryContext);
return buildEvaluator(predicate, dictionary, dataType, queryContext, dataSource);
}

/// Builds a [PredicateEvaluator] when the value source and `dictionary` are already in sync by construction: when
/// `dictionary` is non-null the source produces dict ids decodable by that dictionary; when `dictionary` is null
/// the source produces raw values. No gating logic runs — the dictionary (if any) is taken as-is, so the caller is
/// responsible for the match.
/// responsible for the match. FST/IFST evaluators are not considered here since this overload has no `DataSource`
/// to read text indexes from.
// TODO: Always pass in query context
public static PredicateEvaluator getPredicateEvaluator(Predicate predicate, @Nullable Dictionary dictionary,
DataType dataType, @Nullable QueryContext queryContext) {
return buildEvaluator(predicate, dictionary, dataType, queryContext);
return buildEvaluator(predicate, dictionary, dataType, queryContext, null);
}

private static PredicateEvaluator buildEvaluator(Predicate predicate, @Nullable Dictionary dictionary,
DataType dataType, @Nullable QueryContext queryContext) {
DataType dataType, @Nullable QueryContext queryContext, @Nullable DataSource dataSource) {
try {
if (dictionary != null) {
// dictionary based predicate evaluators
Expand All @@ -83,9 +85,23 @@ private static PredicateEvaluator buildEvaluator(Predicate predicate, @Nullable
case RANGE:
return RangePredicateEvaluatorFactory.newDictionaryBasedEvaluator((RangePredicate) predicate, dictionary,
dataType);
case REGEXP_LIKE:
return RegexpLikePredicateEvaluatorFactory.newDictionaryBasedEvaluator((RegexpLikePredicate) predicate,
dictionary, dataType, queryContext);
case REGEXP_LIKE: {
// Prefer FST/IFST text index when present on the data source; otherwise fall back to the generic
// dict-based evaluator (dict-id scan or eager dict iteration).
RegexpLikePredicate regexpLike = (RegexpLikePredicate) predicate;
if (dataSource != null) {
if (regexpLike.isCaseInsensitive() && dataSource.getIFSTIndex() != null) {
return IFSTBasedRegexpPredicateEvaluatorFactory.newIFSTBasedEvaluator(regexpLike,
dataSource.getIFSTIndex(), dictionary);
}
if (!regexpLike.isCaseInsensitive() && dataSource.getFSTIndex() != null) {
return FSTBasedRegexpPredicateEvaluatorFactory.newFSTBasedEvaluator(regexpLike,
dataSource.getFSTIndex(), dictionary);
}
}
return RegexpLikePredicateEvaluatorFactory.newDictionaryBasedEvaluator(regexpLike, dictionary, dataType,
queryContext);
}
default:
throw new UnsupportedOperationException("Unsupported predicate type: " + predicate.getType());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
import org.apache.pinot.common.request.context.FunctionContext;
import org.apache.pinot.common.request.context.predicate.JsonMatchPredicate;
import org.apache.pinot.common.request.context.predicate.Predicate;
import org.apache.pinot.common.request.context.predicate.RegexpLikePredicate;
import org.apache.pinot.common.request.context.predicate.TextMatchPredicate;
import org.apache.pinot.common.request.context.predicate.VectorSimilarityPredicate;
import org.apache.pinot.common.request.context.predicate.VectorSimilarityRadiusPredicate;
Expand All @@ -54,8 +53,6 @@
import org.apache.pinot.core.operator.filter.VectorSearchParams;
import org.apache.pinot.core.operator.filter.VectorSearchStrategy;
import org.apache.pinot.core.operator.filter.VectorSimilarityFilterOperator;
import org.apache.pinot.core.operator.filter.predicate.FSTBasedRegexpPredicateEvaluatorFactory;
import org.apache.pinot.core.operator.filter.predicate.IFSTBasedRegexpPredicateEvaluatorFactory;
import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluator;
import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluatorProvider;
import org.apache.pinot.core.operator.transform.function.ItemTransformFunction;
Expand Down Expand Up @@ -306,27 +303,10 @@ private BaseFilterOperator constructPhysicalOperator(FilterContext filter, int n
return new TextMatchFilterOperator(textIndexReader, (TextMatchPredicate) predicate, numDocs);
}
case REGEXP_LIKE:
// Check if case-insensitive flag is present
RegexpLikePredicate regexpLikePredicate = (RegexpLikePredicate) predicate;
boolean caseInsensitive = regexpLikePredicate.isCaseInsensitive();
if (caseInsensitive) {
if (dataSource.getIFSTIndex() != null) {
predicateEvaluator =
IFSTBasedRegexpPredicateEvaluatorFactory.newIFSTBasedEvaluator(regexpLikePredicate,
dataSource.getIFSTIndex(), dataSource.getDictionary());
} else {
predicateEvaluator =
PredicateEvaluatorProvider.getPredicateEvaluator(predicate, dataSource, _queryContext);
}
} else {
if (dataSource.getFSTIndex() != null) {
predicateEvaluator = FSTBasedRegexpPredicateEvaluatorFactory.newFSTBasedEvaluator(regexpLikePredicate,
dataSource.getFSTIndex(), dataSource.getDictionary());
} else {
predicateEvaluator =
PredicateEvaluatorProvider.getPredicateEvaluator(predicate, dataSource, _queryContext);
}
}
// PredicateEvaluatorProvider handles FST/IFST upgrade internally when the dictionary is usable for
// filtering and a matching text index exists on the data source.
predicateEvaluator =
PredicateEvaluatorProvider.getPredicateEvaluator(predicate, dataSource, _queryContext);
_predicateEvaluators.add(Pair.of(predicate, predicateEvaluator));
return FilterOperatorUtils.getLeafFilterOperator(_queryContext, predicateEvaluator, dataSource, numDocs);
case JSON_MATCH:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,44 @@

import java.lang.reflect.Method;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.pinot.common.request.context.ExpressionContext;
import org.apache.pinot.common.request.context.FilterContext;
import org.apache.pinot.common.request.context.predicate.Predicate;
import org.apache.pinot.common.request.context.predicate.RegexpLikePredicate;
import org.apache.pinot.core.common.BlockDocIdIterator;
import org.apache.pinot.core.common.BlockDocIdSet;
import org.apache.pinot.core.operator.blocks.FilterBlock;
import org.apache.pinot.core.operator.filter.BaseFilterOperator;
import org.apache.pinot.core.operator.filter.predicate.BaseDictIdBasedRegexpLikePredicateEvaluator;
import org.apache.pinot.core.operator.filter.predicate.PredicateEvaluator;
import org.apache.pinot.core.query.request.context.QueryContext;
import org.apache.pinot.segment.local.upsert.UpsertUtils;
import org.apache.pinot.segment.spi.Constants;
import org.apache.pinot.segment.spi.IndexSegment;
import org.apache.pinot.segment.spi.SegmentContext;
import org.apache.pinot.segment.spi.SegmentMetadata;
import org.apache.pinot.segment.spi.datasource.DataSource;
import org.apache.pinot.segment.spi.datasource.DataSourceMetadata;
import org.apache.pinot.segment.spi.index.creator.VectorIndexConfig;
import org.apache.pinot.segment.spi.index.mutable.ThreadSafeMutableRoaringBitmap;
import org.apache.pinot.segment.spi.index.reader.Dictionary;
import org.apache.pinot.segment.spi.index.reader.ForwardIndexReader;
import org.apache.pinot.segment.spi.index.reader.InvertedIndexReader;
import org.apache.pinot.segment.spi.index.reader.TextIndexReader;
import org.apache.pinot.spi.config.table.FieldConfig;
import org.apache.pinot.spi.data.DimensionFieldSpec;
import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.mockito.Mockito;
import org.mockito.stubbing.Answer;
import org.roaringbitmap.buffer.ImmutableRoaringBitmap;
import org.testng.annotations.Test;

import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertTrue;


public class FilterPlanNodeTest {
Expand Down Expand Up @@ -133,4 +153,122 @@ private int getNumberOfFilteredDocs(SegmentContext segmentContext, QueryContext
}
return numDocsFiltered;
}

@Test
public void regexpLikeUsesIFSTEvaluatorWhenIFSTAndInvertedAvailable()
throws Exception {
PredicateEvaluator evaluator = runRegexpLikeAndGetEvaluator(
true, true, false, true, false, true);
assertTrue(evaluator.isDictionaryBased());
assertTrue(evaluator instanceof BaseDictIdBasedRegexpLikePredicateEvaluator);
}

@Test
public void regexpLikeFallsBackToRawWhenIFSTPresentButNoDictConsumer()
throws Exception {
PredicateEvaluator evaluator = runRegexpLikeAndGetEvaluator(
true, true, false, true, false, false);
assertFalse(evaluator.isDictionaryBased());
}

@Test
public void regexpLikeUsesFSTEvaluatorWhenFSTAndInvertedAvailable()
throws Exception {
PredicateEvaluator evaluator = runRegexpLikeAndGetEvaluator(
false, false, true, true, false, true);
assertTrue(evaluator.isDictionaryBased());
assertTrue(evaluator instanceof BaseDictIdBasedRegexpLikePredicateEvaluator);
}

@Test
public void regexpLikeFallsBackToRawWhenFSTPresentButNoDictConsumer()
throws Exception {
PredicateEvaluator evaluator = runRegexpLikeAndGetEvaluator(
false, false, true, true, false, false);
assertFalse(evaluator.isDictionaryBased());
}

@Test
public void regexpLikeUsesIFSTEvaluatorWhenIFSTAndDictEncodedForward()
throws Exception {
PredicateEvaluator evaluator = runRegexpLikeAndGetEvaluator(
true, true, false, true, true, false);
assertTrue(evaluator.isDictionaryBased());
}

private PredicateEvaluator runRegexpLikeAndGetEvaluator(boolean caseInsensitive, boolean hasIFST, boolean hasFST,
boolean hasDictionary, boolean forwardDictEncoded, boolean hasInverted)
throws Exception {
String column = "col";
DataSource dataSource =
mockStringDataSource(column, hasIFST, hasFST, hasDictionary, forwardDictEncoded, hasInverted);
RegexpLikePredicate predicate = caseInsensitive
? new RegexpLikePredicate(ExpressionContext.forIdentifier(column), "pat", "i")
: new RegexpLikePredicate(ExpressionContext.forIdentifier(column), "pat");
FilterContext filterContext = FilterContext.forPredicate(predicate);

IndexSegment segment = mock(IndexSegment.class);
SegmentMetadata segmentMetadata = mock(SegmentMetadata.class);
when(segmentMetadata.getTotalDocs()).thenReturn(1);
when(segment.getSegmentMetadata()).thenReturn(segmentMetadata);
when(segment.getDataSource(Mockito.eq(column), Mockito.any())).thenReturn(dataSource);

QueryContext queryContext = mock(QueryContext.class);
when(queryContext.getFilter()).thenReturn(filterContext);
when(queryContext.isIndexUseAllowed(Mockito.any(DataSource.class), Mockito.any(FieldConfig.IndexType.class)))
.thenReturn(true);

SegmentContext segmentContext = new SegmentContext(segment);

FilterPlanNode planNode = new FilterPlanNode(segmentContext, queryContext);
try {
planNode.run();
} catch (Exception ignored) {
}

Pair<Predicate, PredicateEvaluator> pair = planNode.getPredicateEvaluators().get(0);
return pair.getRight();
}

@SuppressWarnings({"rawtypes", "unchecked"})
private static DataSource mockStringDataSource(String column, boolean hasIFST, boolean hasFST,
boolean hasDictionary, boolean forwardDictEncoded, boolean hasInverted) {
DataSource dataSource = Mockito.mock(DataSource.class);
DataSourceMetadata metadata = Mockito.mock(DataSourceMetadata.class);
when(metadata.getDataType()).thenReturn(DataType.STRING);
when(metadata.isSorted()).thenReturn(false);
when(metadata.getFieldSpec()).thenReturn(new DimensionFieldSpec(column, DataType.STRING, true));
when(dataSource.getDataSourceMetadata()).thenReturn(metadata);
when(dataSource.getColumnName()).thenReturn(column);

ForwardIndexReader forwardIndex = Mockito.mock(ForwardIndexReader.class);
when(forwardIndex.isDictionaryEncoded()).thenReturn(forwardDictEncoded);
when(forwardIndex.getStoredType()).thenReturn(DataType.STRING);
when(dataSource.getForwardIndex()).thenReturn(forwardIndex);

if (hasDictionary) {
Dictionary dictionary = Mockito.mock(Dictionary.class);
when(dictionary.length()).thenReturn(0);
when(dataSource.getDictionary()).thenReturn(dictionary);
} else {
when(dataSource.getDictionary()).thenReturn(null);
}

InvertedIndexReader invertedReader = hasInverted ? Mockito.mock(InvertedIndexReader.class) : null;
TextIndexReader ifstReader = hasIFST ? mockTextIndexReader() : null;
TextIndexReader fstReader = hasFST ? mockTextIndexReader() : null;
when(dataSource.getInvertedIndex()).thenReturn(invertedReader);
when(dataSource.getRangeIndex()).thenReturn(null);
when(dataSource.getIFSTIndex()).thenReturn(ifstReader);
when(dataSource.getFSTIndex()).thenReturn(fstReader);

return dataSource;
}

private static TextIndexReader mockTextIndexReader() {
TextIndexReader reader = Mockito.mock(TextIndexReader.class);
ImmutableRoaringBitmap emptyBitmap = ImmutableRoaringBitmap.bitmapOf();
when(reader.getDictIds(Mockito.anyString())).thenReturn(emptyBitmap);
return reader;
}
}