Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ API Changes

New Features
---------------------
* GITHUB#15737: Store pre-aggregated sum and value count in DocValuesSkipper (Ankit Jain)

* GITHUB#15740: Add NumericFieldStats utility for retrieving global numeric field statistics
(min, max, doc count) from index metadata structures. Migrate SortedNumericDocValuesRangeQuery
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -898,6 +898,39 @@ public int docCount() {
return field.docCount;
}

@Override
public long sumLow(int level) {
return sumLow();
}

@Override
public long sumHigh(int level) {
return sumHigh();
}

@Override
public long valueCount(int level) {
return valueCount();
}

@Override
public long sumLow() {
// SimpleText doesn't store pre-aggregated sums
return 0;
}

@Override
public long sumHigh() {
// SimpleText doesn't store pre-aggregated sums
return 0;
}

@Override
public long valueCount() {
// SimpleText doesn't store pre-aggregated value counts
return 0;
}

@Override
public int minDocID(int level) {
if (doc == -1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,18 @@ private static class SkipAccumulator {
int docCount;
long minValue;
long maxValue;
long sumHigh;
long sumLow;
long valueCount;

SkipAccumulator(int docID) {
minDocID = docID;
minValue = Long.MAX_VALUE;
maxValue = Long.MIN_VALUE;
docCount = 0;
sumHigh = 0;
sumLow = 0;
valueCount = 0;
}

boolean isDone(int skipIndexIntervalSize, int valueCount, long nextValue, int nextDoc) {
Expand All @@ -219,6 +225,20 @@ boolean isDone(int skipIndexIntervalSize, int valueCount, long nextValue, int ne
void accumulate(long value) {
minValue = Math.min(minValue, value);
maxValue = Math.max(maxValue, value);
// 128-bit addition: add a signed long to (sumHigh, sumLow)
long newLow = sumLow + value;
// Detect carry/borrow using unsigned overflow detection
if (value >= 0) {
if (Long.compareUnsigned(newLow, sumLow) < 0) {
sumHigh++;
}
} else {
if (Long.compareUnsigned(newLow, sumLow) >= 0) {
sumHigh--;
}
}
sumLow = newLow;
valueCount++;
}

void accumulate(SkipAccumulator other) {
Expand All @@ -227,6 +247,16 @@ void accumulate(SkipAccumulator other) {
minValue = Math.min(minValue, other.minValue);
maxValue = Math.max(maxValue, other.maxValue);
docCount += other.docCount;
// 128-bit addition: add (other.sumHigh, other.sumLow) to (sumHigh, sumLow)
long newLow = sumLow + other.sumLow;
if (Long.compareUnsigned(newLow, sumLow) < 0
|| Long.compareUnsigned(newLow, other.sumLow) < 0) {
// unsigned overflow means carry
sumHigh++;
}
sumLow = newLow;
sumHigh += other.sumHigh;
valueCount += other.valueCount;
}

void nextDoc(int docID) {
Expand All @@ -251,6 +281,9 @@ private void writeSkipIndex(FieldInfo field, DocValuesProducer valuesProducer)
long globalMaxValue = Long.MIN_VALUE;
long globalMinValue = Long.MAX_VALUE;
int globalDocCount = 0;
long globalSumHigh = 0;
long globalSumLow = 0;
long globalValueCount = 0;
int maxDocId = -1;
final List<SkipAccumulator> accumulators = new ArrayList<>();
SkipAccumulator accumulator = null;
Expand All @@ -262,6 +295,15 @@ private void writeSkipIndex(FieldInfo field, DocValuesProducer valuesProducer)
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
globalDocCount += accumulator.docCount;
// 128-bit addition for global sum
long newLow = globalSumLow + accumulator.sumLow;
if (Long.compareUnsigned(newLow, globalSumLow) < 0
|| Long.compareUnsigned(newLow, accumulator.sumLow) < 0) {
globalSumHigh++;
}
globalSumLow = newLow;
globalSumHigh += accumulator.sumHigh;
globalValueCount += accumulator.valueCount;
maxDocId = accumulator.maxDocID;
accumulator = null;
if (accumulators.size() == maxAccumulators) {
Expand All @@ -284,6 +326,15 @@ private void writeSkipIndex(FieldInfo field, DocValuesProducer valuesProducer)
globalMaxValue = Math.max(globalMaxValue, accumulator.maxValue);
globalMinValue = Math.min(globalMinValue, accumulator.minValue);
globalDocCount += accumulator.docCount;
// 128-bit addition for global sum
long newLow = globalSumLow + accumulator.sumLow;
if (Long.compareUnsigned(newLow, globalSumLow) < 0
|| Long.compareUnsigned(newLow, accumulator.sumLow) < 0) {
globalSumHigh++;
}
globalSumLow = newLow;
globalSumHigh += accumulator.sumHigh;
globalValueCount += accumulator.valueCount;
maxDocId = accumulator.maxDocID;
writeLevels(accumulators);
}
Expand All @@ -295,6 +346,9 @@ private void writeSkipIndex(FieldInfo field, DocValuesProducer valuesProducer)
assert globalDocCount <= maxDocId + 1;
meta.writeInt(globalDocCount);
meta.writeInt(maxDocId);
meta.writeLong(globalSumHigh);
meta.writeLong(globalSumLow);
meta.writeLong(globalValueCount);
}

private void writeLevels(List<SkipAccumulator> accumulators) throws IOException {
Expand All @@ -319,6 +373,9 @@ private void writeLevels(List<SkipAccumulator> accumulators) throws IOException
data.writeLong(accumulator.maxValue);
data.writeLong(accumulator.minValue);
data.writeInt(accumulator.docCount);
data.writeLong(accumulator.sumHigh);
data.writeLong(accumulator.sumLow);
data.writeLong(accumulator.valueCount);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOExcepti
static final String META_CODEC = "Lucene90DocValuesMetadata";
static final String META_EXTENSION = "dvm";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
static final int VERSION_SUM_AND_VALUE_COUNT = 1;
static final int VERSION_CURRENT = VERSION_SUM_AND_VALUE_COUNT;

// indicates docvalues type
static final byte NUMERIC = 0;
Expand All @@ -196,12 +197,19 @@ public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOExcepti

// number of documents in an interval
private static final int DEFAULT_SKIP_INDEX_INTERVAL_SIZE = 4096;
// bytes on an interval:
// bytes on an interval for VERSION_START (v0):
// * 1 byte : number of levels
// * 16 bytes: min / max value,
// * 8 bytes: min / max docID
// * 4 bytes: number of documents
private static final long SKIP_INDEX_INTERVAL_BYTES = 29L;
static final long SKIP_INDEX_INTERVAL_BYTES_V0 = 29L;
// bytes on an interval for VERSION_SUM_AND_VALUE_COUNT (v1):
// * 1 byte : number of levels
// * 16 bytes: min / max value,
// * 8 bytes: min / max docID
// * 4 bytes: number of documents
// * 24 bytes: sum (high + low) and value count
static final long SKIP_INDEX_INTERVAL_BYTES_V1 = 53L;
// number of intervals represented as a shift to create a new level, this is 1 << 3 == 8
// intervals.
static final int SKIP_INDEX_LEVEL_SHIFT = 3;
Expand All @@ -211,19 +219,24 @@ public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOExcepti
static final int SKIP_INDEX_MAX_LEVEL = 4;
// number of bytes to skip when skipping a level. It does not take into account the
// current interval that is being read.
static final long[] SKIP_INDEX_JUMP_LENGTH_PER_LEVEL = new long[SKIP_INDEX_MAX_LEVEL];
static final long[] SKIP_INDEX_JUMP_LENGTH_PER_LEVEL_V0 = new long[SKIP_INDEX_MAX_LEVEL];
static final long[] SKIP_INDEX_JUMP_LENGTH_PER_LEVEL_V1 = new long[SKIP_INDEX_MAX_LEVEL];

static {
computeJumpLengths(SKIP_INDEX_JUMP_LENGTH_PER_LEVEL_V0, SKIP_INDEX_INTERVAL_BYTES_V0);
computeJumpLengths(SKIP_INDEX_JUMP_LENGTH_PER_LEVEL_V1, SKIP_INDEX_INTERVAL_BYTES_V1);
}

private static void computeJumpLengths(long[] jumpLengths, long intervalBytes) {
// Size of the interval minus read bytes (1 byte for level and 4 bytes for maxDocID)
SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[0] = SKIP_INDEX_INTERVAL_BYTES - 5L;
jumpLengths[0] = intervalBytes - 5L;
for (int level = 1; level < SKIP_INDEX_MAX_LEVEL; level++) {
// jump from previous level
SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level] = SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level - 1];
jumpLengths[level] = jumpLengths[level - 1];
// nodes added by new level
SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level] +=
(1 << (level * SKIP_INDEX_LEVEL_SHIFT)) * SKIP_INDEX_INTERVAL_BYTES;
jumpLengths[level] += (1 << (level * SKIP_INDEX_LEVEL_SHIFT)) * intervalBytes;
// remove the byte levels added in the previous level
SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level] -= (1 << ((level - 1) * SKIP_INDEX_LEVEL_SHIFT));
jumpLengths[level] -= (1 << ((level - 1) * SKIP_INDEX_LEVEL_SHIFT));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
*/
package org.apache.lucene.codecs.lucene90;

import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_JUMP_LENGTH_PER_LEVEL;
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_JUMP_LENGTH_PER_LEVEL_V0;
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_JUMP_LENGTH_PER_LEVEL_V1;
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.SKIP_INDEX_MAX_LEVEL;
import static org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat.TERMS_DICT_BLOCK_LZ4_SHIFT;

Expand Down Expand Up @@ -219,8 +220,21 @@ private DocValuesSkipperEntry readDocValueSkipperMeta(IndexInput meta) throws IO
long minValue = meta.readLong();
int docCount = meta.readInt();
int maxDocID = meta.readInt();
long sumHigh;
long sumLow;
long valueCount;
if (version >= Lucene90DocValuesFormat.VERSION_SUM_AND_VALUE_COUNT) {
sumHigh = meta.readLong();
sumLow = meta.readLong();
valueCount = meta.readLong();
} else {
sumHigh = 0;
sumLow = 0;
valueCount = 0;
}

return new DocValuesSkipperEntry(offset, length, minValue, maxValue, docCount, maxDocID);
return new DocValuesSkipperEntry(
offset, length, minValue, maxValue, docCount, maxDocID, sumHigh, sumLow, valueCount);
}

private void readNumeric(IndexInput meta, NumericEntry entry) throws IOException {
Expand Down Expand Up @@ -353,7 +367,15 @@ public void close() throws IOException {
}

private record DocValuesSkipperEntry(
long offset, long length, long minValue, long maxValue, int docCount, int maxDocId) {}
long offset,
long length,
long minValue,
long maxValue,
int docCount,
int maxDocId,
long sumHigh,
long sumLow,
long valueCount) {}

private static class NumericEntry {
long[] table;
Expand Down Expand Up @@ -1865,6 +1887,12 @@ public DocValuesSkipper getSkipper(FieldInfo field) throws IOException {
final DocValuesSkipperEntry entry = skippers.get(field.number);

final IndexInput input = data.slice("doc value skipper", entry.offset, entry.length);
final boolean hasSumAndValueCount =
version >= Lucene90DocValuesFormat.VERSION_SUM_AND_VALUE_COUNT;
final long[] jumpLengths =
hasSumAndValueCount
? SKIP_INDEX_JUMP_LENGTH_PER_LEVEL_V1
: SKIP_INDEX_JUMP_LENGTH_PER_LEVEL_V0;
// TODO: should we write to disk the actual max level for this segment?
return new DocValuesSkipper() {
final int[] minDocID = new int[SKIP_INDEX_MAX_LEVEL];
Expand All @@ -1879,6 +1907,9 @@ public DocValuesSkipper getSkipper(FieldInfo field) throws IOException {
final long[] minValue = new long[SKIP_INDEX_MAX_LEVEL];
final long[] maxValue = new long[SKIP_INDEX_MAX_LEVEL];
final int[] docCount = new int[SKIP_INDEX_MAX_LEVEL];
final long[] sumHigh = new long[SKIP_INDEX_MAX_LEVEL];
final long[] sumLow = new long[SKIP_INDEX_MAX_LEVEL];
final long[] valueCount = new long[SKIP_INDEX_MAX_LEVEL];
int levels = 1;

@Override
Expand All @@ -1900,14 +1931,19 @@ public void advance(int target) throws IOException {
// check if current interval is competitive or we can jump to the next position
for (int level = levels - 1; level >= 0; level--) {
if ((maxDocID[level] = input.readInt()) < target) {
input.skipBytes(SKIP_INDEX_JUMP_LENGTH_PER_LEVEL[level]); // the jump for the level
input.skipBytes(jumpLengths[level]); // the jump for the level
valid = false;
break;
}
minDocID[level] = input.readInt();
maxValue[level] = input.readLong();
minValue[level] = input.readLong();
docCount[level] = input.readInt();
if (hasSumAndValueCount) {
sumHigh[level] = input.readLong();
sumLow[level] = input.readLong();
valueCount[level] = input.readLong();
}
}
if (valid) {
// adjust levels
Expand Down Expand Up @@ -1950,6 +1986,21 @@ public int docCount(int level) {
return docCount[level];
}

@Override
public long sumLow(int level) {
return sumLow[level];
}

@Override
public long sumHigh(int level) {
return sumHigh[level];
}

@Override
public long valueCount(int level) {
return valueCount[level];
}

@Override
public long minValue() {
return entry.minValue;
Expand All @@ -1964,6 +2015,21 @@ public long maxValue() {
public int docCount() {
return entry.docCount;
}

@Override
public long sumLow() {
return entry.sumLow;
}

@Override
public long sumHigh() {
return entry.sumHigh;
}

@Override
public long valueCount() {
return entry.valueCount;
}
};
}
}
Loading
Loading