apache · DomGarguilo · Oct 7, 2025 · Oct 10, 2025 · Nov 19, 2025 · Dec 2, 2025
diff --git a/core/src/main/java/org/apache/accumulo/core/conf/Property.java b/core/src/main/java/org/apache/accumulo/core/conf/Property.java
@@ -1198,6 +1198,12 @@ public enum Property {
       "The maximum amount of memory that will be used to cache results of a client query/scan. "
           + "Once this limit is reached, the buffered data is sent to the client.",
       "1.3.5"),
+  TABLE_SCAN_BATCH_DUPLICATE_MAX_MULTIPLIER("table.scan.batch.duplicate.max.multiplier", "3",
+      PropertyType.COUNT,
+      "When a scan batch would end on a duplicate key, allow the batch to grow by this "
+          + "multiplier of the scan batch size and table scan max memory to avoid splitting duplicate keys. "
+          + "If the duplicate run still exceeds this limit, the scan fails to avoid dropping keys.",
+      "2.1.5"),
   TABLE_SHUFFLE_SOURCES("table.shuffle.sources", "false", PropertyType.BOOLEAN,
       "Shuffle the opening order for Rfiles to reduce thread contention on file open operations.",
       "2.1.5"),

diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Batch.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Batch.java
@@ -27,12 +27,15 @@ final class Batch {
   private final List<KVEntry> results;
   private final Key continueKey;
   private final long numBytes;
+  private final int duplicatesToSkip;
 
-  Batch(boolean skipContinueKey, List<KVEntry> results, Key continueKey, long numBytes) {
+  Batch(boolean skipContinueKey, List<KVEntry> results, Key continueKey, long numBytes,
+      int duplicatesToSkip) {
     this.skipContinueKey = skipContinueKey;
     this.results = results;
     this.continueKey = continueKey;
     this.numBytes = numBytes;
+    this.duplicatesToSkip = duplicatesToSkip;
   }
 
   public boolean isSkipContinueKey() {
@@ -50,4 +53,8 @@ public Key getContinueKey() {
   public long getNumBytes() {
     return numBytes;
   }
+
+  public int getDuplicatesToSkip() {
+    return duplicatesToSkip;
+  }
 }
diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Scanner.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/Scanner.java
@@ -62,6 +62,7 @@ public class Scanner {
   private final AtomicBoolean interruptFlag;
 
   private boolean readInProgress = false;
+  private int duplicatesToSkip = 0;
 
   Scanner(TabletBase tablet, Range range, ScanParameters scanParams, AtomicBoolean interruptFlag) {
     this.tablet = tablet;
@@ -138,7 +139,8 @@ private Pair<ScanBatch,ScanDataSource> readInternal() throws IOException, Tablet
         iter = new SourceSwitchingIterator(dataSource, false);
       }
 
-      results = tablet.nextBatch(iter, range, scanParams);
+      results = tablet.nextBatch(iter, range, scanParams, duplicatesToSkip);
+      duplicatesToSkip = 0;
 
       if (results.getResults() == null) {
         range = null;
@@ -148,6 +150,7 @@ private Pair<ScanBatch,ScanDataSource> readInternal() throws IOException, Tablet
       } else {
         range = new Range(results.getContinueKey(), !results.isSkipContinueKey(), range.getEndKey(),
             range.isEndKeyInclusive());
+        duplicatesToSkip = results.getDuplicatesToSkip();
         return new Pair<>(new ScanBatch(results.getResults(), true), dataSource);
       }
 

diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/TabletBase.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/TabletBase.java
@@ -60,6 +60,7 @@
 import org.apache.accumulo.server.conf.TableConfiguration;
 import org.apache.accumulo.server.fs.TooManyFilesException;
 import org.apache.accumulo.tserver.InMemoryMap;
+import org.apache.accumulo.tserver.MemKey;
 import org.apache.accumulo.tserver.TabletHostingServer;
 import org.apache.accumulo.tserver.TabletServerResourceManager;
 import org.apache.accumulo.tserver.metrics.TabletServerScanMetrics;
@@ -276,8 +277,8 @@ void recordScanTrace(Span span, List<KVEntry> batch, ScanParameters scanParamete
     }
   }
 
-  Batch nextBatch(SortedKeyValueIterator<Key,Value> iter, Range range, ScanParameters scanParams)
-      throws IOException {
+  Batch nextBatch(SortedKeyValueIterator<Key,Value> iter, Range range, ScanParameters scanParams,
+      int duplicatesToSkip) throws IOException {
 
     // log.info("In nextBatch..");
 
@@ -297,9 +298,33 @@ Batch nextBatch(SortedKeyValueIterator<Key,Value> iter, Range range, ScanParamet
     long resultBytes = 0L;
 
     long maxResultsSize = getTableConfiguration().getAsBytes(Property.TABLE_SCAN_MAXMEM);
+    int duplicateBatchMultiplier =
+        getTableConfiguration().getCount(Property.TABLE_SCAN_BATCH_DUPLICATE_MAX_MULTIPLIER);
+    if (duplicateBatchMultiplier < 1) {
+      duplicateBatchMultiplier = 1;
+    }
+    long maxResultsSizeWithDuplicates = maxResultsSize;
+    long maxEntriesWithDuplicates = scanParams.getMaxEntries();
+    if (duplicateBatchMultiplier > 1) {
+      try {
+        maxResultsSizeWithDuplicates =
+            Math.multiplyExact(maxResultsSize, (long) duplicateBatchMultiplier);
+      } catch (ArithmeticException e) {
+        maxResultsSizeWithDuplicates = Long.MAX_VALUE;
+        // TODO maybe log that this happened? Deduped somehow?
+      }
+      try {
+        maxEntriesWithDuplicates =
+            Math.multiplyExact(scanParams.getMaxEntries(), (long) duplicateBatchMultiplier);
+      } catch (ArithmeticException e) {
+        maxEntriesWithDuplicates = Long.MAX_VALUE;
+        // TODO maybe log that this happened? Deduped somehow?
+      }
+    }
 
     Key continueKey = null;
-    boolean skipContinueKey = false;
+    boolean skipContinueKey = true;
+    boolean resumeOnSameKey = false;
 
     YieldCallback<Key> yield = new YieldCallback<>();
 
@@ -314,32 +339,73 @@ Batch nextBatch(SortedKeyValueIterator<Key,Value> iter, Range range, ScanParamet
       iter.seek(range, LocalityGroupUtil.families(scanParams.getColumnSet()), true);
     }
 
+    skipReturnedDuplicates(iter, duplicatesToSkip, range);
+
+    Key rangeStartKey = range.getStartKey();
+    Key currentKey = null;
+    boolean resumingOnSameKey =
+        iter.hasTop() && rangeStartKey != null && rangeStartKey.equals(iter.getTopKey());
+    int previousDuplicates = resumingOnSameKey ? duplicatesToSkip : 0;
+    int duplicatesReturnedForCurrentKey = 0;
+    Key cutKey = null;
+    boolean cutPending = false;
+
     while (iter.hasTop()) {
       if (yield.hasYielded()) {
         throw new IOException(
             "Coding error: hasTop returned true but has yielded at " + yield.getPositionAndReset());
       }
       value = iter.getTopValue();
       key = iter.getTopKey();
+      if (cutPending && !key.equals(cutKey)) {
+        continueKey = copyResumeKey(cutKey);
+        resumeOnSameKey = true;
+        skipContinueKey = false;
+        break;
+      }
+      if (!key.equals(currentKey)) {
+        currentKey = copyResumeKey(key);
+        if (resumingOnSameKey && key.equals(rangeStartKey)) {
+          duplicatesReturnedForCurrentKey = previousDuplicates;
+        } else {
+          duplicatesReturnedForCurrentKey = 0;
+          resumingOnSameKey = false;
+        }
+      }
 
       KVEntry kvEntry = new KVEntry(key, value); // copies key and value
       results.add(kvEntry);
       resultSize += kvEntry.estimateMemoryUsed();
       resultBytes += kvEntry.numBytes();
 
+      duplicatesReturnedForCurrentKey++;
+
+      if (cutPending && (resultSize >= maxResultsSizeWithDuplicates
+          || results.size() >= maxEntriesWithDuplicates)) {
+        throw new IllegalStateException("Duplicate key run exceeded scan batch growth limit for "
+            + cutKey + ". Increase " + Property.TABLE_SCAN_BATCH_DUPLICATE_MAX_MULTIPLIER.getKey()
+            + " or reduce duplicates for this key.");
+      }
+
       boolean timesUp = batchTimeOut > 0 && (System.nanoTime() - startNanos) >= timeToRun;
 
       if (resultSize >= maxResultsSize || results.size() >= scanParams.getMaxEntries() || timesUp) {
-        continueKey = new Key(key);
-        skipContinueKey = true;
-        break;
+        if (!cutPending) {
+          cutPending = true;
+          cutKey = currentKey;
+        } else if (timesUp) {
+          throw new IllegalStateException("Duplicate key run exceeded scan batch timeout for "
+              + cutKey + ". Increase " + Property.TABLE_SCAN_BATCH_DUPLICATE_MAX_MULTIPLIER.getKey()
+              + " or batch timeout, or reduce duplicates for this key.");
+        }
       }
 
       iter.next();
     }
 
     if (yield.hasYielded()) {
-      continueKey = new Key(yield.getPositionAndReset());
+      continueKey = copyResumeKey(yield.getPositionAndReset());
+      resumeOnSameKey = false;
       skipContinueKey = true;
       if (!range.contains(continueKey)) {
         throw new IOException("Underlying iterator yielded to a position outside of its range: "
@@ -362,7 +428,9 @@ Batch nextBatch(SortedKeyValueIterator<Key,Value> iter, Range range, ScanParamet
       }
     }
 
-    return new Batch(skipContinueKey, results, continueKey, resultBytes);
+    int duplicatesToSkipForNextBatch = resumeOnSameKey ? duplicatesReturnedForCurrentKey : 0;
+    return new Batch(skipContinueKey, results, continueKey, resultBytes,
+        duplicatesToSkipForNextBatch);
   }
 
   private Tablet.LookupResult lookup(SortedKeyValueIterator<Key,Value> mmfi, List<Range> ranges,
@@ -515,7 +583,8 @@ private void handleTabletClosedDuringScan(List<KVEntry> results, Tablet.LookupRe
 
   private void addUnfinishedRange(Tablet.LookupResult lookupResult, Range range, Key key) {
     if (range.getEndKey() == null || key.compareTo(range.getEndKey()) < 0) {
-      Range nlur = new Range(new Key(key), false, range.getEndKey(), range.isEndKeyInclusive());
+      Key copy = copyResumeKey(key);
+      Range nlur = new Range(copy, false, range.getEndKey(), range.isEndKeyInclusive());
       lookupResult.unfinishedRanges.add(nlur);
     }
   }
@@ -526,4 +595,30 @@ public synchronized void updateQueryStats(int size, long numBytes) {
     this.queryResultBytes.addAndGet(numBytes);
     this.server.getScanMetrics().incrementQueryResultBytes(numBytes);
   }
+
+  private Key copyResumeKey(Key key) {
+    if (key instanceof MemKey) {
+      MemKey memKey = (MemKey) key;
+      return new MemKey(memKey, memKey.getKVCount());
+    }
+    return new Key(key);
+  }
+
+  private void skipReturnedDuplicates(SortedKeyValueIterator<Key,Value> iter, int duplicatesToSkip,
+      Range range) throws IOException {
+    if (duplicatesToSkip <= 0 || !range.isStartKeyInclusive()) {
+      return;
+    }
+
+    Key startKey = range.getStartKey();
+    if (startKey == null) {
+      return;
+    }
+
+    int skipped = 0;
+    while (skipped < duplicatesToSkip && iter.hasTop() && iter.getTopKey().equals(startKey)) {
+      iter.next();
+      skipped++;
+    }
+  }
 }