apache · adriangb · May 25, 2026 · May 25, 2026 · May 25, 2026 · May 25, 2026
diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs
@@ -30,10 +30,12 @@ use crate::geospatial::statistics::GeospatialStatistics;
 use crate::schema::types::ColumnDescPtr;
 use crate::util::bit_util::num_required_bits;
 use crate::util::interner::{Interner, Storage};
+use arrow_array::types::ByteArrayType;
 use arrow_array::{
     Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, FixedSizeBinaryArray,
-    LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
+    GenericByteArray, LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
 };
+use arrow_buffer::{ArrowNativeType, Buffer};
 use arrow_schema::DataType;
 
 macro_rules! downcast_dict_impl {
@@ -481,6 +483,90 @@ impl ColumnValueEncoder for ByteArrayEncoder {
         Ok(())
     }
 
+    fn count_values_within_byte_budget_gather(
+        values: &Self::Values,
+        indices: &[usize],
+        byte_budget: usize,
+    ) -> Option<usize> {
+        // `ByteArrayEncoder` only ever writes via `write_gather`, so this
+        // is the relevant method.
+        //
+        // Two-stage walk for the simple offset-buffer byte array types:
+        //   1. If indices are contiguous, compute the total payload in
+        //      O(1) via a single subtraction on the offsets buffer.
+        //      When the total fits the budget — the overwhelmingly
+        //      common "small values" case — return immediately.
+        //   2. Otherwise, walk per-value byte sizes from the offsets
+        //      buffer (still cheap, no slice/UTF-8 construction) and
+        //      exit at the first value that pushes the cumulative sum
+        //      past the budget. This bounds skewed distributions: an
+        //      outlier value is caught wherever it lands in the chunk.
+        let count = match values.data_type() {
+            DataType::Utf8 => count_within_budget_offsets(
+                values.as_any().downcast_ref::<StringArray>().unwrap(),
+                indices,
+                byte_budget,
+            ),
+            DataType::LargeUtf8 => count_within_budget_offsets(
+                values.as_any().downcast_ref::<LargeStringArray>().unwrap(),
+                indices,
+                byte_budget,
+            ),
+            DataType::Binary => count_within_budget_offsets(
+                values.as_any().downcast_ref::<BinaryArray>().unwrap(),
+                indices,
+                byte_budget,
+            ),
+            DataType::LargeBinary => count_within_budget_offsets(
+                values.as_any().downcast_ref::<LargeBinaryArray>().unwrap(),
+                indices,
+                byte_budget,
+            ),
+            // View arrays carry each value's length in the low 32 bits of
+            // its u128 view word, so lengths are scannable without touching
+            // any data buffer — and the common small-value case skips even
+            // that scan via an O(1) conservative bound.
+            DataType::Utf8View => {
+                let array = values.as_any().downcast_ref::<StringViewArray>().unwrap();
+                count_within_budget_views(
+                    array.views(),
+                    indices,
+                    byte_budget,
+                    max_view_value_len(array.data_buffers()),
+                )
+            }
+            DataType::BinaryView => {
+                let array = values.as_any().downcast_ref::<BinaryViewArray>().unwrap();
+                count_within_budget_views(
+                    array.views(),
+                    indices,
+                    byte_budget,
+                    max_view_value_len(array.data_buffers()),
+                )
+            }
+            // For arrow Dictionary input, treat every chunk as fitting
+            // and stay on the batched path. The arrow array being
+            // Dictionary-encoded in the first place implies its values
+            // are small enough that dedup is worthwhile, which is the
+            // opposite of the "5 MiB blob per row" case this fix
+            // targets. Doing a per-value walk through dict keys (each
+            // value lookup is keys[i] → values[key] → slice) on every
+            // chunk costs ~+30-80% vs `main` after writer-dict spill,
+            // and there is essentially nothing to bound.
+            DataType::Dictionary(_, _) => indices.len(),
+            // FixedSizeBinary falls through to the per-value walk via
+            // `ArrayAccessor::value`.
+            _ => downcast_op!(
+                values.data_type(),
+                values,
+                count_within_budget_accessor,
+                indices,
+                byte_budget
+            ),
+        };
+        Some(count)
+    }
+
     fn num_values(&self) -> usize {
         match &self.dict_encoder {
             Some(encoder) => encoder.indices.len(),
@@ -593,6 +679,142 @@ where
     }
 }
 
+/// Cumulative-scan fallback used for byte array types that don't expose
+/// a single contiguous offsets buffer — view arrays, dictionary arrays,
+/// fixed-size binary. Returns `indices.len()` if every value fits the
+/// budget, otherwise the smallest `k ≥ 1` whose first `k` values' encoded
+/// size first exceeds `byte_budget` — i.e. the boundary value is included
+/// so the caller's page-flush check trips immediately on this mini-batch,
+/// without leaving a sliver to glue onto the next page.
+///
+/// Free function so it can be used with `downcast_op!`.
+fn count_within_budget_accessor<T>(values: T, indices: &[usize], byte_budget: usize) -> usize
+where
+    T: ArrayAccessor + Copy,
+    T::Item: AsRef<[u8]>,
+{
+    let mut cum: usize = 0;
+    for (i, idx) in indices.iter().enumerate() {
+        let value_len = values.value(*idx).as_ref().len() + std::mem::size_of::<u32>();
+        cum = cum.saturating_add(value_len);
+        if cum > byte_budget {
+            return i + 1;
+        }
+    }
+    indices.len()
+}
+
+/// Upper bound on any single value's byte length in a view array.
+///
+/// An inline view stores at most 12 bytes; an
+/// out-of-line view's data is a contiguous slice of exactly one data
+/// buffer, so it cannot be longer than the largest data buffer. This is a
+/// loose bound (a value is usually far smaller than a whole buffer) but it
+/// is O(number of buffers) and always sound.
+fn max_view_value_len(buffers: &[Buffer]) -> usize {
+    /// Bytes that fit inline in a u128 view word (the rest is len + prefix).
+    const MAX_INLINE_VIEW_LEN: usize = 12;
+    buffers
+        .iter()
+        .map(|b| b.len())
+        .max()
+        .unwrap_or(0)
+        .max(MAX_INLINE_VIEW_LEN)
+}
+
+/// Two-stage budget count for view arrays (`Utf8View`, `BinaryView`).
+///
+///   1. View arrays have no prefix-sum offsets buffer, so the exact O(1)
+///      span subtraction `count_within_budget_offsets` uses is unavailable.
+///      But a *conservative* O(1) bound is: every value is at most
+///      `max_value_len` bytes, so the whole chunk fits the budget when
+///      `n * (max_value_len + 4) <= byte_budget`. This skips the per-value
+///      walk for the common small-value case — what view arrays are built
+///      for, and exactly the case where there is nothing to bound.
+///   2. Otherwise scan per-value lengths from the low 32 bits of each u128
+///      view word (no data-buffer dereference) and stop at the first value
+///      that pushes the cumulative sum past the budget.
+fn count_within_budget_views(
+    views: &[u128],
+    indices: &[usize],
+    byte_budget: usize,
+    max_value_len: usize,
+) -> usize {
+    // Stage 1: O(1) conservative upper bound.
+    let per_value = max_value_len + std::mem::size_of::<u32>();
+    if indices.len().saturating_mul(per_value) <= byte_budget {
+        return indices.len();
+    }
+    // Stage 2: exact per-value scan.
+    let mut cum: usize = 0;
+    for (i, idx) in indices.iter().enumerate() {
+        let len = (views[*idx] as u32) as usize;
+        cum = cum.saturating_add(len + std::mem::size_of::<u32>());
+        if cum > byte_budget {
+            return i + 1;
+        }
+    }
+    indices.len()
+}
+
+/// Two-stage fast path for `GenericByteArray<O>`
+/// (Utf8/LargeUtf8/Binary/LargeBinary).
+///
+/// `indices` are assumed sorted ascending — they always are here, since
+/// they come from `non_null_indices`, which is built in array order.
+///
+///   1. The span `offsets[last+1] - offsets[first]` is an O(1) upper
+///      bound on the chunk's payload: it covers every array position in
+///      `[first, last]`, a superset of `indices`. For a non-null chunk
+///      `indices` *is* that whole range; for a chunk drawn from a
+///      nullable column the skipped positions are nulls, whose offset
+///      delta is zero, so the span still equals the exact payload.
+///      Either way, if the upper bound fits the budget every value
+///      fits — return `indices.len()` with no per-value work. This
+///      covers the overwhelmingly common "small values" case for both
+///      non-null *and* nullable columns.
+///   2. Otherwise the chunk is genuinely near the budget: walk per-index
+///      lengths from the offsets buffer directly (no slice/UTF-8
+///      construction) and stop at the first value that pushes the
+///      cumulative sum past the budget.
+fn count_within_budget_offsets<T: ByteArrayType>(
+    values: &GenericByteArray<T>,
+    indices: &[usize],
+    byte_budget: usize,
+) -> usize {
+    if indices.is_empty() {
+        return 0;
+    }
+    let n = indices.len();
+    let first = indices[0];
+    let last = indices[n - 1];
+    let offsets = values.value_offsets();
+    let prefix_overhead = std::mem::size_of::<u32>();
+
+    // Stage 1: O(1) span upper bound. Skips Stage 2 in the common case —
+    // including nullable columns, whose `indices` are sparse. The earlier
+    // `last - first + 1 == n` contiguity gate forced every nullable chunk
+    // onto the O(n) Stage 2 walk even though the span check is valid for
+    // any sorted index set.
+    if last >= first {
+        let payload = (offsets[last + 1] - offsets[first]).as_usize();
+        if payload + n * prefix_overhead <= byte_budget {
+            return n;
+        }
+    }
+
+    // Stage 2: scan per-index lengths from the offsets buffer.
+    let mut cum: usize = 0;
+    for (i, idx) in indices.iter().enumerate() {
+        let len = (offsets[idx + 1] - offsets[*idx]).as_usize() + prefix_overhead;
+        cum = cum.saturating_add(len);
+        if cum > byte_budget {
+            return i + 1;
+        }
+    }
+    n
+}
+
 /// Computes the min and max for the provided array and indices
 ///
 /// This is a free function so it can be used with `downcast_op!`