Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 223 additions & 1 deletion parquet/src/arrow/arrow_writer/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,12 @@ use crate::geospatial::statistics::GeospatialStatistics;
use crate::schema::types::ColumnDescPtr;
use crate::util::bit_util::num_required_bits;
use crate::util::interner::{Interner, Storage};
use arrow_array::types::ByteArrayType;
use arrow_array::{
Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, FixedSizeBinaryArray,
LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
GenericByteArray, LargeBinaryArray, LargeStringArray, StringArray, StringViewArray,
};
use arrow_buffer::{ArrowNativeType, Buffer};
use arrow_schema::DataType;

macro_rules! downcast_dict_impl {
Expand Down Expand Up @@ -481,6 +483,90 @@ impl ColumnValueEncoder for ByteArrayEncoder {
Ok(())
}

fn count_values_within_byte_budget_gather(
values: &Self::Values,
indices: &[usize],
byte_budget: usize,
) -> Option<usize> {
// `ByteArrayEncoder` only ever writes via `write_gather`, so this
// is the relevant method.
//
// Two-stage walk for the simple offset-buffer byte array types:
// 1. If indices are contiguous, compute the total payload in
// O(1) via a single subtraction on the offsets buffer.
// When the total fits the budget — the overwhelmingly
// common "small values" case — return immediately.
// 2. Otherwise, walk per-value byte sizes from the offsets
// buffer (still cheap, no slice/UTF-8 construction) and
// exit at the first value that pushes the cumulative sum
// past the budget. This bounds skewed distributions: an
// outlier value is caught wherever it lands in the chunk.
let count = match values.data_type() {
DataType::Utf8 => count_within_budget_offsets(
values.as_any().downcast_ref::<StringArray>().unwrap(),
indices,
byte_budget,
),
DataType::LargeUtf8 => count_within_budget_offsets(
values.as_any().downcast_ref::<LargeStringArray>().unwrap(),
indices,
byte_budget,
),
DataType::Binary => count_within_budget_offsets(
values.as_any().downcast_ref::<BinaryArray>().unwrap(),
indices,
byte_budget,
),
DataType::LargeBinary => count_within_budget_offsets(
values.as_any().downcast_ref::<LargeBinaryArray>().unwrap(),
indices,
byte_budget,
),
// View arrays carry each value's length in the low 32 bits of
// its u128 view word, so lengths are scannable without touching
// any data buffer — and the common small-value case skips even
// that scan via an O(1) conservative bound.
DataType::Utf8View => {
let array = values.as_any().downcast_ref::<StringViewArray>().unwrap();
count_within_budget_views(
array.views(),
indices,
byte_budget,
max_view_value_len(array.data_buffers()),
)
}
DataType::BinaryView => {
let array = values.as_any().downcast_ref::<BinaryViewArray>().unwrap();
count_within_budget_views(
array.views(),
indices,
byte_budget,
max_view_value_len(array.data_buffers()),
)
}
// For arrow Dictionary input, treat every chunk as fitting
// and stay on the batched path. The arrow array being
// Dictionary-encoded in the first place implies its values
// are small enough that dedup is worthwhile, which is the
// opposite of the "5 MiB blob per row" case this fix
// targets. Doing a per-value walk through dict keys (each
// value lookup is keys[i] → values[key] → slice) on every
// chunk costs ~+30-80% vs `main` after writer-dict spill,
// and there is essentially nothing to bound.
DataType::Dictionary(_, _) => indices.len(),
// FixedSizeBinary falls through to the per-value walk via
// `ArrayAccessor::value`.
_ => downcast_op!(
values.data_type(),
values,
count_within_budget_accessor,
indices,
byte_budget
),
};
Some(count)
}

fn num_values(&self) -> usize {
match &self.dict_encoder {
Some(encoder) => encoder.indices.len(),
Expand Down Expand Up @@ -593,6 +679,142 @@ where
}
}

/// Cumulative-scan fallback used for byte array types that don't expose
/// a single contiguous offsets buffer — view arrays, dictionary arrays,
/// fixed-size binary. Returns `indices.len()` if every value fits the
/// budget, otherwise the smallest `k ≥ 1` whose first `k` values' encoded
/// size first exceeds `byte_budget` — i.e. the boundary value is included
/// so the caller's page-flush check trips immediately on this mini-batch,
/// without leaving a sliver to glue onto the next page.
///
/// Free function so it can be used with `downcast_op!`.
fn count_within_budget_accessor<T>(values: T, indices: &[usize], byte_budget: usize) -> usize
where
T: ArrayAccessor + Copy,
T::Item: AsRef<[u8]>,
{
let mut cum: usize = 0;
for (i, idx) in indices.iter().enumerate() {
let value_len = values.value(*idx).as_ref().len() + std::mem::size_of::<u32>();
cum = cum.saturating_add(value_len);
if cum > byte_budget {
return i + 1;
}
}
indices.len()
}

/// Upper bound on any single value's byte length in a view array.
///
/// An inline view stores at most 12 bytes; an
/// out-of-line view's data is a contiguous slice of exactly one data
/// buffer, so it cannot be longer than the largest data buffer. This is a
/// loose bound (a value is usually far smaller than a whole buffer) but it
/// is O(number of buffers) and always sound.
fn max_view_value_len(buffers: &[Buffer]) -> usize {
/// Bytes that fit inline in a u128 view word (the rest is len + prefix).
const MAX_INLINE_VIEW_LEN: usize = 12;
buffers
.iter()
.map(|b| b.len())
.max()
.unwrap_or(0)
.max(MAX_INLINE_VIEW_LEN)
}

/// Two-stage budget count for view arrays (`Utf8View`, `BinaryView`).
///
/// 1. View arrays have no prefix-sum offsets buffer, so the exact O(1)
/// span subtraction `count_within_budget_offsets` uses is unavailable.
/// But a *conservative* O(1) bound is: every value is at most
/// `max_value_len` bytes, so the whole chunk fits the budget when
/// `n * (max_value_len + 4) <= byte_budget`. This skips the per-value
/// walk for the common small-value case — what view arrays are built
/// for, and exactly the case where there is nothing to bound.
/// 2. Otherwise scan per-value lengths from the low 32 bits of each u128
/// view word (no data-buffer dereference) and stop at the first value
/// that pushes the cumulative sum past the budget.
fn count_within_budget_views(
views: &[u128],
indices: &[usize],
byte_budget: usize,
max_value_len: usize,
) -> usize {
// Stage 1: O(1) conservative upper bound.
let per_value = max_value_len + std::mem::size_of::<u32>();
if indices.len().saturating_mul(per_value) <= byte_budget {
return indices.len();
}
// Stage 2: exact per-value scan.
let mut cum: usize = 0;
for (i, idx) in indices.iter().enumerate() {
let len = (views[*idx] as u32) as usize;
cum = cum.saturating_add(len + std::mem::size_of::<u32>());
if cum > byte_budget {
return i + 1;
}
}
indices.len()
}

/// Two-stage fast path for `GenericByteArray<O>`
/// (Utf8/LargeUtf8/Binary/LargeBinary).
///
/// `indices` are assumed sorted ascending — they always are here, since
/// they come from `non_null_indices`, which is built in array order.
///
/// 1. The span `offsets[last+1] - offsets[first]` is an O(1) upper
/// bound on the chunk's payload: it covers every array position in
/// `[first, last]`, a superset of `indices`. For a non-null chunk
/// `indices` *is* that whole range; for a chunk drawn from a
/// nullable column the skipped positions are nulls, whose offset
/// delta is zero, so the span still equals the exact payload.
/// Either way, if the upper bound fits the budget every value
/// fits — return `indices.len()` with no per-value work. This
/// covers the overwhelmingly common "small values" case for both
/// non-null *and* nullable columns.
/// 2. Otherwise the chunk is genuinely near the budget: walk per-index
/// lengths from the offsets buffer directly (no slice/UTF-8
/// construction) and stop at the first value that pushes the
/// cumulative sum past the budget.
fn count_within_budget_offsets<T: ByteArrayType>(
values: &GenericByteArray<T>,
indices: &[usize],
byte_budget: usize,
) -> usize {
if indices.is_empty() {
return 0;
}
let n = indices.len();
let first = indices[0];
let last = indices[n - 1];
let offsets = values.value_offsets();
let prefix_overhead = std::mem::size_of::<u32>();

// Stage 1: O(1) span upper bound. Skips Stage 2 in the common case —
// including nullable columns, whose `indices` are sparse. The earlier
// `last - first + 1 == n` contiguity gate forced every nullable chunk
// onto the O(n) Stage 2 walk even though the span check is valid for
// any sorted index set.
if last >= first {
let payload = (offsets[last + 1] - offsets[first]).as_usize();
if payload + n * prefix_overhead <= byte_budget {
return n;
}
}

// Stage 2: scan per-index lengths from the offsets buffer.
let mut cum: usize = 0;
for (i, idx) in indices.iter().enumerate() {
let len = (offsets[idx + 1] - offsets[*idx]).as_usize() + prefix_overhead;
cum = cum.saturating_add(len);
if cum > byte_budget {
return i + 1;
}
}
n
}

/// Computes the min and max for the provided array and indices
///
/// This is a free function so it can be used with `downcast_op!`
Expand Down
Loading
Loading