Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions vortex-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -169,5 +169,9 @@ harness = false
name = "take_fsl"
harness = false

[[bench]]
name = "listview_rebuild"
harness = false

[package.metadata.cargo-machete]
ignored = ["getrandom_v03"]
73 changes: 73 additions & 0 deletions vortex-array/benches/listview_rebuild.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Benchmarks for ListView rebuild with primitive and VarBinView elements.

#![allow(clippy::unwrap_used)]
#![allow(clippy::cast_possible_truncation)]

use divan::Bencher;
use vortex_array::IntoArray;
use vortex_array::arrays::ListViewArray;
use vortex_array::arrays::ListViewRebuildMode;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::VarBinViewArray;
use vortex_array::validity::Validity;
use vortex_buffer::Buffer;

fn main() {
divan::main();
}

const NUM_LISTS: &[usize] = &[100, 1_000, 10_000];

fn make_primitive_listview(num_lists: usize) -> ListViewArray {
let element_count = num_lists * 4;
let elements = PrimitiveArray::from_iter(0..element_count as i32).into_array();
let offsets: Buffer<u32> = (0..num_lists).map(|i| (i * 2) as u32).collect();
let sizes: Buffer<u32> = std::iter::repeat_n(4u32, num_lists).collect();
ListViewArray::new(
elements,
offsets.into_array(),
sizes.into_array(),
Validity::NonNullable,
)
}

fn make_varbinview_listview(num_lists: usize) -> ListViewArray {
let element_count = num_lists * 4;
let strings: Vec<String> = (0..element_count)
.map(|i| {
if i % 3 == 0 {
format!("long-string-value-{i:06}")
} else {
format!("s{i}")
}
})
.collect();
let elements = VarBinViewArray::from_iter_str(strings.iter().map(|s| s.as_str())).into_array();
let offsets: Buffer<u32> = (0..num_lists).map(|i| (i * 2) as u32).collect();
let sizes: Buffer<u32> = std::iter::repeat_n(4u32, num_lists).collect();
ListViewArray::new(
elements,
offsets.into_array(),
sizes.into_array(),
Validity::NonNullable,
)
}

#[divan::bench(args = NUM_LISTS)]
fn rebuild_primitive(bencher: Bencher, num_lists: usize) {
let listview = make_primitive_listview(num_lists);
bencher
.with_inputs(|| &listview)
.bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
}

#[divan::bench(args = NUM_LISTS)]
fn rebuild_varbinview(bencher: Bencher, num_lists: usize) {
let listview = make_varbinview_listview(num_lists);
bencher
.with_inputs(|| &listview)
.bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
}
32 changes: 11 additions & 21 deletions vortex-array/src/arrays/listview/rebuild.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ use crate::Array;
use crate::IntoArray;
use crate::ToCanonical;
use crate::arrays::ListViewArray;
use crate::builders::builder_with_capacity;
use crate::compute;
use crate::vtable::ValidityHelper;

Expand Down Expand Up @@ -103,18 +102,14 @@ impl ListViewArray {
})
}

// TODO(connor)[ListView]: We should benchmark if it is faster to use `take` on the elements
// instead of using a builder.
/// The inner function for `rebuild_zero_copy_to_list`, which rebuilds a `ListViewArray` piece
/// by piece.
///
/// Collects all element indices into a flat `BufferMut<u64>` and performs a single bulk
/// `take` to produce the new elements array.
fn naive_rebuild<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
&self,
) -> VortexResult<ListViewArray> {
let element_dtype = self
.dtype()
.as_list_element_opt()
.vortex_expect("somehow had a canonical list that was not a list");

let offsets_canonical = self.offsets().to_primitive();
let offsets_slice = offsets_canonical.as_slice::<O>();
let sizes_canonical = self.sizes().to_primitive();
Expand All @@ -129,17 +124,8 @@ impl ListViewArray {
// null lists to 0.
let mut new_sizes = BufferMut::<S>::with_capacity(len);

// Canonicalize the elements up front as we will be slicing the elements quite a lot.
let elements_canonical = self
.elements()
.to_canonical()
.vortex_expect("canonicalize elements for rebuild")
.into_array();

// Note that we do not know what the exact capacity should be of the new elements since
// there could be overlaps in the existing `ListViewArray`.
let mut new_elements_builder =
builder_with_capacity(element_dtype.as_ref(), self.elements().len());
// Collect all element indices for a single bulk take.
let mut take_indices = BufferMut::<u64>::with_capacity(self.elements().len());

let mut n_elements = NewOffset::zero();
for index in 0..len {
Expand All @@ -159,14 +145,18 @@ impl ListViewArray {

new_offsets.push(n_elements);
new_sizes.push(size);
new_elements_builder.extend_from_array(&elements_canonical.slice(start..stop)?);
take_indices.extend(start as u64..stop as u64);

n_elements += num_traits::cast(size).vortex_expect("Cast failed");
}

let offsets = new_offsets.into_array();
let sizes = new_sizes.into_array();
let elements = new_elements_builder.finish();
let elements = self
.elements()
.take(take_indices.into_array())?
.to_canonical()?
.into_array();
Comment on lines +155 to +159
Copy link
Contributor

@connortsui20 connortsui20 Feb 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the case that we read ListViewArray from disk and we want to export to datafusion (basically if we want to export as a ListArray to Arrow), I believe that not executing here might be more beneficial.

If we want to push down a filter/take into a ListView from Datafusion/arrow, then we definitely do not want to execute here since it would be better to combine the filter/take with this take here.

I also think that this is the more common (perhaps only?) path that this naive_rebuild exists in.

cc @joseph-isaacs

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, not sure what the caller expects to get here. I think it makes sense to keep it lazy and leave canonicalization up to the caller, wdyt @joseph-isaacs ?


debug_assert_eq!(
n_elements.as_(),
Expand Down
Loading