Skip to content
Open
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

53 changes: 43 additions & 10 deletions vortex-array/benches/cast_primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,16 @@ fn main() {
divan::main();
}

const N: usize = 100_000;
// Sizes used for the fallible-path benches below. Kept small enough to fit in L2 so
// the kernel cost shows up clearly rather than being hidden by DRAM bandwidth.
const SIZES: &[usize] = &[65_536];

#[divan::bench]
fn cast_u16_to_u32(bencher: Bencher) {
#[divan::bench(args = SIZES)]
fn cast_u16_to_u32(bencher: Bencher, n: usize) {
let mut rng = StdRng::seed_from_u64(42);
#[expect(clippy::cast_possible_truncation)]
let arr = PrimitiveArray::from_option_iter((0..N).map(|i| {
if rng.random_bool(0.5) {
None
} else {
Some(i as u16)
}
let arr = PrimitiveArray::from_option_iter((0..n).map(|i| {
#[expect(clippy::cast_possible_truncation)]
rng.random_bool(0.5).then(|| i as u16)
}))
.into_array();
// Pre-compute min/max so values_fit_in is a cache hit during the benchmark.
Expand All @@ -46,3 +44,38 @@ fn cast_u16_to_u32(bencher: Bencher) {
.execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())
});
}

/// Narrowing fallible cast that goes through `try_map_with_mask`. Inputs are bounded
/// so every value fits, isolating the kernel's per-lane checked-cast overhead.
#[divan::bench(args = SIZES)]
fn cast_u32_to_u8(bencher: Bencher, n: usize) {
let mut rng = StdRng::seed_from_u64(42);
let arr = PrimitiveArray::from_option_iter((0..n).map(|_| {
rng.random_bool(0.7)
.then(|| rng.random_range(0..u8::MAX) as u32)
}))
.into_array();
bencher.with_inputs(|| arr.clone()).bench_refs(|a| {
#[expect(clippy::unwrap_used)]
a.cast(DType::Primitive(PType::U8, Nullability::Nullable))
.unwrap()
.execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())
});
}

/// Sign-change cast i32 → u32. Values are non-negative so the kernel succeeds
/// but still pays the per-lane `try_from` check.
#[divan::bench(args = SIZES)]
fn cast_i32_to_u32(bencher: Bencher, n: usize) {
let mut rng = StdRng::seed_from_u64(42);
let arr = PrimitiveArray::from_option_iter(
(0..n).map(|_| rng.random_bool(0.7).then(|| rng.random_range(0..i32::MAX))),
)
.into_array();
bencher.with_inputs(|| arr.clone()).bench_refs(|a| {
#[expect(clippy::unwrap_used)]
a.cast(DType::Primitive(PType::U32, Nullability::Nullable))
.unwrap()
.execute::<Canonical>(&mut LEGACY_SESSION.create_execution_ctx())
});
}
142 changes: 87 additions & 55 deletions vortex-array/src/arrays/primitive/compute/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ use num_traits::AsPrimitive;
use num_traits::NumCast;
use vortex_buffer::Buffer;
use vortex_buffer::BufferMut;
use vortex_buffer::lane_kernels::IndexedSinkExt;
use vortex_buffer::lane_kernels::IndexedSourceExt;
use vortex_buffer::lane_kernels::ReinterpretSink;
use vortex_error::VortexResult;
use vortex_error::vortex_bail;
use vortex_error::vortex_err;
Expand Down Expand Up @@ -102,9 +105,7 @@ impl CastKernel for Primitive {
}
}

/// Cast values from `F` to `T`. For infallible casts this is a pure pass; for fallible casts
/// each valid value goes through a checked `NumCast::from` and the kernel bails if any of them
/// overflow `T`. Invalid positions use the wrapping `as` cast since their values are masked out.
/// Cast Primitive values from `F` to `T`.
fn cast_values<F, T>(
array: ArrayView<'_, Primitive>,
new_validity: Validity,
Expand All @@ -114,53 +115,101 @@ where
F: NativePType + AsPrimitive<T>,
T: NativePType,
{
let values = array.as_slice::<F>();

// Fast path: statically infallible, or cached min/max prove every valid value fits in `T`.
// The cached check never triggers a stats computation — if the bounds aren't already known
// we fall through to the per-lane loop below.
if values_always_fit(F::PTYPE, T::PTYPE) || values_fit_in(array, T::PTYPE, ctx, false) {
return Ok(PrimitiveArray::new(cast::<F, T>(values), new_validity).into_array());
}

// TODO(joe): if the values source and target have the same bit-width we can
// mutate in place.

// Fallible: invalid lanes are pre-multiplied to zero so the checked cast always succeeds for
// them; valid lanes go through `NumCast::from` and the whole cast bails on the first overflow.
let mask = array.validity()?.execute_mask(array.len(), ctx)?;
let overflow = || {
vortex_err!(
Compute: "Cannot cast {} to {} — value exceeds target range",
F::PTYPE, T::PTYPE,
)
};
let buffer: Buffer<T> = match &mask {
Mask::AllTrue(_) => BufferMut::try_from_trusted_len_iter(

// Returns `true` if every value of `from` is representable in `to` without loss.
fn casts_losslessly_to(from: PType, to: PType) -> bool {
from.least_supertype(to) == Some(to)
}

// Skip the fallible kernel when type widening or (cached) min/max prove every value fits.
let target_dtype = DType::Primitive(T::PTYPE, Nullability::NonNullable);
let infallible = casts_losslessly_to(F::PTYPE, T::PTYPE)
|| cached_values_fit_in(array, &target_dtype) == Some(true);

let len = array.len();

// If F and T have the same byte width, try to take unique ownership of the buffer.
let same_bit_width = F::PTYPE.byte_width() == T::PTYPE.byte_width();
let owned: Option<BufferMut<F>> = if same_bit_width {
array.into_owned().try_into_buffer_mut::<F>().ok()
} else {
None
};
let values: &[F] = array.as_slice::<F>();

if infallible {
return match owned {
Some(mut buf) => {
ReinterpretSink::<F, T>::new(buf.as_mut_slice()).map_into_in_place(|v: F| v.as_());
// SAFETY: same size + alignment for NativePType
let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
Ok(PrimitiveArray::new(result.freeze(), new_validity).into_array())
}
None => {
let mut buffer = BufferMut::<T>::with_capacity(len);
values.map_into(&mut buffer.spare_capacity_mut()[..len], |v| v.as_());
// SAFETY: map_into initializes every lane.
unsafe { buffer.set_len(len) };
Ok(PrimitiveArray::new(buffer.freeze(), new_validity).into_array())
}
};
}

let mask = array.validity()?.execute_mask(len, ctx)?;

let buffer: Buffer<T> = match (&mask, owned) {
(Mask::AllTrue(_), Some(mut buf)) => {
ReinterpretSink::<F, T>::new(buf.as_mut_slice())
.try_map_in_place(|v: F| <T as NumCast>::from(v))
.map_err(|_| overflow())?;
// SAFETY: same size + alignment for NativePType
let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
result.freeze()
}
(Mask::AllTrue(_), None) => {
let mut buffer = BufferMut::<T>::with_capacity(len);
values
.iter()
.map(|&v| <T as NumCast>::from(v).ok_or_else(overflow)),
)?
.freeze(),
Mask::AllFalse(_) => BufferMut::<T>::zeroed(values.len()).freeze(),
Mask::Values(m) => BufferMut::try_from_trusted_len_iter(
values.iter().zip(m.bit_buffer().iter()).map(|(&v, valid)| {
let factor = if valid { F::one() } else { F::zero() };
<T as NumCast>::from(v * factor).ok_or_else(overflow)
}),
)?
.freeze(),
.try_map_into(&mut buffer.spare_capacity_mut()[..len], |v| {
<T as NumCast>::from(v)
})
.map_err(|_| overflow())?;
// SAFETY: initialized every lane.
unsafe { buffer.set_len(len) };
buffer.freeze()
}
(Mask::AllFalse(_), _) => BufferMut::<T>::zeroed(len).freeze(),
(Mask::Values(m), Some(mut buf)) => {
ReinterpretSink::<F, T>::new(buf.as_mut_slice())
.try_map_masked_in_place(m.bit_buffer(), |v: F| <T as NumCast>::from(v))
.map_err(|_| overflow())?;
// SAFETY: same size + alignment for NativePType
let result: BufferMut<T> = unsafe { buf.transmute::<T>() };
result.freeze()
}
(Mask::Values(m), None) => {
let mut buffer = BufferMut::<T>::with_capacity(len);
values
.try_map_masked_into(
m.bit_buffer(),
&mut buffer.spare_capacity_mut()[..len],
|v| <T as NumCast>::from(v),
)
.map_err(|_| overflow())?;
// SAFETY: initialized every lane.
unsafe { buffer.set_len(len) };
buffer.freeze()
}
};

Ok(PrimitiveArray::new(buffer, new_validity).into_array())
}

/// Out-of-range values at invalid positions are truncated/wrapped by `as`, which is fine because
/// they are masked out by validity.
fn cast<F: NativePType + AsPrimitive<T>, T: NativePType>(array: &[F]) -> Buffer<T> {
BufferMut::from_trusted_len_iter(array.iter().map(|&src| src.as_())).freeze()
}

fn reinterpret(
array: ArrayView<'_, Primitive>,
new_ptype: PType,
Expand All @@ -178,23 +227,6 @@ fn reinterpret(
.into_array()
}

/// Returns `true` if every value of `src` is guaranteed representable in `target` without
/// overflow. Precision may be lost (e.g. large integers cast to `f32`), but the cast can never
/// produce an out-of-range result.
fn values_always_fit(src: PType, target: PType) -> bool {
if src == target {
return true;
}
if src.is_int() && target.is_int() {
return target.byte_width() > src.byte_width()
&& (src.is_unsigned_int() || target.is_signed_int());
}
if src.is_float() && target.is_float() {
return target.byte_width() > src.byte_width();
}
src.is_int() && matches!(target, PType::F32 | PType::F64)
}

/// Returns `true` if all valid values in `array` are representable as `target_ptype`.
///
/// Cached min/max statistics are consulted first. If either bound is missing, the function either
Expand Down
5 changes: 5 additions & 0 deletions vortex-buffer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ workspace = true
[dev-dependencies]
divan = { workspace = true }
num-traits = { workspace = true }
rand = { workspace = true }
rstest = { workspace = true }

[[bench]]
Expand All @@ -48,3 +49,7 @@ harness = false
[[bench]]
name = "vortex_bitbuffer"
harness = false

[[bench]]
name = "lane_kernels"
harness = false
Loading
Loading