vortex-data · a10y · Feb 9, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -26,6 +26,7 @@ members = [
     "vortex-duckdb",
     "vortex-cuda",
     "vortex-cuda/cub",
+    "vortex-cuda/gpu-scan-cli",
     "vortex-cuda/macros",
     "vortex-cuda/nvcomp",
     "vortex-cxx",

diff --git a/_typos.toml b/_typos.toml
@@ -1,5 +1,5 @@
 [default]
-extend-ignore-identifiers-re = ["FoR", "typ"]
+extend-ignore-identifiers-re = ["ffor", "FFOR", "FoR", "typ"]
 # We support a few common special comments to tell the checker to ignore sections of code
 extend-ignore-re = [
     "(#|//)\\s*spellchecker:ignore-next-line\\n.*",                      # Ignore the next line

diff --git a/vortex-array/src/arrays/primitive/vtable/mod.rs b/vortex-array/src/arrays/primitive/vtable/mod.rs
@@ -89,6 +89,11 @@ impl VTable for PrimitiveVTable {
 
         let ptype = PType::try_from(dtype)?;
 
+        vortex_ensure!(
+            buffer.is_aligned_to(Alignment::new(ptype.byte_width())),
+            "Misaligned buffer cannot be used to build PrimitiveArray of {ptype}"
+        );
+
         if buffer.len() != ptype.byte_width() * len {
             vortex_bail!(
                 "Buffer length {} does not match expected length {} for {}, {}",

diff --git a/vortex-btrblocks/public-api.lock b/vortex-btrblocks/public-api.lock
@@ -222,6 +222,8 @@ impl vortex_btrblocks::BtrBlocksCompressorBuilder
 
 pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::build(self) -> vortex_btrblocks::BtrBlocksCompressor
 
+pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::empty() -> Self
+
 pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_float(self, codes: impl core::iter::traits::collect::IntoIterator<Item = vortex_btrblocks::FloatCode>) -> Self
 
 pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_int(self, codes: impl core::iter::traits::collect::IntoIterator<Item = vortex_btrblocks::IntCode>) -> Self

diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs
@@ -71,6 +71,15 @@ impl Default for BtrBlocksCompressorBuilder {
 }
 
 impl BtrBlocksCompressorBuilder {
+    /// Create a new builder with no encodings enabled.
+    pub fn empty() -> Self {
+        Self {
+            int_schemes: Default::default(),
+            float_schemes: Default::default(),
+            string_schemes: Default::default(),
+        }
+    }
+
     /// Excludes the specified integer compression schemes.
     pub fn exclude_int(mut self, codes: impl IntoIterator<Item = IntCode>) -> Self {
         let codes: HashSet<_> = codes.into_iter().collect();

diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
@@ -18,6 +18,7 @@ workspace = true
 
 [features]
 default = []
+tracing = ["dep:tracing"]
 _test-harness = []
 unstable_encodings = ["vortex-zstd/unstable_encodings"]
 
@@ -31,7 +32,11 @@ fastlanes = { workspace = true }
 futures = { workspace = true, features = ["executor"] }
 kanal = { workspace = true }
 paste = { workspace = true }
-tracing = { workspace = true }
+tokio = { workspace = true, features = ["fs"] }
+tracing = { workspace = true, features = [
+    "std",
+    "attributes",
+], optional = true }
 vortex-alp = { workspace = true }
 vortex-array = { workspace = true }
 vortex-buffer = { workspace = true }

diff --git a/vortex-cuda/benches/bitpacked_cuda.rs b/vortex-cuda/benches/bitpacked_cuda.rs
@@ -6,27 +6,24 @@
 #![allow(clippy::unwrap_used)]
 #![allow(clippy::cast_possible_truncation)]
 
+mod common;
+
 use std::mem::size_of;
 use std::ops::Add;
+use std::sync::Arc;
+use std::sync::atomic::Ordering;
 use std::time::Duration;
 
 use criterion::BenchmarkId;
 use criterion::Criterion;
 use criterion::Throughput;
 use cudarc::driver::DeviceRepr;
-use cudarc::driver::PushKernelArg;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
 use futures::executor::block_on;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity::NonNullable;
 use vortex_buffer::Buffer;
-use vortex_cuda::CudaBufferExt;
-use vortex_cuda::CudaDeviceBuffer;
-use vortex_cuda::CudaExecutionCtx;
 use vortex_cuda::CudaSession;
-use vortex_cuda::bitpacked_cuda_kernel;
-use vortex_cuda::bitpacked_cuda_launch_config;
-use vortex_cuda::launch_cuda_kernel_with_config;
+use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_dtype::NativePType;
@@ -35,6 +32,8 @@ use vortex_fastlanes::BitPackedArray;
 use vortex_fastlanes::unpack_iter::BitPacked;
 use vortex_session::VortexSession;
 
+use crate::common::TimedLaunchStrategy;
+
 const N_ROWS: usize = 100_000_000;
 
 /// Create a bit-packed array with the given bit width
@@ -56,54 +55,6 @@ where
         .vortex_expect("failed to create BitPacked array")
 }
 
-/// Launch the bit unpacking kernel and return elapsed GPU time
-fn launch_bitunpack_kernel_timed_typed<T>(
-    bitpacked_array: &BitPackedArray,
-    cuda_ctx: &mut CudaExecutionCtx,
-) -> vortex_error::VortexResult<Duration>
-where
-    T: BitPacked + DeviceRepr,
-    T::Physical: DeviceRepr,
-{
-    let packed = bitpacked_array.packed().clone();
-    let bit_width = bitpacked_array.bit_width();
-    let len = bitpacked_array.len();
-
-    // Move packed data to device if not already there
-    let device_input = if packed.is_on_device() {
-        packed
-    } else {
-        block_on(cuda_ctx.move_to_device(packed)?).vortex_expect("failed to move to device")
-    };
-
-    // Allocate output buffer
-    let output_slice = cuda_ctx
-        .device_alloc::<T>(len.next_multiple_of(1024))
-        .vortex_expect("failed to allocate output");
-    let output_buf = CudaDeviceBuffer::new(output_slice);
-
-    // Get device views
-    let input_view = device_input
-        .cuda_view::<T::Physical>()
-        .vortex_expect("failed to get input view");
-    let output_view = output_buf.as_view::<T>();
-
-    let output_width = size_of::<T>() * 8;
-    let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, cuda_ctx)?;
-    let mut launch_builder = cuda_ctx.launch_builder(&cuda_function);
-
-    launch_builder.arg(&input_view);
-    launch_builder.arg(&output_view);
-
-    let config = bitpacked_cuda_launch_config(output_width, len)?;
-
-    // Launch kernel
-    let events =
-        launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_BLOCKING_SYNC)?;
-
-    events.duration()
-}
-
 /// Generic benchmark function for a specific type and bit width
 fn benchmark_bitunpack_typed<T>(c: &mut Criterion, bit_width: u8, type_name: &str)
 where
@@ -123,19 +74,18 @@ where
         &array,
         |b, array| {
             b.iter_custom(|iters| {
-                let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
-                    .vortex_expect("failed to create execution context");
+                let timed = TimedLaunchStrategy::default();
+                let timer = Arc::clone(timed.get());
 
-                let mut total_time = Duration::ZERO;
+                let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                    .vortex_expect("failed to create execution context")
+                    .with_launch_strategy(Arc::new(timed));
 
                 for _ in 0..iters {
-                    let kernel_time =
-                        launch_bitunpack_kernel_timed_typed::<T>(array, &mut cuda_ctx)
-                            .vortex_expect("kernel launch failed");
-                    total_time += kernel_time;
+                    block_on(array.to_array().execute_cuda(&mut cuda_ctx)).unwrap();
                 }
 
-                total_time
+                Duration::from_nanos(timer.load(Ordering::Relaxed))
             });
         },
     );

diff --git a/vortex-cuda/benches/common/mod.rs b/vortex-cuda/benches/common/mod.rs
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::Arc;
+use std::sync::atomic::AtomicU64;
+use std::sync::atomic::Ordering;
+
+use cudarc::driver::sys::CUevent_flags;
+use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
+use vortex_cuda::CudaKernelEvents;
+use vortex_cuda::LaunchStrategy;
+use vortex_error::VortexResult;
+
+#[derive(Debug, Default)]
+pub struct TimedLaunchStrategy {
+    total_time_ns: Arc<AtomicU64>,
+}
+
+impl TimedLaunchStrategy {
+    pub fn get(&self) -> &Arc<AtomicU64> {
+        &self.total_time_ns
+    }
+}
+
+impl LaunchStrategy for TimedLaunchStrategy {
+    fn event_flags(&self) -> CUevent_flags {
+        // using blocking_sync to make sure all events flush before we complete.
+        CU_EVENT_BLOCKING_SYNC
+    }
+
+    fn on_complete(&self, events: &CudaKernelEvents, _len: usize) -> VortexResult<()> {
+        // NOTE: as long as the duration < 584 years this cast is safe.
+        let elapsed_nanos = events.duration()?.as_nanos() as u64;
+        self.total_time_ns
+            .fetch_add(elapsed_nanos, Ordering::Relaxed);
+
+        Ok(())
+    }
+}
diff --git a/vortex-cuda/benches/date_time_parts_cuda.rs b/vortex-cuda/benches/date_time_parts_cuda.rs
@@ -6,34 +6,36 @@
 #![allow(clippy::unwrap_used)]
 #![allow(clippy::cast_possible_truncation)]
 
+mod common;
+
 use std::mem::size_of;
+use std::sync::Arc;
+use std::sync::atomic::Ordering;
 use std::time::Duration;
 
 use criterion::BenchmarkId;
 use criterion::Criterion;
 use criterion::Throughput;
-use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
 use futures::executor::block_on;
 use vortex_array::IntoArray;
-use vortex_array::ToCanonical;
 use vortex_array::arrays::ConstantArray;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::validity::Validity;
 use vortex_buffer::Buffer;
-use vortex_cuda::CudaBufferExt;
-use vortex_cuda::CudaExecutionCtx;
 use vortex_cuda::CudaSession;
+use vortex_cuda::executor::CudaArrayExt;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 use vortex_datetime_parts::DateTimePartsArray;
 use vortex_dtype::DType;
 use vortex_dtype::Nullability;
-use vortex_dtype::PType;
 use vortex_dtype::datetime::TimeUnit;
 use vortex_dtype::datetime::Timestamp;
 use vortex_error::VortexExpect;
 use vortex_session::VortexSession;
 
+use crate::common::TimedLaunchStrategy;
+
 fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArray {
     let days: Vec<i16> = (0..len).map(|i| (i / 1000) as i16).collect();
     let days_arr = PrimitiveArray::new(Buffer::from(days), Validity::NonNullable).into_array();
@@ -46,80 +48,6 @@ fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArr
         .vortex_expect("Failed to create DateTimePartsArray")
 }
 
-/// Launches DateTimeParts decode kernel and returns elapsed GPU time.
-fn launch_datetimeparts_kernel_timed(
-    dtp_array: &DateTimePartsArray,
-    time_unit: TimeUnit,
-    cuda_ctx: &mut CudaExecutionCtx,
-) -> vortex_error::VortexResult<Duration> {
-    let days_prim = dtp_array.days().to_primitive();
-
-    // TODO(0ax1): figure out how to represent constant array in CUDA kernels
-    let seconds_prim = dtp_array.seconds().to_primitive();
-    let subseconds_prim = dtp_array.subseconds().to_primitive();
-
-    let output_len = dtp_array.len();
-
-    let divisor: i64 = match time_unit {
-        TimeUnit::Nanoseconds => 1_000_000_000,
-        TimeUnit::Microseconds => 1_000_000,
-        TimeUnit::Milliseconds => 1_000,
-        TimeUnit::Seconds => 1,
-        TimeUnit::Days => unreachable!("Days not supported for DateTimeParts"),
-    };
-
-    let days_device = block_on(
-        cuda_ctx
-            .copy_to_device(days_prim.as_slice::<i16>().to_vec())
-            .unwrap(),
-    )
-    .vortex_expect("failed to copy days to device");
-
-    let seconds_device = block_on(
-        cuda_ctx
-            .copy_to_device(seconds_prim.as_slice::<i8>().to_vec())
-            .unwrap(),
-    )
-    .vortex_expect("failed to copy seconds to device");
-
-    let subseconds_device = block_on(
-        cuda_ctx
-            .copy_to_device(subseconds_prim.as_slice::<i8>().to_vec())
-            .unwrap(),
-    )
-    .vortex_expect("failed to copy subseconds to device");
-
-    // Allocate output buffer
-    let output_device = block_on(cuda_ctx.copy_to_device(vec![0i64; output_len]).unwrap())
-        .vortex_expect("failed to allocate output buffer");
-
-    let days_view = days_device
-        .cuda_view::<i16>()
-        .vortex_expect("failed to get days view");
-    let seconds_view = seconds_device
-        .cuda_view::<i8>()
-        .vortex_expect("failed to get seconds view");
-    let subseconds_view = subseconds_device
-        .cuda_view::<i8>()
-        .vortex_expect("failed to get subseconds view");
-    let output_view = output_device
-        .cuda_view::<i64>()
-        .vortex_expect("failed to get output view");
-
-    let array_len_u64 = output_len as u64;
-
-    let events = vortex_cuda::launch_cuda_kernel!(
-        execution_ctx: cuda_ctx,
-        module: "date_time_parts",
-        ptypes: &[PType::I16, PType::I8, PType::I8],
-        launch_args: [days_view, seconds_view, subseconds_view, divisor, output_view, array_len_u64],
-        event_recording: CU_EVENT_BLOCKING_SYNC,
-        array_len: output_len
-    );
-
-    events.duration()
-}
-
 fn benchmark_datetimeparts(c: &mut Criterion) {
     let mut group = c.benchmark_group("datetimeparts_cuda");
     group.sample_size(10);
@@ -139,19 +67,19 @@ fn benchmark_datetimeparts(c: &mut Criterion) {
             &dtp_array,
             |b, dtp_array| {
                 b.iter_custom(|iters| {
-                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
-                        .vortex_expect("failed to create execution context");
+                    let timed = TimedLaunchStrategy::default();
+                    let timer = Arc::clone(timed.get());
 
-                    let mut total_time = Duration::ZERO;
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                        .vortex_expect("failed to create execution context")
+                        .with_launch_strategy(Arc::new(timed));
 
                     for _ in 0..iters {
-                        let kernel_time =
-                            launch_datetimeparts_kernel_timed(dtp_array, time_unit, &mut cuda_ctx)
-                                .vortex_expect("kernel launch failed");
-                        total_time += kernel_time;
+                        // block on immediately here
+                        block_on(dtp_array.to_array().execute_cuda(&mut cuda_ctx)).unwrap();
                     }
 
-                    total_time
+                    Duration::from_nanos(timer.load(Ordering::Relaxed))
                 });
             },
         );