Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
736 changes: 436 additions & 300 deletions Cargo.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ members = [
"vortex-duckdb",
"vortex-cuda",
"vortex-cuda/cub",
"vortex-cuda/gpu-scan-cli",
"vortex-cuda/macros",
"vortex-cuda/nvcomp",
"vortex-cxx",
Expand Down
2 changes: 1 addition & 1 deletion _typos.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[default]
extend-ignore-identifiers-re = ["FoR", "typ"]
extend-ignore-identifiers-re = ["ffor", "FFOR", "FoR", "typ"]
# We support a few common special comments to tell the checker to ignore sections of code
extend-ignore-re = [
"(#|//)\\s*spellchecker:ignore-next-line\\n.*", # Ignore the next line
Expand Down
5 changes: 5 additions & 0 deletions vortex-array/src/arrays/primitive/vtable/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ impl VTable for PrimitiveVTable {

let ptype = PType::try_from(dtype)?;

vortex_ensure!(
buffer.is_aligned_to(Alignment::new(ptype.byte_width())),
"Misaligned buffer cannot be used to build PrimitiveArray of {ptype}"
);

if buffer.len() != ptype.byte_width() * len {
vortex_bail!(
"Buffer length {} does not match expected length {} for {}, {}",
Expand Down
2 changes: 2 additions & 0 deletions vortex-btrblocks/public-api.lock
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,8 @@ impl vortex_btrblocks::BtrBlocksCompressorBuilder

pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::build(self) -> vortex_btrblocks::BtrBlocksCompressor

pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::empty() -> Self

pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_float(self, codes: impl core::iter::traits::collect::IntoIterator<Item = vortex_btrblocks::FloatCode>) -> Self

pub fn vortex_btrblocks::BtrBlocksCompressorBuilder::exclude_int(self, codes: impl core::iter::traits::collect::IntoIterator<Item = vortex_btrblocks::IntCode>) -> Self
Expand Down
9 changes: 9 additions & 0 deletions vortex-btrblocks/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ impl Default for BtrBlocksCompressorBuilder {
}

impl BtrBlocksCompressorBuilder {
/// Create a new builder with no encodings enabled.
pub fn empty() -> Self {
Self {
int_schemes: Default::default(),
float_schemes: Default::default(),
string_schemes: Default::default(),
}
}

/// Excludes the specified integer compression schemes.
pub fn exclude_int(mut self, codes: impl IntoIterator<Item = IntCode>) -> Self {
let codes: HashSet<_> = codes.into_iter().collect();
Expand Down
7 changes: 6 additions & 1 deletion vortex-cuda/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ workspace = true

[features]
default = []
tracing = ["dep:tracing"]
_test-harness = []
unstable_encodings = ["vortex-zstd/unstable_encodings"]

Expand All @@ -31,7 +32,11 @@ fastlanes = { workspace = true }
futures = { workspace = true, features = ["executor"] }
kanal = { workspace = true }
paste = { workspace = true }
tracing = { workspace = true }
tokio = { workspace = true, features = ["fs"] }
tracing = { workspace = true, features = [
"std",
"attributes",
], optional = true }
vortex-alp = { workspace = true }
vortex-array = { workspace = true }
vortex-buffer = { workspace = true }
Expand Down
78 changes: 14 additions & 64 deletions vortex-cuda/benches/bitpacked_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,24 @@
#![allow(clippy::unwrap_used)]
#![allow(clippy::cast_possible_truncation)]

mod common;

use std::mem::size_of;
use std::ops::Add;
use std::sync::Arc;
use std::sync::atomic::Ordering;
use std::time::Duration;

use criterion::BenchmarkId;
use criterion::Criterion;
use criterion::Throughput;
use cudarc::driver::DeviceRepr;
use cudarc::driver::PushKernelArg;
use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
use futures::executor::block_on;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::validity::Validity::NonNullable;
use vortex_buffer::Buffer;
use vortex_cuda::CudaBufferExt;
use vortex_cuda::CudaDeviceBuffer;
use vortex_cuda::CudaExecutionCtx;
use vortex_cuda::CudaSession;
use vortex_cuda::bitpacked_cuda_kernel;
use vortex_cuda::bitpacked_cuda_launch_config;
use vortex_cuda::launch_cuda_kernel_with_config;
use vortex_cuda::executor::CudaArrayExt;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;
use vortex_dtype::NativePType;
Expand All @@ -35,6 +32,8 @@ use vortex_fastlanes::BitPackedArray;
use vortex_fastlanes::unpack_iter::BitPacked;
use vortex_session::VortexSession;

use crate::common::TimedLaunchStrategy;

const N_ROWS: usize = 100_000_000;

/// Create a bit-packed array with the given bit width
Expand All @@ -56,54 +55,6 @@ where
.vortex_expect("failed to create BitPacked array")
}

/// Launch the bit unpacking kernel and return elapsed GPU time
fn launch_bitunpack_kernel_timed_typed<T>(
bitpacked_array: &BitPackedArray,
cuda_ctx: &mut CudaExecutionCtx,
) -> vortex_error::VortexResult<Duration>
where
T: BitPacked + DeviceRepr,
T::Physical: DeviceRepr,
{
let packed = bitpacked_array.packed().clone();
let bit_width = bitpacked_array.bit_width();
let len = bitpacked_array.len();

// Move packed data to device if not already there
let device_input = if packed.is_on_device() {
packed
} else {
block_on(cuda_ctx.move_to_device(packed)?).vortex_expect("failed to move to device")
};

// Allocate output buffer
let output_slice = cuda_ctx
.device_alloc::<T>(len.next_multiple_of(1024))
.vortex_expect("failed to allocate output");
let output_buf = CudaDeviceBuffer::new(output_slice);

// Get device views
let input_view = device_input
.cuda_view::<T::Physical>()
.vortex_expect("failed to get input view");
let output_view = output_buf.as_view::<T>();

let output_width = size_of::<T>() * 8;
let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, cuda_ctx)?;
let mut launch_builder = cuda_ctx.launch_builder(&cuda_function);

launch_builder.arg(&input_view);
launch_builder.arg(&output_view);

let config = bitpacked_cuda_launch_config(output_width, len)?;

// Launch kernel
let events =
launch_cuda_kernel_with_config(&mut launch_builder, config, CU_EVENT_BLOCKING_SYNC)?;

events.duration()
}

/// Generic benchmark function for a specific type and bit width
fn benchmark_bitunpack_typed<T>(c: &mut Criterion, bit_width: u8, type_name: &str)
where
Expand All @@ -123,19 +74,18 @@ where
&array,
|b, array| {
b.iter_custom(|iters| {
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
.vortex_expect("failed to create execution context");
let timed = TimedLaunchStrategy::default();
let timer = Arc::clone(timed.get());

let mut total_time = Duration::ZERO;
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
.vortex_expect("failed to create execution context")
.with_launch_strategy(Arc::new(timed));

for _ in 0..iters {
let kernel_time =
launch_bitunpack_kernel_timed_typed::<T>(array, &mut cuda_ctx)
.vortex_expect("kernel launch failed");
total_time += kernel_time;
block_on(array.to_array().execute_cuda(&mut cuda_ctx)).unwrap();
}

total_time
Duration::from_nanos(timer.load(Ordering::Relaxed))
});
},
);
Expand Down
39 changes: 39 additions & 0 deletions vortex-cuda/benches/common/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use std::sync::Arc;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;

use cudarc::driver::sys::CUevent_flags;
use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
use vortex_cuda::CudaKernelEvents;
use vortex_cuda::LaunchStrategy;
use vortex_error::VortexResult;

#[derive(Debug, Default)]
pub struct TimedLaunchStrategy {
total_time_ns: Arc<AtomicU64>,
}

impl TimedLaunchStrategy {
pub fn get(&self) -> &Arc<AtomicU64> {
&self.total_time_ns
}
}

impl LaunchStrategy for TimedLaunchStrategy {
fn event_flags(&self) -> CUevent_flags {
// using blocking_sync to make sure all events flush before we complete.
CU_EVENT_BLOCKING_SYNC
}

fn on_complete(&self, events: &CudaKernelEvents, _len: usize) -> VortexResult<()> {
// NOTE: as long as the duration < 584 years this cast is safe.
let elapsed_nanos = events.duration()?.as_nanos() as u64;
self.total_time_ns
.fetch_add(elapsed_nanos, Ordering::Relaxed);

Ok(())
}
}
102 changes: 15 additions & 87 deletions vortex-cuda/benches/date_time_parts_cuda.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,36 @@
#![allow(clippy::unwrap_used)]
#![allow(clippy::cast_possible_truncation)]

mod common;

use std::mem::size_of;
use std::sync::Arc;
use std::sync::atomic::Ordering;
use std::time::Duration;

use criterion::BenchmarkId;
use criterion::Criterion;
use criterion::Throughput;
use cudarc::driver::sys::CUevent_flags::CU_EVENT_BLOCKING_SYNC;
use futures::executor::block_on;
use vortex_array::IntoArray;
use vortex_array::ToCanonical;
use vortex_array::arrays::ConstantArray;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::validity::Validity;
use vortex_buffer::Buffer;
use vortex_cuda::CudaBufferExt;
use vortex_cuda::CudaExecutionCtx;
use vortex_cuda::CudaSession;
use vortex_cuda::executor::CudaArrayExt;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;
use vortex_datetime_parts::DateTimePartsArray;
use vortex_dtype::DType;
use vortex_dtype::Nullability;
use vortex_dtype::PType;
use vortex_dtype::datetime::TimeUnit;
use vortex_dtype::datetime::Timestamp;
use vortex_error::VortexExpect;
use vortex_session::VortexSession;

use crate::common::TimedLaunchStrategy;

fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArray {
let days: Vec<i16> = (0..len).map(|i| (i / 1000) as i16).collect();
let days_arr = PrimitiveArray::new(Buffer::from(days), Validity::NonNullable).into_array();
Expand All @@ -46,80 +48,6 @@ fn make_datetimeparts_array(len: usize, time_unit: TimeUnit) -> DateTimePartsArr
.vortex_expect("Failed to create DateTimePartsArray")
}

/// Launches DateTimeParts decode kernel and returns elapsed GPU time.
fn launch_datetimeparts_kernel_timed(
dtp_array: &DateTimePartsArray,
time_unit: TimeUnit,
cuda_ctx: &mut CudaExecutionCtx,
) -> vortex_error::VortexResult<Duration> {
let days_prim = dtp_array.days().to_primitive();

// TODO(0ax1): figure out how to represent constant array in CUDA kernels
let seconds_prim = dtp_array.seconds().to_primitive();
let subseconds_prim = dtp_array.subseconds().to_primitive();

let output_len = dtp_array.len();

let divisor: i64 = match time_unit {
TimeUnit::Nanoseconds => 1_000_000_000,
TimeUnit::Microseconds => 1_000_000,
TimeUnit::Milliseconds => 1_000,
TimeUnit::Seconds => 1,
TimeUnit::Days => unreachable!("Days not supported for DateTimeParts"),
};

let days_device = block_on(
cuda_ctx
.copy_to_device(days_prim.as_slice::<i16>().to_vec())
.unwrap(),
)
.vortex_expect("failed to copy days to device");

let seconds_device = block_on(
cuda_ctx
.copy_to_device(seconds_prim.as_slice::<i8>().to_vec())
.unwrap(),
)
.vortex_expect("failed to copy seconds to device");

let subseconds_device = block_on(
cuda_ctx
.copy_to_device(subseconds_prim.as_slice::<i8>().to_vec())
.unwrap(),
)
.vortex_expect("failed to copy subseconds to device");

// Allocate output buffer
let output_device = block_on(cuda_ctx.copy_to_device(vec![0i64; output_len]).unwrap())
.vortex_expect("failed to allocate output buffer");

let days_view = days_device
.cuda_view::<i16>()
.vortex_expect("failed to get days view");
let seconds_view = seconds_device
.cuda_view::<i8>()
.vortex_expect("failed to get seconds view");
let subseconds_view = subseconds_device
.cuda_view::<i8>()
.vortex_expect("failed to get subseconds view");
let output_view = output_device
.cuda_view::<i64>()
.vortex_expect("failed to get output view");

let array_len_u64 = output_len as u64;

let events = vortex_cuda::launch_cuda_kernel!(
execution_ctx: cuda_ctx,
module: "date_time_parts",
ptypes: &[PType::I16, PType::I8, PType::I8],
launch_args: [days_view, seconds_view, subseconds_view, divisor, output_view, array_len_u64],
event_recording: CU_EVENT_BLOCKING_SYNC,
array_len: output_len
);

events.duration()
}

fn benchmark_datetimeparts(c: &mut Criterion) {
let mut group = c.benchmark_group("datetimeparts_cuda");
group.sample_size(10);
Expand All @@ -139,19 +67,19 @@ fn benchmark_datetimeparts(c: &mut Criterion) {
&dtp_array,
|b, dtp_array| {
b.iter_custom(|iters| {
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
.vortex_expect("failed to create execution context");
let timed = TimedLaunchStrategy::default();
let timer = Arc::clone(timed.get());

let mut total_time = Duration::ZERO;
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
.vortex_expect("failed to create execution context")
.with_launch_strategy(Arc::new(timed));

for _ in 0..iters {
let kernel_time =
launch_datetimeparts_kernel_timed(dtp_array, time_unit, &mut cuda_ctx)
.vortex_expect("kernel launch failed");
total_time += kernel_time;
// block on immediately here
block_on(dtp_array.to_array().execute_cuda(&mut cuda_ctx)).unwrap();
}

total_time
Duration::from_nanos(timer.load(Ordering::Relaxed))
});
},
);
Expand Down
Loading
Loading