Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
122 changes: 44 additions & 78 deletions vortex-cuda/cub/kernels/filter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
// CUB DeviceSelect::Flagged wrapper for Vortex GPU filtering.

#include <cub/cub.cuh>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <cuda_runtime.h>
#include <stdint.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

// i256 type
typedef struct {
Expand All @@ -17,11 +17,10 @@ typedef struct {

// Bit extraction functor for TransformInputIterator
struct BitExtractor {
const uint8_t* packed;
const uint8_t *packed;
uint64_t bit_offset;

__host__ __device__ inline
uint8_t operator()(int64_t idx) const {
__host__ __device__ inline uint8_t operator()(int64_t idx) const {
uint64_t actual_bit = bit_offset + static_cast<uint64_t>(idx);
uint64_t byte_idx = actual_bit / 8;
uint32_t bit_idx = actual_bit % 8;
Expand All @@ -30,31 +29,23 @@ struct BitExtractor {
};

/// Type alias for the packed bit iterator.
using PackedBitIterator = thrust::transform_iterator<
BitExtractor,
thrust::counting_iterator<int64_t>
>;
using PackedBitIterator = thrust::transform_iterator<BitExtractor, thrust::counting_iterator<int64_t>>;

// CUB DeviceSelect::Flagged - Query temp storage size
template<typename T>
static cudaError_t filter_temp_size_impl(size_t* temp_bytes, int64_t num_items) {
template <typename T>
static cudaError_t filter_temp_size_impl(size_t *temp_bytes, int64_t num_items) {
size_t bytes = 0;
cudaError_t err = cub::DeviceSelect::Flagged(
nullptr, bytes,
static_cast<const T*>(nullptr),
static_cast<const uint8_t*>(nullptr),
static_cast<T*>(nullptr),
static_cast<int64_t*>(nullptr),
num_items
);
nullptr, bytes, static_cast<const T *>(nullptr), static_cast<const uint8_t *>(nullptr),
static_cast<T *>(nullptr), static_cast<int64_t *>(nullptr), num_items);
*temp_bytes = bytes;
return err;
}

#define DEFINE_TEMP_SIZE(suffix, Type) \
extern "C" cudaError_t filter_temp_size_##suffix(size_t* temp_bytes, int64_t n) { \
return filter_temp_size_impl<Type>(temp_bytes, n); \
}
#define DEFINE_TEMP_SIZE(suffix, Type) \
extern "C" cudaError_t filter_temp_size_##suffix(size_t *temp_bytes, int64_t n) { \
return filter_temp_size_impl<Type>(temp_bytes, n); \
}

DEFINE_TEMP_SIZE(u8, uint8_t)
DEFINE_TEMP_SIZE(i8, int8_t)
Expand All @@ -69,34 +60,23 @@ DEFINE_TEMP_SIZE(f64, double)
DEFINE_TEMP_SIZE(i128, __int128_t)
DEFINE_TEMP_SIZE(i256, __int256_t)

// CUB DeviceSelect::Flagged - Execute filter with byte mask (one byte per element)
template<typename T>
static cudaError_t filter_bytemask_impl(
void* d_temp,
size_t temp_bytes,
const T* d_in,
const uint8_t* d_flags,
T* d_out,
int64_t* d_num_selected,
int64_t num_items,
cudaStream_t stream
) {
return cub::DeviceSelect::Flagged(
d_temp, temp_bytes,
d_in, d_flags, d_out, d_num_selected,
num_items, stream
);
// CUB DeviceSelect::Flagged - Execute filter with byte mask (one byte per
// element)
template <typename T>
static cudaError_t filter_bytemask_impl(void *d_temp, size_t temp_bytes, const T *d_in,
const uint8_t *d_flags, T *d_out, int64_t *d_num_selected,
int64_t num_items, cudaStream_t stream) {
return cub::DeviceSelect::Flagged(d_temp, temp_bytes, d_in, d_flags, d_out, d_num_selected, num_items,
stream);
}

#define DEFINE_FILTER_BYTEMASK(suffix, Type) \
extern "C" cudaError_t filter_bytemask_##suffix( \
void* d_temp, size_t temp_bytes, \
const Type* d_in, const uint8_t* d_flags, \
Type* d_out, int64_t* d_num_selected, \
int64_t num_items, cudaStream_t stream \
) { \
return filter_bytemask_impl<Type>(d_temp, temp_bytes, d_in, d_flags, d_out, d_num_selected, num_items, stream); \
}
#define DEFINE_FILTER_BYTEMASK(suffix, Type) \
extern "C" cudaError_t filter_bytemask_##suffix( \
void *d_temp, size_t temp_bytes, const Type *d_in, const uint8_t *d_flags, Type *d_out, \
int64_t *d_num_selected, int64_t num_items, cudaStream_t stream) { \
return filter_bytemask_impl<Type>(d_temp, temp_bytes, d_in, d_flags, d_out, d_num_selected, \
num_items, stream); \
}

DEFINE_FILTER_BYTEMASK(u8, uint8_t)
DEFINE_FILTER_BYTEMASK(i8, int8_t)
Expand All @@ -111,7 +91,8 @@ DEFINE_FILTER_BYTEMASK(f64, double)
DEFINE_FILTER_BYTEMASK(i128, __int128_t)
DEFINE_FILTER_BYTEMASK(i256, __int256_t)

// CUB DeviceSelect::Flagged - Execute filter with bit mask (one bit per element)
// CUB DeviceSelect::Flagged - Execute filter with bit mask (one bit per
// element)
//
// Execute filter is using packed bit mask directly via TransformInputIterator.
//
Expand All @@ -125,41 +106,26 @@ DEFINE_FILTER_BYTEMASK(i256, __int256_t)
// d_num_selected: Output count of selected elements
// num_items: Number of input elements
// stream: CUDA stream
template<typename T>
static cudaError_t filter_bitmask_impl(
void* d_temp,
size_t temp_bytes,
const T* d_in,
const uint8_t* d_bitmask,
uint64_t bit_offset,
T* d_out,
int64_t* d_num_selected,
int64_t num_items,
cudaStream_t stream
) {
template <typename T>
static cudaError_t filter_bitmask_impl(void *d_temp, size_t temp_bytes, const T *d_in,
const uint8_t *d_bitmask, uint64_t bit_offset, T *d_out,
int64_t *d_num_selected, int64_t num_items, cudaStream_t stream) {
// Create a transform iterator to read packed bits.
BitExtractor extractor{d_bitmask, bit_offset};
BitExtractor extractor {d_bitmask, bit_offset};
thrust::counting_iterator<int64_t> counting_iter(0);
PackedBitIterator flag_iter(counting_iter, extractor);

return cub::DeviceSelect::Flagged(
d_temp, temp_bytes,
d_in, flag_iter, d_out, d_num_selected,
num_items, stream
);
return cub::DeviceSelect::Flagged(d_temp, temp_bytes, d_in, flag_iter, d_out, d_num_selected, num_items,
stream);
}

#define DEFINE_FILTER_BITMASK(suffix, Type) \
extern "C" cudaError_t filter_bitmask_##suffix( \
void* d_temp, size_t temp_bytes, \
const Type* d_in, \
const uint8_t* d_bitmask, \
uint64_t bit_offset, \
Type* d_out, int64_t* d_num_selected, \
int64_t num_items, cudaStream_t stream \
) { \
return filter_bitmask_impl<Type>(d_temp, temp_bytes, d_in, d_bitmask, bit_offset, d_out, d_num_selected, num_items, stream); \
}
#define DEFINE_FILTER_BITMASK(suffix, Type) \
extern "C" cudaError_t filter_bitmask_##suffix( \
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, one line per arg actually seems more readable here.

void *d_temp, size_t temp_bytes, const Type *d_in, const uint8_t *d_bitmask, uint64_t bit_offset, \
Type *d_out, int64_t *d_num_selected, int64_t num_items, cudaStream_t stream) { \
return filter_bitmask_impl<Type>(d_temp, temp_bytes, d_in, d_bitmask, bit_offset, d_out, \
d_num_selected, num_items, stream); \
}

DEFINE_FILTER_BITMASK(u8, uint8_t)
DEFINE_FILTER_BITMASK(i8, int8_t)
Expand Down
61 changes: 23 additions & 38 deletions vortex-cuda/cub/kernels/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,47 +17,40 @@ typedef struct {

// CUDA types - defined as opaque for bindgen
typedef int cudaError_t;
typedef void* cudaStream_t;
typedef void *cudaStream_t;

#ifdef __cplusplus
extern "C" {
#endif

// X-macro table: (suffix, c_type)
#define FILTER_TYPE_TABLE(X) \
X(u8, uint8_t) \
X(i8, int8_t) \
X(u16, uint16_t) \
X(i16, int16_t) \
X(u32, uint32_t) \
X(i32, int32_t) \
X(u64, uint64_t) \
X(i64, int64_t) \
X(f32, float) \
X(f64, double) \
X(i128, __int128_t) \
#define FILTER_TYPE_TABLE(X) \
X(u8, uint8_t) \
X(i8, int8_t) \
X(u16, uint16_t) \
X(i16, int16_t) \
X(u32, uint32_t) \
X(i32, int32_t) \
X(u64, uint64_t) \
X(i64, int64_t) \
X(f32, float) \
X(f64, double) \
X(i128, __int128_t) \
X(i256, __int256_t)

// Filter temp size query functions
#define DECLARE_FILTER_TEMP_SIZE(suffix, c_type) \
cudaError_t filter_temp_size_##suffix(size_t* temp_bytes, int64_t num_items);
#define DECLARE_FILTER_TEMP_SIZE(suffix, c_type) \
cudaError_t filter_temp_size_##suffix(size_t *temp_bytes, int64_t num_items);

FILTER_TYPE_TABLE(DECLARE_FILTER_TEMP_SIZE)

#undef DECLARE_FILTER_TEMP_SIZE

// Filter execution functions (byte mask - one byte per element)
#define DECLARE_FILTER_BYTEMASK(suffix, c_type) \
cudaError_t filter_bytemask_##suffix( \
void* d_temp, \
size_t temp_bytes, \
const c_type* d_in, \
const uint8_t* d_flags, \
c_type* d_out, \
int64_t* d_num_selected, \
int64_t num_items, \
cudaStream_t stream \
);
#define DECLARE_FILTER_BYTEMASK(suffix, c_type) \
cudaError_t filter_bytemask_##suffix(void *d_temp, size_t temp_bytes, const c_type *d_in, \
const uint8_t *d_flags, c_type *d_out, int64_t *d_num_selected, \
int64_t num_items, cudaStream_t stream);

FILTER_TYPE_TABLE(DECLARE_FILTER_BYTEMASK)

Expand All @@ -68,18 +61,10 @@ FILTER_TYPE_TABLE(DECLARE_FILTER_BYTEMASK)
// These functions accept packed bit mask directly, avoiding the need to
// expand bits to bytes in a separate kernel. Uses CUB's TransformInputIterator
// to read bits on-the-fly during the filter operation.
#define DECLARE_FILTER_BITMASK(suffix, c_type) \
cudaError_t filter_bitmask_##suffix( \
void* d_temp, \
size_t temp_bytes, \
const c_type* d_in, \
const uint8_t* d_bitmask, \
uint64_t bit_offset, \
c_type* d_out, \
int64_t* d_num_selected, \
int64_t num_items, \
cudaStream_t stream \
);
#define DECLARE_FILTER_BITMASK(suffix, c_type) \
cudaError_t filter_bitmask_##suffix(void *d_temp, size_t temp_bytes, const c_type *d_in, \
const uint8_t *d_bitmask, uint64_t bit_offset, c_type *d_out, \
int64_t *d_num_selected, int64_t num_items, cudaStream_t stream);

FILTER_TYPE_TABLE(DECLARE_FILTER_BITMASK)

Expand Down
22 changes: 9 additions & 13 deletions vortex-cuda/kernels/src/alp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
// Converts integers to floats by multiplying by precomputed exponent factors.
// Formula: decoded = (float)encoded * f * e
// Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly.
template<typename EncodedT, typename FloatT>
template <typename EncodedT, typename FloatT>
struct AlpOp {
FloatT f; // F10[exponents.f] - power of 10
FloatT e; // IF10[exponents.e] - inverse power of 10
FloatT f; // F10[exponents.f] - power of 10
FloatT e; // IF10[exponents.e] - inverse power of 10

__device__ inline FloatT operator()(EncodedT value) const {
return static_cast<FloatT>(value) * f * e;
Expand All @@ -19,16 +19,12 @@ struct AlpOp {

// Macro to generate ALP kernel for each type combination.
// Input is integer (encoded), output is float (decoded).
#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType) \
extern "C" __global__ void alp_##enc_suffix##_##float_suffix( \
const EncType *__restrict encoded, \
FloatType *__restrict decoded, \
FloatType f, \
FloatType e, \
uint64_t array_len \
) { \
scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType>{f, e}); \
}
#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType) \
extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded, \
FloatType *__restrict decoded, FloatType f, \
FloatType e, uint64_t array_len) { \
scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType> {f, e}); \
}

// f32 variants (ALP for f32 encodes as i32 or i64)
GENERATE_ALP_KERNEL(i32, f32, int32_t, float)
Expand Down
9 changes: 4 additions & 5 deletions vortex-cuda/kernels/src/bit_unpack.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,15 @@
/// * `bit_width` - Number of bits with which each value is encoded
template <typename T>
__device__ inline void bit_unpack_lane(const T *__restrict packed_chunk, T *__restrict output_buffer,
unsigned int lane, uint32_t bit_width);
unsigned int lane, uint32_t bit_width);

/// Template specializations for `bitunpack_lane_to_smem` for different integer types.
///
/// Generates template specializations for each supported integer size (8, 16, 32, 64 bits).
#define BIT_UNPACK_LANE(bits) \
#define BIT_UNPACK_LANE(bits) \
template <> \
__device__ inline void bit_unpack_lane<uint##bits##_t>(const uint##bits##_t *in, \
uint##bits##_t *out, \
unsigned int lane, uint32_t bw) { \
__device__ inline void bit_unpack_lane<uint##bits##_t>(const uint##bits##_t *in, uint##bits##_t *out, \
unsigned int lane, uint32_t bw) { \
bit_unpack_##bits##_lane(in, out, lane, bw); \
}

Expand Down
2 changes: 1 addition & 1 deletion vortex-cuda/kernels/src/config.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ constexpr uint32_t ELEMENTS_PER_THREAD = 32;
#define MIN(a, b) (((a) < (b)) ? (a) : (b))

#define START_ELEM(idx, len) MIN((idx) * ELEMENTS_PER_THREAD, (len))
#define STOP_ELEM(idx, len) MIN(START_ELEM(idx, len) + ELEMENTS_PER_THREAD, (len))
#define STOP_ELEM(idx, len) MIN(START_ELEM(idx, len) + ELEMENTS_PER_THREAD, (len))
2 changes: 1 addition & 1 deletion vortex-cuda/kernels/src/config_check.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

// Kernel that outputs the config values for verification.
// Output buffer layout: [elements_per_thread, block_dim_x, elements_per_block]
extern "C" __global__ void config_check(uint32_t* output) {
extern "C" __global__ void config_check(uint32_t *output) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
output[0] = ELEMENTS_PER_THREAD;
output[1] = blockDim.x;
Expand Down
21 changes: 7 additions & 14 deletions vortex-cuda/kernels/src/constant_numeric.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,8 @@
#include <cuda_fp16.h>

// Fill an output buffer with a constant value.
template<typename T>
__device__ void constant_fill(
T *__restrict output,
T value,
uint64_t array_len
) {
template <typename T>
__device__ void constant_fill(T *__restrict output, T value, uint64_t array_len) {
const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
const uint64_t startElem = START_ELEM(worker, array_len);
const uint64_t stopElem = STOP_ELEM(worker, array_len);
Expand All @@ -25,13 +21,10 @@ __device__ void constant_fill(
}
}

#define GENERATE_CONSTANT_NUMERIC_KERNEL(suffix, Type) \
extern "C" __global__ void constant_numeric_##suffix( \
Type *__restrict output, \
Type value, \
uint64_t array_len \
) { \
constant_fill(output, value, array_len); \
}
#define GENERATE_CONSTANT_NUMERIC_KERNEL(suffix, Type) \
extern "C" __global__ void constant_numeric_##suffix(Type *__restrict output, Type value, \
uint64_t array_len) { \
constant_fill(output, value, array_len); \
}

FOR_EACH_NUMERIC(GENERATE_CONSTANT_NUMERIC_KERNEL)
Loading
Loading