vortex-data · 0ax1 · Feb 13, 2026 · 0ax1 · Feb 13, 2026
diff --git a/vortex-cuda/kernels/src/.clang-format → vortex-cuda/.clang-format b/vortex-cuda/kernels/src/.clang-format → vortex-cuda/.clang-format
diff --git a/vortex-cuda/cub/kernels/filter.cu b/vortex-cuda/cub/kernels/filter.cu
@@ -4,10 +4,10 @@
 // CUB DeviceSelect::Flagged wrapper for Vortex GPU filtering.
 
 #include <cub/cub.cuh>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <cuda_runtime.h>
 #include <stdint.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
 
 // i256 type
 typedef struct {
@@ -17,11 +17,10 @@ typedef struct {
 
 // Bit extraction functor for TransformInputIterator
 struct BitExtractor {
-    const uint8_t* packed;
+    const uint8_t *packed;
     uint64_t bit_offset;
 
-    __host__ __device__ inline
-    uint8_t operator()(int64_t idx) const {
+    __host__ __device__ inline uint8_t operator()(int64_t idx) const {
         uint64_t actual_bit = bit_offset + static_cast<uint64_t>(idx);
         uint64_t byte_idx = actual_bit / 8;
         uint32_t bit_idx = actual_bit % 8;
@@ -30,31 +29,23 @@ struct BitExtractor {
 };
 
 /// Type alias for the packed bit iterator.
-using PackedBitIterator = thrust::transform_iterator<
-    BitExtractor,
-    thrust::counting_iterator<int64_t>
->;
+using PackedBitIterator = thrust::transform_iterator<BitExtractor, thrust::counting_iterator<int64_t>>;
 
 // CUB DeviceSelect::Flagged - Query temp storage size
-template<typename T>
-static cudaError_t filter_temp_size_impl(size_t* temp_bytes, int64_t num_items) {
+template <typename T>
+static cudaError_t filter_temp_size_impl(size_t *temp_bytes, int64_t num_items) {
     size_t bytes = 0;
     cudaError_t err = cub::DeviceSelect::Flagged(
-        nullptr, bytes,
-        static_cast<const T*>(nullptr),
-        static_cast<const uint8_t*>(nullptr),
-        static_cast<T*>(nullptr),
-        static_cast<int64_t*>(nullptr),
-        num_items
-    );
+        nullptr, bytes, static_cast<const T *>(nullptr), static_cast<const uint8_t *>(nullptr),
+        static_cast<T *>(nullptr), static_cast<int64_t *>(nullptr), num_items);
     *temp_bytes = bytes;
     return err;
 }
 
-#define DEFINE_TEMP_SIZE(suffix, Type) \
-extern "C" cudaError_t filter_temp_size_##suffix(size_t* temp_bytes, int64_t n) { \
-    return filter_temp_size_impl<Type>(temp_bytes, n); \
-}
+#define DEFINE_TEMP_SIZE(suffix, Type)                                                                       \
+    extern "C" cudaError_t filter_temp_size_##suffix(size_t *temp_bytes, int64_t n) {                        \
+        return filter_temp_size_impl<Type>(temp_bytes, n);                                                   \
+    }
 
 DEFINE_TEMP_SIZE(u8, uint8_t)
 DEFINE_TEMP_SIZE(i8, int8_t)
@@ -69,34 +60,23 @@ DEFINE_TEMP_SIZE(f64, double)
 DEFINE_TEMP_SIZE(i128, __int128_t)
 DEFINE_TEMP_SIZE(i256, __int256_t)
 
-// CUB DeviceSelect::Flagged - Execute filter with byte mask (one byte per element)
-template<typename T>
-static cudaError_t filter_bytemask_impl(
-    void* d_temp,
-    size_t temp_bytes,
-    const T* d_in,
-    const uint8_t* d_flags,
-    T* d_out,
-    int64_t* d_num_selected,
-    int64_t num_items,
-    cudaStream_t stream
-) {
-    return cub::DeviceSelect::Flagged(
-        d_temp, temp_bytes,
-        d_in, d_flags, d_out, d_num_selected,
-        num_items, stream
-    );
+// CUB DeviceSelect::Flagged - Execute filter with byte mask (one byte per
+// element)
+template <typename T>
+static cudaError_t filter_bytemask_impl(void *d_temp, size_t temp_bytes, const T *d_in,
+                                        const uint8_t *d_flags, T *d_out, int64_t *d_num_selected,
+                                        int64_t num_items, cudaStream_t stream) {
+    return cub::DeviceSelect::Flagged(d_temp, temp_bytes, d_in, d_flags, d_out, d_num_selected, num_items,
+                                      stream);
 }
 
-#define DEFINE_FILTER_BYTEMASK(suffix, Type) \
-extern "C" cudaError_t filter_bytemask_##suffix( \
-    void* d_temp, size_t temp_bytes, \
-    const Type* d_in, const uint8_t* d_flags, \
-    Type* d_out, int64_t* d_num_selected, \
-    int64_t num_items, cudaStream_t stream \
-) { \
-    return filter_bytemask_impl<Type>(d_temp, temp_bytes, d_in, d_flags, d_out, d_num_selected, num_items, stream); \
-}
+#define DEFINE_FILTER_BYTEMASK(suffix, Type)                                                                 \
+    extern "C" cudaError_t filter_bytemask_##suffix(                                                         \
+        void *d_temp, size_t temp_bytes, const Type *d_in, const uint8_t *d_flags, Type *d_out,              \
+        int64_t *d_num_selected, int64_t num_items, cudaStream_t stream) {                                   \
+        return filter_bytemask_impl<Type>(d_temp, temp_bytes, d_in, d_flags, d_out, d_num_selected,          \
+                                          num_items, stream);                                                \
+    }
 
 DEFINE_FILTER_BYTEMASK(u8, uint8_t)
 DEFINE_FILTER_BYTEMASK(i8, int8_t)
@@ -111,7 +91,8 @@ DEFINE_FILTER_BYTEMASK(f64, double)
 DEFINE_FILTER_BYTEMASK(i128, __int128_t)
 DEFINE_FILTER_BYTEMASK(i256, __int256_t)
 
-// CUB DeviceSelect::Flagged - Execute filter with bit mask (one bit per element)
+// CUB DeviceSelect::Flagged - Execute filter with bit mask (one bit per
+// element)
 //
 // Execute filter is using packed bit mask directly via TransformInputIterator.
 //
@@ -125,41 +106,26 @@ DEFINE_FILTER_BYTEMASK(i256, __int256_t)
 //   d_num_selected: Output count of selected elements
 //   num_items: Number of input elements
 //   stream: CUDA stream
-template<typename T>
-static cudaError_t filter_bitmask_impl(
-    void* d_temp,
-    size_t temp_bytes,
-    const T* d_in,
-    const uint8_t* d_bitmask,
-    uint64_t bit_offset,
-    T* d_out,
-    int64_t* d_num_selected,
-    int64_t num_items,
-    cudaStream_t stream
-) {
+template <typename T>
+static cudaError_t filter_bitmask_impl(void *d_temp, size_t temp_bytes, const T *d_in,
+                                       const uint8_t *d_bitmask, uint64_t bit_offset, T *d_out,
+                                       int64_t *d_num_selected, int64_t num_items, cudaStream_t stream) {
     // Create a transform iterator to read packed bits.
-    BitExtractor extractor{d_bitmask, bit_offset};
+    BitExtractor extractor {d_bitmask, bit_offset};
     thrust::counting_iterator<int64_t> counting_iter(0);
     PackedBitIterator flag_iter(counting_iter, extractor);
 
-    return cub::DeviceSelect::Flagged(
-        d_temp, temp_bytes,
-        d_in, flag_iter, d_out, d_num_selected,
-        num_items, stream
-    );
+    return cub::DeviceSelect::Flagged(d_temp, temp_bytes, d_in, flag_iter, d_out, d_num_selected, num_items,
+                                      stream);
 }
 
-#define DEFINE_FILTER_BITMASK(suffix, Type) \
-extern "C" cudaError_t filter_bitmask_##suffix( \
-    void* d_temp, size_t temp_bytes, \
-    const Type* d_in, \
-    const uint8_t* d_bitmask, \
-    uint64_t bit_offset, \
-    Type* d_out, int64_t* d_num_selected, \
-    int64_t num_items, cudaStream_t stream \
-) { \
-    return filter_bitmask_impl<Type>(d_temp, temp_bytes, d_in, d_bitmask, bit_offset, d_out, d_num_selected, num_items, stream); \
-}
+#define DEFINE_FILTER_BITMASK(suffix, Type)                                                                  \
+    extern "C" cudaError_t filter_bitmask_##suffix(                                                          \
+        void *d_temp, size_t temp_bytes, const Type *d_in, const uint8_t *d_bitmask, uint64_t bit_offset,    \
+        Type *d_out, int64_t *d_num_selected, int64_t num_items, cudaStream_t stream) {                      \
+        return filter_bitmask_impl<Type>(d_temp, temp_bytes, d_in, d_bitmask, bit_offset, d_out,             \
+                                         d_num_selected, num_items, stream);                                 \
+    }
 
 DEFINE_FILTER_BITMASK(u8, uint8_t)
 DEFINE_FILTER_BITMASK(i8, int8_t)

diff --git a/vortex-cuda/cub/kernels/filter.h b/vortex-cuda/cub/kernels/filter.h
@@ -17,47 +17,40 @@ typedef struct {
 
 // CUDA types - defined as opaque for bindgen
 typedef int cudaError_t;
-typedef void* cudaStream_t;
+typedef void *cudaStream_t;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 // X-macro table: (suffix, c_type)
-#define FILTER_TYPE_TABLE(X)  \
-    X(u8,  uint8_t)           \
-    X(i8,  int8_t)            \
-    X(u16, uint16_t)          \
-    X(i16, int16_t)           \
-    X(u32, uint32_t)          \
-    X(i32, int32_t)           \
-    X(u64, uint64_t)          \
-    X(i64, int64_t)           \
-    X(f32, float)             \
-    X(f64, double)            \
-    X(i128, __int128_t)       \
+#define FILTER_TYPE_TABLE(X)                                                                                 \
+    X(u8, uint8_t)                                                                                           \
+    X(i8, int8_t)                                                                                            \
+    X(u16, uint16_t)                                                                                         \
+    X(i16, int16_t)                                                                                          \
+    X(u32, uint32_t)                                                                                         \
+    X(i32, int32_t)                                                                                          \
+    X(u64, uint64_t)                                                                                         \
+    X(i64, int64_t)                                                                                          \
+    X(f32, float)                                                                                            \
+    X(f64, double)                                                                                           \
+    X(i128, __int128_t)                                                                                      \
     X(i256, __int256_t)
 
 // Filter temp size query functions
-#define DECLARE_FILTER_TEMP_SIZE(suffix, c_type) \
-    cudaError_t filter_temp_size_##suffix(size_t* temp_bytes, int64_t num_items);
+#define DECLARE_FILTER_TEMP_SIZE(suffix, c_type)                                                             \
+    cudaError_t filter_temp_size_##suffix(size_t *temp_bytes, int64_t num_items);
 
 FILTER_TYPE_TABLE(DECLARE_FILTER_TEMP_SIZE)
 
 #undef DECLARE_FILTER_TEMP_SIZE
 
 // Filter execution functions (byte mask - one byte per element)
-#define DECLARE_FILTER_BYTEMASK(suffix, c_type)  \
-    cudaError_t filter_bytemask_##suffix(        \
-        void* d_temp,                            \
-        size_t temp_bytes,                       \
-        const c_type* d_in,                      \
-        const uint8_t* d_flags,                  \
-        c_type* d_out,                           \
-        int64_t* d_num_selected,                 \
-        int64_t num_items,                       \
-        cudaStream_t stream                      \
-    );
+#define DECLARE_FILTER_BYTEMASK(suffix, c_type)                                                              \
+    cudaError_t filter_bytemask_##suffix(void *d_temp, size_t temp_bytes, const c_type *d_in,                \
+                                         const uint8_t *d_flags, c_type *d_out, int64_t *d_num_selected,     \
+                                         int64_t num_items, cudaStream_t stream);
 
 FILTER_TYPE_TABLE(DECLARE_FILTER_BYTEMASK)
 
@@ -68,18 +61,10 @@ FILTER_TYPE_TABLE(DECLARE_FILTER_BYTEMASK)
 // These functions accept packed bit mask directly, avoiding the need to
 // expand bits to bytes in a separate kernel. Uses CUB's TransformInputIterator
 // to read bits on-the-fly during the filter operation.
-#define DECLARE_FILTER_BITMASK(suffix, c_type)  \
-    cudaError_t filter_bitmask_##suffix(        \
-        void* d_temp,                           \
-        size_t temp_bytes,                      \
-        const c_type* d_in,                     \
-        const uint8_t* d_bitmask,               \
-        uint64_t bit_offset,                    \
-        c_type* d_out,                          \
-        int64_t* d_num_selected,                \
-        int64_t num_items,                      \
-        cudaStream_t stream                     \
-    );
+#define DECLARE_FILTER_BITMASK(suffix, c_type)                                                               \
+    cudaError_t filter_bitmask_##suffix(void *d_temp, size_t temp_bytes, const c_type *d_in,                 \
+                                        const uint8_t *d_bitmask, uint64_t bit_offset, c_type *d_out,        \
+                                        int64_t *d_num_selected, int64_t num_items, cudaStream_t stream);
 
 FILTER_TYPE_TABLE(DECLARE_FILTER_BITMASK)
 

diff --git a/vortex-cuda/kernels/src/alp.cu b/vortex-cuda/kernels/src/alp.cu
@@ -7,10 +7,10 @@
 // Converts integers to floats by multiplying by precomputed exponent factors.
 // Formula: decoded = (float)encoded * f * e
 // Where f = F10[exponents.f] and e = IF10[exponents.e] are passed directly.
-template<typename EncodedT, typename FloatT>
+template <typename EncodedT, typename FloatT>
 struct AlpOp {
-    FloatT f;  // F10[exponents.f] - power of 10
-    FloatT e;  // IF10[exponents.e] - inverse power of 10
+    FloatT f; // F10[exponents.f] - power of 10
+    FloatT e; // IF10[exponents.e] - inverse power of 10
 
     __device__ inline FloatT operator()(EncodedT value) const {
         return static_cast<FloatT>(value) * f * e;
@@ -19,16 +19,12 @@ struct AlpOp {
 
 // Macro to generate ALP kernel for each type combination.
 // Input is integer (encoded), output is float (decoded).
-#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType) \
-extern "C" __global__ void alp_##enc_suffix##_##float_suffix( \
-    const EncType *__restrict encoded, \
-    FloatType *__restrict decoded, \
-    FloatType f, \
-    FloatType e, \
-    uint64_t array_len \
-) { \
-    scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType>{f, e}); \
-}
+#define GENERATE_ALP_KERNEL(enc_suffix, float_suffix, EncType, FloatType)                                    \
+    extern "C" __global__ void alp_##enc_suffix##_##float_suffix(const EncType *__restrict encoded,          \
+                                                                 FloatType *__restrict decoded, FloatType f, \
+                                                                 FloatType e, uint64_t array_len) {          \
+        scalar_kernel(encoded, decoded, array_len, AlpOp<EncType, FloatType> {f, e});                        \
+    }
 
 // f32 variants (ALP for f32 encodes as i32 or i64)
 GENERATE_ALP_KERNEL(i32, f32, int32_t, float)

diff --git a/vortex-cuda/kernels/src/bit_unpack.cuh b/vortex-cuda/kernels/src/bit_unpack.cuh
@@ -26,16 +26,15 @@
 /// * `bit_width` - Number of bits with which each value is encoded
 template <typename T>
 __device__ inline void bit_unpack_lane(const T *__restrict packed_chunk, T *__restrict output_buffer,
-                                                       unsigned int lane, uint32_t bit_width);
+                                       unsigned int lane, uint32_t bit_width);
 
 /// Template specializations for `bitunpack_lane_to_smem` for different integer types.
 ///
 /// Generates template specializations for each supported integer size (8, 16, 32, 64 bits).
-#define BIT_UNPACK_LANE(bits)                                                                                 \
+#define BIT_UNPACK_LANE(bits)                                                                                \
     template <>                                                                                              \
-    __device__ inline void bit_unpack_lane<uint##bits##_t>(const uint##bits##_t *in,                  \
-                                                                  uint##bits##_t *out,                       \
-                                                                  unsigned int lane, uint32_t bw) {          \
+    __device__ inline void bit_unpack_lane<uint##bits##_t>(const uint##bits##_t *in, uint##bits##_t *out,    \
+                                                           unsigned int lane, uint32_t bw) {                 \
         bit_unpack_##bits##_lane(in, out, lane, bw);                                                         \
     }
 

diff --git a/vortex-cuda/kernels/src/config.cuh b/vortex-cuda/kernels/src/config.cuh
@@ -16,4 +16,4 @@ constexpr uint32_t ELEMENTS_PER_THREAD = 32;
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
 
 #define START_ELEM(idx, len) MIN((idx) * ELEMENTS_PER_THREAD, (len))
-#define STOP_ELEM(idx, len) MIN(START_ELEM(idx, len) + ELEMENTS_PER_THREAD, (len))
+#define STOP_ELEM(idx, len)  MIN(START_ELEM(idx, len) + ELEMENTS_PER_THREAD, (len))
diff --git a/vortex-cuda/kernels/src/config_check.cu b/vortex-cuda/kernels/src/config_check.cu
@@ -13,7 +13,7 @@
 
 // Kernel that outputs the config values for verification.
 // Output buffer layout: [elements_per_thread, block_dim_x, elements_per_block]
-extern "C" __global__ void config_check(uint32_t* output) {
+extern "C" __global__ void config_check(uint32_t *output) {
     if (threadIdx.x == 0 && blockIdx.x == 0) {
         output[0] = ELEMENTS_PER_THREAD;
         output[1] = blockDim.x;

diff --git a/vortex-cuda/kernels/src/constant_numeric.cu b/vortex-cuda/kernels/src/constant_numeric.cu
@@ -6,12 +6,8 @@
 #include <cuda_fp16.h>
 
 // Fill an output buffer with a constant value.
-template<typename T>
-__device__ void constant_fill(
-    T *__restrict output,
-    T value,
-    uint64_t array_len
-) {
+template <typename T>
+__device__ void constant_fill(T *__restrict output, T value, uint64_t array_len) {
     const uint64_t worker = blockIdx.x * blockDim.x + threadIdx.x;
     const uint64_t startElem = START_ELEM(worker, array_len);
     const uint64_t stopElem = STOP_ELEM(worker, array_len);
@@ -25,13 +21,10 @@ __device__ void constant_fill(
     }
 }
 
-#define GENERATE_CONSTANT_NUMERIC_KERNEL(suffix, Type) \
-extern "C" __global__ void constant_numeric_##suffix( \
-    Type *__restrict output, \
-    Type value, \
-    uint64_t array_len \
-) { \
-    constant_fill(output, value, array_len); \
-}
+#define GENERATE_CONSTANT_NUMERIC_KERNEL(suffix, Type)                                                       \
+    extern "C" __global__ void constant_numeric_##suffix(Type *__restrict output, Type value,                \
+                                                         uint64_t array_len) {                               \
+        constant_fill(output, value, array_len);                                                             \
+    }
 
 FOR_EACH_NUMERIC(GENERATE_CONSTANT_NUMERIC_KERNEL)