Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
2f86159
Added MNIST example, applied formattin, and fixed some bugs.
RobBa Mar 22, 2026
ce702ff
Enabled CUDA compilation
RobBa Mar 22, 2026
120b36d
Better mem-access for matmul
RobBa Mar 23, 2026
350bc6c
Optimized matmul for better cache alignment/lower cache misses
RobBa Mar 23, 2026
3ed4515
MNIST and gitignore update
RobBa Mar 23, 2026
6bb06f0
Tensor backend for CUDA ready
RobBa Mar 28, 2026
d9483d6
Minor optimization using memcopy
RobBa Mar 29, 2026
6ba1018
Laying foundation for easy transpose
RobBa Mar 29, 2026
4f74be9
Further enabling of resetting dims
RobBa Mar 29, 2026
638d129
Progress on view transpose/lazy transpose
RobBa Mar 29, 2026
5e9b304
Finished implementing transpose logic
RobBa Mar 29, 2026
e6f646c
Some more CUDA and compiler hints, clean up values_t, fix leaky-relu
RobBa Apr 3, 2026
582e299
Sigmoid kernel
RobBa Apr 3, 2026
9d18a0c
Fixed issues with new transposition logic
RobBa May 2, 2026
1d072cb
skeleton for softmax
RobBa May 6, 2026
e42498d
First softmax untested
RobBa May 10, 2026
e214e54
Skeleton of cuda bce loss
RobBa May 10, 2026
563a141
Renaming of cuda namespace
RobBa May 11, 2026
8de5740
Fixed several compile-time and run-time bugs
RobBa May 11, 2026
21483d2
Fixing some cuda errors in tensor.cpp
RobBa May 12, 2026
82b3c38
Fix some more compile time bugs in cuda backend
RobBa May 12, 2026
c23ef9a
Resolved compile time errors
RobBa May 12, 2026
4f0065f
Resolved compile time and runtime issues of unit tests
RobBa May 12, 2026
d224845
Fixed python unit tests
RobBa May 12, 2026
be9b17d
First CUDA unit tests, plus some fixes and checks based on them
RobBa May 12, 2026
b37d353
More unit tests and some unit test cleanup
RobBa May 12, 2026
21b24a8
Fixing Gaussian interface, using expect_near in unit tests instead of…
RobBa May 12, 2026
332fb97
Minor refactor
RobBa May 12, 2026
9c9a977
Remove cuda branch from printing
RobBa May 13, 2026
70a9fab
Fixed CUDA transpose
RobBa May 13, 2026
c4e2514
Implemented naive matmul in CUDA
RobBa May 13, 2026
719ed21
Fixed unit test, made threads per block more generic through device p…
RobBa May 13, 2026
35eb97b
Removed bug from matmul
RobBa May 13, 2026
d157017
Added unit tests for CUDA modules, set up infrastructure for cuda cod…
RobBa May 13, 2026
4d67455
More infra prepared for CUDA
RobBa May 13, 2026
f82f899
Last pieces of infrastructure
RobBa May 13, 2026
95bce13
Added warp reduce kernel code to compute softmax. Needs dimension tha…
RobBa May 14, 2026
138d83b
Added softmax kernel for medium large strides
RobBa May 14, 2026
17d65c8
Fixed indexing bug for crossentropy- and softmax that arises in dimen…
RobBa May 14, 2026
3e96018
Update readme
RobBa May 14, 2026
39a9086
Adjusting spacing
RobBa May 15, 2026
2e13f16
Fix GPU softmax
RobBa May 15, 2026
7091379
Fixed softmax for good
RobBa May 15, 2026
bc5ae6d
Softmax kernels for large case; need debugging
RobBa May 18, 2026
af8b5df
Fixed large softmax kernel
RobBa May 18, 2026
e262f09
Fixed softmax backward indexing to process multidimensional softmax
RobBa May 19, 2026
083dc34
Softmax backward kernel for small stride in CUDA; fix some indentation
RobBa May 19, 2026
b0c6020
Minor fix in kernel, add unit test, update readme. Kernel still needs…
RobBa May 19, 2026
6bb7030
Update unit tests with sharper delta
RobBa May 19, 2026
b029b52
Fix error in small softmax backward kernel
RobBa May 20, 2026
2716fc6
Kernel for large backward softmax
RobBa May 20, 2026
b8e8622
CUDA version of FfLayer with unit tests
RobBa May 20, 2026
a644134
Updated unit tests
RobBa May 20, 2026
cde3a65
Fixed unit tests
RobBa May 20, 2026
54a423c
Clean up unit tests further, align indentation
RobBa May 20, 2026
d7bde01
backward on broadcast add
RobBa May 21, 2026
51ab71f
Fixed sum over dims on CPU for general case and implemented general c…
RobBa May 22, 2026
f5eef20
Implemented backward loss functions, generalized forward CE loss func…
RobBa May 23, 2026
ae003ee
Fixed compile time issues, added unit tests for backward of basic ten…
RobBa May 24, 2026
5cde0c6
More forward loss kernels, prepared unit tests for losses
RobBa May 24, 2026
4290ad9
Forward crossentropy and RMSE losses
RobBa May 25, 2026
01cd521
Preparing infrastructure for forward crossentropy softmax kernel
RobBa May 25, 2026
8045d39
Crossentropy softmax kernel forward, some restructuring of includes
RobBa May 25, 2026
e17cef4
Fix crossentropy forward kernel, simplify CPU version
RobBa May 30, 2026
cffa431
Unit tests for larger input in loss functions, bugfixes in crossentro…
RobBa May 30, 2026
6e005d1
Fixed all forward loss functions and eliminated error of uninitialize…
RobBa May 30, 2026
438dcd8
CUDA versions of optimizers
RobBa May 30, 2026
9e48d7b
Fix bug in loss
RobBa May 31, 2026
df332d0
CUDA Python unit tests
RobBa May 31, 2026
e6c0978
Fixing some subtle bugs, updated some kernels with type traits, clean…
RobBa May 31, 2026
4995c96
Fixed RMSProp and bug in RMSE backward, added unit tests for large lo…
RobBa May 31, 2026
10877a3
Fix bug in CUDA softmax backward, add unit test to capture gap, fix o…
RobBa May 31, 2026
895500a
Fix bug and python unit tests
RobBa May 31, 2026
c24769f
Merge branch 'main' into dev/cuda_and_performance
RobBa May 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions .gitignore

This file was deleted.

17 changes: 16 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,22 @@ endif()
add_compile_options("$<$<C_COMPILER_ID:MSVC>:/utf-8>")
add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/utf-8>")

# TODO: add flag for double precision?
option(CUDA "Enable CUDA execution for some faster data structures" ON)

if (CUDA)
include(CheckLanguage)
check_language(CUDA)

if(CMAKE_CUDA_COMPILER)
add_definitions(-D__CUDA)
enable_language(CUDA)

set(CMAKE_CUDA_STANDARD 20)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
else()
message(WARNING "Could not find CUDA on system. Compiling without CUDA enabled")
endif()
endif()

# include python libs
if(APPLE)
Expand Down
4 changes: 2 additions & 2 deletions examples/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,10 @@ def evaluate(net, x, y_int, batch_size=256):
# setup
net = make_net()
loss_fn = CrossEntropyWithSoftmax()
optim = RmsProp(net.parameters(), 0.00001, 0.95) # lr and decay
optim = RmsProp(net.parameters(), 0.000001, 0.999) # lr and decay

# training loop
n_epochs = 10
n_epochs = 5
for epoch in range(n_epochs):
train_loss = train_epoch(net, loss_fn, optim, x_train, y_train)
val_acc = evaluate(net, x_val, y_val)
Expand Down
21 changes: 16 additions & 5 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,18 @@ For some examples on Python interface, see tests/python.
- Training framework (optimizers, loss functions, layers, and networks)
- **Example code**: Full MNIST dataset training example
- **Python Interface**: Seamless integration via Boost.Python
- **Clean Architecture**: Modular design, ~4K LOC
- **Clean Architecture**: Modular design, maintainable and extensible
- **CI/CD**: Automated testing with GTest and GitHub Actions

## Tech Stack

- C++17/20
- C++17/20/23
- CMake build system
- Boost.Python for Python bindings
- Python 3 for library interface and examples
- Google Test (GTest) and PyTest for unit testing
- GitHub Actions for CI/CD
- CUDA

## Current Status

Expand All @@ -40,7 +41,7 @@ Roadmap:
- [x] Python Binding Unit Tests
- [x] Optimizers and training framework
- [x] MNIST example
- [ ] CUDA mode for operations
- [x] CUDA mode for operations
- [ ] Additional layer types (Conv2D, Dropout, etc.)
- [ ] AlexNet reference implementation
- [ ] Docker deployment example
Expand All @@ -50,9 +51,18 @@ Roadmap:
mkdir build && cd build
cmake ..
make
ctest
```

### Building with CUDA

Project automatically detects whether CUDA is installed, and compiles with it.
If CUDA compilation not desired you can switch it off via

```bash
cmake --DCUDA=Off ..
```


## Running Unit Tests

Compile with building tests enabled:
Expand All @@ -61,7 +71,7 @@ Compile with building tests enabled:
mkdir build && cd build
cmake -DBUILD_TESTS=On ..
make
ctest
ctest .
```

## Required
Expand All @@ -73,6 +83,7 @@ ctest
- numpy 1.26.4
- pytest and GTest for unit tests (we use pytest=9.0.2)
- Google Benchmark for benchmarking
- CUDA (we use CUDA 13.1 on an RTX-5050)

## Troubleshooting

Expand Down
33 changes: 32 additions & 1 deletion src/backend/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@ file(GLOB_RECURSE CORE_SOURCES
utility/*.cpp
)

if(CMAKE_CUDA_COMPILER)
file(GLOB_RECURSE CUDA_SOURCES
computational_graph/*.cu
data_modeling/*.cu
module/*.cu
system/*.cu
training/*.cu
utility/*.cu
)
list(APPEND CORE_SOURCES ${CUDA_SOURCES})
endif()

add_library(BackendCore SHARED ${CORE_SOURCES})

target_include_directories(BackendCore PUBLIC
Expand All @@ -15,4 +27,23 @@ target_include_directories(BackendCore PUBLIC

set_target_properties(BackendCore PROPERTIES
LIBRARY_OUTPUT_DIRECTORY "${PYTHON_MODULE_DIR}" # make sure Python-modules see backend
)
)

if(CMAKE_CUDA_COMPILER)
set_target_properties(BackendCore PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
# nvidia-smi --query-gpu=compute_cap --format=csv,noheader
# I get 12.0, hence 120
#set(CMAKE_CUDA_ARCHITECTURES "75;86;89;100;120")
CMAKE_CUDA_ARCHITECTURES native
)

find_package(CUDAToolkit REQUIRED)
target_include_directories(BackendCore PRIVATE
${CUDAToolkit_INCLUDE_DIRS}
)
target_link_libraries(BackendCore
CUDA::cudart
)
endif()

Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
/**
* @file activation_nodes.cu
* @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
* @brief
* @version 0.1
* @date 2026-03-23
*
* @copyright Copyright (c) 2026
*
*/

#ifndef __CUDA
static_assert(false, "File should not be compiled without CUDA enabled");
#endif // __CUDA

#include "activation_nodes.cuh"
#include "utility/cuda/cuda_common.cuh"

using namespace std;

namespace {
/**
* @brief Relu backward kernel.
*/
__global__ void reluBackwardKernel(ftype* const res, const ftype* const upstreamGrad, const ftype* const parent, const tensorSize_t size) {
const int gid = blockIdx.x * blockDim.x + threadIdx.x;
if(gid >= size) {
return;
}

res[gid] = parent[gid] > 0 ? upstreamGrad[gid] : 0;
}

/**
* @brief Leaky relu backward kernel.
*/
__global__ void leakyReluBackwardKernel(ftype* const res, const ftype* const upstreamGrad, const ftype* const parent, const ftype eps, const tensorSize_t size) {
const int gid = blockIdx.x * blockDim.x + threadIdx.x;
if(gid >= size) {
return;
}

res[gid] = parent[gid] > 0 ? upstreamGrad[gid] : eps * upstreamGrad[gid];
}

/**
* @brief Sigmoid backward kernel, optimized by using the forward sigmoid.
*/
__global__ void sigmoidBackwardKernel(ftype* const res, const ftype* const upstreamGrad, const ftype* const sigmoid, const tensorSize_t size) {
const int gid = blockIdx.x * blockDim.x + threadIdx.x;
if(gid >= size) {
return;
}

ftype si = sigmoid[gid];
res[gid] = si * (1 - si) * upstreamGrad[gid];
}

/**
* @brief Softmax backward kernel. This kernel is different than others since it is warp aligned. The inner loop avoids shared memory bank
* conflicts by broadcasting.
*
* stridesWidthPerBlock is an awkward name. It is the product of number of strides per block (times) stride. We pre-compute it on host.
*/
__global__ void softmaxBackwardKernelOneBlock(ftype* const res, const ftype* const upstreamGrad, const ftype* const softmax,
const tensorSize_t stride, const int stridesWidthPerBlock, const int threadsPerStride, tensorSize_t size) {
const int tid = threadIdx.x;

const int withinStrideOffset = tid % threadsPerStride;
const int strideOffset = (tid / threadsPerStride) * stride;

const int gid = blockIdx.x * stridesWidthPerBlock + strideOffset + withinStrideOffset;
const bool isPadded = (withinStrideOffset >= stride) || (gid >= size); // padded threads only exists to align warps with strides

ftype yi = 0;
const int smemOffset = strideOffset + withinStrideOffset;

extern __shared__ ftype smem[];
if(!isPadded) {
yi = softmax[gid];
smem[smemOffset] = yi;
smem[smemOffset + stridesWidthPerBlock] = upstreamGrad[gid];
}
__syncthreads();

if(isPadded) {
return;
}

ftype grad = 0;
for(int j = 0; j < stride; j++) {
// warp alignment -> smem-reads are broadcasted per warp -> no bank conflicts
ftype yj = smem[strideOffset + j];
ftype gj = smem[strideOffset + j + stridesWidthPerBlock];

auto jacobian = (withinStrideOffset == j) ? yi * (1 - yj) : -yi * yj;
grad += gj * jacobian;
}

res[gid] = grad;
}

/**
* @brief Large softmax pass. Because the stride now does not fit into one block anymore we do a grid-stride loop.
*/
__global__ void softmaxBackwardKernelLargePass(ftype* const res, const ftype* const upstreamGrad, const ftype* const softmax, const int blocksPerStride, const tensorSize_t stride) {
const int strideNumber = blockIdx.x / blocksPerStride;
const int strideOffset = strideNumber * stride;
const int i = (blockIdx.x % blocksPerStride) * blockDim.x + threadIdx.x;
// blockIdx.x % blocksPerStride = block number within this stride

const int tid = threadIdx.x;
const int gid = strideOffset + i;

extern __shared__ ftype smem[];

const bool isNotPadded = i < stride;
const ftype yi = isNotPadded ? softmax[gid] : 0;

ftype grad = 0;
for(int offset = 0; offset < stride; offset += blockDim.x) {
// load into smem
{
const int j = offset + tid;
if(j < stride) {
smem[tid] = softmax[strideOffset + j];
smem[tid + blockDim.x] = upstreamGrad[strideOffset + j];
}
__syncthreads();
}


for(int k = 0; k < blockDim.x; k++) {
const int j = offset + k;
if(j < stride) {
ftype yj = smem[k];
ftype gj = smem[k + blockDim.x];

auto jacobian = (i == j) ? yi * (1 - yj) : -yi * yj;
grad += gj * jacobian;
}
}
__syncthreads();
}

if(isNotPadded) {
res[gid] = grad;
}
}
}

namespace cuda_impl {
void reluBackward(Tensor& res, const Tensor& upstreamGrad, const Tensor& parent) {
constexpr int threadsPerBlock = 256;
const int blocks = (upstreamGrad.getSize() + threadsPerBlock - 1) / threadsPerBlock;

reluBackwardKernel<<<blocks, threadsPerBlock>>>(res.getData(), upstreamGrad.getData(), parent.getData(), res.getSize());
cudaErrchk(cudaDeviceSynchronize());
}

void leakyReluBackward(Tensor& res, const Tensor& upstreamGrad, const Tensor& parent, ftype eps) {
constexpr int threadsPerBlock = 256;
const int blocks = (upstreamGrad.getSize() + threadsPerBlock - 1) / threadsPerBlock;

leakyReluBackwardKernel<<<blocks, threadsPerBlock>>>(res.getData(), upstreamGrad.getData(), parent.getData(), eps, res.getSize());
cudaErrchk(cudaDeviceSynchronize());
}

void sigmoidBackward(Tensor& res, const Tensor& upstreamGrad, const Tensor& sigmoid) {
constexpr int threadsPerBlock = 256;
const int blocks = (upstreamGrad.getSize() + threadsPerBlock - 1) / threadsPerBlock;

sigmoidBackwardKernel<<<blocks, threadsPerBlock>>>(res.getData(), upstreamGrad.getData(), sigmoid.getData(), res.getSize());
cudaErrchk(cudaDeviceSynchronize());
}

/**
* @brief The backward of the softmax. Due to optimization this function distinguishes three cases of stride size, where stride
* is the size of the dimension the softmax operation is applied to. The two cases are a stride either fitting into one block or not.
*/
void softmaxBackward(Tensor& res, const Tensor& upstreamGrad, const Tensor& softmax) {
assert(upstreamGrad.getSize() == softmax.getSize());

constexpr int maxThreadsPerBlock = 256;
const int stride = softmax.getDims()[-1];

if(stride < maxThreadsPerBlock) {
const int threadsPerStride = max(1, ((stride + 31) / 32)) * 32; // == warps per stride * 32

// min over maximum possible strides per block and actual number of strides
const int stridesPerBlock = min(maxThreadsPerBlock / threadsPerStride, softmax.getSize() / stride);
const int strideWidthPerBlock = stridesPerBlock * stride; // for smem idx computation

int threadsPerBlock = 1;
while(threadsPerBlock < threadsPerStride * stridesPerBlock) threadsPerBlock <<= 1;
// threadsPerBlock now larger than threadsPerStride * stridesPerBlock
const int blocks = (upstreamGrad.getSize() + threadsPerBlock - 1) / threadsPerBlock;

softmaxBackwardKernelOneBlock<<<blocks, threadsPerBlock, 2 * strideWidthPerBlock * sizeof(ftype)>>>(
res.getData(), upstreamGrad.getData(), softmax.getData(), stride, strideWidthPerBlock, threadsPerStride, softmax.getSize());
}
else {
constexpr int maxThreadsPerBlock = 256;

const int nStrides = softmax.getSize() / stride;
const int threadsPerBlock = maxThreadsPerBlock; // TODO: do that one better, this can result in gross imbalance; also for normal softmax
const int blocksPerStride = (stride + threadsPerBlock - 1) / threadsPerBlock;

softmaxBackwardKernelLargePass<<<blocksPerStride * nStrides, threadsPerBlock, 2 * threadsPerBlock * sizeof(ftype)>>>(
res.getData(), upstreamGrad.getData(), softmax.getData(), blocksPerStride, stride);
}
cudaErrchk(cudaDeviceSynchronize());
}
}
Loading
Loading