RobBa · RobBa · May 31, 2026 · Mar 22, 2026 · Mar 22, 2026 · Mar 23, 2026
diff --git a/.gitignore b/.gitignore
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -27,7 +27,22 @@ endif()
 add_compile_options("$<$<C_COMPILER_ID:MSVC>:/utf-8>")
 add_compile_options("$<$<CXX_COMPILER_ID:MSVC>:/utf-8>")
 
-# TODO: add flag for double precision?
+option(CUDA "Enable CUDA execution for some faster data structures" ON)
+
+if (CUDA)  
+  include(CheckLanguage)
+  check_language(CUDA)
+
+  if(CMAKE_CUDA_COMPILER)
+    add_definitions(-D__CUDA)
+    enable_language(CUDA)
+
+    set(CMAKE_CUDA_STANDARD 20)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+  else()
+    message(WARNING "Could not find CUDA on system. Compiling without CUDA enabled")
+  endif()
+endif()
 
 # include python libs
 if(APPLE)

diff --git a/examples/mnist.py b/examples/mnist.py
@@ -125,10 +125,10 @@ def evaluate(net, x, y_int, batch_size=256):
     # setup
     net = make_net()
     loss_fn = CrossEntropyWithSoftmax()
-    optim = RmsProp(net.parameters(), 0.00001, 0.95)  # lr and decay
+    optim = RmsProp(net.parameters(), 0.000001, 0.999)  # lr and decay
 
     # training loop
-    n_epochs = 10
+    n_epochs = 5
     for epoch in range(n_epochs):
         train_loss = train_epoch(net, loss_fn, optim, x_train, y_train)
         val_acc = evaluate(net, x_val, y_val)

diff --git a/readme.md b/readme.md
@@ -20,17 +20,18 @@ For some examples on Python interface, see tests/python.
   - Training framework (optimizers, loss functions, layers, and networks)
 - **Example code**: Full MNIST dataset training example
 - **Python Interface**: Seamless integration via Boost.Python
-- **Clean Architecture**: Modular design, ~4K LOC
+- **Clean Architecture**: Modular design, maintainable and extensible
 - **CI/CD**: Automated testing with GTest and GitHub Actions
 
 ## Tech Stack
 
-- C++17/20
+- C++17/20/23
 - CMake build system
 - Boost.Python for Python bindings
 - Python 3 for library interface and examples
 - Google Test (GTest) and PyTest for unit testing
 - GitHub Actions for CI/CD
+- CUDA
 
 ## Current Status
 
@@ -40,7 +41,7 @@ Roadmap:
 - [x] Python Binding Unit Tests
 - [x] Optimizers and training framework
 - [x] MNIST example
-- [ ] CUDA mode for operations
+- [x] CUDA mode for operations
 - [ ] Additional layer types (Conv2D, Dropout, etc.)
 - [ ] AlexNet reference implementation
 - [ ] Docker deployment example
@@ -50,9 +51,18 @@ Roadmap:
 mkdir build && cd build
 cmake ..
 make
-ctest
 ```
 
+### Building with CUDA
+
+Project automatically detects whether CUDA is installed, and compiles with it.
+If CUDA compilation not desired you can switch it off via 
+
+```bash
+cmake --DCUDA=Off ..
+```
+
+
 ## Running Unit Tests
 
 Compile with building tests enabled: 
@@ -61,7 +71,7 @@ Compile with building tests enabled:
 mkdir build && cd build
 cmake -DBUILD_TESTS=On ..
 make
-ctest 
+ctest .
 ```
 
 ## Required
@@ -73,6 +83,7 @@ ctest
 - numpy 1.26.4
 - pytest and GTest for unit tests (we use pytest=9.0.2)
 - Google Benchmark for benchmarking
+- CUDA (we use CUDA 13.1 on an RTX-5050)
 
 ## Troubleshooting
 

diff --git a/src/backend/CMakeLists.txt b/src/backend/CMakeLists.txt
@@ -7,6 +7,18 @@ file(GLOB_RECURSE CORE_SOURCES
     utility/*.cpp
 )
 
+if(CMAKE_CUDA_COMPILER)
+    file(GLOB_RECURSE CUDA_SOURCES
+        computational_graph/*.cu
+        data_modeling/*.cu
+        module/*.cu
+        system/*.cu
+        training/*.cu
+        utility/*.cu
+    )
+    list(APPEND CORE_SOURCES ${CUDA_SOURCES})
+endif()
+
 add_library(BackendCore SHARED ${CORE_SOURCES})
 
 target_include_directories(BackendCore PUBLIC
@@ -15,4 +27,23 @@ target_include_directories(BackendCore PUBLIC
 
 set_target_properties(BackendCore PROPERTIES
     LIBRARY_OUTPUT_DIRECTORY "${PYTHON_MODULE_DIR}" # make sure Python-modules see backend
-)
+)
+
+if(CMAKE_CUDA_COMPILER)
+    set_target_properties(BackendCore PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON
+        # nvidia-smi --query-gpu=compute_cap --format=csv,noheader 
+        # I get 12.0, hence 120
+        #set(CMAKE_CUDA_ARCHITECTURES "75;86;89;100;120")
+        CMAKE_CUDA_ARCHITECTURES native
+    )
+
+    find_package(CUDAToolkit REQUIRED)
+    target_include_directories(BackendCore PRIVATE 
+        ${CUDAToolkit_INCLUDE_DIRS}
+    )
+    target_link_libraries(BackendCore 
+        CUDA::cudart
+    )
+endif()
+
diff --git a/src/backend/computational_graph/activation_functions/cuda/activation_nodes.cu b/src/backend/computational_graph/activation_functions/cuda/activation_nodes.cu
@@ -0,0 +1,214 @@
+/**
+ * @file activation_nodes.cu
+ * @author Robert Baumgartner (r.baumgartner-1@tudelft.nl)
+ * @brief
+ * @version 0.1
+ * @date 2026-03-23
+ *
+ * @copyright Copyright (c) 2026
+ *
+ */
+
+#ifndef __CUDA
+static_assert(false, "File should not be compiled without CUDA enabled");
+#endif // __CUDA
+
+#include "activation_nodes.cuh"
+#include "utility/cuda/cuda_common.cuh"
+
+using namespace std;
+
+namespace {
+  /**
+   * @brief Relu backward kernel.
+   */
+  __global__ void reluBackwardKernel(ftype* const res, const ftype* const upstreamGrad, const ftype* const parent, const tensorSize_t size) {
+    const int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= size) {
+      return;
+    }
+
+    res[gid] =  parent[gid] > 0 ? upstreamGrad[gid] : 0;
+  }
+
+  /**
+   * @brief Leaky relu backward kernel.
+   */
+  __global__ void leakyReluBackwardKernel(ftype* const res, const ftype* const upstreamGrad, const ftype* const parent, const ftype eps, const tensorSize_t size) {
+    const int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= size) {
+      return;
+    }
+
+    res[gid] = parent[gid] > 0 ? upstreamGrad[gid] : eps * upstreamGrad[gid];
+  }
+
+  /**
+   * @brief Sigmoid backward kernel, optimized by using the forward sigmoid.
+   */
+  __global__ void sigmoidBackwardKernel(ftype* const res, const ftype* const upstreamGrad, const ftype* const sigmoid, const tensorSize_t size) {
+    const int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= size) {
+      return;
+    }
+
+    ftype si = sigmoid[gid];
+    res[gid] = si * (1 - si) * upstreamGrad[gid];
+  }
+
+  /**
+   * @brief Softmax backward kernel. This kernel is different than others since it is warp aligned. The inner loop avoids shared memory bank 
+   * conflicts by broadcasting.
+   * 
+   * stridesWidthPerBlock is an awkward name. It is the product of number of strides per block (times) stride. We pre-compute it on host. 
+   */
+  __global__ void softmaxBackwardKernelOneBlock(ftype* const res, const ftype* const upstreamGrad, const ftype* const softmax,  
+                                                const tensorSize_t stride, const int stridesWidthPerBlock, const int threadsPerStride, tensorSize_t size) {
+    const int tid = threadIdx.x;
+
+    const int withinStrideOffset = tid % threadsPerStride;
+    const int strideOffset = (tid / threadsPerStride) * stride;
+
+    const int gid = blockIdx.x * stridesWidthPerBlock + strideOffset + withinStrideOffset;
+    const bool isPadded = (withinStrideOffset >= stride) || (gid >= size); // padded threads only exists to align warps with strides
+
+    ftype yi = 0;
+    const int smemOffset = strideOffset + withinStrideOffset;
+
+    extern __shared__ ftype smem[];
+    if(!isPadded) {
+      yi = softmax[gid];
+      smem[smemOffset] = yi;
+      smem[smemOffset + stridesWidthPerBlock] = upstreamGrad[gid];
+    }
+    __syncthreads();
+
+    if(isPadded) {
+      return;
+    }
+
+    ftype grad = 0;
+    for(int j = 0; j < stride; j++) {
+      // warp alignment -> smem-reads are broadcasted per warp -> no bank conflicts
+      ftype yj = smem[strideOffset + j];
+      ftype gj = smem[strideOffset + j + stridesWidthPerBlock];
+
+      auto jacobian = (withinStrideOffset == j) ? yi * (1 - yj) : -yi * yj;
+      grad += gj * jacobian;
+    }
+
+    res[gid] = grad;
+  }
+
+  /**
+   * @brief Large softmax pass. Because the stride now does not fit into one block anymore we do a grid-stride loop.
+   */
+  __global__ void softmaxBackwardKernelLargePass(ftype* const res, const ftype* const upstreamGrad, const ftype* const softmax, const int blocksPerStride, const tensorSize_t stride) {    
+    const int strideNumber = blockIdx.x / blocksPerStride;
+    const int strideOffset = strideNumber * stride;
+    const int i = (blockIdx.x % blocksPerStride) * blockDim.x + threadIdx.x;
+    // blockIdx.x % blocksPerStride = block number within this stride
+
+    const int tid = threadIdx.x;
+    const int gid = strideOffset + i;
+
+    extern __shared__ ftype smem[];
+
+    const bool isNotPadded = i < stride;
+    const ftype yi = isNotPadded ? softmax[gid] : 0;
+
+    ftype grad = 0;
+    for(int offset = 0; offset < stride; offset += blockDim.x) {
+      // load into smem
+      {
+        const int j = offset + tid;
+        if(j < stride) {
+          smem[tid] = softmax[strideOffset + j];
+          smem[tid + blockDim.x] = upstreamGrad[strideOffset + j];
+        }
+        __syncthreads();
+      }
+
+
+      for(int k = 0; k < blockDim.x; k++) {
+        const int j = offset + k;
+        if(j < stride) {
+          ftype yj = smem[k];
+          ftype gj = smem[k + blockDim.x];
+
+          auto jacobian = (i == j) ? yi * (1 - yj) : -yi * yj;
+          grad += gj * jacobian;
+        }
+      }
+      __syncthreads();
+    }
+
+    if(isNotPadded) {
+      res[gid] = grad;
+    } 
+  }
+}
+
+namespace cuda_impl {
+  void reluBackward(Tensor& res, const Tensor& upstreamGrad, const Tensor& parent) {
+    constexpr int threadsPerBlock = 256;
+    const int blocks = (upstreamGrad.getSize() + threadsPerBlock - 1) / threadsPerBlock;
+
+    reluBackwardKernel<<<blocks, threadsPerBlock>>>(res.getData(), upstreamGrad.getData(), parent.getData(), res.getSize());
+    cudaErrchk(cudaDeviceSynchronize());
+  }
+
+  void leakyReluBackward(Tensor& res, const Tensor& upstreamGrad, const Tensor& parent, ftype eps) {
+    constexpr int threadsPerBlock = 256;
+    const int blocks = (upstreamGrad.getSize() + threadsPerBlock - 1) / threadsPerBlock;
+
+    leakyReluBackwardKernel<<<blocks, threadsPerBlock>>>(res.getData(), upstreamGrad.getData(), parent.getData(), eps, res.getSize());
+    cudaErrchk(cudaDeviceSynchronize());
+  }
+
+  void sigmoidBackward(Tensor& res, const Tensor& upstreamGrad, const Tensor& sigmoid) {
+    constexpr int threadsPerBlock = 256;
+    const int blocks = (upstreamGrad.getSize() + threadsPerBlock - 1) / threadsPerBlock;
+
+    sigmoidBackwardKernel<<<blocks, threadsPerBlock>>>(res.getData(), upstreamGrad.getData(), sigmoid.getData(), res.getSize());
+    cudaErrchk(cudaDeviceSynchronize());
+  }
+
+  /**
+   * @brief The backward of the softmax. Due to optimization this function distinguishes three cases of stride size, where stride
+   * is the size of the dimension the softmax operation is applied to. The two cases are a stride either fitting into one block or not.
+   */
+  void softmaxBackward(Tensor& res, const Tensor& upstreamGrad, const Tensor& softmax) {
+    assert(upstreamGrad.getSize() == softmax.getSize());
+
+    constexpr int maxThreadsPerBlock = 256;
+    const int stride = softmax.getDims()[-1];
+
+    if(stride < maxThreadsPerBlock) {
+      const int threadsPerStride = max(1, ((stride + 31) / 32)) * 32; // == warps per stride * 32
+
+      // min over maximum possible strides per block and actual number of strides
+      const int stridesPerBlock = min(maxThreadsPerBlock / threadsPerStride, softmax.getSize() / stride);
+      const int strideWidthPerBlock = stridesPerBlock * stride; // for smem idx computation
+
+      int threadsPerBlock = 1;
+      while(threadsPerBlock < threadsPerStride * stridesPerBlock) threadsPerBlock <<= 1; 
+      // threadsPerBlock now larger than threadsPerStride * stridesPerBlock
+      const int blocks = (upstreamGrad.getSize() + threadsPerBlock - 1) / threadsPerBlock;
+
+      softmaxBackwardKernelOneBlock<<<blocks, threadsPerBlock, 2 * strideWidthPerBlock * sizeof(ftype)>>>(
+          res.getData(), upstreamGrad.getData(), softmax.getData(), stride, strideWidthPerBlock, threadsPerStride, softmax.getSize());
+    }
+    else {
+      constexpr int maxThreadsPerBlock = 256; 
+
+      const int nStrides = softmax.getSize() / stride;
+      const int threadsPerBlock = maxThreadsPerBlock; // TODO: do that one better, this can result in gross imbalance; also for normal softmax
+      const int blocksPerStride = (stride + threadsPerBlock - 1) / threadsPerBlock; 
+
+      softmaxBackwardKernelLargePass<<<blocksPerStride * nStrides, threadsPerBlock, 2 * threadsPerBlock * sizeof(ftype)>>>(
+                                       res.getData(), upstreamGrad.getData(), softmax.getData(), blocksPerStride, stride);
+    }
+    cudaErrchk(cudaDeviceSynchronize());
+  }
+}