Rust-GPU · karpenkoag · Dec 22, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -47,6 +47,8 @@ members = [
   "examples/gemm/kernels",
   "examples/i128_demo",
   "examples/i128_demo/kernels",
+  "examples/sdot",
+  "examples/sdot/kernels",
   "examples/sha2_crates_io",
   "examples/sha2_crates_io/kernels",
   "examples/vecadd",

diff --git a/examples/README.md b/examples/README.md
@@ -21,3 +21,6 @@ which runs on CPU or GPU, with the additional option of running OptiX denoising.
 
 The Path Tracer uses cuda_builder to compile the core path tracer for the GPU and GPU (hardware raytracing), and uses the core path tracer as a normal crate
 for CPU rendering and sharing structures.
+
+### [sdot](sdot)
+Example of computes the dot product of two single-precision (f32) vectors
diff --git a/examples/sdot/Cargo.toml b/examples/sdot/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "sdot"
+version = "0.1.0"
+edition = "2024"
+description = "Example of computes the dot product of two f32 vectors"
+
+[dependencies]
+cust = { path = "../../crates/cust" }
+blastoff = { path = "../../crates/blastoff" }
+rand = "0.9.*"
+
+[build-dependencies]
+cuda_builder = { workspace = true, default-features = false }
diff --git a/examples/sdot/build.rs b/examples/sdot/build.rs
@@ -0,0 +1,63 @@
+use std::env;
+use std::path;
+use std::process::{Command, Stdio};
+
+use cuda_builder::CudaBuilder;
+
+fn main() {
+    println!("cargo::rerun-if-changed=build.rs");
+    println!("cargo::rerun-if-changed=kernels");
+
+    let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
+    let kernels_path = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()).join("kernels");
+
+    CudaBuilder::new(kernels_path.as_path())
+        .copy_to(out_path.join("kernels.ptx"))
+        .build()
+        .unwrap();
+
+    // Generate PTX from native CUDA kernels
+    let cuda_kernel_path = kernels_path.join("cuda/sdot.cu");
+
+    println!("cargo::rerun-if-changed={}", cuda_kernel_path.display());
+
+    let cuda_ptx = out_path.join("kernels_cuda_mangles.ptx");
+    let mut nvcc = Command::new("nvcc");
+    nvcc.arg("--ptx")
+        .args(["--Werror", "all-warnings"])
+        .args(["--output-directory", out_path.as_os_str().to_str().unwrap()])
+        .args(["-o", cuda_ptx.as_os_str().to_str().unwrap()])
+        .arg(cuda_kernel_path.as_path());
+
+    let build = nvcc
+        .stderr(Stdio::inherit())
+        .output()
+        .expect("failed to execute nvcc kernel build");
+
+    assert!(build.status.success());
+
+    // Decodes (demangles) low-level identifiers
+    let cat_out = Command::new("cat")
+        .arg(cuda_ptx)
+        .stdout(Stdio::piped())
+        .stderr(Stdio::inherit())
+        .spawn()
+        .expect("Failed to start cat process")
+        .stdout
+        .expect("Failed to open cat stdout");
+
+    let outputs = std::fs::File::create(out_path.join("kernels_cuda.ptx"))
+        .expect("Can not open output ptc kernel file");
+
+    let filt_out = Command::new("cu++filt")
+        .arg("-p")
+        .stdin(Stdio::from(cat_out))
+        .stdout(Stdio::from(outputs))
+        .stderr(Stdio::inherit())
+        .spawn()
+        .expect("Failed to start cu++filt process")
+        .wait_with_output()
+        .expect("Failed to wait on cu++filt");
+
+    assert!(filt_out.status.success());
+}
diff --git a/examples/sdot/kernels/Cargo.toml b/examples/sdot/kernels/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "sdot_kernels"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cuda_std = { path = "../../../crates/cuda_std" }
+
+[lib]
+crate-type = ["cdylib", "rlib"]
diff --git a/examples/sdot/kernels/cuda/sdot.cu b/examples/sdot/kernels/cuda/sdot.cu
@@ -0,0 +1,36 @@
+// Here we use `unsigned long` to match the Rust version `usize`.
+__global__ void sdot(const float *x, unsigned long x_n, const float *y, unsigned long y_n, float *out)
+{
+
+    extern __shared__ float shared_sum[];
+    unsigned int i;
+
+    unsigned int num_threads = gridDim.x * blockDim.x;
+    unsigned int start_ind = blockDim.x * blockIdx.x;
+    unsigned int tid = threadIdx.x;
+
+    float sum = 0.0f;
+    for (i = start_ind + tid; i < x_n; i += num_threads)
+    {
+        // Rust checks emulation
+        if (i >= y_n)
+            __trap();
+
+        sum += x[i] * y[i];
+    }
+    shared_sum[tid] = sum;
+
+    for (i = blockDim.x >> 1; i > 0; i >>= 1)
+    {
+        __syncthreads();
+        if (tid < i)
+        {
+            shared_sum[tid] += shared_sum[tid + i];
+        }
+    }
+
+    if (tid == 0)
+    {
+        out[blockIdx.x] = shared_sum[tid];
+    }
+}
diff --git a/examples/sdot/kernels/src/lib.rs b/examples/sdot/kernels/src/lib.rs
@@ -0,0 +1,37 @@
+use cuda_std::{kernel, shared, thread};
+
+#[kernel]
+#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
+pub unsafe fn sdot(x: &[f32], y: &[f32], out: *mut f32) {
+    let shared_sum = shared::dynamic_shared_mem::<f32>();
+
+    let num_threads = (thread::grid_dim_x() as usize) * (thread::block_dim_x() as usize);
+    let start_ind = (thread::block_dim_x() as usize) * (thread::block_idx_x() as usize);
+    let tid = thread::thread_idx_x() as usize;
+
+    let mut sum = 0f32;
+    for i in ((start_ind + tid)..x.len()).step_by(num_threads) {
+        sum += x[i] * y[i];
+    }
+    unsafe {
+        *shared_sum.add(tid) = sum;
+    }
+
+    let mut i = (thread::block_dim_x() >> 1) as usize;
+    while i > 0 {
+        thread::sync_threads();
+        if tid < i {
+            unsafe {
+                *shared_sum.add(tid) += *shared_sum.add(tid + i);
+            }
+        }
+
+        i >>= 1;
+    }
+
+    if tid == 0 {
+        unsafe {
+            *out.add(thread::block_idx_x() as usize) = *shared_sum.add(tid);
+        }
+    }
+}