Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ members = [
"examples/gemm/kernels",
"examples/i128_demo",
"examples/i128_demo/kernels",
"examples/sdot",
"examples/sdot/kernels",
"examples/sha2_crates_io",
"examples/sha2_crates_io/kernels",
"examples/vecadd",
Expand Down
3 changes: 3 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,6 @@ which runs on CPU or GPU, with the additional option of running OptiX denoising.

The Path Tracer uses cuda_builder to compile the core path tracer for the GPU and GPU (hardware raytracing), and uses the core path tracer as a normal crate
for CPU rendering and sharing structures.

### [sdot](sdot)
Example of computes the dot product of two single-precision (f32) vectors
13 changes: 13 additions & 0 deletions examples/sdot/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[package]
name = "sdot"
version = "0.1.0"
edition = "2024"
description = "Example of computes the dot product of two f32 vectors"

[dependencies]
cust = { path = "../../crates/cust" }
blastoff = { path = "../../crates/blastoff" }
rand = "0.9.*"

[build-dependencies]
cuda_builder = { workspace = true, default-features = false }
63 changes: 63 additions & 0 deletions examples/sdot/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
use std::env;
use std::path;
use std::process::{Command, Stdio};

use cuda_builder::CudaBuilder;

fn main() {
println!("cargo::rerun-if-changed=build.rs");
println!("cargo::rerun-if-changed=kernels");

let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
let kernels_path = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()).join("kernels");

CudaBuilder::new(kernels_path.as_path())
.copy_to(out_path.join("kernels.ptx"))
.build()
.unwrap();

// Generate PTX from native CUDA kernels
let cuda_kernel_path = kernels_path.join("cuda/sdot.cu");

println!("cargo::rerun-if-changed={}", cuda_kernel_path.display());

let cuda_ptx = out_path.join("kernels_cuda_mangles.ptx");
let mut nvcc = Command::new("nvcc");
nvcc.arg("--ptx")
.args(["--Werror", "all-warnings"])
.args(["--output-directory", out_path.as_os_str().to_str().unwrap()])
.args(["-o", cuda_ptx.as_os_str().to_str().unwrap()])
.arg(cuda_kernel_path.as_path());

let build = nvcc
.stderr(Stdio::inherit())
.output()
.expect("failed to execute nvcc kernel build");

assert!(build.status.success());

// Decodes (demangles) low-level identifiers
let cat_out = Command::new("cat")
.arg(cuda_ptx)
.stdout(Stdio::piped())
.stderr(Stdio::inherit())
.spawn()
.expect("Failed to start cat process")
.stdout
.expect("Failed to open cat stdout");

let outputs = std::fs::File::create(out_path.join("kernels_cuda.ptx"))
.expect("Can not open output ptc kernel file");

let filt_out = Command::new("cu++filt")
.arg("-p")
.stdin(Stdio::from(cat_out))
.stdout(Stdio::from(outputs))
.stderr(Stdio::inherit())
.spawn()
.expect("Failed to start cu++filt process")
.wait_with_output()
.expect("Failed to wait on cu++filt");

assert!(filt_out.status.success());
}
10 changes: 10 additions & 0 deletions examples/sdot/kernels/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[package]
name = "sdot_kernels"
version = "0.1.0"
edition = "2024"

[dependencies]
cuda_std = { path = "../../../crates/cuda_std" }

[lib]
crate-type = ["cdylib", "rlib"]
36 changes: 36 additions & 0 deletions examples/sdot/kernels/cuda/sdot.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Here we use `unsigned long` to match the Rust version `usize`.
__global__ void sdot(const float *x, unsigned long x_n, const float *y, unsigned long y_n, float *out)
{

extern __shared__ float shared_sum[];
unsigned int i;

unsigned int num_threads = gridDim.x * blockDim.x;
unsigned int start_ind = blockDim.x * blockIdx.x;
unsigned int tid = threadIdx.x;

float sum = 0.0f;
for (i = start_ind + tid; i < x_n; i += num_threads)
{
// Rust checks emulation
if (i >= y_n)
__trap();

sum += x[i] * y[i];
}
shared_sum[tid] = sum;

for (i = blockDim.x >> 1; i > 0; i >>= 1)
{
__syncthreads();
if (tid < i)
{
shared_sum[tid] += shared_sum[tid + i];
}
}

if (tid == 0)
{
out[blockIdx.x] = shared_sum[tid];
}
}
37 changes: 37 additions & 0 deletions examples/sdot/kernels/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
use cuda_std::{kernel, shared, thread};

#[kernel]
#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
pub unsafe fn sdot(x: &[f32], y: &[f32], out: *mut f32) {
let shared_sum = shared::dynamic_shared_mem::<f32>();

let num_threads = (thread::grid_dim_x() as usize) * (thread::block_dim_x() as usize);
let start_ind = (thread::block_dim_x() as usize) * (thread::block_idx_x() as usize);
let tid = thread::thread_idx_x() as usize;

let mut sum = 0f32;
for i in ((start_ind + tid)..x.len()).step_by(num_threads) {
sum += x[i] * y[i];
}
unsafe {
*shared_sum.add(tid) = sum;
}

let mut i = (thread::block_dim_x() >> 1) as usize;
while i > 0 {
thread::sync_threads();
if tid < i {
unsafe {
*shared_sum.add(tid) += *shared_sum.add(tid + i);
}
}

i >>= 1;
}

if tid == 0 {
unsafe {
*out.add(thread::block_idx_x() as usize) = *shared_sum.add(tid);
}
}
}
Loading