Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 59 additions & 26 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,14 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS "-O3 -march=native")

option(ENABLE_SANITIZERS "Enable Clang sanitizers" OFF)
option(ENABLE_ROCSPARSE "Enable rocSPARSE" OFF)
option(ENABLE_CUSPARSE "Enable cuSPARSE" OFF)
option(ENABLE_SYCL_REFERENCE "Enable SYCL kernels in reference backend" OFF)

# Vendor backends
option(ENABLE_ONEMKL_SYCL "Enable oneMKL (SYCL) vendor backend" OFF)
option(ENABLE_ARMPL "Enable ArmPL vendor backend" OFF)
option(ENABLE_ROCSPARSE "Enable rocSPARSE vendor backend" OFF)
option(ENABLE_CUSPARSE "Enable cuSPARSE vendor backend" OFF)
option(ENABLE_AOCLSPARSE "Enable AOCL-Sparse vendor backend" OFF)

# Get includes, which declares the `spblas` library
add_subdirectory(include)
Expand All @@ -20,9 +26,43 @@ endif()
# Download dependencies
include(FetchContent)

# Enable sanitizers
if (ENABLE_SANITIZERS)
set(SANITIZER_FLAGS "-fsanitize=address,undefined")
target_compile_options(spblas INTERFACE ${SANITIZER_FLAGS} -g -O1 -fno-omit-frame-pointer)
target_link_options(spblas INTERFACE ${SANITIZER_FLAGS})
endif()

# Initialize backend flags
set(SPBLAS_CPU_BACKEND OFF)
set(SPBLAS_GPU_BACKEND OFF)

if (ENABLE_SYCL_REFERENCE)
if (ENABLE_ONEMKL_SYCL OR ENABLE_ARMPL OR ENABLE_ROCSPARSE OR ENABLE_CUSPARSE OR ENABLE_AOCLSPARSE)
message(FATAL_ERROR "SYCL reference backend cannot be enabled together with vendor backends")
endif()

# Check for SYCL support
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-fsycl" COMPILER_SUPPORTS_SYCL)

FetchContent_Declare(
sycl_thrust
GIT_REPOSITORY https://github.com/SparseBLAS/sycl-thrust.git
GIT_TAG main)
FetchContent_MakeAvailable(sycl_thrust)

if(COMPILER_SUPPORTS_SYCL)
target_compile_options(spblas INTERFACE -fsycl -fsycl-device-code-split=per_kernel)
# target_compile_options(spblas INTERFACE -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --offload-arch=sm_90)

else()
message(FATAL_ERROR "Compiler does not support SYCL (-fsycl flag not available)")
endif()

target_compile_definitions(spblas INTERFACE SPBLAS_ENABLE_SYCL_REFERENCE)
endif()

if (ENABLE_ONEMKL_SYCL)
set(SPBLAS_CPU_BACKEND ON)
set(SPBLAS_GPU_BACKEND ON)
Expand All @@ -47,6 +87,23 @@ if (ENABLE_ARMPL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_ARMPL")
endif()

if (ENABLE_ROCSPARSE)
set(SPBLAS_GPU_BACKEND ON)
project(spblas LANGUAGES HIP)
find_package(hip REQUIRED)
find_package(rocsparse REQUIRED)
target_link_libraries(spblas INTERFACE roc::rocsparse hip::host)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_ROCSPARSE")
set(CMAKE_HIP_FLAGS "${CMAKE_CXX_FLAGS}")
endif()

if (ENABLE_CUSPARSE)
set(SPBLAS_GPU_BACKEND ON)
find_package(CUDAToolkit REQUIRED)
target_link_libraries(spblas INTERFACE CUDA::cudart CUDA::cusparse CUDA::cublas)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_CUSPARSE")
endif()

if (ENABLE_AOCLSPARSE)
set(SPBLAS_CPU_BACKEND ON)
if (NOT DEFINED ENV{AOCLSPARSE_DIR})
Expand Down Expand Up @@ -77,23 +134,6 @@ if (ENABLE_AOCLSPARSE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_AOCLSPARSE")
endif()

if (ENABLE_ROCSPARSE)
set(SPBLAS_GPU_BACKEND ON)
project(spblas LANGUAGES HIP)
find_package(hip REQUIRED)
find_package(rocsparse REQUIRED)
target_link_libraries(spblas INTERFACE roc::rocsparse hip::host)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_ROCSPARSE")
set(CMAKE_HIP_FLAGS "${CMAKE_CXX_FLAGS}")
endif()

if (ENABLE_CUSPARSE)
set(SPBLAS_GPU_BACKEND ON)
find_package(CUDAToolkit REQUIRED)
target_link_libraries(spblas INTERFACE CUDA::cudart CUDA::cusparse CUDA::cublas)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPBLAS_ENABLE_CUSPARSE")
endif()

# If no vendor backend is enabled, enable CPU backend for reference implementation
if (NOT ENABLE_ONEMKL_SYCL AND
NOT ENABLE_ARMPL AND
Expand All @@ -108,13 +148,6 @@ if (LOG_LEVEL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLOG_LEVEL=${LOG_LEVEL}") # SPBLAS_DEBUG | SPBLAS_WARNING | SPBLAS_TRACE | SPBLAS_INFO
endif()

# Enable sanitizers
if (ENABLE_SANITIZERS)
set(SANITIZER_FLAGS "-fsanitize=address,undefined")
target_compile_options(spblas INTERFACE ${SANITIZER_FLAGS} -g -O1 -fno-omit-frame-pointer)
target_link_options(spblas INTERFACE ${SANITIZER_FLAGS})
endif()

# mdspan
FetchContent_Declare(
mdspan
Expand Down
4 changes: 4 additions & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,7 @@ if (SPBLAS_GPU_BACKEND)
add_subdirectory(rocsparse)
endif()
endif()

if (ENABLE_SYCL_REFERENCE)
add_subdirectory(sycl_reference)
endif()
1 change: 1 addition & 0 deletions examples/device/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ function(add_device_example example_name)
endfunction()

add_device_example(device_spmv)
add_device_example(spmm_benchmark)
162 changes: 162 additions & 0 deletions examples/device/spmm_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#include <spblas/spblas.hpp>

#include <thrust/device_vector.h>

#include <cassert>
#include <cmath>
#include <iostream>

#include <fmt/core.h>
#include <fmt/ranges.h>

int main(int argc, char** argv) {
using value_t = float;
using index_t = spblas::index_t;
using offset_t = spblas::offset_t;
namespace md = spblas::__mdspan;

offset_t nnz_row = 100;

index_t m = 100000;
index_t n = 1;
index_t k = 100000;

if (argc >= 2) {
m = std::atoll(argv[1]);
}

if (argc >= 3) {
k = std::atoll(argv[2]);
}

if (argc >= 4) {
n = std::atoll(argv[3]);
}

if (argc >= 5) {
nnz_row = std::atoll(argv[4]);
}

fmt::print("Multiplying {} x {} matrix with {} nnz/row by {} columns.\n", m,
k, nnz_row, n);

offset_t nnz_in = m * nnz_row;

auto&& [values, rowptr, colind, shape, nnz] =
spblas::generate_csr<value_t, index_t, offset_t>(m, k, nnz_in);

// Copy data to the GPU
thrust::device_vector<value_t> d_values(values);
thrust::device_vector<offset_t> d_rowptr(rowptr);
thrust::device_vector<index_t> d_colind(colind);

spblas::csr_view<value_t, index_t, offset_t> a(
d_values.data().get(), d_rowptr.data().get(), d_colind.data().get(),
shape, nnz);

std::vector<value_t> b_values(k * n, 1);
std::vector<value_t> c_values(m * n, 0);

thrust::device_vector<value_t> d_b(b_values);
thrust::device_vector<value_t> d_c(c_values);

md::mdspan b(d_b.data().get(), k, n);
md::mdspan c(d_c.data().get(), m, n);

// Perform computation on the GPU.
spblas::multiply(thrust::device, a, b, c);

// Copy the result back to the CPU.
thrust::copy(d_c.begin(), d_c.end(), c_values.begin());

std::vector<value_t> c_ref(m * n, 0);

spblas::csr_view<value_t, index_t, offset_t> a_view(
values.data(), rowptr.data(), colind.data(), shape, nnz);
md::mdspan b_view(b_values.data(), k, n);
md::mdspan c_view(c_ref.data(), m, n);

// Perform reference computation on CPU.
spblas::multiply(a_view, b_view, c_view);

// Compare results
const float epsilon = 64 * std::numeric_limits<float>::epsilon();
const float abs_th = std::numeric_limits<float>::min();
bool results_match = true;

for (std::size_t i = 0; i < c_ref.size(); ++i) {
float diff = std::abs(c_ref[i] - c_values[i]);
float norm = std::min(std::abs(c_ref[i]) + std::abs(c_values[i]),
std::numeric_limits<float>::max());
float abs_error = std::max(abs_th, epsilon * norm);

if (diff > abs_error) {
results_match = false;
std::cout << "Mismatch at index " << i << ": "
<< "SYCL result = " << c_values[i]
<< ", Reference = " << c_ref[i] << "\n";
break;
}
}

if (results_match) {
fmt::print("OK!\n");
} else {
fmt::print("Error!\n");
return 1;
}

// Warmup: call `SpMM` repeatedly for at least 2 seconds.

double min_warmup_duration = 2;
auto warmup_begin = std::chrono::high_resolution_clock::now();
auto warmup_end = warmup_begin;

while (std::chrono::duration<double>(warmup_end - warmup_begin).count() <
min_warmup_duration) {
spblas::multiply(thrust::device, a, b, c);
warmup_end = std::chrono::high_resolution_clock::now();
}

double gb = 1e-9 * (nnz * sizeof(value_t) + nnz * sizeof(index_t) +
(m + 1) * sizeof(offset_t) + k * n * sizeof(value_t) +
m * n * sizeof(value_t));

double gflops = 1e-9 * 2 * nnz * n;

double max_bw = 456;

std::size_t n_iterations = 10;

std::vector<double> durations;
durations.reserve(n_iterations);

for (std::size_t i = 0; i < n_iterations; i++) {
auto begin = std::chrono::high_resolution_clock::now();
spblas::multiply(thrust::device, a, b, c);
auto end = std::chrono::high_resolution_clock::now();
double duration = std::chrono::duration<double>(end - begin).count();
double gb_s = gb / duration;
double gflops_s = gflops / duration;

fmt::print("Completed in {} s (achieved {} GB/s)\n", duration, gb_s);
fmt::print("Achieved {} GFLOPs\n", gflops_s);

durations.push_back(duration);
}

fmt::print("Durations: {}\n", durations);

std::sort(durations.begin(), durations.end());

double median_duration = durations[durations.size() / 2];

double median_gb_s = gb / median_duration;
double median_gflops_s = gflops / median_duration;

fmt::print("Median duration {} ({} GB/s) {}% of peak\n", median_duration,
median_gb_s, 100 * (median_gb_s / max_bw));
fmt::print("Median achieved {} GFLOPs\n", median_gflops_s);

return 0;
}
7 changes: 7 additions & 0 deletions examples/sycl_reference/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

function(add_sycl_example example_name)
add_executable(${example_name} ${example_name}.cpp)
target_link_libraries(${example_name} spblas fmt sycl_thrust)
endfunction()

add_sycl_example(sycl_spmm)
Loading
Loading