Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 91 additions & 37 deletions Common/src/linear_algebra/CSysMatrixGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,32 +28,43 @@
#include "../../include/linear_algebra/CSysMatrix.hpp"
#include "../../include/linear_algebra/GPUComms.cuh"

template<typename matrixType, typename vectorType>
__global__ void GPUMatrixVectorProductAdd(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
{
int row = (blockIdx.x * blockDim.x + threadIdx.x)/32;
int threadNo = threadIdx.x%32;
int activeThreads = nVar * (32/nVar);

int blockRow = (threadNo/nVar)%nVar;

if(row<nPointDomain && threadNo<activeThreads) prod[row * nVar + blockRow] = 0.0;

__syncthreads();

if(row<nPointDomain && threadNo<activeThreads)
{
vectorType res = 0.0;
#include <cusparse.h>
#include <cstdint>
#include <type_traits>

inline void cusparseAssert(cusparseStatus_t code, const char* file, int line, bool abort = true) {
if (code != CUSPARSE_STATUS_SUCCESS) {
fprintf(stderr, "cuSPARSEassert: %s %s %d\n", cusparseGetErrorString(code), file, line);
if (abort) exit(static_cast<int>(code));
}
}

for(int index = d_row_ptr[row] * nVar * nEqn + threadNo; index < d_row_ptr[row+1] * nVar * nEqn; index+=activeThreads)
{
int blockCol = index%nEqn;
int blockNo = index/(nVar * nEqn);
res += matrix[index] * vec[(d_col_ind[blockNo])*nVar + blockCol];
}
#define cusparseErrChk(ans) \
{ \
cusparseAssert((ans), __FILE__, __LINE__); \
}

inline cusparseIndexType_t GetCusparseIndexType() {
if constexpr (sizeof(unsigned long) == 4) {
return CUSPARSE_INDEX_32I;
} else if constexpr (sizeof(unsigned long) == 8) {
return CUSPARSE_INDEX_64I;
} else {
static_assert(sizeof(unsigned long) == 4 || sizeof(unsigned long) == 8,
"cuSPARSE BSR SpMV only supports 32-bit or 64-bit index arrays in this path.");
}
}

atomicAdd(&prod[row * nVar + blockRow], res);
}
template <class ScalarType>
constexpr cudaDataType GetCudaDataType() {
if constexpr (std::is_same<ScalarType, float>::value) {
return CUDA_R_32F;
} else if constexpr (std::is_same<ScalarType, double>::value) {
return CUDA_R_64F;
} else {
static_assert(std::is_same<ScalarType, float>::value || std::is_same<ScalarType, double>::value,
"cuSPARSE BSR SpMV only supports float and double in this path.");
}
}

template<class ScalarType>
Expand All @@ -62,26 +73,69 @@ void CSysMatrix<ScalarType>::HtDTransfer(bool trigger) const
if(trigger) gpuErrChk(cudaMemcpy((void*)(d_matrix), (void*)&matrix[0], (sizeof(ScalarType)*nnz*nVar*nEqn), cudaMemcpyHostToDevice));
}

template<class ScalarType>
template <class ScalarType>
void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
CGeometry* geometry, const CConfig* config) const
{
CGeometry* geometry, const CConfig* config) const {
if (nVar != nEqn) {
SU2_MPI::Error("CUDA CSysMatrix matvec with cuSPARSE BSR requires square blocks.", CURRENT_FUNCTION);
}

ScalarType* d_vec = vec.GetDevicePointer();
ScalarType* d_prod = prod.GetDevicePointer();
ScalarType* d_vec = vec.GetDevicePointer();
ScalarType* d_prod = prod.GetDevicePointer();

vec.HtDTransfer();
prod.GPUSetVal(0.0);
vec.HtDTransfer();

dim3 blockDim(KernelParameters::MVP_BLOCK_SIZE,1,1);
int gridx = KernelParameters::round_up_division(KernelParameters::MVP_WARP_SIZE, nPointDomain);
dim3 gridDim(gridx, 1, 1);
const auto indexType = GetCusparseIndexType();
const auto valueType = GetCudaDataType<ScalarType>();

GPUMatrixVectorProductAdd<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, nPointDomain, nVar, nEqn);
gpuErrChk( cudaPeekAtLastError() );
const std::int64_t blockSize = static_cast<std::int64_t>(nVar);

prod.DtHTransfer();
const std::int64_t brows = static_cast<std::int64_t>(nPointDomain);
const std::int64_t bcols = static_cast<std::int64_t>(nPoint);
const std::int64_t bnnz = static_cast<std::int64_t>(nnz);

const std::int64_t xSize = static_cast<std::int64_t>(nPoint) * blockSize;
const std::int64_t ySize = static_cast<std::int64_t>(nPointDomain) * blockSize;

const ScalarType alpha = 1.0;
const ScalarType beta = 0.0;

cusparseHandle_t handle = nullptr;
cusparseConstSpMatDescr_t matA = nullptr;
cusparseDnVecDescr_t vecX = nullptr;
cusparseDnVecDescr_t vecY = nullptr;

cusparseErrChk(cusparseCreate(&handle));

cusparseErrChk(cusparseCreateConstBsr(&matA, brows, bcols, bnnz, blockSize, blockSize, d_row_ptr, d_col_ind, d_matrix,
indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, valueType, CUSPARSE_ORDER_ROW));

cusparseErrChk(cusparseCreateDnVec(&vecX, xSize, d_vec, valueType));
cusparseErrChk(cusparseCreateDnVec(&vecY, ySize, d_prod, valueType));

size_t bufferSize = 0;
void* dBuffer = nullptr;

cusparseErrChk(cusparseSpMV_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY,
valueType, CUSPARSE_SPMV_BSR_ALG1, &bufferSize));

if (bufferSize > 0) {
gpuErrChk(cudaMalloc(&dBuffer, bufferSize));
}

cusparseErrChk(cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, valueType,
CUSPARSE_SPMV_BSR_ALG1, dBuffer));

if (dBuffer != nullptr) {
gpuErrChk(cudaFree(dBuffer));
}

cusparseErrChk(cusparseDestroyDnVec(vecY));
cusparseErrChk(cusparseDestroyDnVec(vecX));
cusparseErrChk(cusparseDestroySpMat(matA));
cusparseErrChk(cusparseDestroy(handle));

prod.DtHTransfer();
}

template class CSysMatrix<su2mixedfloat>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
5 changes: 4 additions & 1 deletion meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,13 @@ python = pymod.find_installation()
if get_option('enable-cuda')
add_languages('cuda')
add_global_arguments('-arch=sm_86', language : 'cuda')
cuda_deps = [meson.get_compiler('cuda').find_library('cusparse', required : true)]
else
cuda_deps = []
endif

su2_cpp_args = []
su2_deps = [declare_dependency(include_directories: 'externals/CLI11')]
su2_deps = [declare_dependency(include_directories: 'externals/CLI11')] + cuda_deps

default_warning_flags = []
if build_machine.system() != 'windows'
Expand Down
Loading