-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathzgeqrf_cusolver.cpp
More file actions
71 lines (55 loc) · 2.57 KB
/
zgeqrf_cusolver.cpp
File metadata and controls
71 lines (55 loc) · 2.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <common.hpp>
#include <iostream>
int32_t main(int32_t argc, char* argv[]) {
auto cu_err = cudaSetDevice(0);
if (cu_err != cudaSuccess)
{ fprintf(stderr, "%s\n", cudaGetErrorString(cu_err)); return -1; }
cudaStream_t stream;
cudaStreamCreate(&stream);
cusolverDnHandle_t cusolverH;
cusolverDnCreate(&cusolverH);
cusolverDnSetStream(cusolverH, stream);
cusolverDnParams_t params;
cusolverDnCreateParams(¶ms);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int64_t M = 1 < argc ? std::atoi(argv[1]) : 1024;
int64_t N = 2 < argc ? std::atoi(argv[2]) : 128;
std::vector<std::complex<double>> matA(M * N);
matrix_generator<std::complex<double>>(M, N).generate_block(1., 512, 512, &matA[0], M);
std::complex<double>* dA = nullptr, * dTau = nullptr;
int32_t* info = nullptr;
cudaMalloc((void**)(&dA), M * N * sizeof(std::complex<double>));
cudaMalloc((void**)(&dTau), N * sizeof(std::complex<double>));
cudaMalloc((void**)(&info), sizeof(int32_t));
cudaMemcpy(dA, matA.data(), M * N * sizeof(std::complex<double>), cudaMemcpyHostToDevice);
size_t workspaceInBytesOnDevice, workspaceInBytesOnHost;
cusolverDnXgeqrf_bufferSize(cusolverH, params, M, N, CUDA_C_64F, dA, M, CUDA_C_64F, dTau, CUDA_C_64F, &workspaceInBytesOnDevice, &workspaceInBytesOnHost);
void* hWork = std::malloc(workspaceInBytesOnHost), *dWork;
cudaMalloc(&dWork, workspaceInBytesOnDevice);
cusolverDnXgeqrf(cusolverH, params, M, N, CUDA_C_64F, dA, M, CUDA_C_64F, dTau, CUDA_C_64F, dWork, workspaceInBytesOnDevice, hWork, workspaceInBytesOnHost, info);
cudaDeviceSynchronize();
cudaMemcpy(dA, matA.data(), M * N * sizeof(std::complex<double>), cudaMemcpyHostToDevice);
cudaEventRecord(start, stream);
cusolverDnXgeqrf(cusolverH, params, M, N, CUDA_C_64F, dA, M, CUDA_C_64F, dTau, CUDA_C_64F, dWork, workspaceInBytesOnDevice, hWork, workspaceInBytesOnHost, info);
cudaEventRecord(stop, stream);
cudaDeviceSynchronize();
float milliseconds = 0.0f;
cudaEventElapsedTime(&milliseconds, start, stop);
int64_t qr_flops = (N * N * N * -2 / 3) + (M * N * N * 2);
double gflops = double(qr_flops) * 1.e-6 / milliseconds;
std::cout << "cusolver-ZGEQRF," << M << "," << N << "," << milliseconds << "," << gflops << std::endl;
cudaFree(dA);
cudaFree(dTau);
cudaFree(info);
cudaFree(dWork);
std::free(hWork);
cudaStreamDestroy(stream);
cusolverDnDestroyParams(params);
cusolverDnDestroy(cusolverH);
cu_err = cudaGetLastError();
if (cu_err != cudaSuccess)
std::cerr << cudaGetErrorString(cu_err) << std::endl;
return 0;
}