-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmatrix_multiply.cu
More file actions
98 lines (78 loc) · 2.85 KB
/
matrix_multiply.cu
File metadata and controls
98 lines (78 loc) · 2.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include <iostream>
#include <cuda_runtime.h>
#include <cassert>
__global__ void matrixMultiply(float *A, float *B, float *C, int N) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < N && col < N) {
float sum = 0.0f;
for (int k = 0; k < N; k++) {
sum += A[row * N + k] * B[k * N + col];
}
C[row * N + col] = sum;
}
}
__global__ void matrixMultiplyShared(float *A, float *B, float *C, int N) {
// Shared memory for tile-based computation
__shared__ float tileA[16][16];
__shared__ float tileB[16][16];
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int tx = threadIdx.x;
int ty = threadIdx.y;
float sum = 0.0f;
// Process matrix in tiles
for (int tile = 0; tile < (N + 15) / 16; tile++) {
// Load tiles into shared memory
if (row < N && (tile * 16 + tx) < N) {
tileA[ty][tx] = A[row * N + tile * 16 + tx];
} else {
tileA[ty][tx] = 0.0f;
}
if ((tile * 16 + ty) < N && col < N) {
tileB[ty][tx] = B[(tile * 16 + ty) * N + col];
} else {
tileB[ty][tx] = 0.0f;
}
__syncthreads(); // Wait for all threads to load data
// Compute partial dot product
for (int k = 0; k < 16; k++) {
sum += tileA[ty][k] * tileB[k][tx];
}
__syncthreads(); // Wait before next tile
}
// Write result
if (row < N && col < N) {
C[row * N + col] = sum;
}
}
int main() {
const int N = 32; // Matrix size (N x N)
size_t size = N * N * sizeof(float);
// Allocate unified memory
float *A, *B, *C;
cudaMallocManaged(&A, size);
cudaMallocManaged(&B, size);
cudaMallocManaged(&C, size);
// Initialize matrices
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
A[i * N + j] = (i == j) ? 1.0f : 0.0f; // Identity matrix
B[i * N + j] = i * N + j; // Sequential values
}
}
// Configure thread block and grid
dim3 threadsPerBlock(16, 16); // 256 threads per block
dim3 numBlocks((N + 15) / 16, (N + 15) / 16);
std::cout << "Computing " << N << "x" << N << " matrix multiplication..." << std::endl;
std::cout << "Using " << numBlocks.x * numBlocks.y << " blocks with "
<< threadsPerBlock.x * threadsPerBlock.y << " threads each" << std::endl;
std::cout << "Total threads: " << numBlocks.x * numBlocks.y * threadsPerBlock.x * threadsPerBlock.y << std::endl;
// Launch kernel
matrixMultiplyShared<<<numBlocks, threadsPerBlock>>>(A, B, C, N);
cudaDeviceSynchronize();
cudaFree(A);
cudaFree(B);
cudaFree(C);
return 0;
}