Cuda-Tutorial/matrix_multiply.cu at main · BuddySirJava/Cuda-Tutorial · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include <iostream>
#include <cuda_runtime.h>
#include <cassert>

__global__ void matrixMultiply(float *A, float *B, float *C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < N; k++) {
            sum += A[row * N + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}

__global__ void matrixMultiplyShared(float *A, float *B, float *C, int N) {
    // Shared memory for tile-based computation
    __shared__ float tileA[16][16];
    __shared__ float tileB[16][16];

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    float sum = 0.0f;

    // Process matrix in tiles
    for (int tile = 0; tile < (N + 15) / 16; tile++) {
        // Load tiles into shared memory
        if (row < N && (tile * 16 + tx) < N) {
            tileA[ty][tx] = A[row * N + tile * 16 + tx];
        } else {
            tileA[ty][tx] = 0.0f;
        }

        if ((tile * 16 + ty) < N && col < N) {
            tileB[ty][tx] = B[(tile * 16 + ty) * N + col];
        } else {
            tileB[ty][tx] = 0.0f;
        }

        __syncthreads(); // Wait for all threads to load data

        // Compute partial dot product
        for (int k = 0; k < 16; k++) {
            sum += tileA[ty][k] * tileB[k][tx];
        }

        __syncthreads(); // Wait before next tile
    }

    // Write result
    if (row < N && col < N) {
        C[row * N + col] = sum;
    }
}

int main() {
    const int N = 32; // Matrix size (N x N)
    size_t size = N * N * sizeof(float);

    // Allocate unified memory
    float *A, *B, *C;
    cudaMallocManaged(&A, size);
    cudaMallocManaged(&B, size);
    cudaMallocManaged(&C, size);

    // Initialize matrices
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            A[i * N + j] = (i == j) ? 1.0f : 0.0f; // Identity matrix
            B[i * N + j] = i * N + j; // Sequential values
        }
    }

    // Configure thread block and grid
    dim3 threadsPerBlock(16, 16); // 256 threads per block
    dim3 numBlocks((N + 15) / 16, (N + 15) / 16);

    std::cout << "Computing " << N << "x" << N << " matrix multiplication..." << std::endl;
    std::cout << "Using " << numBlocks.x * numBlocks.y << " blocks with "
              << threadsPerBlock.x * threadsPerBlock.y << " threads each" << std::endl;
    std::cout << "Total threads: " << numBlocks.x * numBlocks.y * threadsPerBlock.x * threadsPerBlock.y << std::endl;

    // Launch kernel
    matrixMultiplyShared<<<numBlocks, threadsPerBlock>>>(A, B, C, N);
    cudaDeviceSynchronize();

    cudaFree(A);
    cudaFree(B);
    cudaFree(C);

    return 0;
}