CIS5650-Fall-2024 · manvi27 · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024
diff --git a/README.md b/README.md
@@ -3,12 +3,86 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* MANVI AGARWAL
+  * [LinkedIn](https://www.linkedin.com/in/manviagarwal27/)
+* Tested on: Windows 11, AMD Ryzen 5 7640HS @ 4.30GHz 16GB, GeForce RTX 4060 8GB(personal)
 
-### (TODO: Your README)
+### Performance Analysis
+
+This repository compares the implementation of scan or prefix scan algorithm on CPU and GPU. The code in `main.cpp` does correctness check as well as profiles individual implementations to compute the time. The output for the test code is:
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  31  35  17  38   8  45   8  15  34  30  36  13  29 ...   7   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.0012ms    (std::chrono Measured)
+    [   0  31  66  83 121 129 174 182 197 231 261 297 310 ... 6271 6278 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.0013ms    (std::chrono Measured)
+    [   0  31  66  83 121 129 174 182 197 231 261 297 310 ... 6206 6242 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.640896ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.16288ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.245184ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.115168ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 16.073ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 1.03667ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   3   1   1   2   0   3   2   1   0   0   0   1   3 ...   1   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.0012ms    (std::chrono Measured)
+    [   3   1   1   2   3   2   1   1   3   3   2   3   3 ...   3   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.0011ms    (std::chrono Measured)
+    [   3   1   1   2   3   2   1   1   3   3   2   3   3 ...   2   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.0055ms    (std::chrono Measured)
+    [   3   1   1   2   3   2   1   1   3   3   2   3   3 ...   3   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.077728ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.083264ms    (CUDA Measured)
+    passed
+```
+
+## Computation comparison
+
+![](img/ComparisonChart.png)
+
+## Insights
+
+The results favor CPU implementation heavily in terms of timing. There are a couple of reasons for GPU implementation to be much slower than CPU one. All these limitations can be explored to look for optimization for GPU implementation.
+
+Following are some of the reasons for GPU implementation to be much slower:
+
+**1. Expensive memory operations:**
+ For the implementation, I've used global memory and before calling GPU, the data is transferred from CPU memory to global memory which adds to the latency of GPU implementation.
+
+ **2. Small computations and small data set:**
+ Since each thread doesn't have a lot of computation, the time it takes to transfer data from host to device and back ends up surpassing the reduction in time that parallelization of computation attempts to achieve.
+
+**3. Warp Partitioning:** 
+With each iteration, number of threads working reduce but the corresponding warps remain active. This causes divergent wraps and hence the GPU is under-utilized. 
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
 
diff --git a/img/ComparisonChart.png b/img/ComparisonChart.png
diff --git a/src/main.cpp b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 16; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];

diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
@@ -24,6 +24,9 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int thid = threadIdx.x + (blockIdx.x*blockDim.x);
+            bools[thid] = (idata[thid] != 0);
+
         }
 
         /**
@@ -33,6 +36,11 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int thid = threadIdx.x + blockIdx.x*blockDim.x;
+            if(bools[thid] == 1)
+            {
+                odata[indices[thid]] = idata[thid];
+            }
         }
 
     }

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -20,6 +20,12 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            for(int i = 0; i<n; i++){
+                if(i == 0)
+                    odata[i] = 0;
+                else
+                    odata[i] = odata[i-1] + idata[i-1];
+            }
             timer().endCpuTimer();
         }
 
@@ -31,8 +37,15 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int j = 0;
+            for(int i = 0;i<n;i++)
+            {
+                if(idata[i] != 0)
+                    odata[j++] = idata[i];
+            }
             timer().endCpuTimer();
-            return -1;
+            return j;
+
         }
 
         /**
@@ -43,8 +56,26 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            int *temp_array = (int*)malloc(n*sizeof(int));
+            int *scan_array = (int*)malloc(n*sizeof(int));
+            for(int i = 0; i < n; i++)
+                temp_array[i] = (idata[i] != 0);
+            for(int i = 0; i<n; i++){
+                if(i == 0)
+                    scan_array[i] = 0;
+                else
+                    scan_array[i] = temp_array[i-1] + scan_array[i-1];
+            }
+            for(int i = 0; i < n;i++)
+            {
+                if(temp_array[i] == 1)
+                {
+                    odata[scan_array[i]] = idata[i];
+                }
+            }
+
             timer().endCpuTimer();
-            return -1;
+            return scan_array[n-1];
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
@@ -3,6 +3,7 @@
 #include "common.h"
 #include "efficient.h"
 
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,12 +13,73 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void scan_upstream(int n, int *idata,int offset)
+        {
+            int thid = threadIdx.x + (blockIdx.x * blockDim.x);
+            //load data in the global memory
+            int ai = offset * (2*thid + 1) - 1;
+            int bi = offset * (2*thid + 2) - 1;
+
+            if (ai < n && bi < n)
+            {
+
+                idata[bi] += idata[ai];                           
+            }
+            if(thid == 0)
+                idata[n-1] =0;
+            __syncthreads();
+
+        }
+
+
+
+        __global__ void scan_downstream(int n, int* idata,int offset)
+        {
+            int thid = threadIdx.x + (blockIdx.x * blockDim.x);
+            //load data in the global memory           
+            int ai = offset * (2 * thid + 1) - 1;
+            int bi = offset * (2 * thid + 2) - 1;
+            if ((ai < n) && (bi < n)) {
+                float t = idata[ai];
+                idata[ai] = idata[bi];
+                idata[bi] += t;
+                }
+
+             __syncthreads();
+
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
-            // TODO
+
+            int *g_idata;
+            int zeropadded_n = pow(2, ilog2ceil(n));
+
+            cudaMalloc((void**)&g_idata,zeropadded_n * sizeof(int));
+
+            cudaMemcpy(g_idata,idata,n*sizeof(int),cudaMemcpyHostToDevice);
+
+            int threadsPerBlock = 1024;
+            int blocksPerGrid = ((zeropadded_n /2) + threadsPerBlock - 1) / threadsPerBlock;
+            int offset = 1;
+            for (int i = 0; i < ilog2ceil(n); i++)
+            {
+                scan_upstream << <blocksPerGrid, threadsPerBlock >> > (zeropadded_n, g_idata,offset);
+                offset *= 2;
+            }
+            cudaDeviceSynchronize();
+            offset = zeropadded_n / 2;
+            for (int i = 0; i < ilog2ceil(n); i++)
+            {
+                scan_downstream << <blocksPerGrid, threadsPerBlock >> > (zeropadded_n, g_idata,offset);
+                offset /= 2;
+
+            }
+            cudaMemcpy(odata,g_idata,sizeof(int)*n,cudaMemcpyDeviceToHost);
+            cudaFree(g_idata);
             timer().endGpuTimer();
         }
 
@@ -33,8 +95,56 @@ namespace StreamCompaction {
         int compact(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
             // TODO
+            int *g_bools = 0;
+            int *bools = 0;
+            int* indices;
+            int* g_idata;
+            int *g_odata;
+            int zeropadded_n = pow(2,ilog2ceil(n));
+            printf("zeropadded = %d\n", zeropadded_n);
+            int threadsPerBlock = 256;
+            int blocksPerGrid = (zeropadded_n + threadsPerBlock - 1) / threadsPerBlock;
+            int *temp_array = (int*)malloc(sizeof(int)*zeropadded_n);
+            cudaError_t result = cudaMalloc((void**)(&g_bools), zeropadded_n * sizeof(int));
+            if (result != cudaSuccess) {
+                fprintf(stderr, "Mem alloc failed: %s\n", cudaGetErrorString(result));
+                cudaFree(g_bools);
+                timer().endGpuTimer();
+                return -1;
+            }
+            result = cudaMalloc((void**)(&g_idata), zeropadded_n * sizeof(int));
+            cudaMemset(g_idata, 0, zeropadded_n*sizeof(int));
+            if (result != cudaSuccess) {
+                fprintf(stderr, "Mem alloc failed: %s\n", cudaGetErrorString(result));
+                cudaFree(g_bools);
+                cudaFree(g_idata);
+                timer().endGpuTimer();
+                return -1;
+            }
+            cudaMemcpy(g_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            cudaMalloc((void**)&g_odata,sizeof(int)* zeropadded_n);
+            StreamCompaction::Common::kernMapToBoolean<<<blocksPerGrid,threadsPerBlock>>>(zeropadded_n, g_bools, g_idata);
+            bools = (int*)malloc(zeropadded_n*sizeof(int));
+            cudaDeviceSynchronize();
+            cudaMemcpy(bools,g_bools,zeropadded_n*sizeof(int),cudaMemcpyDeviceToHost);
+
+            result = cudaMalloc(&indices, zeropadded_n * sizeof(int));
             timer().endGpuTimer();
-            return -1;
+            scan(zeropadded_n, temp_array, bools);
+
+            timer().startGpuTimer();
+            cudaMemcpy(indices, temp_array, zeropadded_n * sizeof(int), cudaMemcpyHostToDevice);
+
+            StreamCompaction::Common::kernScatter<<<blocksPerGrid,threadsPerBlock>>>(zeropadded_n, g_odata,g_idata, g_bools, indices);
+            cudaMemcpy(odata,g_odata,zeropadded_n*sizeof(int),cudaMemcpyDeviceToHost);
+            cudaFree(g_bools);
+            cudaFree(indices);
+            cudaFree(g_odata);
+            cudaFree(g_idata);
+            free(bools);
+            timer().endGpuTimer();
+
+            return temp_array[zeropadded_n-1];
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "naive.h"
 
+#define block_size 256
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -12,13 +14,56 @@ namespace StreamCompaction {
             return timer;
         }
         // TODO: __global__
+        __global__ void scan_global(int n, int *odata, int *idata, int *temp,int offset,int pout)
+        {
+            int thid = threadIdx.x + (blockIdx.x * blockDim.x);
+            // Load input into global memory.
+            // This is exclusive scan, so shift right by one
+            // and set first element to 0
+            int pin = 1 - pout;
+            if (thid >= offset)
+                temp[pout * n + thid] = temp[pin * n + thid - offset] + temp[pin* n + thid];
+            else
+                temp[pout * n + thid] = temp[pin * n + thid];
+            __syncthreads();
+            odata[thid] = temp[pout * n + thid]; // write output
+        }
 
+        __global__ void shiftInput(int* idata, int* shifted_input)
+        {
+            int thid = threadIdx.x + (blockIdx.x * blockDim.x);
+            shifted_input[thid] = (thid > 0) ? idata[thid - 1] : 0;
+            __syncthreads();
+        }
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
             // TODO
+            int *g_odata,*g_idata,*temp;
+            int zeropadded_n = pow(2, ilog2ceil(n));
+            cudaError_t result = cudaMalloc((void**)&g_idata, zeropadded_n * sizeof(int));
+            result = cudaMalloc((void**)&g_odata,zeropadded_n*sizeof(int));
+            result = cudaMalloc((void**)&temp,2 * zeropadded_n * sizeof(int));
+            cudaMemcpy(g_idata,idata,sizeof(int)*n,cudaMemcpyHostToDevice);
+            int threadsPerBlock = 1024;
+            int blocksPerGrid = (zeropadded_n + threadsPerBlock - 1) / threadsPerBlock;
+
+            int offset = 1;
+            int pout = 0;
+            shiftInput<<<blocksPerGrid,threadsPerBlock>>>(g_idata, temp);
+            for (int i = 0; i < ilog2ceil(n); i++) {
+                pout = 1 - pout;
+                scan_global<<<blocksPerGrid,threadsPerBlock>>>(zeropadded_n,g_odata,g_idata,temp,offset,pout);
+                offset *= 2;
+            }
+
+            cudaMemcpy(odata,g_odata,sizeof(int)*n,cudaMemcpyDeviceToHost);
+            for (int i = 0; i < 257; i++)
+            {
+                //printf("%d %d\n", idata[i], odata[i]);
+            }
             timer().endGpuTimer();
         }
     }