Skip to content
Open
86 changes: 80 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,86 @@ CUDA Stream Compaction

**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**

* (TODO) YOUR NAME HERE
* (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
* MANVI AGARWAL
* [LinkedIn](https://www.linkedin.com/in/manviagarwal27/)
* Tested on: Windows 11, AMD Ryzen 5 7640HS @ 4.30GHz 16GB, GeForce RTX 4060 8GB(personal)

### (TODO: Your README)
### Performance Analysis

This repository compares the implementation of scan or prefix scan algorithm on CPU and GPU. The code in `main.cpp` does correctness check as well as profiles individual implementations to compute the time. The output for the test code is:

```
****************
** SCAN TESTS **
****************
[ 31 35 17 38 8 45 8 15 34 30 36 13 29 ... 7 0 ]
==== cpu scan, power-of-two ====
elapsed time: 0.0012ms (std::chrono Measured)
[ 0 31 66 83 121 129 174 182 197 231 261 297 310 ... 6271 6278 ]
==== cpu scan, non-power-of-two ====
elapsed time: 0.0013ms (std::chrono Measured)
[ 0 31 66 83 121 129 174 182 197 231 261 297 310 ... 6206 6242 ]
passed
==== naive scan, power-of-two ====
elapsed time: 0.640896ms (CUDA Measured)
passed
==== naive scan, non-power-of-two ====
elapsed time: 0.16288ms (CUDA Measured)
passed
==== work-efficient scan, power-of-two ====
elapsed time: 0.245184ms (CUDA Measured)
passed
==== work-efficient scan, non-power-of-two ====
elapsed time: 0.115168ms (CUDA Measured)
passed
==== thrust scan, power-of-two ====
elapsed time: 16.073ms (CUDA Measured)
passed
==== thrust scan, non-power-of-two ====
elapsed time: 1.03667ms (CUDA Measured)
passed

*****************************
** STREAM COMPACTION TESTS **
*****************************
[ 3 1 1 2 0 3 2 1 0 0 0 1 3 ... 1 0 ]
==== cpu compact without scan, power-of-two ====
elapsed time: 0.0012ms (std::chrono Measured)
[ 3 1 1 2 3 2 1 1 3 3 2 3 3 ... 3 1 ]
passed
==== cpu compact without scan, non-power-of-two ====
elapsed time: 0.0011ms (std::chrono Measured)
[ 3 1 1 2 3 2 1 1 3 3 2 3 3 ... 2 2 ]
passed
==== cpu compact with scan ====
elapsed time: 0.0055ms (std::chrono Measured)
[ 3 1 1 2 3 2 1 1 3 3 2 3 3 ... 3 1 ]
passed
==== work-efficient compact, power-of-two ====
elapsed time: 0.077728ms (CUDA Measured)
passed
==== work-efficient compact, non-power-of-two ====
elapsed time: 0.083264ms (CUDA Measured)
passed
```

## Computation comparison

![](img/ComparisonChart.png)

## Insights

The results favor CPU implementation heavily in terms of timing. There are a couple of reasons for GPU implementation to be much slower than CPU one. All these limitations can be explored to look for optimization for GPU implementation.

Following are some of the reasons for GPU implementation to be much slower:

**1. Expensive memory operations:**
For the implementation, I've used global memory and before calling GPU, the data is transferred from CPU memory to global memory which adds to the latency of GPU implementation.

**2. Small computations and small data set:**
Since each thread doesn't have a lot of computation, the time it takes to transfer data from host to device and back ends up surpassing the reduction in time that parallelization of computation attempts to achieve.

**3. Warp Partitioning:**
With each iteration, number of threads working reduce but the corresponding warps remain active. This causes divergent wraps and hence the GPU is under-utilized.

Include analysis, etc. (Remember, this is public, so don't put
anything here that you don't want to share with the world.)

Binary file added img/ComparisonChart.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
const int SIZE = 1 << 16; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
Expand Down
8 changes: 8 additions & 0 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ namespace StreamCompaction {
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO
int thid = threadIdx.x + (blockIdx.x*blockDim.x);
bools[thid] = (idata[thid] != 0);

}

/**
Expand All @@ -33,6 +36,11 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO
int thid = threadIdx.x + blockIdx.x*blockDim.x;
if(bools[thid] == 1)
{
odata[indices[thid]] = idata[thid];
}
}

}
Expand Down
35 changes: 33 additions & 2 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ namespace StreamCompaction {
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
for(int i = 0; i<n; i++){
if(i == 0)
odata[i] = 0;
else
odata[i] = odata[i-1] + idata[i-1];
}
timer().endCpuTimer();
}

Expand All @@ -31,8 +37,15 @@ namespace StreamCompaction {
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int j = 0;
for(int i = 0;i<n;i++)
{
if(idata[i] != 0)
odata[j++] = idata[i];
}
timer().endCpuTimer();
return -1;
return j;

}

/**
Expand All @@ -43,8 +56,26 @@ namespace StreamCompaction {
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
int *temp_array = (int*)malloc(n*sizeof(int));
int *scan_array = (int*)malloc(n*sizeof(int));
for(int i = 0; i < n; i++)
temp_array[i] = (idata[i] != 0);
for(int i = 0; i<n; i++){
if(i == 0)
scan_array[i] = 0;
else
scan_array[i] = temp_array[i-1] + scan_array[i-1];
}
for(int i = 0; i < n;i++)
{
if(temp_array[i] == 1)
{
odata[scan_array[i]] = idata[i];
}
}

timer().endCpuTimer();
return -1;
return scan_array[n-1];
}
}
}
114 changes: 112 additions & 2 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "common.h"
#include "efficient.h"


namespace StreamCompaction {
namespace Efficient {
using StreamCompaction::Common::PerformanceTimer;
Expand All @@ -12,12 +13,73 @@ namespace StreamCompaction {
return timer;
}

__global__ void scan_upstream(int n, int *idata,int offset)
{
int thid = threadIdx.x + (blockIdx.x * blockDim.x);
//load data in the global memory
int ai = offset * (2*thid + 1) - 1;
int bi = offset * (2*thid + 2) - 1;

if (ai < n && bi < n)
{

idata[bi] += idata[ai];
}
if(thid == 0)
idata[n-1] =0;
__syncthreads();

}



__global__ void scan_downstream(int n, int* idata,int offset)
{
int thid = threadIdx.x + (blockIdx.x * blockDim.x);
//load data in the global memory
int ai = offset * (2 * thid + 1) - 1;
int bi = offset * (2 * thid + 2) - 1;
if ((ai < n) && (bi < n)) {
float t = idata[ai];
idata[ai] = idata[bi];
idata[bi] += t;
}

__syncthreads();

}

/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
timer().startGpuTimer();
// TODO

int *g_idata;
int zeropadded_n = pow(2, ilog2ceil(n));

cudaMalloc((void**)&g_idata,zeropadded_n * sizeof(int));

cudaMemcpy(g_idata,idata,n*sizeof(int),cudaMemcpyHostToDevice);

int threadsPerBlock = 1024;
int blocksPerGrid = ((zeropadded_n /2) + threadsPerBlock - 1) / threadsPerBlock;
int offset = 1;
for (int i = 0; i < ilog2ceil(n); i++)
{
scan_upstream << <blocksPerGrid, threadsPerBlock >> > (zeropadded_n, g_idata,offset);
offset *= 2;
}
cudaDeviceSynchronize();
offset = zeropadded_n / 2;
for (int i = 0; i < ilog2ceil(n); i++)
{
scan_downstream << <blocksPerGrid, threadsPerBlock >> > (zeropadded_n, g_idata,offset);
offset /= 2;

}
cudaMemcpy(odata,g_idata,sizeof(int)*n,cudaMemcpyDeviceToHost);
cudaFree(g_idata);
timer().endGpuTimer();
}

Expand All @@ -33,8 +95,56 @@ namespace StreamCompaction {
int compact(int n, int *odata, const int *idata) {
timer().startGpuTimer();
// TODO
int *g_bools = 0;
int *bools = 0;
int* indices;
int* g_idata;
int *g_odata;
int zeropadded_n = pow(2,ilog2ceil(n));
printf("zeropadded = %d\n", zeropadded_n);
int threadsPerBlock = 256;
int blocksPerGrid = (zeropadded_n + threadsPerBlock - 1) / threadsPerBlock;
int *temp_array = (int*)malloc(sizeof(int)*zeropadded_n);
cudaError_t result = cudaMalloc((void**)(&g_bools), zeropadded_n * sizeof(int));
if (result != cudaSuccess) {
fprintf(stderr, "Mem alloc failed: %s\n", cudaGetErrorString(result));
cudaFree(g_bools);
timer().endGpuTimer();
return -1;
}
result = cudaMalloc((void**)(&g_idata), zeropadded_n * sizeof(int));
cudaMemset(g_idata, 0, zeropadded_n*sizeof(int));
if (result != cudaSuccess) {
fprintf(stderr, "Mem alloc failed: %s\n", cudaGetErrorString(result));
cudaFree(g_bools);
cudaFree(g_idata);
timer().endGpuTimer();
return -1;
}
cudaMemcpy(g_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc((void**)&g_odata,sizeof(int)* zeropadded_n);
StreamCompaction::Common::kernMapToBoolean<<<blocksPerGrid,threadsPerBlock>>>(zeropadded_n, g_bools, g_idata);
bools = (int*)malloc(zeropadded_n*sizeof(int));
cudaDeviceSynchronize();
cudaMemcpy(bools,g_bools,zeropadded_n*sizeof(int),cudaMemcpyDeviceToHost);

result = cudaMalloc(&indices, zeropadded_n * sizeof(int));
timer().endGpuTimer();
return -1;
scan(zeropadded_n, temp_array, bools);

timer().startGpuTimer();
cudaMemcpy(indices, temp_array, zeropadded_n * sizeof(int), cudaMemcpyHostToDevice);

StreamCompaction::Common::kernScatter<<<blocksPerGrid,threadsPerBlock>>>(zeropadded_n, g_odata,g_idata, g_bools, indices);
cudaMemcpy(odata,g_odata,zeropadded_n*sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(g_bools);
cudaFree(indices);
cudaFree(g_odata);
cudaFree(g_idata);
free(bools);
timer().endGpuTimer();

return temp_array[zeropadded_n-1];
}
}
}
45 changes: 45 additions & 0 deletions stream_compaction/naive.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include "common.h"
#include "naive.h"

#define block_size 256

namespace StreamCompaction {
namespace Naive {
using StreamCompaction::Common::PerformanceTimer;
Expand All @@ -12,13 +14,56 @@ namespace StreamCompaction {
return timer;
}
// TODO: __global__
__global__ void scan_global(int n, int *odata, int *idata, int *temp,int offset,int pout)
{
int thid = threadIdx.x + (blockIdx.x * blockDim.x);
// Load input into global memory.
// This is exclusive scan, so shift right by one
// and set first element to 0
int pin = 1 - pout;
if (thid >= offset)
temp[pout * n + thid] = temp[pin * n + thid - offset] + temp[pin* n + thid];
else
temp[pout * n + thid] = temp[pin * n + thid];
__syncthreads();
odata[thid] = temp[pout * n + thid]; // write output
}

__global__ void shiftInput(int* idata, int* shifted_input)
{
int thid = threadIdx.x + (blockIdx.x * blockDim.x);
shifted_input[thid] = (thid > 0) ? idata[thid - 1] : 0;
__syncthreads();
}
/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
timer().startGpuTimer();
// TODO
int *g_odata,*g_idata,*temp;
int zeropadded_n = pow(2, ilog2ceil(n));
cudaError_t result = cudaMalloc((void**)&g_idata, zeropadded_n * sizeof(int));
result = cudaMalloc((void**)&g_odata,zeropadded_n*sizeof(int));
result = cudaMalloc((void**)&temp,2 * zeropadded_n * sizeof(int));
cudaMemcpy(g_idata,idata,sizeof(int)*n,cudaMemcpyHostToDevice);
int threadsPerBlock = 1024;
int blocksPerGrid = (zeropadded_n + threadsPerBlock - 1) / threadsPerBlock;

int offset = 1;
int pout = 0;
shiftInput<<<blocksPerGrid,threadsPerBlock>>>(g_idata, temp);
for (int i = 0; i < ilog2ceil(n); i++) {
pout = 1 - pout;
scan_global<<<blocksPerGrid,threadsPerBlock>>>(zeropadded_n,g_odata,g_idata,temp,offset,pout);
offset *= 2;
}

cudaMemcpy(odata,g_odata,sizeof(int)*n,cudaMemcpyDeviceToHost);
for (int i = 0; i < 257; i++)
{
//printf("%d %d\n", idata[i], odata[i]);
}
timer().endGpuTimer();
}
}
Expand Down
Loading