Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,11 @@ tmp/
src/cuda/GPU_Microbenchmark/ubench/**/*
!src/cuda/GPU_Microbenchmark/ubench/**/*/
!src/cuda/GPU_Microbenchmark/ubench/**/*.*
!src/cuda/GPU_Microbenchmark/ubench/**/Makefile
!src/cuda/GPU_Microbenchmark/ubench/**/Makefile

# Ignore compiled CUDA binaries
*.fatbin

# Ignore VPI symlinks
src/cuda/HPC/vpi/*
!src/cuda/HPC/vpi/vpi_subtractor/main.cpp
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,9 @@
path = src/cuda/pytorch_examples
url = https://github.com/accel-sim/pytorch_examples.git
branch = inference_accelsim_v2
[submodule "src/cuda/HPC/external/cugraph"]
path = src/cuda/HPC/external/cugraph
url = https://github.com/rapidsai/cugraph.git
[submodule "src/cuda/HPC/external/newton"]
path = src/cuda/HPC/external/newton
url = https://github.com/newton-physics/newton.git
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,16 @@ To compile everything above for AccelWattch:
```
make accelwattch -C ./src
```

## H100 Benchmark Suite

The H100 suite contains 15 modern GPU workloads from H100 profiling and analysis:

- **cuFFT** (2 apps): FFT operations using cuFFT library
- **cuSolver** (2 apps): Linear algebra using cuSolver library
- **Image Processing** (3 apps): Wavelet transform, Gaussian filter, FDTD3d
- **Graph Algorithms** (2 apps): BFS and MST using cuGraph (git submodule)
- **Physics Simulation** (3 apps): Newton physics engine benchmarks (git submodule)
- **Computer Vision** (3 apps): VPI-based vision processing (requires VPI 4.0)

See [src/cuda/H100/README.md](src/cuda/H100/README.md) for details.
14 changes: 14 additions & 0 deletions get_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,17 @@ if [ ! -d $DATA_ROOT ]; then
tar xzvf all.gpgpu-sim-app-data.tgz -C $BASH_ROOT
rm all.gpgpu-sim-app-data.tgz
fi

echo "Generating HPC benchmark data..."
if [ -f $BASH_ROOT/src/cuda/HPC/get_graph_data.sh ]; then
bash $BASH_ROOT/src/cuda/HPC/get_graph_data.sh || echo "Warning: Graph data generation failed"
fi
if [ -f $BASH_ROOT/src/cuda/HPC/get_image_data.sh ]; then
bash $BASH_ROOT/src/cuda/HPC/get_image_data.sh || echo "Warning: Image data generation failed"
fi
if [ -f $BASH_ROOT/src/cuda/HPC/get_dwt_data.sh ]; then
bash $BASH_ROOT/src/cuda/HPC/get_dwt_data.sh || echo "Warning: DWT data generation failed"
fi
if [ -f $BASH_ROOT/src/cuda/HPC/get_vpi_data.sh ]; then
bash $BASH_ROOT/src/cuda/HPC/get_vpi_data.sh
fi
52 changes: 49 additions & 3 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ $(error You must run "source setup_environment before calling make")
endif

ifeq ($(CUDA_GT_7), 1)
all: GPU_Microbenchmark microbench rodinia_2.0-ft cutlass rodinia-3.1 pannotia proxy-apps ispass-2009 lonestargpu-2.0 polybench custom_apps heterosync cuda_samples mlperf_inference vllm huggingface # mlperf_training
all: GPU_Microbenchmark microbench rodinia_2.0-ft cutlass rodinia-3.1 pannotia proxy-apps ispass-2009 lonestargpu-2.0 polybench custom_apps heterosync cuda_samples mlperf_inference vllm huggingface hpc # mlperf_training
else
ifeq ($(CUDA_GT_4), 1)
all: pannotia rodinia_2.0-ft proxy-apps dragon-naive microbench rodinia-3.1 ispass-2009 dragon-cdp lonestargpu-2.0 polybench parboil shoc custom_apps
Expand All @@ -19,7 +19,7 @@ accelwattch_hw_power: rodinia-3.1_hw_power parboil_hw_power cuda_samples-11.0_hw
#Disable clean for now, It has a bug!
# clean_dragon-naive clean_pannotia clean_proxy-apps

clean: clean_mlperf_inference clean_rodinia_2.0-ft clean_dragon-cdp clean_ispass-2009 clean_lonestargpu-2.0 clean_custom_apps clean_parboil clean_cutlass clean_rodinia-3.1 clean_heterosync clean_UVMSmart_test clean_cuda_samples clean_huggingface clean_GPU_Microbenchmark
clean: clean_mlperf_inference clean_rodinia_2.0-ft clean_dragon-cdp clean_ispass-2009 clean_lonestargpu-2.0 clean_custom_apps clean_parboil clean_cutlass clean_rodinia-3.1 clean_heterosync clean_UVMSmart_test clean_cuda_samples clean_huggingface clean_GPU_Microbenchmark clean_hpc
clean_accelwattch: clean_rodinia-3.1 clean_parboil clean_cutlass clean_cuda_samples-11.0 clean_cuda_samples_hw_power clean_rodinia-3.1_hw_power clean_parboil_hw_power clean_accelwattch_ubench

clean_data:
Expand Down Expand Up @@ -543,6 +543,52 @@ clean_heterosync:
clean_cutlass:
rm -rf cuda/cutlass-bench/build

###############################################################################
# Modern HPC Benchmarks - (CUDA 11+, sm_75+)
###############################################################################
hpc:
mkdir -p $(BINDIR)/$(BINSUBDIR)/
# Initialize and update submodules (like cutlass pattern)
# Clean up any corrupted submodule directories (check if .git exists)
@for dir in cuda/cuda-samples cuda/HPC/external/cugraph cuda/HPC/external/newton; do \
if [ -d $$dir ] && [ ! -d $$dir/.git ]; then \
echo "Removing corrupted submodule directory: $$dir"; \
rm -rf $$dir; \
fi; \
done
git submodule sync && git submodule update --init --recursive

# Build simple apps (cuFFT, cuSolver, image processing)
$(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/HPC simple



# Build Newton apps (Newton submodule)
cp -r cuda/HPC/newton $(BINDIR)/$(BINSUBDIR)/
mkdir -p $(BINDIR)/$(BINSUBDIR)/external
cp -r cuda/HPC/external/newton $(BINDIR)/$(BINSUBDIR)/external/
bash $(BINDIR)/$(BINSUBDIR)/newton/setup_newton.sh
echo 'source $(BINDIR)/$(BINSUBDIR)/newton/setup_newton.sh && python3 $(BINDIR)/$(BINSUBDIR)/newton/diffsim_ball/example_diffsim_ball.py "$$@"' > $(BINDIR)/$(BINSUBDIR)/newton_diffsim_ball
chmod u+x $(BINDIR)/$(BINSUBDIR)/newton_diffsim_ball
echo 'source $(BINDIR)/$(BINSUBDIR)/newton/setup_newton.sh && python3 $(BINDIR)/$(BINSUBDIR)/newton/robot_cartpole/example_robot_cartpole.py "$$@"' > $(BINDIR)/$(BINSUBDIR)/newton_robot_cartpole
chmod u+x $(BINDIR)/$(BINSUBDIR)/newton_robot_cartpole

# Build VPI apps from symlinked sources and copy binaries
$(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/HPC vpi
cp cuda/HPC/vpi/vpi_background_subtractor/build/vpi_sample_14_background_subtractor $(BINDIR)/$(BINSUBDIR)/vpi_background_subtractor
cp cuda/HPC/vpi/vpi_orb_feature_detector/build/vpi_sample_18_orb_feature_detector $(BINDIR)/$(BINSUBDIR)/vpi_orb_feature_detector
cp cuda/HPC/vpi/vpi_stereo_disparity/build/vpi_sample_02_stereo_disparity $(BINDIR)/$(BINSUBDIR)/vpi_stereo_disparity

# Build graph apps (cuGraph submodule)
-$(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/HPC graph && \
cp cuda/HPC/graph/mst_standalone/build/mst_standalone $(BINDIR)/$(BINSUBDIR)/

clean_HPC:
$(SETENV) $(MAKE) -C cuda/HPC clean
rm -rf cuda/HPC/external/cugraph/build
rm -rf cuda/HPC/external/newton/build
rm -rf cuda/HPC/newton/newton_venv

# clean_deeplearning:
# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/cudnn/mnist clean

Expand Down Expand Up @@ -711,7 +757,7 @@ clean_pytorch_examples:
rm -f $(BINDIR)/$(BINSUBDIR)/inference_vae

clean_cuda_samples:
$(MAKE) clean -C ./cuda/cuda-samples/build
-$(MAKE) clean -C ./cuda/cuda-samples/build 2>/dev/null || true

clean_huggingface:
rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface
Expand Down
25 changes: 25 additions & 0 deletions src/cuda/GPU_Microbenchmark/hw_def/blackwell_RTX5090_hw_def.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@


#ifndef BLACK_GB202_DEF_H
#define BLACK_GB202_DEF_H

#include "./common/common.h"
#include "./common/deviceQuery.h"

#define L1_SIZE (256 * 1024) // Max L1 size in bytes

// #define CLK_FREQUENCY 2010 // frequency in MHz

#define ISSUE_MODEL issue_model::single // single issue core or dual issue
#define CORE_MODEL core_model::subcore // subcore model or shared model
#define DRAM_MODEL dram_model::GDDR6 // memory type
#define WARP_SCHEDS_PER_SM 4 // number of warp schedulers per SM


#define SASS_hmma_per_PTX_wmma 2


#define L2_BANKS_PER_MEM_CHANNEL 1
#define L2_BANK_WIDTH_in_BYTE 64

#endif
4 changes: 3 additions & 1 deletion src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,9 @@ inline void parseGpuConfigArgs(int argc, char *argv[])
++i;
}
config.MAX_WARPS_PER_SM = config.MAX_THREADS_PER_SM / config.WARP_SIZE;
config.MEM_CLK_FREQUENCY = config.MEM_CLK_FREQUENCY * 1e-3f;
// Note: MEM_CLK_FREQUENCY is already in MHz from initializeDeviceProp (line 313)
// Do not convert to GHz - the bandwidth calculation expects MHz
// config.MEM_CLK_FREQUENCY = config.MEM_CLK_FREQUENCY * 1e-3f;
config.BLOCKS_PER_SM = config.MAX_THREADS_PER_SM / config.THREADS_PER_BLOCK;
config.THREADS_PER_SM = config.BLOCKS_PER_SM * config.THREADS_PER_BLOCK;
config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM;
Expand Down
3 changes: 2 additions & 1 deletion src/cuda/GPU_Microbenchmark/hw_def/hw_def.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
// #include "volta_TITANV_hw_def.h"

// #include "ampere_A100_hw_def.h"
#include "blackwell_B200_hw_def.h"
// #include "blackwell_B200_hw_def.h"
#include "blackwell_RTX5090_hw_def.h"

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,13 @@ int main(int argc, char *argv[])
}
}

config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM
config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; // Recalculate after changing BLOCKS_NUM

unsigned ARRAY_SIZE = config.TOTAL_THREADS + repeat_times * config.WARP_SIZE;
assert(ARRAY_SIZE * sizeof(float) <
config.L2_SIZE); // Array size must not exceed L2 size

config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM

// config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM // Commented out - causes mismatch on GPUs with MAX_THREADS_PER_SM != 2048
uint64_t *startClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t));
uint64_t *stopClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t));

Expand Down
37 changes: 28 additions & 9 deletions src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,32 @@ int main(int argc, char *argv[])
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);

unsigned N = ARRAY_SIZE * 6 * sizeof(float); // 6 arrays of floats types
float max_bw = (float)config.MEM_BITWIDTH * config.MEM_CLK_FREQUENCY * 2 / 1e3 / 8;
mem_bw = (float)(N) / ((float)(stopClk[0] - startClk[0]));
printf("Mem BW= %f (Byte/Clk)\n", mem_bw);
printf("Mem BW= %f (GB/sec)\n", (float)N / milliseconds / 1e6);
printf("Max Theortical Mem BW= %f (GB/sec)\n", max_bw);
printf("Mem Efficiency = %f %%\n", (mem_bw / max_bw) * 100);

printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
// Find min and max clocks across all threads to get actual kernel execution time
uint32_t minStart = startClk[0], maxStop = stopClk[0];
for (unsigned i = 1; i < config.TOTAL_THREADS; i++) {
if (startClk[i] < minStart) minStart = startClk[i];
if (stopClk[i] > maxStop) maxStop = stopClk[i];
}
uint32_t totalClocks = maxStop - minStart;

unsigned N = ARRAY_SIZE * 6 * sizeof(float); // 6 arrays of floats (5 reads + 1 write)

// Theoretical max bandwidth in GB/s
// MEM_CLK_FREQUENCY is in MHz, MEM_BITWIDTH is in bits
// BW = (Bus Width / 8 bytes) × (Clock MHz) × 2 (DDR) / 1000 = GB/s
float max_bw = (float)config.MEM_BITWIDTH / 8 * config.MEM_CLK_FREQUENCY * 2 / 1000;

// Achieved bandwidth from CUDA event timing (most accurate)
float achieved_bw_from_time = (float)N / milliseconds / 1e6;

// Achieved bandwidth from cycle count (less accurate, single SM perspective)
float achieved_bw_from_cycles = ((float)N / totalClocks) * config.CLK_FREQUENCY / 1e3;

printf("Mem BW= %f (Byte/Clk)\n", (float)N / totalClocks);
printf("Mem BW (from time)= %f (GB/sec)\n", achieved_bw_from_time);
printf("Mem BW (from cycles)= %f (GB/sec)\n", achieved_bw_from_cycles);
printf("Max Theoretical Mem BW= %f (GB/sec)\n", max_bw);
printf("Mem Efficiency = %f %%\n", (achieved_bw_from_time / max_bw) * 100);

printf("Total Clk number = %u (min start: %u, max stop: %u)\n", totalClocks, minStart, maxStop);
}
100 changes: 100 additions & 0 deletions src/cuda/HPC/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# H100 Benchmark Suite Makefile (following cutlass-bench pattern)
.PHONY: all simple graph newton vpi clean cufft cusolver image

# Default: Build all apps (submodules initialized by parent Makefile)
all: simple graph newton vpi


# Simple apps (cuFFT, cuSolver, image processing - CUDA toolkit only)
simple: cufft cusolver image

cufft:
$(MAKE) -C cufft/cufft_3d_c2c
$(MAKE) -C cufft/cufft_lto_r2c_c2r

cusolver:
$(MAKE) -C cusolver/cusolver_ormqr
$(MAKE) -C cusolver/cusolver_Xgetrf

image:
$(MAKE) -C image/dwtHaar1D
$(MAKE) -C image/recursiveGaussian
$(MAKE) -C image/FDTD3d

# Graph apps (cuGraph submodule - built with CMake)
graph:
@echo "Building cuGraph library from submodule..."
@if [ -d "external/cugraph" ]; then \
cd external/cugraph && \
./build.sh libcugraph --skip_cpp_tests && \
echo "Building graph apps..." && \
cd ../.. && \
export CUGRAPH_ROOT="$(shell pwd)/external/cugraph" && \
# mkdir -p graph/bfs_standalone/build && \
# cd graph/bfs_standalone/build && \
# cmake .. -DCMAKE_CUDA_COMPILER=$(CUDA_INSTALL_PATH)/bin/nvcc && \
# $(MAKE) && \
# cd ../../.. && \
mkdir -p graph/mst_standalone/build && \
cd graph/mst_standalone/build && \
cmake .. -DCMAKE_CUDA_COMPILER=$(CUDA_INSTALL_PATH)/bin/nvcc && \
$(MAKE); \
else \
echo "WARNING: cuGraph submodule not found - skipping graph apps"; \
fi

# Newton apps (Newton submodule - Python-based)
newton:
@echo "Newton apps ready (Python-based, no build needed)"

# VPI apps (VPI library from system - required)
# Symlink sources from /opt/nvidia/vpi*/samples/ and build locally
vpi:
@echo "Searching for VPI installation..."
@VPI_ROOT=$$(find /opt/nvidia -maxdepth 1 -name "vpi*" -type d 2>/dev/null | head -1); \
if [ -z "$$VPI_ROOT" ]; then \
echo "ERROR: VPI not found in /opt/nvidia/"; \
echo "Please install VPI from NVIDIA"; \
exit 1; \
fi; \
echo "Found VPI at $$VPI_ROOT"; \
VPI_SAMPLES=$$VPI_ROOT/samples; \
echo "Creating symlinks to VPI sample sources..."; \
mkdir -p vpi/vpi_orb_feature_detector vpi/vpi_stereo_disparity; \
ln -sf $$VPI_SAMPLES/14-background_subtractor/CMakeLists.txt vpi/vpi_background_subtractor/CMakeLists.txt; \
ln -sf $$VPI_SAMPLES/18-orb_feature_detector/main.cpp vpi/vpi_orb_feature_detector/main.cpp; \
ln -sf $$VPI_SAMPLES/18-orb_feature_detector/CMakeLists.txt vpi/vpi_orb_feature_detector/CMakeLists.txt; \
ln -sf $$VPI_SAMPLES/02-stereo_disparity/main.cpp vpi/vpi_stereo_disparity/main.cpp; \
ln -sf $$VPI_SAMPLES/02-stereo_disparity/CMakeLists.txt vpi/vpi_stereo_disparity/CMakeLists.txt; \
echo "Building VPI apps from symlinked sources..."; \
mkdir -p vpi/vpi_background_subtractor/build && \
cd vpi/vpi_background_subtractor/build && \
cmake .. && $(MAKE) && \
cd ../../.. && \
mkdir -p vpi/vpi_orb_feature_detector/build && \
cd vpi/vpi_orb_feature_detector/build && \
cmake .. && $(MAKE) && \
cd ../../.. && \
mkdir -p vpi/vpi_stereo_disparity/build && \
cd vpi/vpi_stereo_disparity/build && \
cmake .. && $(MAKE)

clean:
# Clean bin directory
-rm -rf bin
# Clean simple apps
-$(MAKE) -C cufft/cufft_3d_c2c clean
-$(MAKE) -C cufft/cufft_lto_r2c_c2r clean
-$(MAKE) -C cusolver/cusolver_ormqr clean
-$(MAKE) -C cusolver/cusolver_Xgetrf clean
-$(MAKE) -C image/dwtHaar1D clean
-$(MAKE) -C image/recursiveGaussian clean
-$(MAKE) -C image/FDTD3d clean
# Clean graph apps and cuGraph build
-rm -rf graph/bfs_standalone/build
-rm -rf graph/mst_standalone/build
-rm -rf external/cugraph/cpp/build
# Clean Newton
-rm -rf newton/newton_venv
# Clean VPI apps and symlinks
-rm -rf vpi/*/build
Loading
Loading