accel-sim · Anunalla · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026
diff --git a/.gitignore b/.gitignore
@@ -24,4 +24,11 @@ tmp/
 src/cuda/GPU_Microbenchmark/ubench/**/*
 !src/cuda/GPU_Microbenchmark/ubench/**/*/
 !src/cuda/GPU_Microbenchmark/ubench/**/*.*
-!src/cuda/GPU_Microbenchmark/ubench/**/Makefile
+!src/cuda/GPU_Microbenchmark/ubench/**/Makefile
+
+# Ignore compiled CUDA binaries
+*.fatbin
+
+# Ignore VPI symlinks
+src/cuda/HPC/vpi/*
+!src/cuda/HPC/vpi/vpi_subtractor/main.cpp
diff --git a/.gitmodules b/.gitmodules
@@ -11,3 +11,9 @@
 	path = src/cuda/pytorch_examples
 	url = https://github.com/accel-sim/pytorch_examples.git
 	branch = inference_accelsim_v2
+[submodule "src/cuda/HPC/external/cugraph"]
+	path = src/cuda/HPC/external/cugraph
+	url = https://github.com/rapidsai/cugraph.git
+[submodule "src/cuda/HPC/external/newton"]
+	path = src/cuda/HPC/external/newton
+	url = https://github.com/newton-physics/newton.git
diff --git a/README.md b/README.md
@@ -68,3 +68,16 @@ To compile everything above for AccelWattch:
 ```
 make accelwattch -C ./src
 ```
+
+## H100 Benchmark Suite
+
+The H100 suite contains 15 modern GPU workloads from H100 profiling and analysis:
+
+- **cuFFT** (2 apps): FFT operations using cuFFT library
+- **cuSolver** (2 apps): Linear algebra using cuSolver library
+- **Image Processing** (3 apps): Wavelet transform, Gaussian filter, FDTD3d
+- **Graph Algorithms** (2 apps): BFS and MST using cuGraph (git submodule)
+- **Physics Simulation** (3 apps): Newton physics engine benchmarks (git submodule)
+- **Computer Vision** (3 apps): VPI-based vision processing (requires VPI 4.0)
+
+See [src/cuda/H100/README.md](src/cuda/H100/README.md) for details.
diff --git a/get_data.sh b/get_data.sh
@@ -10,3 +10,17 @@ if [ ! -d $DATA_ROOT ]; then
     tar xzvf all.gpgpu-sim-app-data.tgz -C $BASH_ROOT
     rm all.gpgpu-sim-app-data.tgz
 fi
+
+echo "Generating HPC benchmark data..."
+if [ -f $BASH_ROOT/src/cuda/HPC/get_graph_data.sh ]; then
+    bash $BASH_ROOT/src/cuda/HPC/get_graph_data.sh || echo "Warning: Graph data generation failed"
+fi
+if [ -f $BASH_ROOT/src/cuda/HPC/get_image_data.sh ]; then
+    bash $BASH_ROOT/src/cuda/HPC/get_image_data.sh || echo "Warning: Image data generation failed"
+fi
+if [ -f $BASH_ROOT/src/cuda/HPC/get_dwt_data.sh ]; then
+    bash $BASH_ROOT/src/cuda/HPC/get_dwt_data.sh || echo "Warning: DWT data generation failed"
+fi
+if [ -f $BASH_ROOT/src/cuda/HPC/get_vpi_data.sh ]; then
+    bash $BASH_ROOT/src/cuda/HPC/get_vpi_data.sh
+fi
diff --git a/src/Makefile b/src/Makefile
@@ -4,7 +4,7 @@ $(error You must run "source setup_environment before calling make")
 endif
 
 ifeq ($(CUDA_GT_7), 1)
-all:  GPU_Microbenchmark microbench rodinia_2.0-ft cutlass rodinia-3.1 pannotia proxy-apps ispass-2009 lonestargpu-2.0 polybench custom_apps heterosync cuda_samples mlperf_inference vllm huggingface # mlperf_training
+all:  GPU_Microbenchmark microbench rodinia_2.0-ft cutlass rodinia-3.1 pannotia proxy-apps ispass-2009 lonestargpu-2.0 polybench custom_apps heterosync cuda_samples mlperf_inference vllm huggingface hpc # mlperf_training
 else
 ifeq ($(CUDA_GT_4), 1)
 all:   pannotia rodinia_2.0-ft proxy-apps dragon-naive microbench rodinia-3.1 ispass-2009 dragon-cdp lonestargpu-2.0 polybench parboil shoc custom_apps
@@ -19,7 +19,7 @@ accelwattch_hw_power: rodinia-3.1_hw_power parboil_hw_power cuda_samples-11.0_hw
 #Disable clean for now, It has a bug!
 # clean_dragon-naive clean_pannotia clean_proxy-apps
 
-clean: clean_mlperf_inference clean_rodinia_2.0-ft clean_dragon-cdp  clean_ispass-2009 clean_lonestargpu-2.0 clean_custom_apps clean_parboil clean_cutlass clean_rodinia-3.1 clean_heterosync clean_UVMSmart_test clean_cuda_samples clean_huggingface clean_GPU_Microbenchmark
+clean: clean_mlperf_inference clean_rodinia_2.0-ft clean_dragon-cdp  clean_ispass-2009 clean_lonestargpu-2.0 clean_custom_apps clean_parboil clean_cutlass clean_rodinia-3.1 clean_heterosync clean_UVMSmart_test clean_cuda_samples clean_huggingface clean_GPU_Microbenchmark clean_hpc
 clean_accelwattch: clean_rodinia-3.1 clean_parboil clean_cutlass clean_cuda_samples-11.0 clean_cuda_samples_hw_power clean_rodinia-3.1_hw_power clean_parboil_hw_power clean_accelwattch_ubench
 
 clean_data:
@@ -543,6 +543,52 @@ clean_heterosync:
 clean_cutlass:
 	rm -rf cuda/cutlass-bench/build
 
+###############################################################################
+# Modern HPC Benchmarks - (CUDA 11+, sm_75+)
+###############################################################################
+hpc:
+	mkdir -p $(BINDIR)/$(BINSUBDIR)/
+	# Initialize and update submodules (like cutlass pattern)
+	# Clean up any corrupted submodule directories (check if .git exists)
+	@for dir in cuda/cuda-samples cuda/HPC/external/cugraph cuda/HPC/external/newton; do \
+		if [ -d $$dir ] && [ ! -d $$dir/.git ]; then \
+			echo "Removing corrupted submodule directory: $$dir"; \
+			rm -rf $$dir; \
+		fi; \
+	done
+	git submodule sync && git submodule update --init --recursive
+
+	# Build simple apps (cuFFT, cuSolver, image processing)
+	$(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/HPC simple
+
+
+
+	# Build Newton apps (Newton submodule)
+	cp -r cuda/HPC/newton $(BINDIR)/$(BINSUBDIR)/
+	mkdir -p $(BINDIR)/$(BINSUBDIR)/external
+	cp -r cuda/HPC/external/newton $(BINDIR)/$(BINSUBDIR)/external/
+	bash $(BINDIR)/$(BINSUBDIR)/newton/setup_newton.sh
+	echo 'source $(BINDIR)/$(BINSUBDIR)/newton/setup_newton.sh && python3 $(BINDIR)/$(BINSUBDIR)/newton/diffsim_ball/example_diffsim_ball.py "$$@"' > $(BINDIR)/$(BINSUBDIR)/newton_diffsim_ball
+	chmod u+x $(BINDIR)/$(BINSUBDIR)/newton_diffsim_ball
+	echo 'source $(BINDIR)/$(BINSUBDIR)/newton/setup_newton.sh && python3 $(BINDIR)/$(BINSUBDIR)/newton/robot_cartpole/example_robot_cartpole.py "$$@"' > $(BINDIR)/$(BINSUBDIR)/newton_robot_cartpole
+	chmod u+x $(BINDIR)/$(BINSUBDIR)/newton_robot_cartpole
+
+	# Build VPI apps from symlinked sources and copy binaries
+	$(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/HPC vpi
+	cp cuda/HPC/vpi/vpi_background_subtractor/build/vpi_sample_14_background_subtractor $(BINDIR)/$(BINSUBDIR)/vpi_background_subtractor
+	cp cuda/HPC/vpi/vpi_orb_feature_detector/build/vpi_sample_18_orb_feature_detector $(BINDIR)/$(BINSUBDIR)/vpi_orb_feature_detector
+	cp cuda/HPC/vpi/vpi_stereo_disparity/build/vpi_sample_02_stereo_disparity $(BINDIR)/$(BINSUBDIR)/vpi_stereo_disparity
+
+	# Build graph apps (cuGraph submodule)
+	-$(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/HPC graph && \
+		cp cuda/HPC/graph/mst_standalone/build/mst_standalone $(BINDIR)/$(BINSUBDIR)/
+
+clean_HPC:
+	$(SETENV) $(MAKE) -C cuda/HPC clean
+	rm -rf cuda/HPC/external/cugraph/build
+	rm -rf cuda/HPC/external/newton/build
+	rm -rf cuda/HPC/newton/newton_venv
+
 # clean_deeplearning:
 # 	$(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/cudnn/mnist clean
 
@@ -711,7 +757,7 @@ clean_pytorch_examples:
 	rm -f $(BINDIR)/$(BINSUBDIR)/inference_vae
 
 clean_cuda_samples:
-	$(MAKE) clean -C ./cuda/cuda-samples/build
+	-$(MAKE) clean -C ./cuda/cuda-samples/build 2>/dev/null || true
 
 clean_huggingface:
 	rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface

diff --git a/src/cuda/GPU_Microbenchmark/hw_def/blackwell_RTX5090_hw_def.h b/src/cuda/GPU_Microbenchmark/hw_def/blackwell_RTX5090_hw_def.h
@@ -0,0 +1,25 @@
+
+
+#ifndef BLACK_GB202_DEF_H
+#define BLACK_GB202_DEF_H
+
+#include "./common/common.h"
+#include "./common/deviceQuery.h"
+
+#define L1_SIZE (256 * 1024) // Max L1 size in bytes
+
+// #define CLK_FREQUENCY 2010 // frequency in MHz
+
+#define ISSUE_MODEL issue_model::single // single issue core or dual issue
+#define CORE_MODEL core_model::subcore  // subcore model or shared model
+#define DRAM_MODEL dram_model::GDDR6      // memory type
+#define WARP_SCHEDS_PER_SM 4            // number of warp schedulers per SM
+
+
+#define SASS_hmma_per_PTX_wmma 2
+
+
+#define L2_BANKS_PER_MEM_CHANNEL 1
+#define L2_BANK_WIDTH_in_BYTE 64
+
+#endif
diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
@@ -108,7 +108,9 @@ inline void parseGpuConfigArgs(int argc, char *argv[])
         ++i;
     }
     config.MAX_WARPS_PER_SM = config.MAX_THREADS_PER_SM / config.WARP_SIZE;
-    config.MEM_CLK_FREQUENCY = config.MEM_CLK_FREQUENCY * 1e-3f;
+    // Note: MEM_CLK_FREQUENCY is already in MHz from initializeDeviceProp (line 313)
+    // Do not convert to GHz - the bandwidth calculation expects MHz
+    // config.MEM_CLK_FREQUENCY = config.MEM_CLK_FREQUENCY * 1e-3f;
     config.BLOCKS_PER_SM = config.MAX_THREADS_PER_SM / config.THREADS_PER_BLOCK;
     config.THREADS_PER_SM = config.BLOCKS_PER_SM * config.THREADS_PER_BLOCK;
     config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM;

diff --git a/src/cuda/GPU_Microbenchmark/hw_def/hw_def.h b/src/cuda/GPU_Microbenchmark/hw_def/hw_def.h
@@ -14,6 +14,7 @@
 // #include "volta_TITANV_hw_def.h"
 
 // #include "ampere_A100_hw_def.h"
-#include "blackwell_B200_hw_def.h"
+// #include "blackwell_B200_hw_def.h"
+#include "blackwell_RTX5090_hw_def.h"
 
 #endif
diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu
@@ -96,12 +96,13 @@ int main(int argc, char *argv[])
     }
   }
 
+  config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM
+  config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; // Recalculate after changing BLOCKS_NUM
+
   unsigned ARRAY_SIZE = config.TOTAL_THREADS + repeat_times * config.WARP_SIZE;
   assert(ARRAY_SIZE * sizeof(float) <
          config.L2_SIZE); // Array size must not exceed L2 size
-
-  config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM
-
+  // config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM // Commented out - causes mismatch on GPUs with MAX_THREADS_PER_SM != 2048
   uint64_t *startClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t));
   uint64_t *stopClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t));
 

diff --git a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu
@@ -145,13 +145,32 @@ int main(int argc, char *argv[])
   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, start, stop);
 
-  unsigned N = ARRAY_SIZE * 6 * sizeof(float); // 6 arrays of floats types
-  float max_bw = (float)config.MEM_BITWIDTH * config.MEM_CLK_FREQUENCY * 2 / 1e3 / 8;
-  mem_bw = (float)(N) / ((float)(stopClk[0] - startClk[0]));
-  printf("Mem BW= %f (Byte/Clk)\n", mem_bw);
-  printf("Mem BW= %f (GB/sec)\n", (float)N / milliseconds / 1e6);
-  printf("Max Theortical Mem BW= %f (GB/sec)\n", max_bw);
-  printf("Mem Efficiency = %f %%\n", (mem_bw / max_bw) * 100);
-
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
+  // Find min and max clocks across all threads to get actual kernel execution time
+  uint32_t minStart = startClk[0], maxStop = stopClk[0];
+  for (unsigned i = 1; i < config.TOTAL_THREADS; i++) {
+    if (startClk[i] < minStart) minStart = startClk[i];
+    if (stopClk[i] > maxStop) maxStop = stopClk[i];
+  }
+  uint32_t totalClocks = maxStop - minStart;
+
+  unsigned N = ARRAY_SIZE * 6 * sizeof(float); // 6 arrays of floats (5 reads + 1 write)
+
+  // Theoretical max bandwidth in GB/s
+  // MEM_CLK_FREQUENCY is in MHz, MEM_BITWIDTH is in bits
+  // BW = (Bus Width / 8 bytes) × (Clock MHz) × 2 (DDR) / 1000 = GB/s
+  float max_bw = (float)config.MEM_BITWIDTH / 8 * config.MEM_CLK_FREQUENCY * 2 / 1000;
+
+  // Achieved bandwidth from CUDA event timing (most accurate)
+  float achieved_bw_from_time = (float)N / milliseconds / 1e6;
+
+  // Achieved bandwidth from cycle count (less accurate, single SM perspective)
+  float achieved_bw_from_cycles = ((float)N / totalClocks) * config.CLK_FREQUENCY / 1e3;
+
+  printf("Mem BW= %f (Byte/Clk)\n", (float)N / totalClocks);
+  printf("Mem BW (from time)= %f (GB/sec)\n", achieved_bw_from_time);
+  printf("Mem BW (from cycles)= %f (GB/sec)\n", achieved_bw_from_cycles);
+  printf("Max Theoretical Mem BW= %f (GB/sec)\n", max_bw);
+  printf("Mem Efficiency = %f %%\n", (achieved_bw_from_time / max_bw) * 100);
+
+  printf("Total Clk number = %u (min start: %u, max stop: %u)\n", totalClocks, minStart, maxStop);
 }
diff --git a/src/cuda/HPC/Makefile b/src/cuda/HPC/Makefile
@@ -0,0 +1,100 @@
+# H100 Benchmark Suite Makefile (following cutlass-bench pattern)
+.PHONY: all simple graph newton vpi clean cufft cusolver image
+
+# Default: Build all apps (submodules initialized by parent Makefile)
+all: simple graph newton vpi
+
+
+# Simple apps (cuFFT, cuSolver, image processing - CUDA toolkit only)
+simple: cufft cusolver image
+
+cufft:
+	$(MAKE) -C cufft/cufft_3d_c2c
+	$(MAKE) -C cufft/cufft_lto_r2c_c2r
+
+cusolver:
+	$(MAKE) -C cusolver/cusolver_ormqr
+	$(MAKE) -C cusolver/cusolver_Xgetrf
+
+image:
+	$(MAKE) -C image/dwtHaar1D
+	$(MAKE) -C image/recursiveGaussian
+	$(MAKE) -C image/FDTD3d
+
+# Graph apps (cuGraph submodule - built with CMake)
+graph:
+	@echo "Building cuGraph library from submodule..."
+	@if [ -d "external/cugraph" ]; then \
+		cd external/cugraph && \
+		./build.sh libcugraph --skip_cpp_tests && \
+		echo "Building graph apps..." && \
+		cd ../.. && \
+		export CUGRAPH_ROOT="$(shell pwd)/external/cugraph" && \
+		# mkdir -p graph/bfs_standalone/build && \
+		# cd graph/bfs_standalone/build && \
+		# cmake .. -DCMAKE_CUDA_COMPILER=$(CUDA_INSTALL_PATH)/bin/nvcc && \
+		# $(MAKE) && \
+		# cd ../../.. && \
+		mkdir -p graph/mst_standalone/build && \
+		cd graph/mst_standalone/build && \
+		cmake .. -DCMAKE_CUDA_COMPILER=$(CUDA_INSTALL_PATH)/bin/nvcc && \
+		$(MAKE); \
+	else \
+		echo "WARNING: cuGraph submodule not found - skipping graph apps"; \
+	fi
+
+# Newton apps (Newton submodule - Python-based)
+newton:
+	@echo "Newton apps ready (Python-based, no build needed)"
+
+# VPI apps (VPI library from system - required)
+# Symlink sources from /opt/nvidia/vpi*/samples/ and build locally
+vpi:
+	@echo "Searching for VPI installation..."
+	@VPI_ROOT=$$(find /opt/nvidia -maxdepth 1 -name "vpi*" -type d 2>/dev/null | head -1); \
+	if [ -z "$$VPI_ROOT" ]; then \
+		echo "ERROR: VPI not found in /opt/nvidia/"; \
+		echo "Please install VPI from NVIDIA"; \
+		exit 1; \
+	fi; \
+	echo "Found VPI at $$VPI_ROOT"; \
+	VPI_SAMPLES=$$VPI_ROOT/samples; \
+	echo "Creating symlinks to VPI sample sources..."; \
+	mkdir -p vpi/vpi_orb_feature_detector vpi/vpi_stereo_disparity; \
+	ln -sf $$VPI_SAMPLES/14-background_subtractor/CMakeLists.txt vpi/vpi_background_subtractor/CMakeLists.txt; \
+	ln -sf $$VPI_SAMPLES/18-orb_feature_detector/main.cpp vpi/vpi_orb_feature_detector/main.cpp; \
+	ln -sf $$VPI_SAMPLES/18-orb_feature_detector/CMakeLists.txt vpi/vpi_orb_feature_detector/CMakeLists.txt; \
+	ln -sf $$VPI_SAMPLES/02-stereo_disparity/main.cpp vpi/vpi_stereo_disparity/main.cpp; \
+	ln -sf $$VPI_SAMPLES/02-stereo_disparity/CMakeLists.txt vpi/vpi_stereo_disparity/CMakeLists.txt; \
+	echo "Building VPI apps from symlinked sources..."; \
+	mkdir -p vpi/vpi_background_subtractor/build && \
+	cd vpi/vpi_background_subtractor/build && \
+	cmake .. && $(MAKE) && \
+	cd ../../.. && \
+	mkdir -p vpi/vpi_orb_feature_detector/build && \
+	cd vpi/vpi_orb_feature_detector/build && \
+	cmake .. && $(MAKE) && \
+	cd ../../.. && \
+	mkdir -p vpi/vpi_stereo_disparity/build && \
+	cd vpi/vpi_stereo_disparity/build && \
+	cmake .. && $(MAKE)
+
+clean:
+	# Clean bin directory
+	-rm -rf bin
+	# Clean simple apps
+	-$(MAKE) -C cufft/cufft_3d_c2c clean
+	-$(MAKE) -C cufft/cufft_lto_r2c_c2r clean
+	-$(MAKE) -C cusolver/cusolver_ormqr clean
+	-$(MAKE) -C cusolver/cusolver_Xgetrf clean
+	-$(MAKE) -C image/dwtHaar1D clean
+	-$(MAKE) -C image/recursiveGaussian clean
+	-$(MAKE) -C image/FDTD3d clean
+	# Clean graph apps and cuGraph build
+	-rm -rf graph/bfs_standalone/build
+	-rm -rf graph/mst_standalone/build
+	-rm -rf external/cugraph/cpp/build
+	# Clean Newton
+	-rm -rf newton/newton_venv
+	# Clean VPI apps and symlinks
+	-rm -rf vpi/*/build