accel-sim · SamMausberg · Mar 17, 2026 · Mar 18, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml
@@ -14,12 +14,18 @@ on:
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:  
   test-12-8:
+    env:
+      CUDA_INSTALL_PATH: /usr/local/cuda 
+      BOOST_ROOT: /usr/include/boost
     runs-on: ubuntu-latest
     container:
-     image: ghcr.io/accel-sim/accel-sim-framework:Ubuntu-24.04-cuda-12.8-minimal
+     image: nvidia/cuda:13.1.1-cudnn-devel-ubuntu24.04
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
+      - name: install dependencies
+        run: |
+              apt-get update && apt-get install -y git build-essential cmake libboost-all-dev wget
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
       - name: Checkout repository with submodules
         uses: actions/checkout@v4
@@ -36,5 +42,5 @@ jobs:
       - name: Print Successful Apps
         if: always()
         run: |
-              echo "Built `ls bin/12.8/release | wc` Apps:"
-              ls bin/12.8/release
+              echo "Built $(ls bin/*/release | wc -l) Apps:"
+              ls bin/*/release
diff --git a/src/Makefile b/src/Makefile
@@ -13,6 +13,10 @@ all:   pannotia rodinia_2.0-ft proxy-apps microbench rodinia-3.1 ispass-2009 pol
 endif
 endif
 ci:     rodinia_2.0-ft rodinia-3.1 GPU_Microbenchmark cutlass_mini cuda_samples
+# Keep the repository pinned to the legacy cuda-samples revision for older toolkits,
+# but use the upstream CUDA 13.1 samples when building with CUDA 13.
+CUDA_SAMPLES_CUDA13_REF := 4f735616ba599fe93cc2c6c85dcb4369260f9643
+CUDA_SAMPLES_PINNED_REF := $(shell git -C .. rev-parse HEAD:src/cuda/cuda-samples 2>/dev/null)
 accelwattch: accelwattch_validation accelwattch_hw_power accelwattch_ubench
 accelwattch_validation: rodinia-3.1_accelwattch_validation parboil_accelwattch_validation cutlass cuda_samples-11.0_accelwattch_validation
 accelwattch_hw_power: rodinia-3.1_hw_power parboil_hw_power cuda_samples-11.0_hw_power
@@ -518,8 +522,40 @@ mlperf_training:
 
 cuda_samples:
 	mkdir -p $(BINDIR)/$(BINSUBDIR)/
-	mkdir -p ./cuda/cuda-samples/build && cd ./cuda/cuda-samples/build && cmake .. && $(MAKE)
-	find $(GPUAPPS_ROOT)/src/cuda/cuda-samples/build/Samples  -type f -executable -exec mv {} "$(BINDIR)/$(BINSUBDIR)/" \; ;
+	set -eu; \
+	repo_dir=./cuda/cuda-samples; \
+	desired_ref="$(CUDA_SAMPLES_PINNED_REF)"; \
+	if [ -z "$$desired_ref" ]; then \
+		desired_ref=$$(git -C "$$repo_dir" rev-parse HEAD); \
+	fi; \
+	source_dir="$$repo_dir"; \
+	build_dir="$$repo_dir/build-$(CUDA_VERSION)"; \
+	export_dir=; \
+	cleanup_cuda_samples() { \
+		if [ -n "$$export_dir" ] && [ -d "$$export_dir" ]; then \
+			rm -rf "$$export_dir"; \
+		fi; \
+	}; \
+	trap 'cleanup_cuda_samples' EXIT; \
+	if [ ${CUDA_VERSION_MAJOR} -ge 13 ]; then \
+		desired_ref="$(CUDA_SAMPLES_CUDA13_REF)"; \
+	fi; \
+	original_ref=$$(git -C "$$repo_dir" rev-parse HEAD); \
+	if [ "$$original_ref" != "$$desired_ref" ]; then \
+		export_dir=./cuda/cuda-samples-export-$(CUDA_VERSION); \
+		build_dir="$$repo_dir/build-$(CUDA_VERSION)-compat"; \
+		git -C "$$repo_dir" rev-parse --verify "$$desired_ref^{commit}" >/dev/null; \
+		rm -rf "$$export_dir"; \
+		mkdir -p "$$export_dir"; \
+		archive_file="$$export_dir/source.tar"; \
+		git -C "$$repo_dir" archive --format=tar "$$desired_ref" > "$$archive_file"; \
+		tar -xf "$$archive_file" -C "$$export_dir"; \
+		rm -f "$$archive_file"; \
+		source_dir="$$export_dir"; \
+	fi; \
+	cmake -S "$$source_dir" -B "$$build_dir"; \
+	cmake --build "$$build_dir"; \
+	find "$$build_dir"/Samples -type f -executable -exec mv {} "$(BINDIR)/$(BINSUBDIR)/" \; ;
 
 pytorch_examples:
 	mkdir -p $(BINDIR)/$(BINSUBDIR)/
@@ -711,10 +747,11 @@ clean_pytorch_examples:
 	rm -f $(BINDIR)/$(BINSUBDIR)/inference_vae
 
 clean_cuda_samples:
-	$(MAKE) clean -C ./cuda/cuda-samples/build
+	find ./cuda/cuda-samples -maxdepth 1 -type d \( -name 'build' -o -name 'build-*' \) -exec rm -rf {} +
+	find ./cuda -maxdepth 1 -type d -name 'cuda-samples-export-*' -exec rm -rf {} +
 
 clean_huggingface:
 	rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface
 
 clean_GPU_Microbenchmark:
-	$(MAKE) clean -C ./cuda/GPU_Microbenchmark
+	$(MAKE) clean -C ./cuda/GPU_Microbenchmark
diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
@@ -46,6 +46,15 @@ struct GpuConfig
     unsigned L2_BANKS = 0;            // L2 Cache Banks (LTCs)
 };
 inline GpuConfig config;
+
+inline int getDeviceAttributeOrZero(cudaDeviceAttr attr, int deviceID)
+{
+    int value = 0;
+    if (cudaDeviceGetAttribute(&value, attr, deviceID) != cudaSuccess)
+        return 0;
+    return value;
+}
+
 // Parses short flags like --sm 80 into a GpuConfig object
 inline void parseGpuConfigArgs(int argc, char *argv[])
 {
@@ -280,8 +289,12 @@ inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[])
         cudaSetDevice(deviceID);
         cudaGetDeviceProperties(&deviceProp, deviceID);
 
-        int clockRateKHz;
-        cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceID);
+        int clockRateKHz =
+            getDeviceAttributeOrZero(cudaDevAttrClockRate, deviceID);
+        int memoryClockRateKHz =
+            getDeviceAttributeOrZero(cudaDevAttrMemoryClockRate, deviceID);
+        int memoryBusWidthBits =
+            getDeviceAttributeOrZero(cudaDevAttrGlobalMemoryBusWidth, deviceID);
 
         // core stats
         config.SM_NUMBER = deviceProp.multiProcessorCount;
@@ -310,9 +323,12 @@ inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[])
 
         // memory
         config.MEM_SIZE = deviceProp.totalGlobalMem;
-        config.MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f;
-        config.MEM_BITWIDTH = deviceProp.memoryBusWidth;
-        config.CLK_FREQUENCY = clockRateKHz * 1e-3f;
+        if (memoryClockRateKHz > 0)
+            config.MEM_CLK_FREQUENCY = memoryClockRateKHz * 1e-3f;
+        if (memoryBusWidthBits > 0)
+            config.MEM_BITWIDTH = memoryBusWidthBits;
+        if (clockRateKHz > 0)
+            config.CLK_FREQUENCY = clockRateKHz * 1e-3f;
 
         // Get FBP_COUNT and L2_BANKS from NVIDIA RM API
         config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS);

diff --git a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu
@@ -12,9 +12,8 @@ int main(int argc, char *argv[])
   snprintf(msg, sizeof(msg), "Global memory size = %.0f GB\n",
            static_cast<float>(deviceProp.totalGlobalMem / 1073741824.0f));
   std::cout << msg;
-  std::cout << "Memory Clock rate = " << deviceProp.memoryClockRate * 1e-3f
-            << " Mhz\n";
-  std::cout << "Memory Bus Width = " << deviceProp.memoryBusWidth << " bit\n";
+  std::cout << "Memory Clock rate = " << config.MEM_CLK_FREQUENCY << " Mhz\n";
+  std::cout << "Memory Bus Width = " << config.MEM_BITWIDTH << " bit\n";
   std::cout << "Memory type = " << dram_model_str[DRAM_MODEL] << "\n";
   std::cout << "Memory channels = "
             << config.FBP_COUNT << "\n";
@@ -38,7 +37,7 @@ int main(int argc, char *argv[])
               << dram_model_freq_ratio[DRAM_MODEL] << std::endl;
 
     // timing
-    float device_freq_MHZ = (deviceProp.memoryClockRate * 1e-3f * 2) /
+    float device_freq_MHZ = (config.MEM_CLK_FREQUENCY * 2) /
                             dram_model_freq_ratio[DRAM_MODEL];
     if (DRAM_MODEL == dram_model::HBM)
     {

diff --git a/src/cuda/GPU_Microbenchmark/ubench/shd/shared_cp_async/ldgsts.cu b/src/cuda/GPU_Microbenchmark/ubench/shd/shared_cp_async/ldgsts.cu
@@ -16,6 +16,14 @@ struct alignas(DATA_SIZE) Data {
 };
 static_assert(sizeof(Data) == DATA_SIZE, "Data struct size mismatch");
 
+static int getDeviceAttributeOrZero(cudaDeviceAttr attr, int device_id = 0) {
+    int value = 0;
+    if (cudaDeviceGetAttribute(&value, attr, device_id) != cudaSuccess) {
+        return 0;
+    }
+    return value;
+}
+
 
 // sm_80+ required
 __global__ void pipeline_kernel_async(const Data* __restrict__ global,
@@ -174,8 +182,11 @@ for (size_t b = 0; b < num_blocks; b++) {
 }
 double avg_cycles = static_cast<double>(total_cycles) / num_blocks;
 
+int sm_clock_khz = getDeviceAttributeOrZero(cudaDevAttrClockRate);
+int memory_clock_khz = getDeviceAttributeOrZero(cudaDevAttrMemoryClockRate);
+
 // GPU frequency (kHz → Hz)
-double gpu_clock_hz = static_cast<double>(prop.clockRate) * 1000.0;
+double gpu_clock_hz = static_cast<double>(sm_clock_khz) * 1000.0;
 
 // Time in seconds
 double time_sec = avg_cycles / gpu_clock_hz;
@@ -192,8 +203,8 @@ double bw_gbs   = bytes_moved / time_sec / 1e9;
 double bytesclk = bytes_moved / avg_cycles;   // Bytes per GPU cycle
 
 std::cout << "---------------------------------\n";
-std::cout << "SM Clock               = " << prop.clockRate / 1000.0 << " MHz\n";
-std::cout << "Memory Clock           = " << prop.memoryClockRate / 1000.0 << " MHz\n";
+std::cout << "SM Clock               = " << sm_clock_khz / 1000.0 << " MHz\n";
+std::cout << "Memory Clock           = " << memory_clock_khz / 1000.0 << " MHz\n";
 std::cout << "Avg cycles (per block) = " << avg_cycles << "\n";
 std::cout << "Time (s)               = " << time_sec << "\n";
 std::cout << "Bytes moved            = " << bytes_moved / (1024.0*1024*1024) << " GB\n";

diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile b/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/Makefile
@@ -2,7 +2,7 @@
 SRC = deviceQuery.cpp
 
 EXE = deviceQuery
-INCLUDE :=  -I../../../../cuda-samples/Common/
+INCLUDE :=  -I../../../../NVIDIA_CUDA-11.0_Samples/common/inc/
 NVCC_FLGAS =
 
 include ../../../common/common.mk
diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp b/src/cuda/GPU_Microbenchmark/ubench/system/deviceQuery/deviceQuery.cpp
@@ -10,6 +10,14 @@ from CUDA SDK
 #include <memory>
 #include <string>
 
+static int getDeviceAttributeOrZero(cudaDeviceAttr attr, int device_id) {
+  int value = 0;
+  if (cudaDeviceGetAttribute(&value, attr, device_id) != cudaSuccess) {
+    return 0;
+  }
+  return value;
+}
+
 int main(int argc, char **argv) {
   int deviceCount = 0;
   cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
@@ -32,6 +40,11 @@ int main(int argc, char **argv) {
     cudaSetDevice(dev);
     cudaDeviceProp deviceProp;
     cudaGetDeviceProperties(&deviceProp, dev);
+    int smClockKHz = getDeviceAttributeOrZero(cudaDevAttrClockRate, dev);
+    int memoryClockKHz =
+        getDeviceAttributeOrZero(cudaDevAttrMemoryClockRate, dev);
+    int memoryBusWidthBits =
+        getDeviceAttributeOrZero(cudaDevAttrGlobalMemoryBusWidth, dev);
 
     // device
     printf("  Device : \"%s\"\n\n", deviceProp.name);
@@ -40,7 +53,7 @@ int main(int argc, char **argv) {
 
     // core
     printf("  GPU Max Clock rate                             : %.0f MHz \n",
-           deviceProp.clockRate * 1e-3f);
+           smClockKHz * 1e-3f);
     printf("  Multiprocessors Count                       : %d\n",
            deviceProp.multiProcessorCount);
     printf("  Maximum number of threads per multiprocessor: %d\n",
@@ -81,9 +94,10 @@ int main(int argc, char **argv) {
              static_cast<float>(deviceProp.totalGlobalMem / 1073741824.0f));
     printf("%s", msg);
     printf("  Memory Clock rate                           : %.0f Mhz\n",
-           deviceProp.memoryClockRate * 1e-3f);
+           memoryClockKHz * 1e-3f);
     printf("  Memory Bus Width                            : %d bit\n",
-           deviceProp.memoryBusWidth);
+           memoryBusWidthBits > 0 ? memoryBusWidthBits
+                                  : deviceProp.memoryBusWidth);
 
     printf(" ////////////////////////// \n");
   }

diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/Makefile b/src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/Makefile
@@ -2,7 +2,12 @@ SRC = mbarrier.cu
 
 EXE = mbarrier
 
-ARCH?=sm_80 sm_90a sm_100a sm_101 sm_120 
+ifneq ($(filter 13 14 15 16,$(CUDA_VERSION_MAJOR)),)
+ARCH_DEFAULT := sm_80 sm_90a sm_100a sm_110 sm_120
+else
+ARCH_DEFAULT := sm_80 sm_90a sm_100a sm_101 sm_120
+endif
+ARCH ?= $(ARCH_DEFAULT)
 # Unset the CUDA_CPPFLAGS which is set based on CUDA version
 CUDA_CPPFLAGS=
 

diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile
@@ -3,7 +3,11 @@ SRC = tma_bulk.cu
 EXE = tma_bulk
 
 # TMA is supported on SM_90a and above
-ARCH?=sm_90a sm_100a sm_101 sm_120 
+ifneq ($(filter 13 14 15 16,$(CUDA_VERSION_MAJOR)),)
+ARCH?=sm_90a sm_100a sm_110 sm_120
+else
+ARCH?=sm_90a sm_100a sm_101 sm_120
+endif
 # Unset the CUDA_CPPFLAGS which is set based on CUDA version
 # but TMA is only supported on SM_90a and above
 CUDA_CPPFLAGS=

diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu
@@ -1,6 +1,7 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 #include <cuda/barrier>
 #include <cuda/ptx>
 

diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile
@@ -3,7 +3,11 @@ SRC = tma_tensor.cu
 EXE = tma_tensor
 
 # TMA is supported on SM_90a and above
-ARCH?=sm_90a sm_100a sm_101 sm_120 
+ifneq ($(filter 13 14 15 16,$(CUDA_VERSION_MAJOR)),)
+ARCH?=sm_90a sm_100a sm_110 sm_120
+else
+ARCH?=sm_90a sm_100a sm_101 sm_120
+endif
 # Unset the CUDA_CPPFLAGS which is set based on CUDA version
 # but TMA is only supported on SM_90a and above
 CUDA_CPPFLAGS=

diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu
@@ -6,6 +6,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <unistd.h>
 #include <unordered_map>
 using barrier = cuda::barrier<cuda::thread_scope_block>;
 namespace ptx = cuda::ptx;

diff --git a/src/cuda/rodinia/2.0-ft/backprop/backprop_cuda.cu b/src/cuda/rodinia/2.0-ft/backprop/backprop_cuda.cu
@@ -129,7 +129,7 @@ void bpnn_train_cuda(BPNN *net, float *eo, float *eh)
 											  in,
 											  hid);
 
-  cudaThreadSynchronize();
+  cudaDeviceSynchronize();
 
   cudaError_t error = cudaGetLastError();
 	if (error != cudaSuccess) {

diff --git a/src/cuda/rodinia/2.0-ft/nn/nn_cuda.cu b/src/cuda/rodinia/2.0-ft/nn/nn_cuda.cu
@@ -211,7 +211,7 @@ int main(int argc, char* argv[])
 
 		//Add a and b, store in c
 		euclid<<<dimGrid,dimBlock>>>(data, x2, y2, z_d, REC_WINDOW, REC_LENGTH, LATITUDE_POS);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 
 		//Copy data from device memory to host memory
 		cudaMemcpy( z, z_d, sizeof(float)*REC_WINDOW, cudaMemcpyDeviceToHost );

diff --git a/src/cuda/rodinia/2.0-ft/srad/srad_v2/srad.cu b/src/cuda/rodinia/2.0-ft/srad/srad_v2/srad.cu
@@ -232,7 +232,7 @@ runTest( int argc, char** argv)
 #endif   
 }
 
-    cudaThreadSynchronize();
+    cudaDeviceSynchronize();
 
 #ifdef TIMER
 		CUT_SAFE_CALL( cutStopTimer( timer_1 ));

diff --git a/src/cuda/rodinia/2.0-ft/streamcluster/streamcluster_cuda.cu b/src/cuda/rodinia/2.0-ft/streamcluster/streamcluster_cuda.cu
@@ -219,7 +219,7 @@ float pgain( long x, Points *points, float z, long int *numcenters, int kmax, bo
 																											center_table_d,				// in:	center index table
 																											switch_membership_d		// out:  changes in membership
 																										  );
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 #ifdef PROFILE
 	double t10 = gettime();

diff --git a/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper.cu b/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper.cu
@@ -75,7 +75,7 @@ kernel_gpu_cuda_wrapper(record *records,
 	//	INITIAL DRIVER OVERHEAD
 	//====================================================================================================100
 
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 	//====================================================================================================100
 	//	EXECUTION PARAMETERS
@@ -223,7 +223,7 @@ kernel_gpu_cuda_wrapper(record *records,
 											offsetD,
 											keysD,
 											ansD);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	checkCUDAError("findK");
 
 	time4 = get_time();

diff --git a/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper_2.cu b/src/cuda/rodinia/3.1/cuda/b+tree/kernel/kernel_gpu_cuda_wrapper_2.cu
@@ -77,7 +77,7 @@ kernel_gpu_cuda_wrapper_2(	knode *knodes,
 	//	INITIAL DRIVER OVERHEAD
 	//====================================================================================================100
 
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 	//====================================================================================================100
 	//	EXECUTION PARAMETERS
@@ -272,7 +272,7 @@ kernel_gpu_cuda_wrapper_2(	knode *knodes,
 												endD,
 												ansDStart,
 												ansDLength);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	checkCUDAError("findRangeK");
 
 	time4 = get_time();

diff --git a/src/cuda/rodinia/3.1/cuda/backprop/backprop_cuda.cu b/src/cuda/rodinia/3.1/cuda/backprop/backprop_cuda.cu
@@ -127,7 +127,7 @@ void bpnn_train_cuda(BPNN *net, float *eo, float *eh)
 											  in,
 											  hid);
 
-  cudaThreadSynchronize();
+  cudaDeviceSynchronize();
 
   cudaError_t error = cudaGetLastError();
 	if (error != cudaSuccess) {