Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions .github/workflows/test-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,18 @@ on:
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
jobs:
test-12-8:
env:
CUDA_INSTALL_PATH: /usr/local/cuda
BOOST_ROOT: /usr/include/boost
runs-on: ubuntu-latest
container:
image: ghcr.io/accel-sim/accel-sim-framework:Ubuntu-24.04-cuda-12.8-minimal
image: nvidia/cuda:13.1.1-cudnn-devel-ubuntu24.04

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
- name: install dependencies
run: |
apt-get update && apt-get install -y git build-essential cmake libboost-all-dev wget
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- name: Checkout repository with submodules
uses: actions/checkout@v4
Expand All @@ -36,5 +42,5 @@ jobs:
- name: Print Successful Apps
if: always()
run: |
echo "Built `ls bin/12.8/release | wc` Apps:"
ls bin/12.8/release
echo "Built $(ls bin/*/release | wc -l) Apps:"
ls bin/*/release
45 changes: 41 additions & 4 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ all: pannotia rodinia_2.0-ft proxy-apps microbench rodinia-3.1 ispass-2009 pol
endif
endif
ci: rodinia_2.0-ft rodinia-3.1 GPU_Microbenchmark cutlass_mini cuda_samples
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Samples should build it; it is an NVIDIA collection and should work at different CUDA versions.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It fails with nvcc fatal: Unsupported gpu architecture 'compute_50' when in CUDA 13.1.1 environment.

How would you like me to handle this?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm ok - they probably dropped support for compute_50 in 13 - I am a little shocked they didn't update the samples to build properly in 13, unless they are just lagging on updating samples.
Are you 100% sure that the latest cuda_samples fails to build with cuda13? Maybe we are syncing to an older version of samples and we need to update it?

# Keep the repository pinned to the legacy cuda-samples revision for older toolkits,
# but use the upstream CUDA 13.1 samples when building with CUDA 13.
CUDA_SAMPLES_CUDA13_REF := 4f735616ba599fe93cc2c6c85dcb4369260f9643
CUDA_SAMPLES_PINNED_REF := $(shell git -C .. rev-parse HEAD:src/cuda/cuda-samples 2>/dev/null)
accelwattch: accelwattch_validation accelwattch_hw_power accelwattch_ubench
accelwattch_validation: rodinia-3.1_accelwattch_validation parboil_accelwattch_validation cutlass cuda_samples-11.0_accelwattch_validation
accelwattch_hw_power: rodinia-3.1_hw_power parboil_hw_power cuda_samples-11.0_hw_power
Expand Down Expand Up @@ -518,8 +522,40 @@ mlperf_training:

cuda_samples:
mkdir -p $(BINDIR)/$(BINSUBDIR)/
mkdir -p ./cuda/cuda-samples/build && cd ./cuda/cuda-samples/build && cmake .. && $(MAKE)
find $(GPUAPPS_ROOT)/src/cuda/cuda-samples/build/Samples -type f -executable -exec mv {} "$(BINDIR)/$(BINSUBDIR)/" \; ;
set -eu; \
repo_dir=./cuda/cuda-samples; \
desired_ref="$(CUDA_SAMPLES_PINNED_REF)"; \
if [ -z "$$desired_ref" ]; then \
desired_ref=$$(git -C "$$repo_dir" rev-parse HEAD); \
fi; \
source_dir="$$repo_dir"; \
build_dir="$$repo_dir/build-$(CUDA_VERSION)"; \
export_dir=; \
cleanup_cuda_samples() { \
if [ -n "$$export_dir" ] && [ -d "$$export_dir" ]; then \
rm -rf "$$export_dir"; \
fi; \
}; \
trap 'cleanup_cuda_samples' EXIT; \
if [ ${CUDA_VERSION_MAJOR} -ge 13 ]; then \
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we have the stuff in setup_environment, do we still need this?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that setup_environment is ok for normal Makefile builds.

cuda_samples is a separate case, I believe cause it hardcodes 'CMAKE_CUDA_ARCHITECTURES' not sure.

^ could be completely wrong about this.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok - this is a lot of code which I think is mostly setting up enviroment stuff for samples.
My hesitation here is that the samples used to build really simply, and I am not sure if we really need all this code or the AI was just working around some problem in a weird way.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it works, we can keep it - @JRPan - who originally integrated samples? Maybe we can check with them?

desired_ref="$(CUDA_SAMPLES_CUDA13_REF)"; \
fi; \
original_ref=$$(git -C "$$repo_dir" rev-parse HEAD); \
if [ "$$original_ref" != "$$desired_ref" ]; then \
export_dir=./cuda/cuda-samples-export-$(CUDA_VERSION); \
build_dir="$$repo_dir/build-$(CUDA_VERSION)-compat"; \
git -C "$$repo_dir" rev-parse --verify "$$desired_ref^{commit}" >/dev/null; \
rm -rf "$$export_dir"; \
mkdir -p "$$export_dir"; \
archive_file="$$export_dir/source.tar"; \
git -C "$$repo_dir" archive --format=tar "$$desired_ref" > "$$archive_file"; \
tar -xf "$$archive_file" -C "$$export_dir"; \
rm -f "$$archive_file"; \
source_dir="$$export_dir"; \
fi; \
cmake -S "$$source_dir" -B "$$build_dir"; \
cmake --build "$$build_dir"; \
find "$$build_dir"/Samples -type f -executable -exec mv {} "$(BINDIR)/$(BINSUBDIR)/" \; ;

pytorch_examples:
mkdir -p $(BINDIR)/$(BINSUBDIR)/
Expand Down Expand Up @@ -711,10 +747,11 @@ clean_pytorch_examples:
rm -f $(BINDIR)/$(BINSUBDIR)/inference_vae

clean_cuda_samples:
$(MAKE) clean -C ./cuda/cuda-samples/build
find ./cuda/cuda-samples -maxdepth 1 -type d \( -name 'build' -o -name 'build-*' \) -exec rm -rf {} +
find ./cuda -maxdepth 1 -type d -name 'cuda-samples-export-*' -exec rm -rf {} +

clean_huggingface:
rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface

clean_GPU_Microbenchmark:
$(MAKE) clean -C ./cuda/GPU_Microbenchmark
$(MAKE) clean -C ./cuda/GPU_Microbenchmark
26 changes: 21 additions & 5 deletions src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,15 @@ struct GpuConfig
unsigned L2_BANKS = 0; // L2 Cache Banks (LTCs)
};
inline GpuConfig config;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's 0, the uBench will be wrong. Why is it failing?

If the syntax has changed, just update it to use the new syntax. There must still be a way to query device attributes.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait, I don't think I changed the FBP_COUNT = 0 and L2_BANKS = 0

I think in the simulator path, they are overwritten from gpgpusim.config. In the hardware path filled with queryGrInfo().

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These apps also run on real GPUs, not just in gpgpu-sim. The ubench uses device query to determine kernel parameters. So these value must be correct (matching the HW) for these kernels to perform as expected. This is not the config in the simulation.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it seems like a pre-existing issue in commit 4becbe3.

The only changes I made to stuff like this in the file were to the clock rate, memory clock rate, and memory bus width.

Copy link
Copy Markdown
Author

@SamMausberg SamMausberg Mar 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I think I misunderstood your point, yes, that's true. When I go over this, I can make that fix even though it was pre-existing.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So @JRPan - are you saying we don't need this function?
It is weird to do this 0 think for only these functions.

Isn't the real issue that we are returning 0s for values that should have an attribute?
Generally, I would prefer not to add these extra functions to patch a deeper issue.

Said another way, why aren't we seeing success with these values? We can just fill them in from the gpgpusim config like the other values.

But maybe I am misunderstanding this.

inline int getDeviceAttributeOrZero(cudaDeviceAttr attr, int deviceID)
{
int value = 0;
if (cudaDeviceGetAttribute(&value, attr, deviceID) != cudaSuccess)
return 0;
return value;
}

// Parses short flags like --sm 80 into a GpuConfig object
inline void parseGpuConfigArgs(int argc, char *argv[])
{
Expand Down Expand Up @@ -280,8 +289,12 @@ inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[])
cudaSetDevice(deviceID);
cudaGetDeviceProperties(&deviceProp, deviceID);

int clockRateKHz;
cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceID);
int clockRateKHz =
getDeviceAttributeOrZero(cudaDevAttrClockRate, deviceID);
int memoryClockRateKHz =
getDeviceAttributeOrZero(cudaDevAttrMemoryClockRate, deviceID);
int memoryBusWidthBits =
getDeviceAttributeOrZero(cudaDevAttrGlobalMemoryBusWidth, deviceID);

// core stats
config.SM_NUMBER = deviceProp.multiProcessorCount;
Expand Down Expand Up @@ -310,9 +323,12 @@ inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[])

// memory
config.MEM_SIZE = deviceProp.totalGlobalMem;
config.MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f;
config.MEM_BITWIDTH = deviceProp.memoryBusWidth;
config.CLK_FREQUENCY = clockRateKHz * 1e-3f;
if (memoryClockRateKHz > 0)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is with all the zeros?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to clarify my intent: the > 0 means if CUDA returns positive, we overwrite the default. If it returns 0, it's because the query is unsupported; it leaves it alone. I made this choice because before we used deviceProp.memoryClockRate and deviceProp.memoryBusWidth, this needed to be switched to cudaDeviceGetAttribute()

I don't think its a correctness issue but yea its not the most readable I can change it if you'd like.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JRPan - Should we just be asserting if we don't get good values here?

@SamMausberg - the issue is that some of the uBenchmarks need good values here to work properly. If we give them back something conforrect, the uBench will not test the thing we think it is supposed to test.

config.MEM_CLK_FREQUENCY = memoryClockRateKHz * 1e-3f;
if (memoryBusWidthBits > 0)
config.MEM_BITWIDTH = memoryBusWidthBits;
if (clockRateKHz > 0)
config.CLK_FREQUENCY = clockRateKHz * 1e-3f;

// Get FBP_COUNT and L2_BANKS from NVIDIA RM API
config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@ int main(int argc, char *argv[])
snprintf(msg, sizeof(msg), "Global memory size = %.0f GB\n",
static_cast<float>(deviceProp.totalGlobalMem / 1073741824.0f));
std::cout << msg;
std::cout << "Memory Clock rate = " << deviceProp.memoryClockRate * 1e-3f
<< " Mhz\n";
std::cout << "Memory Bus Width = " << deviceProp.memoryBusWidth << " bit\n";
std::cout << "Memory Clock rate = " << config.MEM_CLK_FREQUENCY << " Mhz\n";
std::cout << "Memory Bus Width = " << config.MEM_BITWIDTH << " bit\n";
std::cout << "Memory type = " << dram_model_str[DRAM_MODEL] << "\n";
std::cout << "Memory channels = "
<< config.FBP_COUNT << "\n";
Expand All @@ -38,7 +37,7 @@ int main(int argc, char *argv[])
<< dram_model_freq_ratio[DRAM_MODEL] << std::endl;

// timing
float device_freq_MHZ = (deviceProp.memoryClockRate * 1e-3f * 2) /
float device_freq_MHZ = (config.MEM_CLK_FREQUENCY * 2) /
dram_model_freq_ratio[DRAM_MODEL];
if (DRAM_MODEL == dram_model::HBM)
{
Expand Down
17 changes: 14 additions & 3 deletions src/cuda/GPU_Microbenchmark/ubench/shd/shared_cp_async/ldgsts.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ struct alignas(DATA_SIZE) Data {
};
static_assert(sizeof(Data) == DATA_SIZE, "Data struct size mismatch");

static int getDeviceAttributeOrZero(cudaDeviceAttr attr, int device_id = 0) {
int value = 0;
if (cudaDeviceGetAttribute(&value, attr, device_id) != cudaSuccess) {
return 0;
}
return value;
}


// sm_80+ required
__global__ void pipeline_kernel_async(const Data* __restrict__ global,
Expand Down Expand Up @@ -174,8 +182,11 @@ for (size_t b = 0; b < num_blocks; b++) {
}
double avg_cycles = static_cast<double>(total_cycles) / num_blocks;

int sm_clock_khz = getDeviceAttributeOrZero(cudaDevAttrClockRate);
int memory_clock_khz = getDeviceAttributeOrZero(cudaDevAttrMemoryClockRate);

// GPU frequency (kHz → Hz)
double gpu_clock_hz = static_cast<double>(prop.clockRate) * 1000.0;
double gpu_clock_hz = static_cast<double>(sm_clock_khz) * 1000.0;

// Time in seconds
double time_sec = avg_cycles / gpu_clock_hz;
Expand All @@ -192,8 +203,8 @@ double bw_gbs = bytes_moved / time_sec / 1e9;
double bytesclk = bytes_moved / avg_cycles; // Bytes per GPU cycle

std::cout << "---------------------------------\n";
std::cout << "SM Clock = " << prop.clockRate / 1000.0 << " MHz\n";
std::cout << "Memory Clock = " << prop.memoryClockRate / 1000.0 << " MHz\n";
std::cout << "SM Clock = " << sm_clock_khz / 1000.0 << " MHz\n";
std::cout << "Memory Clock = " << memory_clock_khz / 1000.0 << " MHz\n";
std::cout << "Avg cycles (per block) = " << avg_cycles << "\n";
std::cout << "Time (s) = " << time_sec << "\n";
std::cout << "Bytes moved = " << bytes_moved / (1024.0*1024*1024) << " GB\n";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
SRC = deviceQuery.cpp

EXE = deviceQuery
INCLUDE := -I../../../../cuda-samples/Common/
INCLUDE := -I../../../../NVIDIA_CUDA-11.0_Samples/common/inc/
NVCC_FLGAS =

include ../../../common/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@ from CUDA SDK
#include <memory>
#include <string>

static int getDeviceAttributeOrZero(cudaDeviceAttr attr, int device_id) {
int value = 0;
if (cudaDeviceGetAttribute(&value, attr, device_id) != cudaSuccess) {
return 0;
}
return value;
}

int main(int argc, char **argv) {
int deviceCount = 0;
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
Expand All @@ -32,6 +40,11 @@ int main(int argc, char **argv) {
cudaSetDevice(dev);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
int smClockKHz = getDeviceAttributeOrZero(cudaDevAttrClockRate, dev);
int memoryClockKHz =
getDeviceAttributeOrZero(cudaDevAttrMemoryClockRate, dev);
int memoryBusWidthBits =
getDeviceAttributeOrZero(cudaDevAttrGlobalMemoryBusWidth, dev);

// device
printf(" Device : \"%s\"\n\n", deviceProp.name);
Expand All @@ -40,7 +53,7 @@ int main(int argc, char **argv) {

// core
printf(" GPU Max Clock rate : %.0f MHz \n",
deviceProp.clockRate * 1e-3f);
smClockKHz * 1e-3f);
printf(" Multiprocessors Count : %d\n",
deviceProp.multiProcessorCount);
printf(" Maximum number of threads per multiprocessor: %d\n",
Expand Down Expand Up @@ -81,9 +94,10 @@ int main(int argc, char **argv) {
static_cast<float>(deviceProp.totalGlobalMem / 1073741824.0f));
printf("%s", msg);
printf(" Memory Clock rate : %.0f Mhz\n",
deviceProp.memoryClockRate * 1e-3f);
memoryClockKHz * 1e-3f);
printf(" Memory Bus Width : %d bit\n",
deviceProp.memoryBusWidth);
memoryBusWidthBits > 0 ? memoryBusWidthBits
: deviceProp.memoryBusWidth);

printf(" ////////////////////////// \n");
}
Expand Down
7 changes: 6 additions & 1 deletion src/cuda/GPU_Microbenchmark/ubench/tma/mbarrier/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@ SRC = mbarrier.cu

EXE = mbarrier

ARCH?=sm_80 sm_90a sm_100a sm_101 sm_120
ifneq ($(filter 13 14 15 16,$(CUDA_VERSION_MAJOR)),)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know why all this code got added - @SamMausberg, doesn't this just do the exact same thing as the single-line baseline? A bunch of the makefiles have this - please delete unless there is a good reason to have it.

ARCH_DEFAULT := sm_80 sm_90a sm_100a sm_110 sm_120
else
ARCH_DEFAULT := sm_80 sm_90a sm_100a sm_101 sm_120
endif
ARCH ?= $(ARCH_DEFAULT)
# Unset the CUDA_CPPFLAGS which is set based on CUDA version
CUDA_CPPFLAGS=

Expand Down
6 changes: 5 additions & 1 deletion src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ SRC = tma_bulk.cu
EXE = tma_bulk

# TMA is supported on SM_90a and above
ARCH?=sm_90a sm_100a sm_101 sm_120
ifneq ($(filter 13 14 15 16,$(CUDA_VERSION_MAJOR)),)
ARCH?=sm_90a sm_100a sm_110 sm_120
else
ARCH?=sm_90a sm_100a sm_101 sm_120
endif
# Unset the CUDA_CPPFLAGS which is set based on CUDA version
# but TMA is only supported on SM_90a and above
CUDA_CPPFLAGS=
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <cuda/barrier>
#include <cuda/ptx>

Expand Down
6 changes: 5 additions & 1 deletion src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ SRC = tma_tensor.cu
EXE = tma_tensor

# TMA is supported on SM_90a and above
ARCH?=sm_90a sm_100a sm_101 sm_120
ifneq ($(filter 13 14 15 16,$(CUDA_VERSION_MAJOR)),)
ARCH?=sm_90a sm_100a sm_110 sm_120
else
ARCH?=sm_90a sm_100a sm_101 sm_120
endif
# Unset the CUDA_CPPFLAGS which is set based on CUDA version
# but TMA is only supported on SM_90a and above
CUDA_CPPFLAGS=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <unordered_map>
using barrier = cuda::barrier<cuda::thread_scope_block>;
namespace ptx = cuda::ptx;
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/rodinia/2.0-ft/backprop/backprop_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ void bpnn_train_cuda(BPNN *net, float *eo, float *eh)
in,
hid);

cudaThreadSynchronize();
cudaDeviceSynchronize();
Comment thread
tgrogers marked this conversation as resolved.

cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/rodinia/2.0-ft/nn/nn_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ int main(int argc, char* argv[])

//Add a and b, store in c
euclid<<<dimGrid,dimBlock>>>(data, x2, y2, z_d, REC_WINDOW, REC_LENGTH, LATITUDE_POS);
cudaThreadSynchronize();
cudaDeviceSynchronize();

//Copy data from device memory to host memory
cudaMemcpy( z, z_d, sizeof(float)*REC_WINDOW, cudaMemcpyDeviceToHost );
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/rodinia/2.0-ft/srad/srad_v2/srad.cu
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ runTest( int argc, char** argv)
#endif
}

cudaThreadSynchronize();
cudaDeviceSynchronize();

#ifdef TIMER
CUT_SAFE_CALL( cutStopTimer( timer_1 ));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ float pgain( long x, Points *points, float z, long int *numcenters, int kmax, bo
center_table_d, // in: center index table
switch_membership_d // out: changes in membership
);
cudaThreadSynchronize();
cudaDeviceSynchronize();

#ifdef PROFILE
double t10 = gettime();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ kernel_gpu_cuda_wrapper(record *records,
// INITIAL DRIVER OVERHEAD
//====================================================================================================100

cudaThreadSynchronize();
cudaDeviceSynchronize();

//====================================================================================================100
// EXECUTION PARAMETERS
Expand Down Expand Up @@ -223,7 +223,7 @@ kernel_gpu_cuda_wrapper(record *records,
offsetD,
keysD,
ansD);
cudaThreadSynchronize();
cudaDeviceSynchronize();
checkCUDAError("findK");

time4 = get_time();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ kernel_gpu_cuda_wrapper_2( knode *knodes,
// INITIAL DRIVER OVERHEAD
//====================================================================================================100

cudaThreadSynchronize();
cudaDeviceSynchronize();

//====================================================================================================100
// EXECUTION PARAMETERS
Expand Down Expand Up @@ -272,7 +272,7 @@ kernel_gpu_cuda_wrapper_2( knode *knodes,
endD,
ansDStart,
ansDLength);
cudaThreadSynchronize();
cudaDeviceSynchronize();
checkCUDAError("findRangeK");

time4 = get_time();
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/rodinia/3.1/cuda/backprop/backprop_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ void bpnn_train_cuda(BPNN *net, float *eo, float *eh)
in,
hid);

cudaThreadSynchronize();
cudaDeviceSynchronize();

cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
Expand Down
Loading
Loading