Skip to content

Commit adf1bda

Browse files
committed
GPU: Implement parallel memset for host code
1 parent a2d7b83 commit adf1bda

File tree

2 files changed

+19
-5
lines changed

2 files changed

+19
-5
lines changed

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
246246
}
247247
if (mProcessingSettings.deterministicGPUReconstruction) {
248248
#ifndef GPUCA_NO_FAST_MATH
249-
GPUError("Warning, deterministicGPUReconstruction needs GPUCA_NO_FAST_MATH, otherwise results will never be deterministic!");
249+
GPUError("Warning, deterministicGPUReconstruction needs GPUCA_NO_FAST_MATH for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!");
250250
#endif
251251
mProcessingSettings.overrideClusterizerFragmentLen = TPC_MAX_FRAGMENT_LEN_GPU;
252252
param().rec.tpc.nWaysOuter = true;

GPU/GPUTracking/Base/GPUReconstructionCPU.cxx

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,24 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
111111
template <>
112112
inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
113113
{
114-
int32_t ompThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNOMPThreads()));
115-
if (ompThreads > 1) {
116-
memset(ptr, 0, size);
117-
} else {
114+
#ifdef WITH_OPENMP
115+
int32_t nOMPThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNOMPThreads()));
116+
if (nOMPThreads > 1) {
117+
GPUCA_OPENMP(parallel num_threads(nOMPThreads))
118+
{
119+
size_t threadSize = size / omp_get_num_threads();
120+
if (threadSize % 4096) {
121+
threadSize += 4096 - threadSize % 4096;
122+
}
123+
size_t offset = threadSize * omp_get_thread_num();
124+
size_t mySize = std::min<size_t>(threadSize, size - offset);
125+
if (mySize) {
126+
memset((char*)ptr + offset, 0, mySize);
127+
}
128+
}
129+
} else
130+
#endif
131+
{
118132
memset(ptr, 0, size);
119133
}
120134
return 0;

0 commit comments

Comments
 (0)