Skip to content

Commit 709601a

Browse files
committed
GPU: Replace OpenMP parallization with TBB
1 parent fb5baae commit 709601a

40 files changed

+789
-770
lines changed

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 50 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,9 @@
2323
#include <condition_variable>
2424
#include <array>
2525

26-
#ifdef WITH_OPENMP
27-
#include <omp.h>
28-
#endif
29-
3026
#include "GPUReconstruction.h"
3127
#include "GPUReconstructionIncludes.h"
28+
#include "GPUReconstructionThreading.h"
3229
#include "GPUROOTDumpCore.h"
3330
#include "GPUConfigDump.h"
3431
#include "GPUChainTracking.h"
@@ -121,17 +118,18 @@ void GPUReconstruction::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* tr
121118
}
122119
}
123120

124-
int32_t GPUReconstruction::SetNOMPThreads(int32_t n)
121+
void GPUReconstruction::SetNActiveThreads(int32_t n)
125122
{
126-
#ifdef WITH_OPENMP
127-
omp_set_num_threads(mProcessingSettings.ompThreads = std::max(1, n < 0 ? mMaxOMPThreads : std::min(n, mMaxOMPThreads)));
123+
mActiveHostKernelThreads = std::max(1, n < 0 ? mMaxHostThreads : std::min(n, mMaxHostThreads));
124+
mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
128125
if (mProcessingSettings.debugLevel >= 3) {
129-
GPUInfo("Set number of OpenMP threads to %d (%d requested)", mProcessingSettings.ompThreads, n);
126+
GPUInfo("Set number of active parallel kernels threads on host to %d (%d requested)", mActiveHostKernelThreads, n);
130127
}
131-
return n > mMaxOMPThreads;
132-
#else
133-
return 1;
134-
#endif
128+
}
129+
130+
int32_t GPUReconstruction::getHostThreadIndex()
131+
{
132+
return std::max<int32_t>(0, tbb::this_task_arena::current_thread_index());
135133
}
136134

137135
int32_t GPUReconstruction::Init()
@@ -197,6 +195,24 @@ int32_t GPUReconstruction::Init()
197195
return 0;
198196
}
199197

198+
namespace o2::gpu::internal
199+
{
200+
static uint32_t getDefaultNThreads()
201+
{
202+
const char* tbbEnv = getenv("TBB_NUM_THREADS");
203+
uint32_t tbbNum = tbbEnv ? atoi(tbbEnv) : 0;
204+
if (tbbNum) {
205+
return tbbNum;
206+
}
207+
const char* ompEnv = getenv("OMP_NUM_THREADS");
208+
uint32_t ompNum = ompEnv ? atoi(ompEnv) : 0;
209+
if (ompNum) {
210+
return tbbNum;
211+
}
212+
return tbb::info::default_concurrency();
213+
}
214+
} // namespace o2::gpu::internal
215+
200216
int32_t GPUReconstruction::InitPhaseBeforeDevice()
201217
{
202218
if (mProcessingSettings.printSettings) {
@@ -299,32 +315,37 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
299315
mMemoryScalers->rescaleMaxMem(mProcessingSettings.forceMaxMemScalers);
300316
}
301317

302-
#ifdef WITH_OPENMP
303-
if (mProcessingSettings.ompThreads <= 0) {
304-
mProcessingSettings.ompThreads = omp_get_max_threads();
305-
} else {
306-
mProcessingSettings.ompAutoNThreads = false;
307-
omp_set_num_threads(mProcessingSettings.ompThreads);
318+
if (mProcessingSettings.nHostThreads != -1 && mProcessingSettings.ompThreads != -1) {
319+
GPUFatal("Must not use both nHostThreads and ompThreads at the same time!");
320+
} else if (mProcessingSettings.ompThreads != -1) {
321+
mProcessingSettings.nHostThreads = mProcessingSettings.ompThreads;
322+
GPUWarning("You are using the deprecated ompThreads option, please switch to nHostThreads!");
308323
}
309-
if (mProcessingSettings.ompKernels) {
310-
if (omp_get_max_active_levels() < 2) {
311-
omp_set_max_active_levels(2);
312-
}
324+
325+
if (mProcessingSettings.nHostThreads <= 0) {
326+
mProcessingSettings.nHostThreads = internal::getDefaultNThreads();
327+
} else {
328+
mProcessingSettings.autoAdjustHostThreads = false;
329+
}
330+
mMaxHostThreads = mActiveHostKernelThreads = mProcessingSettings.nHostThreads;
331+
if (mMaster == nullptr) {
332+
mThreading = std::make_shared<GPUReconstructionThreading>();
333+
mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism, mMaxHostThreads);
334+
mThreading->allThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
335+
mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
336+
} else {
337+
mThreading = mMaster->mThreading;
313338
}
314-
#else
315-
mProcessingSettings.ompThreads = 1;
316-
#endif
317-
mMaxOMPThreads = mProcessingSettings.ompThreads;
318-
mMaxThreads = std::max(mMaxThreads, mProcessingSettings.ompThreads);
339+
mMaxBackendThreads = std::max(mMaxBackendThreads, mMaxHostThreads);
319340
if (IsGPU()) {
320341
mNStreams = std::max<int32_t>(mProcessingSettings.nStreams, 3);
321342
}
322343

323344
if (mProcessingSettings.nTPCClustererLanes == -1) {
324-
mProcessingSettings.nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max<int32_t>(1, std::min<int32_t>(GPUCA_NSLICES, mProcessingSettings.ompKernels ? (mProcessingSettings.ompThreads >= 4 ? std::min<int32_t>(mProcessingSettings.ompThreads / 2, mProcessingSettings.ompThreads >= 32 ? GPUCA_NSLICES : 4) : 1) : mProcessingSettings.ompThreads));
345+
mProcessingSettings.nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max<int32_t>(1, std::min<int32_t>(GPUCA_NSLICES, mProcessingSettings.inKernelParallel ? (mMaxHostThreads >= 4 ? std::min<int32_t>(mMaxHostThreads / 2, mMaxHostThreads >= 32 ? GPUCA_NSLICES : 4) : 1) : mMaxHostThreads));
325346
}
326347
if (mProcessingSettings.overrideClusterizerFragmentLen == -1) {
327-
mProcessingSettings.overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mProcessingSettings.ompThreads / mProcessingSettings.nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST;
348+
mProcessingSettings.overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mMaxHostThreads / mProcessingSettings.nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST;
328349
}
329350
if (mProcessingSettings.nTPCClustererLanes > GPUCA_NSLICES) {
330351
GPUError("Invalid value for nTPCClustererLanes: %d", mProcessingSettings.nTPCClustererLanes);

GPU/GPUTracking/Base/GPUReconstruction.h

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ namespace gpu
5151
class GPUChain;
5252
struct GPUMemorySizeScalers;
5353
struct GPUReconstructionPipelineContext;
54+
struct GPUReconstructionThreading;
5455
class GPUROOTDumpCore;
5556

5657
namespace gpu_reconstruction_kernels
@@ -206,8 +207,8 @@ class GPUReconstruction
206207
void SetOutputControl(void* ptr, size_t size);
207208
void SetInputControl(void* ptr, size_t size);
208209
GPUOutputControl& OutputControl() { return mOutputControl; }
209-
int32_t GetMaxThreads() const { return mMaxThreads; }
210-
int32_t SetNOMPThreads(int32_t n);
210+
int32_t GetMaxBackendThreads() const { return mMaxBackendThreads; }
211+
void SetNActiveThreads(int32_t n);
211212
int32_t NStreams() const { return mNStreams; }
212213
const void* DeviceMemoryBase() const { return mDeviceMemoryBase; }
213214

@@ -234,6 +235,9 @@ class GPUReconstruction
234235
double GetStatKernelTime() { return mStatKernelTime; }
235236
double GetStatWallTime() { return mStatWallTime; }
236237

238+
std::shared_ptr<GPUReconstructionThreading> mThreading;
239+
static int32_t getHostThreadIndex();
240+
237241
protected:
238242
void AllocateRegisteredMemoryInternal(GPUMemoryResource* res, GPUOutputControl* control, GPUReconstruction* recPool);
239243
void FreeRegisteredMemory(GPUMemoryResource* res);
@@ -343,11 +347,12 @@ class GPUReconstruction
343347
std::shared_ptr<GPUROOTDumpCore> mROOTDump;
344348
std::vector<std::array<uint32_t, 4>>* mOutputErrorCodes = nullptr;
345349

346-
int32_t mMaxThreads = 0; // Maximum number of threads that may be running, on CPU or GPU
347-
int32_t mThreadId = -1; // Thread ID that is valid for the local CUDA context
348-
int32_t mGPUStuck = 0; // Marks that the GPU is stuck, skip future events
349-
int32_t mNStreams = 1; // Number of parallel GPU streams
350-
int32_t mMaxOMPThreads = 0; // Maximum number of OMP threads
350+
int32_t mMaxBackendThreads = 0; // Maximum number of threads that may be running, on CPU or GPU
351+
int32_t mThreadId = -1; // Thread ID that is valid for the local CUDA context
352+
int32_t mGPUStuck = 0; // Marks that the GPU is stuck, skip future events
353+
int32_t mNStreams = 1; // Number of parallel GPU streams
354+
int32_t mMaxHostThreads = 0; // Maximum number of OMP threads
355+
int32_t mActiveHostKernelThreads = 0; // Number of currently active threads on the host for kernels
351356

352357
// Management for GPUProcessors
353358
struct ProcessorData {

GPU/GPUTracking/Base/GPUReconstructionCPU.cxx

Lines changed: 39 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include "GPUReconstructionCPU.h"
1616
#include "GPUReconstructionIncludes.h"
17+
#include "GPUReconstructionThreading.h"
1718
#include "GPUChain.h"
1819

1920
#include "GPUTPCClusterData.h"
@@ -40,13 +41,6 @@
4041
#include <unistd.h>
4142
#endif
4243

43-
#if defined(WITH_OPENMP) || defined(_OPENMP)
44-
#include <omp.h>
45-
#else
46-
static inline int32_t omp_get_thread_num() { return 0; }
47-
static inline int32_t omp_get_max_threads() { return 1; }
48-
#endif
49-
5044
using namespace o2::gpu;
5145
using namespace o2::gpu::gpu_reconstruction_kernels;
5246

@@ -60,19 +54,21 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
6054
Exit(); // Needs to be identical to GPU backend bahavior in order to avoid calling abstract methods later in the destructor
6155
}
6256

63-
int32_t GPUReconstructionCPUBackend::getNOMPThreads()
57+
int32_t GPUReconstructionCPUBackend::getNKernelHostThreads(bool splitCores)
6458
{
65-
int32_t ompThreads = 0;
66-
if (mProcessingSettings.ompKernels == 2) {
67-
ompThreads = mProcessingSettings.ompThreads / mNestedLoopOmpFactor;
68-
if ((uint32_t)getOMPThreadNum() < mProcessingSettings.ompThreads % mNestedLoopOmpFactor) {
69-
ompThreads++;
59+
int32_t nThreads = 0;
60+
if (mProcessingSettings.inKernelParallel == 2 && mNActiveThreadsOuterLoop) {
61+
if (splitCores) {
62+
nThreads = mMaxHostThreads / mNActiveThreadsOuterLoop;
63+
nThreads += (uint32_t)getHostThreadIndex() < mMaxHostThreads % mNActiveThreadsOuterLoop;
64+
} else {
65+
nThreads = mMaxHostThreads;
7066
}
71-
ompThreads = std::max(1, ompThreads);
67+
nThreads = std::max(1, nThreads);
7268
} else {
73-
ompThreads = mProcessingSettings.ompKernels ? mProcessingSettings.ompThreads : 1;
69+
nThreads = mProcessingSettings.inKernelParallel ? mMaxHostThreads : 1;
7470
}
75-
return ompThreads;
71+
return nThreads;
7672
}
7773

7874
template <class T, int32_t I, typename... Args>
@@ -88,16 +84,19 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
8884
}
8985
uint32_t num = y.num == 0 || y.num == -1 ? 1 : y.num;
9086
for (uint32_t k = 0; k < num; k++) {
91-
int32_t ompThreads = getNOMPThreads();
92-
if (ompThreads > 1) {
87+
int32_t nThreads = getNKernelHostThreads(false);
88+
if (nThreads > 1) {
9389
if (mProcessingSettings.debugLevel >= 5) {
94-
printf("Running %d ompThreads\n", ompThreads);
95-
}
96-
GPUCA_OPENMP(parallel for num_threads(ompThreads))
97-
for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
98-
typename T::GPUSharedMemory smem;
99-
T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
90+
printf("Running %d Threads\n", nThreads);
10091
}
92+
mThreading->activeThreads->execute([&] {
93+
tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
94+
typename T::GPUSharedMemory smem;
95+
for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
96+
T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
97+
}
98+
});
99+
});
101100
} else {
102101
for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
103102
typename T::GPUSharedMemory smem;
@@ -111,24 +110,20 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
111110
template <>
112111
inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
113112
{
114-
#ifdef WITH_OPENMP
115-
int32_t nOMPThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNOMPThreads()));
116-
if (nOMPThreads > 1) {
117-
GPUCA_OPENMP(parallel num_threads(nOMPThreads))
118-
{
119-
size_t threadSize = size / omp_get_num_threads();
113+
int32_t nnThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));
114+
if (nnThreads > 1) {
115+
tbb::parallel_for(0, nnThreads, [&](int iThread) {
116+
size_t threadSize = size / nnThreads;
120117
if (threadSize % 4096) {
121118
threadSize += 4096 - threadSize % 4096;
122119
}
123-
size_t offset = threadSize * omp_get_thread_num();
120+
size_t offset = threadSize * iThread;
124121
size_t mySize = std::min<size_t>(threadSize, size - offset);
125122
if (mySize) {
126123
memset((char*)ptr + offset, 0, mySize);
127-
}
128-
}
129-
} else
130-
#endif
131-
{
124+
} // clang-format off
125+
}, tbb::static_partitioner()); // clang-format on
126+
} else {
132127
memset(ptr, 0, size);
133128
}
134129
return 0;
@@ -213,8 +208,8 @@ int32_t GPUReconstructionCPU::InitDevice()
213208
mHostMemoryPermanent = mHostMemoryBase;
214209
ClearAllocatedMemory();
215210
}
216-
if (mProcessingSettings.ompKernels) {
217-
mBlockCount = getOMPMaxThreads();
211+
if (mProcessingSettings.inKernelParallel) {
212+
mBlockCount = mMaxHostThreads;
218213
}
219214
mThreadId = GetThread();
220215
mProcShadow.mProcessorsProc = processors();
@@ -351,16 +346,6 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes()
351346
}
352347
}
353348

354-
int32_t GPUReconstructionCPUBackend::getOMPThreadNum()
355-
{
356-
return omp_get_thread_num();
357-
}
358-
359-
int32_t GPUReconstructionCPUBackend::getOMPMaxThreads()
360-
{
361-
return omp_get_max_threads();
362-
}
363-
364349
static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class member not global, but cannot be moved to header due to ROOT limitation
365350

366351
GPUReconstructionCPU::timerMeta* GPUReconstructionCPU::insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step)
@@ -402,17 +387,17 @@ uint32_t GPUReconstructionCPU::getNextTimerId()
402387
return id.fetch_add(1);
403388
}
404389

405-
uint32_t GPUReconstructionCPU::SetAndGetNestedLoopOmpFactor(bool condition, uint32_t max)
390+
uint32_t GPUReconstructionCPU::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
406391
{
407-
if (condition && mProcessingSettings.ompKernels != 1) {
408-
mNestedLoopOmpFactor = mProcessingSettings.ompKernels == 2 ? std::min<uint32_t>(max, mProcessingSettings.ompThreads) : mProcessingSettings.ompThreads;
392+
if (condition && mProcessingSettings.inKernelParallel != 1) {
393+
mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
409394
} else {
410-
mNestedLoopOmpFactor = 1;
395+
mNActiveThreadsOuterLoop = 1;
411396
}
412397
if (mProcessingSettings.debugLevel >= 5) {
413-
printf("Running %d OMP threads in outer loop\n", mNestedLoopOmpFactor);
398+
printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop);
414399
}
415-
return mNestedLoopOmpFactor;
400+
return mNActiveThreadsOuterLoop;
416401
}
417402

418403
void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream)

GPU/GPUTracking/Base/GPUReconstructionCPU.h

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,8 @@ class GPUReconstructionCPUBackend : public GPUReconstruction
4343
int32_t runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime& _xyz, const Args&... args);
4444
template <class T, int32_t I>
4545
gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend();
46-
uint32_t mNestedLoopOmpFactor = 1;
47-
static int32_t getOMPThreadNum();
48-
static int32_t getOMPMaxThreads();
49-
int32_t getNOMPThreads();
46+
uint32_t mNActiveThreadsOuterLoop = 1;
47+
int32_t getNKernelHostThreads(bool splitCores);
5048
};
5149

5250
class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCPUBackend>
@@ -81,8 +79,8 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
8179
HighResTimer& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)].timerTotal; }
8280
HighResTimer& getGeneralStepTimer(GeneralStep step) { return mTimersGeneralSteps[getGeneralStepNum(step)]; }
8381

84-
void SetNestedLoopOmpFactor(uint32_t f) { mNestedLoopOmpFactor = f; }
85-
uint32_t SetAndGetNestedLoopOmpFactor(bool condition, uint32_t max);
82+
void SetNActiveThreadsOuterLoop(uint32_t f) { mNActiveThreadsOuterLoop = f; }
83+
uint32_t SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max);
8684

8785
void UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream = -1);
8886

@@ -220,8 +218,8 @@ inline int32_t GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args
220218
return 0;
221219
}
222220
if (mProcessingSettings.debugLevel >= 1) {
223-
t = &getKernelTimer<S, I>(myStep, !IsGPU() || cpuFallback ? getOMPThreadNum() : stream);
224-
if ((!mProcessingSettings.deviceTimers || !IsGPU() || cpuFallback) && (mNestedLoopOmpFactor < 2 || getOMPThreadNum() == 0)) {
221+
t = &getKernelTimer<S, I>(myStep, !IsGPU() || cpuFallback ? getHostThreadIndex() : stream);
222+
if ((!mProcessingSettings.deviceTimers || !IsGPU() || cpuFallback) && (mNActiveThreadsOuterLoop < 2 || getHostThreadIndex() == 0)) {
225223
t->Start();
226224
}
227225
}
@@ -287,11 +285,11 @@ HighResTimer& GPUReconstructionCPU::getTimer(const char* name, int32_t num)
287285
static int32_t id = getNextTimerId();
288286
timerMeta* timer = getTimerById(id);
289287
if (timer == nullptr) {
290-
int32_t max = std::max<int32_t>({getOMPMaxThreads(), mProcessingSettings.nStreams});
288+
int32_t max = std::max<int32_t>({mMaxHostThreads, mProcessingSettings.nStreams});
291289
timer = insertTimer(id, name, J, max, 1, RecoStep::NoRecoStep);
292290
}
293291
if (num == -1) {
294-
num = getOMPThreadNum();
292+
num = getHostThreadIndex();
295293
}
296294
if (num < 0 || num >= timer->num) {
297295
throw std::runtime_error("Invalid timer requested");

0 commit comments

Comments
 (0)