1414
1515#include " GPUReconstructionCPU.h"
1616#include " GPUReconstructionIncludes.h"
17+ #include " GPUReconstructionThreading.h"
1718#include " GPUChain.h"
1819
1920#include " GPUTPCClusterData.h"
4041#include < unistd.h>
4142#endif
4243
43- #if defined(WITH_OPENMP) || defined(_OPENMP)
44- #include < omp.h>
45- #else
46- static inline int32_t omp_get_thread_num () { return 0 ; }
47- static inline int32_t omp_get_max_threads () { return 1 ; }
48- #endif
49-
5044using namespace o2 ::gpu;
5145using namespace o2 ::gpu::gpu_reconstruction_kernels;
5246
@@ -60,19 +54,19 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
6054 Exit (); // Needs to be identical to GPU backend bahavior in order to avoid calling abstract methods later in the destructor
6155}
6256
63- int32_t GPUReconstructionCPUBackend::getNOMPThreads ()
57+ int32_t GPUReconstructionCPUBackend::getNKernelHostThreads ()
6458{
65- int32_t ompThreads = 0 ;
66- if (mProcessingSettings .ompKernels == 2 ) {
67- ompThreads = mProcessingSettings .ompThreads / mNestedLoopOmpFactor ;
68- if ((uint32_t )getOMPThreadNum () < mProcessingSettings .ompThreads % mNestedLoopOmpFactor ) {
69- ompThreads ++;
59+ int32_t nThreads = 0 ;
60+ if (mProcessingSettings .inKernelParallel == 2 ) {
61+ nThreads = mProcessingSettings .nHostThreads / mNActiveThreadsOuterLoop ;
62+ if ((uint32_t )getHostThreadIndex () < mProcessingSettings .nHostThreads % mNActiveThreadsOuterLoop ) {
63+ nThreads ++;
7064 }
71- ompThreads = std::max (1 , ompThreads );
65+ nThreads = std::max (1 , nThreads );
7266 } else {
73- ompThreads = mProcessingSettings .ompKernels ? mProcessingSettings .ompThreads : 1 ;
67+ nThreads = mProcessingSettings .inKernelParallel ? mProcessingSettings .nHostThreads : 1 ;
7468 }
75- return ompThreads ;
69+ return nThreads ;
7670}
7771
7872template <class T , int32_t I, typename ... Args>
@@ -88,16 +82,17 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
8882 }
8983 uint32_t num = y.num == 0 || y.num == -1 ? 1 : y.num ;
9084 for (uint32_t k = 0 ; k < num; k++) {
91- int32_t ompThreads = getNOMPThreads ();
92- if (ompThreads > 1 ) {
85+ int32_t nThreads = getNKernelHostThreads ();
86+ if (nThreads > 1 ) {
9387 if (mProcessingSettings .debugLevel >= 5 ) {
94- printf (" Running %d ompThreads \n " , ompThreads );
88+ printf (" Running %d Threads \n " , nThreads );
9589 }
96- GPUCA_OPENMP (parallel for num_threads (ompThreads))
97- for (uint32_t iB = 0 ; iB < x.nBlocks ; iB++) {
90+ mThreading ->activeThreads ->execute ([&] {
9891 typename T::GPUSharedMemory smem;
99- T::template Thread<I>(x.nBlocks , 1 , iB, 0 , smem, T::Processor (*mHostConstantMem )[y.start + k], args...);
100- }
92+ tbb::parallel_for<uint32_t >(0 , x.nBlocks , [&](auto iB) {
93+ T::template Thread<I>(x.nBlocks , 1 , iB, 0 , smem, T::Processor (*mHostConstantMem )[y.start + k], args...);
94+ });
95+ });
10196 } else {
10297 for (uint32_t iB = 0 ; iB < x.nBlocks ; iB++) {
10398 typename T::GPUSharedMemory smem;
@@ -111,24 +106,20 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
111106template <>
112107inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0 >(const krnlSetupTime& _xyz, void * const & ptr, uint64_t const & size)
113108{
114- #ifdef WITH_OPENMP
115- int32_t nOMPThreads = std::max<int32_t >(1 , std::min<int32_t >(size / (16 * 1024 * 1024 ), getNOMPThreads ()));
116- if (nOMPThreads > 1 ) {
117- GPUCA_OPENMP (parallel num_threads (nOMPThreads))
118- {
119- size_t threadSize = size / omp_get_num_threads ();
109+ int32_t nnThreads = std::max<int32_t >(1 , std::min<int32_t >(size / (16 * 1024 * 1024 ), getNKernelHostThreads ()));
110+ if (nnThreads > 1 ) {
111+ tbb::parallel_for (0 , nnThreads, [&](int iThread) {
112+ size_t threadSize = size / nnThreads;
120113 if (threadSize % 4096 ) {
121114 threadSize += 4096 - threadSize % 4096 ;
122115 }
123- size_t offset = threadSize * omp_get_thread_num () ;
116+ size_t offset = threadSize * iThread ;
124117 size_t mySize = std::min<size_t >(threadSize, size - offset);
125118 if (mySize) {
126119 memset ((char *)ptr + offset, 0 , mySize);
127- }
128- }
129- } else
130- #endif
131- {
120+ } // clang-format off
121+ }, tbb::static_partitioner ()); // clang-format on
122+ } else {
132123 memset (ptr, 0 , size);
133124 }
134125 return 0 ;
@@ -213,8 +204,8 @@ int32_t GPUReconstructionCPU::InitDevice()
213204 mHostMemoryPermanent = mHostMemoryBase ;
214205 ClearAllocatedMemory ();
215206 }
216- if (mProcessingSettings .ompKernels ) {
217- mBlockCount = getOMPMaxThreads () ;
207+ if (mProcessingSettings .inKernelParallel ) {
208+ mBlockCount = mMaxHostThreads ;
218209 }
219210 mThreadId = GetThread ();
220211 mProcShadow .mProcessorsProc = processors ();
@@ -351,16 +342,6 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes()
351342 }
352343}
353344
354- int32_t GPUReconstructionCPUBackend::getOMPThreadNum ()
355- {
356- return omp_get_thread_num ();
357- }
358-
359- int32_t GPUReconstructionCPUBackend::getOMPMaxThreads ()
360- {
361- return omp_get_max_threads ();
362- }
363-
364345static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class member not global, but cannot be moved to header due to ROOT limitation
365346
366347GPUReconstructionCPU::timerMeta* GPUReconstructionCPU::insertTimer (uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step)
@@ -402,17 +383,17 @@ uint32_t GPUReconstructionCPU::getNextTimerId()
402383 return id.fetch_add (1 );
403384}
404385
405- uint32_t GPUReconstructionCPU::SetAndGetNestedLoopOmpFactor (bool condition, uint32_t max)
386+ uint32_t GPUReconstructionCPU::SetAndGetNActiveThreadsOuterLoop (bool condition, uint32_t max)
406387{
407- if (condition && mProcessingSettings .ompKernels != 1 ) {
408- mNestedLoopOmpFactor = mProcessingSettings .ompKernels == 2 ? std::min<uint32_t >(max, mProcessingSettings .ompThreads ) : mProcessingSettings .ompThreads ;
388+ if (condition && mProcessingSettings .inKernelParallel != 1 ) {
389+ mNActiveThreadsOuterLoop = mProcessingSettings .inKernelParallel == 2 ? std::min<uint32_t >(max, mProcessingSettings .nHostThreads ) : mProcessingSettings .nHostThreads ;
409390 } else {
410- mNestedLoopOmpFactor = 1 ;
391+ mNActiveThreadsOuterLoop = 1 ;
411392 }
412393 if (mProcessingSettings .debugLevel >= 5 ) {
413- printf (" Running %d OMP threads in outer loop\n " , mNestedLoopOmpFactor );
394+ printf (" Running %d threads in outer loop\n " , mNActiveThreadsOuterLoop );
414395 }
415- return mNestedLoopOmpFactor ;
396+ return mNActiveThreadsOuterLoop ;
416397}
417398
418399void GPUReconstructionCPU::UpdateParamOccupancyMap (const uint32_t * mapHost, const uint32_t * mapGPU, uint32_t occupancyTotal, int32_t stream)
0 commit comments