1414
1515#include " GPUReconstructionCPU.h"
1616#include " GPUReconstructionIncludes.h"
17+ #include " GPUReconstructionThreading.h"
1718#include " GPUChain.h"
1819
1920#include " GPUTPCClusterData.h"
4041#include < unistd.h>
4142#endif
4243
43- #if defined(WITH_OPENMP) || defined(_OPENMP)
44- #include < omp.h>
45- #else
46- static inline int32_t omp_get_thread_num () { return 0 ; }
47- static inline int32_t omp_get_max_threads () { return 1 ; }
48- #endif
49-
5044using namespace o2 ::gpu;
5145using namespace o2 ::gpu::gpu_reconstruction_kernels;
5246
@@ -60,19 +54,21 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
6054 Exit (); // Needs to be identical to GPU backend bahavior in order to avoid calling abstract methods later in the destructor
6155}
6256
63- int32_t GPUReconstructionCPUBackend::getNOMPThreads ( )
57+ int32_t GPUReconstructionCPUBackend::getNKernelHostThreads ( bool splitCores )
6458{
65- int32_t ompThreads = 0 ;
66- if (mProcessingSettings .ompKernels == 2 ) {
67- ompThreads = mProcessingSettings .ompThreads / mNestedLoopOmpFactor ;
68- if ((uint32_t )getOMPThreadNum () < mProcessingSettings .ompThreads % mNestedLoopOmpFactor ) {
69- ompThreads++;
59+ int32_t nThreads = 0 ;
60+ if (mProcessingSettings .inKernelParallel == 2 && mNActiveThreadsOuterLoop ) {
61+ if (splitCores) {
62+ nThreads = mMaxHostThreads / mNActiveThreadsOuterLoop ;
63+ nThreads += (uint32_t )getHostThreadIndex () < mMaxHostThreads % mNActiveThreadsOuterLoop ;
64+ } else {
65+ nThreads = mMaxHostThreads ;
7066 }
71- ompThreads = std::max (1 , ompThreads );
67+ nThreads = std::max (1 , nThreads );
7268 } else {
73- ompThreads = mProcessingSettings .ompKernels ? mProcessingSettings . ompThreads : 1 ;
69+ nThreads = mProcessingSettings .inKernelParallel ? mMaxHostThreads : 1 ;
7470 }
75- return ompThreads ;
71+ return nThreads ;
7672}
7773
7874template <class T , int32_t I, typename ... Args>
@@ -88,16 +84,19 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
8884 }
8985 uint32_t num = y.num == 0 || y.num == -1 ? 1 : y.num ;
9086 for (uint32_t k = 0 ; k < num; k++) {
91- int32_t ompThreads = getNOMPThreads ( );
92- if (ompThreads > 1 ) {
87+ int32_t nThreads = getNKernelHostThreads ( false );
88+ if (nThreads > 1 ) {
9389 if (mProcessingSettings .debugLevel >= 5 ) {
94- printf (" Running %d ompThreads\n " , ompThreads);
95- }
96- GPUCA_OPENMP (parallel for num_threads (ompThreads))
97- for (uint32_t iB = 0 ; iB < x.nBlocks ; iB++) {
98- typename T::GPUSharedMemory smem;
99- T::template Thread<I>(x.nBlocks , 1 , iB, 0 , smem, T::Processor (*mHostConstantMem )[y.start + k], args...);
90+ printf (" Running %d Threads\n " , nThreads);
10091 }
92+ mThreading ->activeThreads ->execute ([&] {
93+ tbb::parallel_for (tbb::blocked_range<uint32_t >(0 , x.nBlocks , 1 ), [&](const tbb::blocked_range<uint32_t >& r) {
94+ typename T::GPUSharedMemory smem;
95+ for (uint32_t iB = r.begin (); iB < r.end (); iB++) {
96+ T::template Thread<I>(x.nBlocks , 1 , iB, 0 , smem, T::Processor (*mHostConstantMem )[y.start + k], args...);
97+ }
98+ });
99+ });
101100 } else {
102101 for (uint32_t iB = 0 ; iB < x.nBlocks ; iB++) {
103102 typename T::GPUSharedMemory smem;
@@ -111,24 +110,20 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
111110template <>
112111inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0 >(const krnlSetupTime& _xyz, void * const & ptr, uint64_t const & size)
113112{
114- #ifdef WITH_OPENMP
115- int32_t nOMPThreads = std::max<int32_t >(1 , std::min<int32_t >(size / (16 * 1024 * 1024 ), getNOMPThreads ()));
116- if (nOMPThreads > 1 ) {
117- GPUCA_OPENMP (parallel num_threads (nOMPThreads))
118- {
119- size_t threadSize = size / omp_get_num_threads ();
113+ int32_t nnThreads = std::max<int32_t >(1 , std::min<int32_t >(size / (16 * 1024 * 1024 ), getNKernelHostThreads (true )));
114+ if (nnThreads > 1 ) {
115+ tbb::parallel_for (0 , nnThreads, [&](int iThread) {
116+ size_t threadSize = size / nnThreads;
120117 if (threadSize % 4096 ) {
121118 threadSize += 4096 - threadSize % 4096 ;
122119 }
123- size_t offset = threadSize * omp_get_thread_num () ;
120+ size_t offset = threadSize * iThread ;
124121 size_t mySize = std::min<size_t >(threadSize, size - offset);
125122 if (mySize) {
126123 memset ((char *)ptr + offset, 0 , mySize);
127- }
128- }
129- } else
130- #endif
131- {
124+ } // clang-format off
125+ }, tbb::static_partitioner ()); // clang-format on
126+ } else {
132127 memset (ptr, 0 , size);
133128 }
134129 return 0 ;
@@ -213,8 +208,8 @@ int32_t GPUReconstructionCPU::InitDevice()
213208 mHostMemoryPermanent = mHostMemoryBase ;
214209 ClearAllocatedMemory ();
215210 }
216- if (mProcessingSettings .ompKernels ) {
217- mBlockCount = getOMPMaxThreads () ;
211+ if (mProcessingSettings .inKernelParallel ) {
212+ mBlockCount = mMaxHostThreads ;
218213 }
219214 mThreadId = GetThread ();
220215 mProcShadow .mProcessorsProc = processors ();
@@ -351,16 +346,6 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes()
351346 }
352347}
353348
354- int32_t GPUReconstructionCPUBackend::getOMPThreadNum ()
355- {
356- return omp_get_thread_num ();
357- }
358-
359- int32_t GPUReconstructionCPUBackend::getOMPMaxThreads ()
360- {
361- return omp_get_max_threads ();
362- }
363-
364349static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class member not global, but cannot be moved to header due to ROOT limitation
365350
366351GPUReconstructionCPU::timerMeta* GPUReconstructionCPU::insertTimer (uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step)
@@ -402,17 +387,17 @@ uint32_t GPUReconstructionCPU::getNextTimerId()
402387 return id.fetch_add (1 );
403388}
404389
405- uint32_t GPUReconstructionCPU::SetAndGetNestedLoopOmpFactor (bool condition, uint32_t max)
390+ uint32_t GPUReconstructionCPU::SetAndGetNActiveThreadsOuterLoop (bool condition, uint32_t max)
406391{
407- if (condition && mProcessingSettings .ompKernels != 1 ) {
408- mNestedLoopOmpFactor = mProcessingSettings .ompKernels == 2 ? std::min<uint32_t >(max, mProcessingSettings . ompThreads ) : mProcessingSettings . ompThreads ;
392+ if (condition && mProcessingSettings .inKernelParallel != 1 ) {
393+ mNActiveThreadsOuterLoop = mProcessingSettings .inKernelParallel == 2 ? std::min<uint32_t >(max, mMaxHostThreads ) : mMaxHostThreads ;
409394 } else {
410- mNestedLoopOmpFactor = 1 ;
395+ mNActiveThreadsOuterLoop = 1 ;
411396 }
412397 if (mProcessingSettings .debugLevel >= 5 ) {
413- printf (" Running %d OMP threads in outer loop\n " , mNestedLoopOmpFactor );
398+ printf (" Running %d threads in outer loop\n " , mNActiveThreadsOuterLoop );
414399 }
415- return mNestedLoopOmpFactor ;
400+ return mNActiveThreadsOuterLoop ;
416401}
417402
418403void GPUReconstructionCPU::UpdateParamOccupancyMap (const uint32_t * mapHost, const uint32_t * mapGPU, uint32_t occupancyTotal, int32_t stream)
0 commit comments