AliceO2Group · davidrohr · Feb 22, 2025 · Feb 21, 2025
@@ -23,12 +23,9 @@
 #include <condition_variable>
 #include <array>
 
-#ifdef WITH_OPENMP
-#include <omp.h>
-#endif
-
 #include "GPUReconstruction.h"
 #include "GPUReconstructionIncludes.h"
+#include "GPUReconstructionThreading.h"
 #include "GPUROOTDumpCore.h"
 #include "GPUConfigDump.h"
 #include "GPUChainTracking.h"
@@ -121,17 +118,18 @@ void GPUReconstruction::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* tr
   }
 }
 
-int32_t GPUReconstruction::SetNOMPThreads(int32_t n)
+void GPUReconstruction::SetNActiveThreads(int32_t n)
 {
-#ifdef WITH_OPENMP
-  omp_set_num_threads(mProcessingSettings.ompThreads = std::max(1, n < 0 ? mMaxOMPThreads : std::min(n, mMaxOMPThreads)));
+  mActiveHostKernelThreads = std::max(1, n < 0 ? mMaxHostThreads : std::min(n, mMaxHostThreads));
+  mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
   if (mProcessingSettings.debugLevel >= 3) {
-    GPUInfo("Set number of OpenMP threads to %d (%d requested)", mProcessingSettings.ompThreads, n);
+    GPUInfo("Set number of active parallel kernels threads on host to %d (%d requested)", mActiveHostKernelThreads, n);
   }
-  return n > mMaxOMPThreads;
-#else
-  return 1;
-#endif
+}
+
+int32_t GPUReconstruction::getHostThreadIndex()
+{
+  return std::max<int32_t>(0, tbb::this_task_arena::current_thread_index());
 }
 
 int32_t GPUReconstruction::Init()
@@ -197,6 +195,24 @@ int32_t GPUReconstruction::Init()
   return 0;
 }
 
+namespace o2::gpu::internal
+{
+static uint32_t getDefaultNThreads()
+{
+  const char* tbbEnv = getenv("TBB_NUM_THREADS");
+  uint32_t tbbNum = tbbEnv ? atoi(tbbEnv) : 0;
+  if (tbbNum) {
+    return tbbNum;
+  }
+  const char* ompEnv = getenv("OMP_NUM_THREADS");
+  uint32_t ompNum = ompEnv ? atoi(ompEnv) : 0;
+  if (ompNum) {
+    return tbbNum;
+  }
+  return tbb::info::default_concurrency();
+}
+} // namespace o2::gpu::internal
+
 int32_t GPUReconstruction::InitPhaseBeforeDevice()
 {
   if (mProcessingSettings.printSettings) {
@@ -299,32 +315,37 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
     mMemoryScalers->rescaleMaxMem(mProcessingSettings.forceMaxMemScalers);
   }
 
-#ifdef WITH_OPENMP
-  if (mProcessingSettings.ompThreads <= 0) {
-    mProcessingSettings.ompThreads = omp_get_max_threads();
-  } else {
-    mProcessingSettings.ompAutoNThreads = false;
-    omp_set_num_threads(mProcessingSettings.ompThreads);
+  if (mProcessingSettings.nHostThreads != -1 && mProcessingSettings.ompThreads != -1) {
+    GPUFatal("Must not use both nHostThreads and ompThreads at the same time!");
+  } else if (mProcessingSettings.ompThreads != -1) {
+    mProcessingSettings.nHostThreads = mProcessingSettings.ompThreads;
+    GPUWarning("You are using the deprecated ompThreads option, please switch to nHostThreads!");
   }
-  if (mProcessingSettings.ompKernels) {
-    if (omp_get_max_active_levels() < 2) {
-      omp_set_max_active_levels(2);
-    }
+
+  if (mProcessingSettings.nHostThreads <= 0) {
+    mProcessingSettings.nHostThreads = internal::getDefaultNThreads();
+  } else {
+    mProcessingSettings.autoAdjustHostThreads = false;
+  }
+  mMaxHostThreads = mActiveHostKernelThreads = mProcessingSettings.nHostThreads;
+  if (mMaster == nullptr) {
+    mThreading = std::make_shared<GPUReconstructionThreading>();
+    mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism, mMaxHostThreads);
+    mThreading->allThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
+    mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
+  } else {
+    mThreading = mMaster->mThreading;
   }
-#else
-  mProcessingSettings.ompThreads = 1;
-#endif
-  mMaxOMPThreads = mProcessingSettings.ompThreads;
-  mMaxThreads = std::max(mMaxThreads, mProcessingSettings.ompThreads);
+  mMaxBackendThreads = std::max(mMaxBackendThreads, mMaxHostThreads);
   if (IsGPU()) {
     mNStreams = std::max<int32_t>(mProcessingSettings.nStreams, 3);
   }
 
   if (mProcessingSettings.nTPCClustererLanes == -1) {
-    mProcessingSettings.nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max<int32_t>(1, std::min<int32_t>(GPUCA_NSLICES, mProcessingSettings.ompKernels ? (mProcessingSettings.ompThreads >= 4 ? std::min<int32_t>(mProcessingSettings.ompThreads / 2, mProcessingSettings.ompThreads >= 32 ? GPUCA_NSLICES : 4) : 1) : mProcessingSettings.ompThreads));
+    mProcessingSettings.nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max<int32_t>(1, std::min<int32_t>(GPUCA_NSLICES, mProcessingSettings.inKernelParallel ? (mMaxHostThreads >= 4 ? std::min<int32_t>(mMaxHostThreads / 2, mMaxHostThreads >= 32 ? GPUCA_NSLICES : 4) : 1) : mMaxHostThreads));
   }
   if (mProcessingSettings.overrideClusterizerFragmentLen == -1) {
-    mProcessingSettings.overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mProcessingSettings.ompThreads / mProcessingSettings.nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST;
+    mProcessingSettings.overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mMaxHostThreads / mProcessingSettings.nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST;
   }
   if (mProcessingSettings.nTPCClustererLanes > GPUCA_NSLICES) {
     GPUError("Invalid value for nTPCClustererLanes: %d", mProcessingSettings.nTPCClustererLanes);

@@ -51,6 +51,7 @@ namespace gpu
 class GPUChain;
 struct GPUMemorySizeScalers;
 struct GPUReconstructionPipelineContext;
+struct GPUReconstructionThreading;
 class GPUROOTDumpCore;
 
 namespace gpu_reconstruction_kernels
@@ -206,8 +207,8 @@ class GPUReconstruction
   void SetOutputControl(void* ptr, size_t size);
   void SetInputControl(void* ptr, size_t size);
   GPUOutputControl& OutputControl() { return mOutputControl; }
-  int32_t GetMaxThreads() const { return mMaxThreads; }
-  int32_t SetNOMPThreads(int32_t n);
+  int32_t GetMaxBackendThreads() const { return mMaxBackendThreads; }
+  void SetNActiveThreads(int32_t n);
   int32_t NStreams() const { return mNStreams; }
   const void* DeviceMemoryBase() const { return mDeviceMemoryBase; }
 
@@ -234,6 +235,9 @@ class GPUReconstruction
   double GetStatKernelTime() { return mStatKernelTime; }
   double GetStatWallTime() { return mStatWallTime; }
 
+  std::shared_ptr<GPUReconstructionThreading> mThreading;
+  static int32_t getHostThreadIndex();
+
  protected:
   void AllocateRegisteredMemoryInternal(GPUMemoryResource* res, GPUOutputControl* control, GPUReconstruction* recPool);
   void FreeRegisteredMemory(GPUMemoryResource* res);
@@ -343,11 +347,12 @@ class GPUReconstruction
   std::shared_ptr<GPUROOTDumpCore> mROOTDump;
   std::vector<std::array<uint32_t, 4>>* mOutputErrorCodes = nullptr;
 
-  int32_t mMaxThreads = 0;    // Maximum number of threads that may be running, on CPU or GPU
-  int32_t mThreadId = -1;     // Thread ID that is valid for the local CUDA context
-  int32_t mGPUStuck = 0;      // Marks that the GPU is stuck, skip future events
-  int32_t mNStreams = 1;      // Number of parallel GPU streams
-  int32_t mMaxOMPThreads = 0; // Maximum number of OMP threads
+  int32_t mMaxBackendThreads = 0;       // Maximum number of threads that may be running, on CPU or GPU
+  int32_t mThreadId = -1;               // Thread ID that is valid for the local CUDA context
+  int32_t mGPUStuck = 0;                // Marks that the GPU is stuck, skip future events
+  int32_t mNStreams = 1;                // Number of parallel GPU streams
+  int32_t mMaxHostThreads = 0;          // Maximum number of OMP threads
+  int32_t mActiveHostKernelThreads = 0; // Number of currently active threads on the host for kernels
 
   // Management for GPUProcessors
   struct ProcessorData {

@@ -14,6 +14,7 @@
 
 #include "GPUReconstructionCPU.h"
 #include "GPUReconstructionIncludes.h"
+#include "GPUReconstructionThreading.h"
 #include "GPUChain.h"
 
 #include "GPUTPCClusterData.h"
@@ -40,13 +41,6 @@
 #include <unistd.h>
 #endif
 
-#if defined(WITH_OPENMP) || defined(_OPENMP)
-#include <omp.h>
-#else
-static inline int32_t omp_get_thread_num() { return 0; }
-static inline int32_t omp_get_max_threads() { return 1; }
-#endif
-
 using namespace o2::gpu;
 using namespace o2::gpu::gpu_reconstruction_kernels;
 
@@ -60,19 +54,21 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
   Exit(); // Needs to be identical to GPU backend bahavior in order to avoid calling abstract methods later in the destructor
 }
 
-int32_t GPUReconstructionCPUBackend::getNOMPThreads()
+int32_t GPUReconstructionCPUBackend::getNKernelHostThreads(bool splitCores)
 {
-  int32_t ompThreads = 0;
-  if (mProcessingSettings.ompKernels == 2) {
-    ompThreads = mProcessingSettings.ompThreads / mNestedLoopOmpFactor;
-    if ((uint32_t)getOMPThreadNum() < mProcessingSettings.ompThreads % mNestedLoopOmpFactor) {
-      ompThreads++;
+  int32_t nThreads = 0;
+  if (mProcessingSettings.inKernelParallel == 2 && mNActiveThreadsOuterLoop) {
+    if (splitCores) {
+      nThreads = mMaxHostThreads / mNActiveThreadsOuterLoop;
+      nThreads += (uint32_t)getHostThreadIndex() < mMaxHostThreads % mNActiveThreadsOuterLoop;
+    } else {
+      nThreads = mMaxHostThreads;
     }
-    ompThreads = std::max(1, ompThreads);
+    nThreads = std::max(1, nThreads);
   } else {
-    ompThreads = mProcessingSettings.ompKernels ? mProcessingSettings.ompThreads : 1;
+    nThreads = mProcessingSettings.inKernelParallel ? mMaxHostThreads : 1;
   }
-  return ompThreads;
+  return nThreads;
 }
 
 template <class T, int32_t I, typename... Args>
@@ -88,16 +84,19 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
   }
   uint32_t num = y.num == 0 || y.num == -1 ? 1 : y.num;
   for (uint32_t k = 0; k < num; k++) {
-    int32_t ompThreads = getNOMPThreads();
-    if (ompThreads > 1) {
+    int32_t nThreads = getNKernelHostThreads(false);
+    if (nThreads > 1) {
       if (mProcessingSettings.debugLevel >= 5) {
-        printf("Running %d ompThreads\n", ompThreads);
-      }
-      GPUCA_OPENMP(parallel for num_threads(ompThreads))
-      for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
-        typename T::GPUSharedMemory smem;
-        T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
+        printf("Running %d Threads\n", nThreads);
       }
+      mThreading->activeThreads->execute([&] {
+        tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
+          typename T::GPUSharedMemory smem;
+          for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
+            T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
+          }
+        });
+      });
     } else {
       for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
         typename T::GPUSharedMemory smem;
@@ -111,24 +110,20 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
 template <>
 inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
 {
-#ifdef WITH_OPENMP
-  int32_t nOMPThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNOMPThreads()));
-  if (nOMPThreads > 1) {
-    GPUCA_OPENMP(parallel num_threads(nOMPThreads))
-    {
-      size_t threadSize = size / omp_get_num_threads();
+  int32_t nnThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));
+  if (nnThreads > 1) {
+    tbb::parallel_for(0, nnThreads, [&](int iThread) {
+      size_t threadSize = size / nnThreads;
       if (threadSize % 4096) {
         threadSize += 4096 - threadSize % 4096;
       }
-      size_t offset = threadSize * omp_get_thread_num();
+      size_t offset = threadSize * iThread;
       size_t mySize = std::min<size_t>(threadSize, size - offset);
       if (mySize) {
         memset((char*)ptr + offset, 0, mySize);
-      }
-    }
-  } else
-#endif
-  {
+      } // clang-format off
+    }, tbb::static_partitioner()); // clang-format on
+  } else {
     memset(ptr, 0, size);
   }
   return 0;
@@ -213,8 +208,8 @@ int32_t GPUReconstructionCPU::InitDevice()
     mHostMemoryPermanent = mHostMemoryBase;
     ClearAllocatedMemory();
   }
-  if (mProcessingSettings.ompKernels) {
-    mBlockCount = getOMPMaxThreads();
+  if (mProcessingSettings.inKernelParallel) {
+    mBlockCount = mMaxHostThreads;
   }
   mThreadId = GetThread();
   mProcShadow.mProcessorsProc = processors();
@@ -351,16 +346,6 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes()
   }
 }
 
-int32_t GPUReconstructionCPUBackend::getOMPThreadNum()
-{
-  return omp_get_thread_num();
-}
-
-int32_t GPUReconstructionCPUBackend::getOMPMaxThreads()
-{
-  return omp_get_max_threads();
-}
-
 static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class member not global, but cannot be moved to header due to ROOT limitation
 
 GPUReconstructionCPU::timerMeta* GPUReconstructionCPU::insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step)
@@ -402,17 +387,17 @@ uint32_t GPUReconstructionCPU::getNextTimerId()
   return id.fetch_add(1);
 }
 
-uint32_t GPUReconstructionCPU::SetAndGetNestedLoopOmpFactor(bool condition, uint32_t max)
+uint32_t GPUReconstructionCPU::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
 {
-  if (condition && mProcessingSettings.ompKernels != 1) {
-    mNestedLoopOmpFactor = mProcessingSettings.ompKernels == 2 ? std::min<uint32_t>(max, mProcessingSettings.ompThreads) : mProcessingSettings.ompThreads;
+  if (condition && mProcessingSettings.inKernelParallel != 1) {
+    mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
   } else {
-    mNestedLoopOmpFactor = 1;
+    mNActiveThreadsOuterLoop = 1;
   }
   if (mProcessingSettings.debugLevel >= 5) {
-    printf("Running %d OMP threads in outer loop\n", mNestedLoopOmpFactor);
+    printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop);
   }
-  return mNestedLoopOmpFactor;
+  return mNActiveThreadsOuterLoop;
 }
 
 void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream)

@@ -43,10 +43,8 @@ class GPUReconstructionCPUBackend : public GPUReconstruction
   int32_t runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime& _xyz, const Args&... args);
   template <class T, int32_t I>
   gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend();
-  uint32_t mNestedLoopOmpFactor = 1;
-  static int32_t getOMPThreadNum();
-  static int32_t getOMPMaxThreads();
-  int32_t getNOMPThreads();
+  uint32_t mNActiveThreadsOuterLoop = 1;
+  int32_t getNKernelHostThreads(bool splitCores);
 };
 
 class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCPUBackend>
@@ -81,8 +79,8 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
   HighResTimer& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)].timerTotal; }
   HighResTimer& getGeneralStepTimer(GeneralStep step) { return mTimersGeneralSteps[getGeneralStepNum(step)]; }
 
-  void SetNestedLoopOmpFactor(uint32_t f) { mNestedLoopOmpFactor = f; }
-  uint32_t SetAndGetNestedLoopOmpFactor(bool condition, uint32_t max);
+  void SetNActiveThreadsOuterLoop(uint32_t f) { mNActiveThreadsOuterLoop = f; }
+  uint32_t SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max);
 
   void UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream = -1);
 
@@ -220,8 +218,8 @@ inline int32_t GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args
     return 0;
   }
   if (mProcessingSettings.debugLevel >= 1) {
-    t = &getKernelTimer<S, I>(myStep, !IsGPU() || cpuFallback ? getOMPThreadNum() : stream);
-    if ((!mProcessingSettings.deviceTimers || !IsGPU() || cpuFallback) && (mNestedLoopOmpFactor < 2 || getOMPThreadNum() == 0)) {
+    t = &getKernelTimer<S, I>(myStep, !IsGPU() || cpuFallback ? getHostThreadIndex() : stream);
+    if ((!mProcessingSettings.deviceTimers || !IsGPU() || cpuFallback) && (mNActiveThreadsOuterLoop < 2 || getHostThreadIndex() == 0)) {
       t->Start();
     }
   }
@@ -287,11 +285,11 @@ HighResTimer& GPUReconstructionCPU::getTimer(const char* name, int32_t num)
   static int32_t id = getNextTimerId();
   timerMeta* timer = getTimerById(id);
   if (timer == nullptr) {
-    int32_t max = std::max<int32_t>({getOMPMaxThreads(), mProcessingSettings.nStreams});
+    int32_t max = std::max<int32_t>({mMaxHostThreads, mProcessingSettings.nStreams});
     timer = insertTimer(id, name, J, max, 1, RecoStep::NoRecoStep);
   }
   if (num == -1) {
-    num = getOMPThreadNum();
+    num = getHostThreadIndex();
   }
   if (num < 0 || num >= timer->num) {
     throw std::runtime_error("Invalid timer requested");