AliceO2Group
diff --git a/‎GPU/GPUTracking/Base/GPUReconstruction.cxx‎
Lines changed: 3 additions & 16 deletions b/‎GPU/GPUTracking/Base/GPUReconstruction.cxx‎
Lines changed: 3 additions & 16 deletions
diff --git a/‎GPU/GPUTracking/Base/GPUReconstruction.h‎
Lines changed: 8 additions & 173 deletions b/‎GPU/GPUTracking/Base/GPUReconstruction.h‎
Lines changed: 8 additions & 173 deletions
diff --git a/‎GPU/GPUTracking/Base/GPUReconstructionCPU.cxx‎
Lines changed: 2 additions & 71 deletions b/‎GPU/GPUTracking/Base/GPUReconstructionCPU.cxx‎
Lines changed: 2 additions & 71 deletions
@@ -26,6 +26,7 @@
 #include "GPUReconstruction.h"
 #include "GPUReconstructionIncludes.h"
 #include "GPUReconstructionThreading.h"
+#include "GPUReconstructionIO.h"
 #include "GPUROOTDumpCore.h"
 #include "GPUConfigDump.h"
 #include "GPUChainTracking.h"
@@ -118,15 +119,6 @@ void GPUReconstruction::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* tr
   }
 }
 
-void GPUReconstruction::SetNActiveThreads(int32_t n)
-{
-  mActiveHostKernelThreads = std::max(1, n < 0 ? mMaxHostThreads : std::min(n, mMaxHostThreads));
-  mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
-  if (mProcessingSettings.debugLevel >= 3) {
-    GPUInfo("Set number of active parallel kernels threads on host to %d (%d requested)", mActiveHostKernelThreads, n);
-  }
-}
-
 int32_t GPUReconstruction::getHostThreadIndex()
 {
   return std::max<int32_t>(0, tbb::this_task_arena::current_thread_index());
@@ -327,12 +319,12 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
   } else {
     mProcessingSettings.autoAdjustHostThreads = false;
   }
-  mMaxHostThreads = mActiveHostKernelThreads = mProcessingSettings.nHostThreads;
+  mMaxHostThreads = mProcessingSettings.nHostThreads;
   if (mMaster == nullptr) {
     mThreading = std::make_shared<GPUReconstructionThreading>();
     mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism, mMaxHostThreads);
     mThreading->allThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
-    mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
+    mThreading->activeThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
   } else {
     mThreading = mMaster->mThreading;
   }
@@ -1181,8 +1173,3 @@ void GPUReconstruction::SetInputControl(void* ptr, size_t size)
 {
   mInputControl.set(ptr, size);
 }
-
-GPUReconstruction::GPUThreadContext::GPUThreadContext() = default;
-GPUReconstruction::GPUThreadContext::~GPUThreadContext() = default;
-
-std::unique_ptr<GPUReconstruction::GPUThreadContext> GPUReconstruction::GetThreadContext() { return std::unique_ptr<GPUReconstruction::GPUThreadContext>(new GPUThreadContext); }
@@ -57,12 +57,11 @@ class GPUROOTDumpCore;
 namespace gpu_reconstruction_kernels
 {
 struct deviceEvent;
+class threadContext;
 }
 
 class GPUReconstruction
 {
-  friend class GPUChain;
-
  protected:
   class LibraryLoader; // These must be the first members to ensure correct destructor order!
   std::shared_ptr<LibraryLoader> mMyLib = nullptr;
@@ -207,8 +206,6 @@ class GPUReconstruction
   void SetOutputControl(void* ptr, size_t size);
   void SetInputControl(void* ptr, size_t size);
   GPUOutputControl& OutputControl() { return mOutputControl; }
-  int32_t GetMaxBackendThreads() const { return mMaxBackendThreads; }
-  void SetNActiveThreads(int32_t n);
   int32_t NStreams() const { return mNStreams; }
   const void* DeviceMemoryBase() const { return mDeviceMemoryBase; }
 
@@ -235,8 +232,10 @@ class GPUReconstruction
   double GetStatKernelTime() { return mStatKernelTime; }
   double GetStatWallTime() { return mStatWallTime; }
 
+  // Threading
   std::shared_ptr<GPUReconstructionThreading> mThreading;
   static int32_t getHostThreadIndex();
+  int32_t GetMaxBackendThreads() const { return mMaxBackendThreads; }
 
  protected:
   void AllocateRegisteredMemoryInternal(GPUMemoryResource* res, GPUOutputControl* control, GPUReconstruction* recPool);
@@ -258,13 +257,7 @@ class GPUReconstruction
   virtual int32_t unregisterMemoryForGPU_internal(const void* ptr) = 0;
 
   // Management for GPU thread contexts
-  class GPUThreadContext
-  {
-   public:
-    GPUThreadContext();
-    virtual ~GPUThreadContext();
-  };
-  virtual std::unique_ptr<GPUThreadContext> GetThreadContext();
+  virtual std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() = 0;
 
   // Private helpers for library loading
   static std::shared_ptr<LibraryLoader>* GetLibraryInstance(DeviceType type, bool verbose);
@@ -347,11 +340,10 @@ class GPUReconstruction
   std::shared_ptr<GPUROOTDumpCore> mROOTDump;
   std::vector<std::array<uint32_t, 4>>* mOutputErrorCodes = nullptr;
 
-  int32_t mMaxBackendThreads = 0;       // Maximum number of threads that may be running, on CPU or GPU
-  int32_t mGPUStuck = 0;                // Marks that the GPU is stuck, skip future events
-  int32_t mNStreams = 1;                // Number of parallel GPU streams
-  int32_t mMaxHostThreads = 0;          // Maximum number of OMP threads
-  int32_t mActiveHostKernelThreads = 0; // Number of currently active threads on the host for kernels
+  int32_t mMaxBackendThreads = 0; // Maximum number of threads that may be running, on CPU or GPU
+  int32_t mGPUStuck = 0;          // Marks that the GPU is stuck, skip future events
+  int32_t mNStreams = 1;          // Number of parallel GPU streams
+  int32_t mMaxHostThreads = 0;    // Maximum number of OMP threads
 
   // Management for GPUProcessors
   struct ProcessorData {
@@ -491,163 +483,6 @@ inline void GPUReconstruction::SetupGPUProcessor(T* proc, bool allocate)
   }
 }
 
-template <class T, class S>
-inline uint32_t GPUReconstruction::DumpData(FILE* fp, const T* const* entries, const S* num, InOutPointerType type)
-{
-  int32_t count = getNIOTypeMultiplicity(type);
-  uint32_t numTotal = 0;
-  for (int32_t i = 0; i < count; i++) {
-    numTotal += num[i];
-  }
-  if (numTotal == 0) {
-    return 0;
-  }
-  fwrite(&type, sizeof(type), 1, fp);
-  for (int32_t i = 0; i < count; i++) {
-    fwrite(&num[i], sizeof(num[i]), 1, fp);
-    if (num[i]) {
-      fwrite(entries[i], sizeof(*entries[i]), num[i], fp);
-    }
-  }
-  if (mProcessingSettings.debugLevel >= 2) {
-    GPUInfo("Dumped %ld %s", (int64_t)numTotal, IOTYPENAMES[type]);
-  }
-  return numTotal;
-}
-
-template <class T, class S>
-inline size_t GPUReconstruction::ReadData(FILE* fp, const T** entries, S* num, std::unique_ptr<T[]>* mem, InOutPointerType type, T** nonConstPtrs)
-{
-  if (feof(fp)) {
-    return 0;
-  }
-  InOutPointerType inType;
-  size_t r, pos = ftell(fp);
-  r = fread(&inType, sizeof(inType), 1, fp);
-  if (r != 1 || inType != type) {
-    fseek(fp, pos, SEEK_SET);
-    return 0;
-  }
-
-  int32_t count = getNIOTypeMultiplicity(type);
-  size_t numTotal = 0;
-  for (int32_t i = 0; i < count; i++) {
-    r = fread(&num[i], sizeof(num[i]), 1, fp);
-    T* m = AllocateIOMemoryHelper(num[i], entries[i], mem[i]);
-    if (nonConstPtrs) {
-      nonConstPtrs[i] = m;
-    }
-    if (num[i]) {
-      r = fread(m, sizeof(*entries[i]), num[i], fp);
-    }
-    numTotal += num[i];
-  }
-  (void)r;
-  if (mProcessingSettings.debugLevel >= 2) {
-    GPUInfo("Read %ld %s", (int64_t)numTotal, IOTYPENAMES[type]);
-  }
-  return numTotal;
-}
-
-template <class T>
-inline void GPUReconstruction::DumpFlatObjectToFile(const T* obj, const char* file)
-{
-  FILE* fp = fopen(file, "w+b");
-  if (fp == nullptr) {
-    return;
-  }
-  size_t size[2] = {sizeof(*obj), obj->getFlatBufferSize()};
-  fwrite(size, sizeof(size[0]), 2, fp);
-  fwrite(obj, 1, size[0], fp);
-  fwrite(obj->getFlatBufferPtr(), 1, size[1], fp);
-  fclose(fp);
-}
-
-template <class T>
-inline std::unique_ptr<T> GPUReconstruction::ReadFlatObjectFromFile(const char* file)
-{
-  FILE* fp = fopen(file, "rb");
-  if (fp == nullptr) {
-    return nullptr;
-  }
-  size_t size[2] = {0}, r;
-  r = fread(size, sizeof(size[0]), 2, fp);
-  if (r == 0 || size[0] != sizeof(T)) {
-    fclose(fp);
-    GPUError("ERROR reading %s, invalid size: %ld (%ld expected)", file, (int64_t)size[0], (int64_t)sizeof(T));
-    throw std::runtime_error("invalid size");
-  }
-  std::unique_ptr<T> retVal(new T);
-  retVal->destroy();
-  char* buf = new char[size[1]]; // Not deleted as ownership is transferred to FlatObject
-  r = fread((void*)retVal.get(), 1, size[0], fp);
-  r = fread(buf, 1, size[1], fp);
-  fclose(fp);
-  if (mProcessingSettings.debugLevel >= 2) {
-    GPUInfo("Read %ld bytes from %s", (int64_t)r, file);
-  }
-  retVal->clearInternalBufferPtr();
-  retVal->setActualBufferAddress(buf);
-  retVal->adoptInternalBuffer(buf);
-  return retVal;
-}
-
-template <class T>
-inline void GPUReconstruction::DumpStructToFile(const T* obj, const char* file)
-{
-  FILE* fp = fopen(file, "w+b");
-  if (fp == nullptr) {
-    return;
-  }
-  size_t size = sizeof(*obj);
-  fwrite(&size, sizeof(size), 1, fp);
-  fwrite(obj, 1, size, fp);
-  fclose(fp);
-}
-
-template <class T>
-inline std::unique_ptr<T> GPUReconstruction::ReadStructFromFile(const char* file)
-{
-  FILE* fp = fopen(file, "rb");
-  if (fp == nullptr) {
-    return nullptr;
-  }
-  size_t size, r;
-  r = fread(&size, sizeof(size), 1, fp);
-  if (r == 0 || size != sizeof(T)) {
-    fclose(fp);
-    GPUError("ERROR reading %s, invalid size: %ld (%ld expected)", file, (int64_t)size, (int64_t)sizeof(T));
-    throw std::runtime_error("invalid size");
-  }
-  std::unique_ptr<T> newObj(new T);
-  r = fread(newObj.get(), 1, size, fp);
-  fclose(fp);
-  if (mProcessingSettings.debugLevel >= 2) {
-    GPUInfo("Read %ld bytes from %s", (int64_t)r, file);
-  }
-  return newObj;
-}
-
-template <class T>
-inline int32_t GPUReconstruction::ReadStructFromFile(const char* file, T* obj)
-{
-  FILE* fp = fopen(file, "rb");
-  if (fp == nullptr) {
-    return 1;
-  }
-  size_t size, r;
-  r = fread(&size, sizeof(size), 1, fp);
-  if (r == 0) {
-    fclose(fp);
-    return 1;
-  }
-  r = fread(obj, 1, size, fp);
-  fclose(fp);
-  if (mProcessingSettings.debugLevel >= 2) {
-    GPUInfo("Read %ld bytes from %s", (int64_t)r, file);
-  }
-  return 0;
-}
 } // namespace gpu
 } // namespace o2
 
 
@@ -54,23 +54,6 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
   Exit(); // Needs to be identical to GPU backend bahavior in order to avoid calling abstract methods later in the destructor
 }
 
-int32_t GPUReconstructionCPUBackend::getNKernelHostThreads(bool splitCores)
-{
-  int32_t nThreads = 0;
-  if (mProcessingSettings.inKernelParallel == 2 && mNActiveThreadsOuterLoop) {
-    if (splitCores) {
-      nThreads = mMaxHostThreads / mNActiveThreadsOuterLoop;
-      nThreads += (uint32_t)getHostThreadIndex() < mMaxHostThreads % mNActiveThreadsOuterLoop;
-    } else {
-      nThreads = mMaxHostThreads;
-    }
-    nThreads = std::max(1, nThreads);
-  } else {
-    nThreads = mProcessingSettings.inKernelParallel ? mMaxHostThreads : 1;
-  }
-  return nThreads;
-}
-
 template <class T, int32_t I, typename... Args>
 inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
 {
@@ -198,6 +181,8 @@ int32_t GPUReconstructionCPU::GetThread()
 
 int32_t GPUReconstructionCPU::InitDevice()
 {
+  mActiveHostKernelThreads = mMaxHostThreads;
+  mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
   if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
     if (mMaster == nullptr) {
       if (mDeviceMemorySize > mHostMemorySize) {
@@ -339,60 +324,6 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes()
   }
 }
 
-static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class member not global, but cannot be moved to header due to ROOT limitation
-
-GPUReconstructionCPU::timerMeta* GPUReconstructionCPU::insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step)
-{
-  while (timerFlag.test_and_set()) {
-  }
-  if (mTimers.size() <= id) {
-    mTimers.resize(id + 1);
-  }
-  if (mTimers[id] == nullptr) {
-    if (J >= 0) {
-      name += std::to_string(J);
-    }
-    mTimers[id].reset(new timerMeta{std::unique_ptr<HighResTimer[]>{new HighResTimer[num]}, name, num, type, 1u, step, (size_t)0});
-  } else {
-    mTimers[id]->count++;
-  }
-  timerMeta* retVal = mTimers[id].get();
-  timerFlag.clear();
-  return retVal;
-}
-
-GPUReconstructionCPU::timerMeta* GPUReconstructionCPU::getTimerById(uint32_t id, bool increment)
-{
-  timerMeta* retVal = nullptr;
-  while (timerFlag.test_and_set()) {
-  }
-  if (mTimers.size() > id && mTimers[id]) {
-    retVal = mTimers[id].get();
-    retVal->count += increment;
-  }
-  timerFlag.clear();
-  return retVal;
-}
-
-uint32_t GPUReconstructionCPU::getNextTimerId()
-{
-  static std::atomic<uint32_t> id{0};
-  return id.fetch_add(1);
-}
-
-uint32_t GPUReconstructionCPU::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
-{
-  if (condition && mProcessingSettings.inKernelParallel != 1) {
-    mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
-  } else {
-    mNActiveThreadsOuterLoop = 1;
-  }
-  if (mProcessingSettings.debugLevel >= 5) {
-    printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop);
-  }
-  return mNActiveThreadsOuterLoop;
-}
-
 void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream)
 {
   param().occupancyMap = mapHost;