AliceO2Group · davidrohr · Apr 20, 2025 · Apr 19, 2025 · Apr 20, 2025
@@ -51,6 +51,8 @@
 #include "GPUTRDTrackletWord.h"
 #include "GPUTRDInterfaces.h"
 #include "GPUTRDGeometry.h"
+#include "GPUConstantMem.h"
+#include "GPUTRDTrackerKernels.h"
 
 #ifdef ENABLE_UPGRADES
 #include "ITS3Reconstruction/IOUtils.h"

@@ -14,7 +14,7 @@
 
 #include "GPUProcessor.h"
 #include "GPUReconstruction.h"
-#include "GPUReconstructionDeviceBase.h"
+#include "GPUSettings.h"
 
 using namespace o2::gpu;
 

@@ -25,13 +25,13 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "GPUTRDDef.h"
-#include "GPUParam.h"
-#include "GPUSettings.h"
-#include "GPUOutputControl.h"
+#include "GPUDataTypes.h"
 #include "GPUMemoryResource.h"
-#include "GPUConstantMem.h"
-#include "GPULogging.h"
+#include "GPUOutputControl.h"
+
+/*#include "GPUParam.h"
+#include "GPUSettings.h"
+#include "GPULogging.h"*/
 
 namespace o2::its
 {
@@ -49,6 +49,13 @@ struct GPUReconstructionThreading;
 class GPUROOTDumpCore;
 class ThrustVolatileAllocator;
 struct GPUDefParameters;
+class GPUMemoryResource;
+struct GPUSettingsDeviceBackend;
+struct GPUSettingsGRP;
+struct GPUSettingsProcessing;
+struct GPUSettingsRec;
+struct GPUSettingsRecDynamic;
+struct GPUMemoryReuse;
 
 namespace gpu_reconstruction_kernels
 {
@@ -186,18 +193,20 @@ class GPUReconstruction
   bool slavesExist() { return mSlaves.size() || mMaster; }
 
   // Getters / setters for parameters
-  DeviceType GetDeviceType() const { return (DeviceType)mDeviceBackendSettings.deviceType; }
+  DeviceType GetDeviceType() const;
   bool IsGPU() const { return GetDeviceType() != DeviceType::INVALID_DEVICE && GetDeviceType() != DeviceType::CPU; }
-  const GPUParam& GetParam() const { return mHostConstantMem->param; }
+  const GPUParam& GetParam() const;
   const GPUConstantMem& GetConstantMem() const { return *mHostConstantMem; }
-  const GPUSettingsGRP& GetGRPSettings() const { return mGRPSettings; }
-  const GPUSettingsDeviceBackend& GetDeviceBackendSettings() { return mDeviceBackendSettings; }
-  const GPUSettingsProcessing& GetProcessingSettings() const { return mProcessingSettings; }
+  const GPUTrackingInOutPointers GetIOPtrs() const;
+  const GPUSettingsGRP& GetGRPSettings() const { return *mGRPSettings; }
+  const GPUSettingsDeviceBackend& GetDeviceBackendSettings() const { return *mDeviceBackendSettings; }
+  const GPUSettingsProcessing& GetProcessingSettings() const { return *mProcessingSettings; }
+  const GPUCalibObjectsConst& GetCalib() const;
   bool IsInitialized() const { return mInitialized; }
   void SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration* workflow = nullptr);
   void SetSettings(const GPUSettingsGRP* grp, const GPUSettingsRec* rec = nullptr, const GPUSettingsProcessing* proc = nullptr, const GPURecoStepConfiguration* workflow = nullptr);
-  void SetResetTimers(bool reset) { mProcessingSettings.resetTimers = reset; }     // May update also after Init()
-  void SetDebugLevelTmp(int32_t level) { mProcessingSettings.debugLevel = level; } // Temporarily, before calling SetSettings()
+  void SetResetTimers(bool reset);      // May update also after Init()
+  void SetDebugLevelTmp(int32_t level); // Temporarily, before calling SetSettings()
   void UpdateSettings(const GPUSettingsGRP* g, const GPUSettingsProcessing* p = nullptr, const GPUSettingsRecDynamic* d = nullptr);
   void UpdateDynamicSettings(const GPUSettingsRecDynamic* d);
   void SetOutputControl(const GPUOutputControl& v) { mOutputControl = v; }
@@ -272,6 +281,7 @@ class GPUReconstruction
   size_t ReadData(FILE* fp, const T** entries, S* num, std::unique_ptr<T[]>* mem, InOutPointerType type, T** nonConstPtrs = nullptr);
   template <class T>
   T* AllocateIOMemoryHelper(size_t n, const T*& ptr, std::unique_ptr<T[]>& u);
+  int16_t RegisterMemoryAllocationHelper(GPUProcessor* proc, void* (GPUProcessor::*setPtr)(void*), int32_t type, const char* name, const GPUMemoryReuse& re);
 
   // Private helper functions to dump / load flat objects
   template <class T>
@@ -292,17 +302,17 @@ class GPUReconstruction
   // Pointers to tracker classes
   GPUConstantMem* processors() { return mHostConstantMem.get(); }
   const GPUConstantMem* processors() const { return mHostConstantMem.get(); }
-  GPUParam& param() { return mHostConstantMem->param; }
+  GPUParam& param();
   std::unique_ptr<GPUConstantMem> mHostConstantMem;
   GPUConstantMem* mDeviceConstantMem = nullptr;
 
   // Settings
-  GPUSettingsGRP mGRPSettings;                          // Global Run Parameters
-  GPUSettingsDeviceBackend mDeviceBackendSettings;      // Processing Parameters (at constructor level)
-  GPUSettingsProcessing mProcessingSettings;            // Processing Parameters (at init level)
-  GPUOutputControl mOutputControl;                      // Controls the output of the individual components
-  GPUOutputControl mInputControl;                       // Prefefined input memory location for reading standalone dumps
-  std::unique_ptr<GPUMemorySizeScalers> mMemoryScalers; // Scalers how much memory will be needed
+  std::unique_ptr<GPUSettingsGRP> mGRPSettings;                     // Global Run Parameters
+  std::unique_ptr<GPUSettingsDeviceBackend> mDeviceBackendSettings; // Processing Parameters (at constructor level)
+  std::unique_ptr<GPUSettingsProcessing> mProcessingSettings;       // Processing Parameters (at init level)
+  GPUOutputControl mOutputControl;                                  // Controls the output of the individual components
+  GPUOutputControl mInputControl;                                   // Prefefined input memory location for reading standalone dumps
+  std::unique_ptr<GPUMemorySizeScalers> mMemoryScalers;             // Scalers how much memory will be needed
 
   GPURecoStepConfiguration mRecoSteps;
 
@@ -392,35 +402,6 @@ class GPUReconstruction
   static GPUReconstruction* GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend& cfg);
 };
 
-template <class T>
-inline T* GPUReconstruction::AllocateIOMemoryHelper(size_t n, const T*& ptr, std::unique_ptr<T[]>& u)
-{
-  if (n == 0) {
-    u.reset(nullptr);
-    return nullptr;
-  }
-  T* retVal;
-  if (mInputControl.useExternal()) {
-    u.reset(nullptr);
-    mInputControl.checkCurrent();
-    GPUProcessor::computePointerWithAlignment(mInputControl.ptrCurrent, retVal, n);
-    if ((size_t)((char*)mInputControl.ptrCurrent - (char*)mInputControl.ptrBase) > mInputControl.size) {
-      throw std::bad_alloc();
-    }
-  } else {
-    u.reset(new T[n]);
-    retVal = u.get();
-    if (mProcessingSettings.registerStandaloneInputMemory) {
-      if (registerMemoryForGPU(u.get(), n * sizeof(T))) {
-        GPUError("Error registering memory for GPU: %p - %ld bytes\n", (void*)u.get(), (int64_t)(n * sizeof(T)));
-        throw std::bad_alloc();
-      }
-    }
-  }
-  ptr = retVal;
-  return retVal;
-}
-
 template <class T, typename... Args>
 inline T* GPUReconstruction::AddChain(Args... args)
 {
@@ -431,31 +412,7 @@ inline T* GPUReconstruction::AddChain(Args... args)
 template <class T>
 inline int16_t GPUReconstruction::RegisterMemoryAllocation(T* proc, void* (T::*setPtr)(void*), int32_t type, const char* name, const GPUMemoryReuse& re)
 {
-  if (!(type & (GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU))) {
-    if ((type & GPUMemoryResource::MEMORY_SCRATCH) && !mProcessingSettings.keepDisplayMemory) { // keepAllMemory --> keepDisplayMemory
-      type |= (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU ? GPUMemoryResource::MEMORY_HOST : GPUMemoryResource::MEMORY_GPU);
-    } else {
-      type |= GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU;
-    }
-  }
-  if (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU) {
-    type &= ~GPUMemoryResource::MEMORY_GPU;
-  }
-  mMemoryResources.emplace_back(proc, static_cast<void* (GPUProcessor::*)(void*)>(setPtr), (GPUMemoryResource::MemoryType)type, name);
-  if (mMemoryResources.size() >= 32768) {
-    throw std::bad_alloc();
-  }
-  uint16_t retVal = mMemoryResources.size() - 1;
-  if (re.type != GPUMemoryReuse::NONE && !mProcessingSettings.disableMemoryReuse) {
-    const auto& it = mMemoryReuse1to1.find(re.id);
-    if (it == mMemoryReuse1to1.end()) {
-      mMemoryReuse1to1[re.id] = {proc, retVal};
-    } else {
-      mMemoryResources[retVal].mReuse = it->second.res[0];
-      it->second.res.emplace_back(retVal);
-    }
-  }
-  return retVal;
+  return RegisterMemoryAllocationHelper(proc, static_cast<void* (GPUProcessor::*)(void*)>(setPtr), type, name, re);
 }
 
 template <class T>
@@ -471,7 +428,7 @@ inline void GPUReconstruction::SetupGPUProcessor(T* proc, bool allocate)
 {
   static_assert(sizeof(T) > sizeof(GPUProcessor), "Need to setup derived class");
   if (allocate) {
-    proc->SetMaxData(mHostConstantMem->ioPtrs);
+    proc->SetMaxData(GetIOPtrs());
   }
   if (proc->mGPUProcessorType != GPUProcessor::PROCESSOR_TYPE_DEVICE && proc->mLinkedProcessor) {
     std::memcpy((void*)proc->mLinkedProcessor, (const void*)proc, sizeof(*proc));

@@ -30,19 +30,18 @@
 #include "GPUTRDTrackletLabels.h"
 #include "GPUMemoryResource.h"
 #include "GPUConstantMem.h"
+#include "GPULogging.h"
 #include "GPUMemorySizeScalers.h"
+#include "GPUReconstructionProcessingKernels.inc"
+
 #include <atomic>
 #include <ctime>
 
-#define GPUCA_LOGGING_PRINTF
-#include "GPULogging.h"
-
 #ifndef _WIN32
 #include <unistd.h>
 #endif
 
 using namespace o2::gpu;
-using namespace o2::gpu::gpu_reconstruction_kernels;
 
 constexpr GPUReconstructionCPU::krnlRunRange GPUReconstructionCPU::krnlRunRangeNone;
 constexpr GPUReconstructionCPU::krnlEvent GPUReconstructionCPU::krnlEventNone;
@@ -55,7 +54,7 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
 }
 
 template <class T, int32_t I, typename... Args>
-inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
+inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
 {
   auto& x = _xyz.x;
   auto& y = _xyz.y;
@@ -67,7 +66,7 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetu
   }
   int32_t nThreads = getNKernelHostThreads(false);
   if (nThreads > 1) {
-    if (mProcessingSettings.debugLevel >= 5) {
+    if (GetProcessingSettings().debugLevel >= 5) {
       printf("Running %d Threads\n", mThreading->activeThreads->max_concurrency());
     }
     tbb::this_task_arena::isolate([&] {
@@ -89,7 +88,7 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetu
 }
 
 template <>
-inline void GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
+inline void GPUReconstructionCPU::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
 {
   int32_t nThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));
   if (nThreads > 1) {
@@ -110,7 +109,7 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16,
 }
 
 template <class T, int32_t I, typename... Args>
-void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
+void GPUReconstructionCPU::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
 {
 #pragma GCC diagnostic push
 #if defined(__clang__)
@@ -121,14 +120,14 @@ void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs<T, I, Arg
 }
 
 template <class S, int32_t I>
-gpu_reconstruction_kernels::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu)
+GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu)
 {
   if (gpu == -1) {
     gpu = IsGPU();
   }
   const auto num = GetKernelNum<S, I>();
   const auto* p = gpu ? mParDevice : mParCPU;
-  gpu_reconstruction_kernels::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]};
+  GPUReconstructionProcessing::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]};
   if (ret.nThreads == 0) {
     ret.nThreads = gpu ? mThreadCount : 1u;
   }
@@ -138,9 +137,9 @@ gpu_reconstruction_kernels::krnlProperties GPUReconstructionCPU::getKernelProper
   return ret;
 }
 
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...)                                                                                                  \
-  template void GPUReconstructionCPUBackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
-  template krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...)                                                                                           \
+  template void GPUReconstructionCPU::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
+  template GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
 
@@ -169,7 +168,7 @@ size_t GPUReconstructionCPU::TransferMemoryResourcesHelper(GPUProcessor* proc, i
     if (!(res.mType & GPUMemoryResource::MEMORY_GPU) || (res.mType & GPUMemoryResource::MEMORY_CUSTOM_TRANSFER)) {
       continue;
     }
-    if (!mProcessingSettings.keepAllMemory && !all && (res.mType & exc) && !(res.mType & inc)) {
+    if (!GetProcessingSettings().keepAllMemory && !all && (res.mType & exc) && !(res.mType & inc)) {
       continue;
     }
     if (toGPU) {
@@ -197,7 +196,7 @@ int32_t GPUReconstructionCPU::InitDevice()
 {
   mActiveHostKernelThreads = mMaxHostThreads;
   mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
     if (mMaster == nullptr) {
       if (mDeviceMemorySize > mHostMemorySize) {
         mHostMemorySize = mDeviceMemorySize;
@@ -207,7 +206,7 @@ int32_t GPUReconstructionCPU::InitDevice()
     mHostMemoryPermanent = mHostMemoryBase;
     ClearAllocatedMemory();
   }
-  if (mProcessingSettings.inKernelParallel) {
+  if (GetProcessingSettings().inKernelParallel) {
     mBlockCount = mMaxHostThreads;
   }
   mProcShadow.mProcessorsProc = processors();
@@ -216,7 +215,7 @@ int32_t GPUReconstructionCPU::InitDevice()
 
 int32_t GPUReconstructionCPU::ExitDevice()
 {
-  if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
+  if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
     if (mMaster == nullptr) {
       operator delete(mHostMemoryBase, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
     }
@@ -232,13 +231,13 @@ int32_t GPUReconstructionCPU::RunChains()
   mStatNEvents++;
   mNEventsProcessed++;
 
-  if (mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) {
+  if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) {
     printf("Allocated memory when starting processing %34s", "");
     PrintMemoryOverview();
   }
   mTimerTotal.Start();
   const std::clock_t cpuTimerStart = std::clock();
-  if (mProcessingSettings.doublePipeline) {
+  if (GetProcessingSettings().doublePipeline) {
     int32_t retVal = EnqueuePipeline();
     if (retVal) {
       return retVal;
@@ -259,7 +258,7 @@ int32_t GPUReconstructionCPU::RunChains()
   }
   mTimerTotal.Stop();
   mStatCPUTime += (double)(std::clock() - cpuTimerStart) / CLOCKS_PER_SEC;
-  if (mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) {
+  if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) {
     printf("Allocated memory when ending processing %36s", "");
     PrintMemoryOverview();
   }
@@ -281,7 +280,7 @@ int32_t GPUReconstructionCPU::RunChains()
       for (int32_t j = 0; j < mTimers[i]->num; j++) {
         HighResTimer& timer = mTimers[i]->timer[j];
         time += timer.GetElapsedTime();
-        if (mProcessingSettings.resetTimers) {
+        if (GetProcessingSettings().resetTimers) {
           timer.Reset();
         }
       }
@@ -297,7 +296,7 @@ int32_t GPUReconstructionCPU::RunChains()
         snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count);
       }
       printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth);
-      if (mProcessingSettings.resetTimers) {
+      if (GetProcessingSettings().resetTimers) {
         mTimers[i]->count = 0;
         mTimers[i]->memSize = 0;
       }
@@ -317,7 +316,7 @@ int32_t GPUReconstructionCPU::RunChains()
         printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", GPUDataTypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents,
                mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost);
       }
-      if (mProcessingSettings.resetTimers) {
+      if (GetProcessingSettings().resetTimers) {
         mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0;
         mTimersRecoSteps[i].timerToGPU.Reset();
         mTimersRecoSteps[i].timerToHost.Reset();
@@ -340,7 +339,7 @@ int32_t GPUReconstructionCPU::RunChains()
   } else if (GetProcessingSettings().debugLevel >= 0) {
     GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str());
   }
-  if (mProcessingSettings.resetTimers) {
+  if (GetProcessingSettings().resetTimers) {
     mStatNEvents = 0;
     mStatCPUTime = 0;
     mTimerTotal.Reset();
@@ -366,7 +365,7 @@ void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, cons
     if (!((size_t)&param().occupancyTotal - (size_t)&param().occupancyMap == sizeof(param().occupancyMap) && sizeof(param().occupancyMap) == sizeof(size_t) && sizeof(param().occupancyTotal) < sizeof(size_t))) {
       throw std::runtime_error("occupancy data not consecutive in GPUParam");
     }
-    const auto threadContext = GetThreadContext();
+    const auto holdContext = GetThreadContext();
     size_t tmp[2] = {(size_t)mapGPU, 0};
     memcpy(&tmp[1], &occupancyTotal, sizeof(occupancyTotal));
     WriteToConstantMemory((char*)&processors()->param.occupancyMap - (char*)processors(), &tmp, sizeof(param().occupancyMap) + sizeof(param().occupancyTotal), stream);