AliceO2Group · davidrohr · Mar 27, 2025 · Mar 27, 2025 · Mar 26, 2025 · Mar 27, 2025
@@ -58,7 +58,7 @@
 
 #if (defined(__CUDACC__) && defined(GPUCA_CUDA_NO_CONSTANT_MEMORY)) || (defined(__HIPCC__) && defined(GPUCA_HIP_NO_CONSTANT_MEMORY)) || (defined(__OPENCL__) && defined(GPUCA_OPENCL_NO_CONSTANT_MEMORY))
   #define GPUCA_NO_CONSTANT_MEMORY
-#elif defined(__CUDACC__) || defined(__HIPCC__)
+#elif (defined(__CUDACC__) || defined(__HIPCC__)) && !defined(GPUCA_GPUCODE_HOSTONLY)
   #define GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM
 #endif
 

@@ -95,7 +95,7 @@ union GPUConstantMemCopyable {
 static constexpr size_t gGPUConstantMemBufferSize = (sizeof(GPUConstantMem) + sizeof(uint4) - 1);
 #endif
 } // namespace o2::gpu
-#if defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM) && !defined(GPUCA_GPUCODE_HOSTONLY)
+#if defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM)
 GPUconstant() o2::gpu::GPUConstantMemCopyable gGPUConstantMemBuffer; // TODO: This should go into o2::gpu namespace, but then CUDA or HIP would not find the symbol
 #endif // GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM
 namespace o2::gpu
@@ -104,7 +104,7 @@ namespace o2::gpu
 // Must be placed here, to avoid circular header dependency
 GPUdi() GPUconstantref() const GPUConstantMem* GPUProcessor::GetConstantMem() const
 {
-#if defined(GPUCA_GPUCODE_DEVICE) && defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM) && !defined(GPUCA_GPUCODE_HOSTONLY)
+#if defined(GPUCA_GPUCODE_DEVICE) && defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM)
   return &GPUCA_CONSMEM;
 #else
   return mConstantMem;

@@ -16,7 +16,7 @@
 #include "GPUReconstructionIncludes.h"
 #include "GPUReconstructionThreading.h"
 #include "GPUChain.h"
-
+#include "GPUDefParameters.h"
 #include "GPUTPCClusterData.h"
 #include "GPUTPCSectorOutCluster.h"
 #include "GPUTPCGMMergedTrack.h"
@@ -120,15 +120,27 @@ void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs<T, I, Arg
 #pragma GCC diagnostic push
 }
 
-template <class T, int32_t I>
-krnlProperties GPUReconstructionCPUBackend::getKernelPropertiesBackend()
+template <class S, int32_t I>
+gpu_reconstruction_kernels::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu)
 {
-  return krnlProperties{1, 1};
+  if (gpu == -1) {
+    gpu = IsGPU();
+  }
+  const auto num = GetKernelNum<S, I>();
+  const auto* p = gpu ? mParDevice : mParCPU;
+  gpu_reconstruction_kernels::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]};
+  if (ret.nThreads == 0) {
+    ret.nThreads = gpu ? mThreadCount : 1u;
+  }
+  if (ret.minBlocks == 0) {
+    ret.minBlocks = 1;
+  }
+  return ret;
 }
 
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types)                                                                                                       \
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...)                                                                                                  \
   template void GPUReconstructionCPUBackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
-  template krnlProperties GPUReconstructionCPUBackend::getKernelPropertiesBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>();
+  template krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
 

@@ -38,8 +38,6 @@ class GPUReconstructionCPUBackend : public GPUReconstructionProcessing
   void runKernelBackend(const gpu_reconstruction_kernels::krnlSetupArgs<T, I, Args...>& args);
   template <class T, int32_t I = 0, typename... Args>
   void runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime& _xyz, const Args&... args);
-  template <class T, int32_t I>
-  gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend();
 };
 
 class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCPUBackend>
@@ -55,10 +53,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
   template <class S, int32_t I = 0, typename... Args>
   void runKernel(krnlSetup&& setup, Args&&... args);
   template <class S, int32_t I = 0>
-  const gpu_reconstruction_kernels::krnlProperties getKernelProperties()
-  {
-    return getKernelPropertiesImpl(gpu_reconstruction_kernels::classArgument<S, I>());
-  }
+  gpu_reconstruction_kernels::krnlProperties getKernelProperties(int gpu = -1);
 
   virtual int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false);
   int32_t GPUStuck() { return mGPUStuck; }
@@ -77,13 +72,15 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
 
   GPUReconstructionCPU(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg) {}
 
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types)                                                                                                                   \
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...)                                                                                                              \
   inline void runKernelImplWrapper(gpu_reconstruction_kernels::classArgument<GPUCA_M_KRNL_TEMPLATE(x_class)>, bool cpuFallback, double& timer, krnlSetup&& setup GPUCA_M_STRIP(x_arguments)) \
   {                                                                                                                                                                                          \
+    krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)> args(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward));                                                    \
+    const uint32_t num = GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>();                                                                                                                     \
     if (cpuFallback) {                                                                                                                                                                       \
-      GPUReconstructionCPU::runKernelImpl(krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward)));                  \
+      GPUReconstructionCPU::runKernelImpl(num, &args);                                                                                                                                       \
     } else {                                                                                                                                                                                 \
-      runKernelImpl(krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>(setup.x, setup.y, setup.z, timer GPUCA_M_STRIP(x_forward)));                                        \
+      runKernelImpl(num, &args);                                                                                                                                                             \
     }                                                                                                                                                                                        \
   }
 #include "GPUReconstructionKernelList.h"

@@ -38,21 +38,18 @@
 #ifndef GPUCA_KRNL_CUSTOM
 #define GPUCA_KRNL_CUSTOM(...)
 #endif
-#define GPUCA_KRNL_REG_EXTRREG(...) GPUCA_M_STRIP(__VA_ARGS__)
-#define GPUCA_KRNL_CUSTOM_EXTRREG(MODE, ...) GPUCA_ATTRRES_XCUSTOM(MODE, __VA_ARGS__)
-#define GPUCA_KRNL_NONE_EXTRREG(MODE, ...) GPUCA_ATTRRES_XNONE(MODE, __VA_ARGS__)
-#define GPUCA_ATTRRES_REG(MODE, reg, num, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_REG, MODE))(num) GPUCA_ATTRRES_XREG (MODE, __VA_ARGS__)
-#define GPUCA_ATTRRES_CUSTOM(MODE, custom, args, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_KRNL_CUSTOM, MODE))(args) GPUCA_ATTRRES_XCUSTOM(MODE, __VA_ARGS__)
-#define GPUCA_ATTRRES_NONE(MODE, none, ...) GPUCA_ATTRRES_XNONE(MODE, __VA_ARGS__)
-#define GPUCA_ATTRRES_(MODE, ...)
-#define GPUCA_ATTRRES_XNONE(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__)
-#define GPUCA_ATTRRES_XCUSTOM(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__)
-#define GPUCA_ATTRRES_XREG(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__)
-#define GPUCA_ATTRRES(MODE, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(MODE, __VA_ARGS__)
+#define GPUCA_ATTRRES_REG(reg, num, ...) GPUCA_M_EXPAND(GPUCA_KRNL_REG)(num) GPUCA_ATTRRES_XREG (__VA_ARGS__)
+#define GPUCA_ATTRRES_CUSTOM(custom, args, ...) GPUCA_M_EXPAND(GPUCA_KRNL_CUSTOM)(args) GPUCA_ATTRRES_XCUSTOM(__VA_ARGS__)
+#define GPUCA_ATTRRES_NONE(none, ...) GPUCA_ATTRRES_XNONE(__VA_ARGS__)
+#define GPUCA_ATTRRES_(...)
+#define GPUCA_ATTRRES_XNONE(...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(__VA_ARGS__)
+#define GPUCA_ATTRRES_XCUSTOM(...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(__VA_ARGS__)
+#define GPUCA_ATTRRES_XREG(...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(__VA_ARGS__)
+#define GPUCA_ATTRRES(...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(__VA_ARGS__)
 
 // GPU Kernel entry point
 #define GPUCA_KRNLGPU_DEF(x_class, x_attributes, x_arguments, ...) \
-  GPUg() void GPUCA_ATTRRES(, GPUCA_M_STRIP(x_attributes)) GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))(GPUCA_CONSMEM_PTR int32_t _iSector_internal GPUCA_M_STRIP(x_arguments))
+  GPUg() void GPUCA_ATTRRES(GPUCA_M_STRIP(x_attributes)) GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))(GPUCA_CONSMEM_PTR int32_t _iSector_internal GPUCA_M_STRIP(x_arguments))
 
 #ifdef GPUCA_KRNL_DEFONLY
 #define GPUCA_KRNLGPU(...) GPUCA_KRNLGPU_DEF(__VA_ARGS__);
@@ -79,12 +76,6 @@
     } \
   };
 
-#define GPUCA_KRNL_PROP(x_class, x_attributes) \
-  template <> gpu_reconstruction_kernels::krnlProperties GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::getKernelPropertiesBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>() { \
-    gpu_reconstruction_kernels::krnlProperties ret = gpu_reconstruction_kernels::krnlProperties{GPUCA_ATTRRES(_EXTRREG, GPUCA_M_STRIP(x_attributes))}; \
-    return ret.nThreads > 0 ? ret : gpu_reconstruction_kernels::krnlProperties{(int32_t)mThreadCount}; \
-  }
-
 #endif // GPUCA_GPUCODE
 
 #define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_LB_, GPUCA_M_KRNL_NAME(x_class))), GPUCA_M_STRIP(x_attributes)), __VA_ARGS__)

@@ -95,17 +95,19 @@ class GPUReconstructionKernels : public T
   template <class S, int32_t I = 0, typename... Args>
   using krnlSetupArgs = gpu_reconstruction_kernels::krnlSetupArgs<S, I, Args...>;
 
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types)                                                                              \
-  virtual void runKernelImpl(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args)                                          \
-  {                                                                                                                                                     \
-    T::template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(args);                                                                                 \
-  }                                                                                                                                                     \
-  virtual gpu_reconstruction_kernels::krnlProperties getKernelPropertiesImpl(gpu_reconstruction_kernels::classArgument<GPUCA_M_KRNL_TEMPLATE(x_class)>) \
-  {                                                                                                                                                     \
-    return T::template getKernelPropertiesBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>();                                                                    \
-  }
+  virtual void runKernelImpl(const int num, const void* args)
+  {
+    switch (num) { // clang-format off
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, x_num)                               \
+      case x_num: {                                                                                             \
+        const auto& args2 = *(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>*)args; \
+        T::template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(args2);                                    \
+        break; \
+      }
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
+    } // clang-format on
+  }
 };
 
 } // namespace o2::gpu

@@ -12,11 +12,35 @@
 /// \file GPUReconstructionProcessing.cxx
 /// \author David Rohr
 
+#define GPUCA_DEF_PARAMETERS_LOAD_DEFAULTS
+#include "GPUDefParametersDefault.h"
+#include "GPUDefParametersLoad.inc"
+
 #include "GPUReconstructionProcessing.h"
 #include "GPUReconstructionThreading.h"
 
 using namespace o2::gpu;
 
+GPUReconstructionProcessing::GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg) : GPUReconstruction(cfg)
+{
+  if (mMaster == nullptr) {
+    mParCPU = new GPUDefParameters(o2::gpu::internal::GPUDefParametersLoad());
+    mParDevice = new GPUDefParameters();
+  } else {
+    GPUReconstructionProcessing* master = dynamic_cast<GPUReconstructionProcessing*>(mMaster);
+    mParCPU = master->mParCPU;
+    mParDevice = master->mParDevice;
+  }
+}
+
+GPUReconstructionProcessing::~GPUReconstructionProcessing()
+{
+  if (mMaster == nullptr) {
+    delete mParCPU;
+    delete mParDevice;
+  }
+}
+
 int32_t GPUReconstructionProcessing::getNKernelHostThreads(bool splitCores)
 {
   int32_t nThreads = 0;
@@ -119,38 +143,22 @@ std::unique_ptr<gpu_reconstruction_kernels::threadContext> GPUReconstructionProc
 gpu_reconstruction_kernels::threadContext::threadContext() = default;
 gpu_reconstruction_kernels::threadContext::~threadContext() = default;
 
-template <class T, int32_t I>
-uint32_t GPUReconstructionProcessing::GetKernelNum(int32_t k)
-{
-  static int32_t num = k;
-  if (num < 0) {
-    throw std::runtime_error("Internal Error - Kernel Number not Set");
-  }
-  return num;
-}
-
-namespace o2::gpu::internal
-{
-static std::vector<std::string> initKernelNames()
-{
-  std::vector<std::string> retVal;
-#define GPUCA_KRNL(x_class, ...)                                                            \
-  GPUReconstructionProcessing::GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>(retVal.size()); \
-  retVal.emplace_back(GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class)));
+const std::vector<std::string> GPUReconstructionProcessing::mKernelNames = {
+#define GPUCA_KRNL(x_class, ...) GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class)),
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
-  return retVal;
-}
-} // namespace o2::gpu::internal
-
-const std::vector<std::string> GPUReconstructionProcessing::mKernelNames = o2::gpu::internal::initKernelNames();
-
-#define GPUCA_KRNL(x_class, ...)                                                                        \
-  template uint32_t GPUReconstructionProcessing::GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>(int32_t); \
-  template <>                                                                                           \
-  const char* GPUReconstructionProcessing::GetKernelName<GPUCA_M_KRNL_TEMPLATE(x_class)>()              \
-  {                                                                                                     \
-    return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class));                                                     \
+};
+
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, x_num)          \
+  template <>                                                                              \
+  uint32_t GPUReconstructionProcessing::GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>()     \
+  {                                                                                        \
+    return x_num;                                                                          \
+  }                                                                                        \
+  template <>                                                                              \
+  const char* GPUReconstructionProcessing::GetKernelName<GPUCA_M_KRNL_TEMPLATE(x_class)>() \
+  {                                                                                        \
+    return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class));                                        \
   }
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
@@ -25,6 +25,8 @@
 namespace o2::gpu
 {
 
+struct GPUDefParameters;
+
 namespace gpu_reconstruction_kernels
 {
 struct deviceEvent {
@@ -63,7 +65,7 @@ class threadContext
 class GPUReconstructionProcessing : public GPUReconstruction
 {
  public:
-  ~GPUReconstructionProcessing() override = default;
+  ~GPUReconstructionProcessing() override;
 
   // Threading
   int32_t getNKernelHostThreads(bool splitCores);
@@ -78,7 +80,7 @@ class GPUReconstructionProcessing : public GPUReconstruction
   static const char* GetKernelName();
   const std::string& GetKernelName(int32_t i) const { return mKernelNames[i]; }
   template <class T, int32_t I = 0>
-  static uint32_t GetKernelNum(int32_t k = -1);
+  static uint32_t GetKernelNum();
 
   // Public queries for timers
   auto& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)]; }
@@ -101,7 +103,7 @@ class GPUReconstructionProcessing : public GPUReconstruction
   };
 
  protected:
-  GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg) : GPUReconstruction(cfg) {}
+  GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg);
   using deviceEvent = gpu_reconstruction_kernels::deviceEvent;
 
   static const std::vector<std::string> mKernelNames;
@@ -132,6 +134,9 @@ class GPUReconstructionProcessing : public GPUReconstruction
   template <class T, int32_t J = -1>
   HighResTimer& getTimer(const char* name, int32_t num = -1);
 
+  GPUDefParameters* mParCPU = nullptr;
+  GPUDefParameters* mParDevice = nullptr;
+
  private:
   uint32_t getNextTimerId();
   timerMeta* getTimerById(uint32_t id, bool increment = true);

@@ -13,8 +13,13 @@
 /// \author David Rohr
 
 #define GPUCA_GPUCODE_HOSTONLY
-#include "GPUReconstructionCUDAIncludesHost.h"
 
+#define GPUCA_DEF_PARAMETERS_LOAD_DEFAULTS
+#include "GPUReconstructionCUDADef.h"
+#include "GPUDefParametersDefault.h"
+#include "GPUDefParametersLoad.inc"
+
+#include "GPUReconstructionCUDAIncludesHost.h"
 #include <cuda_profiler_api.h>
 
 #include "GPUReconstructionCUDA.h"
@@ -51,11 +56,14 @@ GPUReconstructionCUDABackend::GPUReconstructionCUDABackend(const GPUSettingsDevi
 {
   if (mMaster == nullptr) {
     mInternals = new GPUReconstructionCUDAInternals;
+    *mParDevice = o2::gpu::internal::GPUDefParametersLoad();
   }
+  mDeviceBackendSettings.deviceType = DeviceType::CUDA;
 }
 
 GPUReconstructionCUDABackend::~GPUReconstructionCUDABackend()
 {
+  Exit(); // Make sure we destroy everything (in particular the ITS tracker) before we exit CUDA
   if (mMaster == nullptr) {
     delete mInternals;
   }
@@ -69,7 +77,6 @@ int32_t GPUReconstructionCUDABackend::GPUChkErrInternal(const int64_t error, con
 
 GPUReconstructionCUDA::GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg)
 {
-  mDeviceBackendSettings.deviceType = DeviceType::CUDA;
 #ifndef __HIPCC__ // CUDA
   mRtcSrcExtension = ".cu";
   mRtcBinExtension = ".fatbin";
@@ -78,11 +85,7 @@ GPUReconstructionCUDA::GPUReconstructionCUDA(const GPUSettingsDeviceBackend& cfg
   mRtcBinExtension = ".o";
 #endif
 }
-
-GPUReconstructionCUDA::~GPUReconstructionCUDA()
-{
-  Exit(); // Make sure we destroy everything (in particular the ITS tracker) before we exit CUDA
-}
+GPUReconstructionCUDA::~GPUReconstructionCUDA() {}
 
 GPUReconstruction* GPUReconstruction_Create_CUDA(const GPUSettingsDeviceBackend& cfg) { return new GPUReconstructionCUDA(cfg); }