GPU: Get rid of backendInternal additional wrapper

davidrohr · davidrohr · commit 52c23287f6ab · 2025-04-21T16:07:10.000+02:00
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
@@ -54,7 +54,7 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
 }
 
 template <class T, int32_t I, typename... Args>
-inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
+inline void GPUReconstructionCPU::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args)
 {
   auto& x = _xyz.x;
   auto& y = _xyz.y;
@@ -88,7 +88,7 @@ inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime&
 }
 
 template <>
-inline void GPUReconstructionCPU::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
+inline void GPUReconstructionCPU::runKernelBackend<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
 {
   int32_t nThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));
   if (nThreads > 1) {
@@ -108,17 +108,6 @@ inline void GPUReconstructionCPU::runKernelBackendInternal<GPUMemClean16, 0>(con
   }
 }
 
-template <class T, int32_t I, typename... Args>
-void GPUReconstructionCPU::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
-{
-#pragma GCC diagnostic push
-#if defined(__clang__)
-#pragma GCC diagnostic ignored "-Wunused-lambda-capture" // this is not alway captured below
-#endif
-  std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
-#pragma GCC diagnostic push
-}
-
 template <class S, int32_t I>
 GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu)
 {
@@ -137,8 +126,7 @@ GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelPrope
   return ret;
 }
 
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...)                                                                                           \
-  template void GPUReconstructionCPU::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
   template GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -40,7 +40,7 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface
   template <class S, int32_t I = 0>
   krnlProperties getKernelProperties(int gpu = -1);
   template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
+  void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args);
 
   virtual int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false);
   int32_t GPUStuck() { return mGPUStuck; }
@@ -59,9 +59,6 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface
 
   GPUReconstructionCPU(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing::KernelInterface<GPUReconstructionCPU, GPUReconstructionProcessing>(cfg) {}
 
-  template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
-
   int32_t registerMemoryForGPU_internal(const void* ptr, size_t size) override { return 0; }
   int32_t unregisterMemoryForGPU_internal(const void* ptr) override { return 0; }
 
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc b/GPU/GPUTracking/Base/GPUReconstructionProcessingKernels.inc
@@ -21,21 +21,28 @@
 namespace o2::gpu
 {
 
+#pragma GCC diagnostic push
+#if defined(__clang__)
+#pragma GCC diagnostic ignored "-Wunused-lambda-capture" // this is not alway captured below
+#endif
+
 template <class T, class S>
 void GPUReconstructionProcessing::KernelInterface<T, S>::runKernelVirtual(const int num, const void* args)
 {
   switch (num) { // clang-format off
 #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, x_num)                             \
     case x_num: {                                                                                             \
       const auto& args2 = *(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>*)args; \
-      ((T*)this)->template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(args2);                                    \
+      std::apply([this, &args2](auto&... vals) { ((T*)this)->template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>(args2.s, vals...); }, args2.v); \
       break;                                                                                                  \
     }
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
   } // clang-format on
 }
 
+#pragma GCC diagnostic push
+
 } // namespace o2::gpu
 
 #endif // GPURECONSTRUCTIONPROCESSINGKERNELS_H
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -24,7 +24,8 @@
 #include "GPUParamRTC.h"
 #include "GPUReconstructionCUDAHelpers.inc"
 #include "GPUDefParametersLoad.inc"
-#include "GPUReconstructionProcessingKernels.inc"
+#include "GPUReconstructionKernelIncludes.h"
+#include "GPUConstantMem.h"
 
 #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1
 #include "utils/qGetLdBinarySymbols.h"
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -45,17 +45,16 @@ class GPUReconstructionCUDA : public GPUReconstructionProcessing::KernelInterfac
   virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const override;
 
   template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
+  void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args);
+  template <class T, int32_t I = 0, typename... Args>
+  void runKernelBackendTimed(const krnlSetupTime& _xyz, const Args&... args);
 
   template <class T, class S>
   friend GPUh() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
 
  protected:
   GPUReconstructionCUDAInternals* mInternals;
 
-  template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
-
   int32_t InitDevice_Runtime() override;
   int32_t ExitDevice_Runtime() override;
 
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
@@ -23,13 +23,15 @@ using namespace o2::gpu;
 #include "GPUReconstructionIncludesDeviceAll.h"
 
 #include "GPUReconstructionCUDAKernelsSpecialize.inc"
+#include "GPUReconstructionProcessingKernels.inc"
+template void GPUReconstructionProcessing::KernelInterface<GPUReconstructionCUDA, GPUReconstructionDeviceBase>::runKernelVirtual(const int num, const void* args);
 
 #if defined(__HIPCC__) && defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM)
 __global__ void gGPUConstantMemBuffer_dummy(int32_t* p) { *p = *(int32_t*)&gGPUConstantMemBuffer; }
 #endif
 
 template <class T, int32_t I, typename... Args>
-inline void GPUReconstructionCUDA::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
+inline void GPUReconstructionCUDA::runKernelBackendTimed(const krnlSetupTime& _xyz, const Args&... args)
 {
 #if !defined(GPUCA_KERNEL_COMPILE_MODE) || GPUCA_KERNEL_COMPILE_MODE != 1
   if (!GetProcessingSettings().rtc.enable) {
@@ -52,18 +54,18 @@ inline void GPUReconstructionCUDA::runKernelBackendInternal(const krnlSetupTime&
 }
 
 template <class T, int32_t I, typename... Args>
-void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
+inline void GPUReconstructionCUDA::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args)
 {
-  auto& x = args.s.x;
-  auto& z = args.s.z;
+  auto& x = _xyz.x;
+  auto& z = _xyz.z;
   if (z.evList) {
     for (int32_t k = 0; k < z.nEvents; k++) {
       GPUChkErr(cudaStreamWaitEvent(mInternals->Streams[x.stream], ((cudaEvent_t*)z.evList)[k], 0));
     }
   }
   {
-    GPUDebugTiming timer(GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0, (deviceEvent*)mDebugEvents, mInternals->Streams, args.s, this);
-    std::apply([this, &args](auto&... vals) { this->runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
+    GPUDebugTiming timer(GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0, (deviceEvent*)mDebugEvents, mInternals->Streams, _xyz, this);
+    runKernelBackendTimed<T, I, Args...>(_xyz, args...);
   }
   GPUChkErr(cudaGetLastError());
   if (z.ev) {
@@ -74,31 +76,29 @@ void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs<T, I, Args...>&
 #undef GPUCA_KRNL_REG
 #define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
 
-#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1 // ---------- COMPILE_MODE = perkernel ----------
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionCUDA::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
-#else // ---------- COMPILE_MODE = onefile | rdc ----------
-#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 2
-#define GPUCA_KRNL_DEFONLY // COMPILE_MODE = rdc
-#endif
-
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...)        \
-  GPUCA_KRNL_HOST(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) \
-  template void GPUReconstructionCUDA::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
-
-#ifndef __HIPCC__ // CUDA version
-#define GPUCA_KRNL_CALL(x_class, ...) \
-  GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))<<<x.nBlocks, x.nThreads, 0, me->mInternals->Streams[x.stream]>>>(GPUCA_CONSMEM_CALL y.index, args...);
-#else // HIP version
-#undef GPUCA_KRNL_CUSTOM
-#define GPUCA_KRNL_CUSTOM(args) GPUCA_M_STRIP(args)
-#define GPUCA_KRNL_CALL(x_class, ...) \
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->Streams[x.stream], GPUCA_CONSMEM_CALL y.index, args...);
-#endif // __HIPCC__
-
+// clang-format off
+#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE != 1 // ---------- COMPILE_MODE = perkernel ----------
+  #if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 2
+    #define GPUCA_KRNL_DEFONLY // COMPILE_MODE = rdc
+  #endif
+
+  #define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
+    GPUCA_KRNL_HOST(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__)
+
+  #ifndef __HIPCC__ // CUDA version
+    #define GPUCA_KRNL_CALL(x_class, ...) \
+      GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))<<<x.nBlocks, x.nThreads, 0, me->mInternals->Streams[x.stream]>>>(GPUCA_CONSMEM_CALL y.index, args...);
+  #else // HIP version
+    #undef GPUCA_KRNL_CUSTOM
+    #define GPUCA_KRNL_CUSTOM(args) GPUCA_M_STRIP(args)
+    #define GPUCA_KRNL_CALL(x_class, ...) \
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->Streams[x.stream], GPUCA_CONSMEM_CALL y.index, args...);
+  #endif // __HIPCC__
+
+  #include "GPUReconstructionKernelList.h"
+  #undef GPUCA_KRNL
 #endif // ---------- COMPILE_MODE = onefile | rdc ----------
-
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
+// clang-format on
 
 #ifndef GPUCA_NO_CONSTANT_MEMORY
 static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc
@@ -97,7 +97,7 @@ struct GPUTPCGMO2OutputSort_comp {
 } // namespace o2::gpu::internal
 
 template <>
-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)
 {
   if (cmpMax) {
     GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMax());
@@ -107,32 +107,32 @@ inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInter
 }
 
 template <>
-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)
 {
   GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
 }
 
 template <>
-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)
 {
   GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
 }
 
 template <>
-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)
 {
   GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.LooperCandidates(), processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp());
 }
 
 template <>
-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)
 {
   GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSortO2(), processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp());
 }
 #endif // GPUCA_SPECIALIZE_THRUST_SORTS
 
 template <>
-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
 {
   GPUChkErr(cudaMemsetAsync(ptr, 0, size, mInternals->Streams[_xyz.x.stream]));
 }
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
@@ -13,8 +13,8 @@
 /// \author David Rohr
 
 #include "GPUReconstructionOCLIncludesHost.h"
-#include "GPUReconstructionProcessingKernels.inc"
 #include "GPUDefParametersLoad.inc"
+#include "GPUConstantMem.h"
 
 #include <map>
 
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
@@ -34,7 +34,7 @@ class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface
   ~GPUReconstructionOCL() override;
 
   template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
+  void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args);
 
  protected:
   int32_t InitDevice_Runtime() override;
@@ -57,8 +57,6 @@ class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface
 
   template <class T, int32_t I = 0>
   int32_t AddKernel();
-  template <class T, int32_t I = 0, typename... Args>
-  void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
 
   GPUReconstructionOCLInternals* mInternals;
   float mOclVersion;
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
@@ -16,9 +16,11 @@
 #include "GPUReconstructionKernelIncludes.h"
 
 #include "GPUReconstructionOCLKernelsSpecialize.inc"
+#include "GPUReconstructionProcessingKernels.inc"
+template void GPUReconstructionProcessing::KernelInterface<GPUReconstructionOCL, GPUReconstructionDeviceBase>::runKernelVirtual(const int num, const void* args);
 
 template <class T, int32_t I, typename... Args>
-inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
+inline void GPUReconstructionOCL::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args)
 {
   cl_kernel k = getKernelObject<cl_kernel, T, I>();
   auto& x = _xyz.x;
@@ -48,12 +50,6 @@ inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime&
   }
 }
 
-template <class T, int32_t I, typename... Args>
-void GPUReconstructionOCL::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
-{
-  std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
-}
-
 template <class T, int32_t I>
 int32_t GPUReconstructionOCL::AddKernel()
 {
@@ -86,7 +82,3 @@ int32_t GPUReconstructionOCL::AddKernels()
 #undef GPUCA_KRNL
   return 0;
 }
-
-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionOCL::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernelsSpecialize.inc
@@ -13,7 +13,7 @@
 /// \author David Rohr
 
 template <>
-inline void GPUReconstructionOCL::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
+inline void GPUReconstructionOCL::runKernelBackend<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
 {
   cl_int4 val0 = {0, 0, 0, 0};
   GPUChkErr(clEnqueueFillBuffer(mInternals->command_queue[_xyz.x.stream], mInternals->mem_gpu, &val0, sizeof(val0), (char*)ptr - (char*)mDeviceMemoryBase, (size + sizeof(val0) - 1) & ~(sizeof(val0) - 1), _xyz.z.evList == nullptr ? 0 : _xyz.z.nEvents, _xyz.z.evList->getEventList<cl_event>(), _xyz.z.ev->getEventList<cl_event>()));

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ GPUReconstructionCPU::~GPUReconstructionCPU()`
`54`	`54`	`}`
`55`	`55`
`56`	`56`	`template <class T, int32_t I, typename... Args>`
`57`		`-inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)`
	`57`	`+inline void GPUReconstructionCPU::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args)`
`58`	`58`	`{`
`59`	`59`	`auto& x = _xyz.x;`
`60`	`60`	`auto& y = _xyz.y;`
`@@ -88,7 +88,7 @@ inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime&`
`88`	`88`	`}`
`89`	`89`
`90`	`90`	`template <>`
`91`		`-inline void GPUReconstructionCPU::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)`
	`91`	`+inline void GPUReconstructionCPU::runKernelBackend<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)`
`92`	`92`	`{`
`93`	`93`	`int32_t nThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));`
`94`	`94`	`if (nThreads > 1) {`
`@@ -108,17 +108,6 @@ inline void GPUReconstructionCPU::runKernelBackendInternal<GPUMemClean16, 0>(con`
`108`	`108`	`}`
`109`	`109`	`}`
`110`	`110`
`111`		`-template <class T, int32_t I, typename... Args>`
`112`		`-void GPUReconstructionCPU::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)`
`113`		`-{`
`114`		`-#pragma GCC diagnostic push`
`115`		`-#if defined(__clang__)`
`116`		`-#pragma GCC diagnostic ignored "-Wunused-lambda-capture" // this is not alway captured below`
`117`		`-#endif`
`118`		`- std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);`
`119`		`-#pragma GCC diagnostic push`
`120`		`-}`
`121`		`-`
`122`	`111`	`template <class S, int32_t I>`
`123`	`112`	`GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu)`
`124`	`113`	`{`
`@@ -137,8 +126,7 @@ GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelPrope`
`137`	`126`	`return ret;`
`138`	`127`	`}`
`139`	`128`
`140`		`-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \`
`141`		`- template void GPUReconstructionCPU::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \`
	`129`	`+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \`
`142`	`130`	`template GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);`
`143`	`131`	`#include "GPUReconstructionKernelList.h"`
`144`	`132`	`#undef GPUCA_KRNL`
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ struct GPUTPCGMO2OutputSort_comp {`
`97`	`97`	`} // namespace o2::gpu::internal`
`98`	`98`
`99`	`99`	`template <>`
`100`		`-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)`
	`100`	`+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)`
`101`	`101`	`{`
`102`	`102`	`if (cmpMax) {`
`103`	`103`	`GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMax());`
`@@ -107,32 +107,32 @@ inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInter`
`107`	`107`	`}`
`108`	`108`
`109`	`109`	`template <>`
`110`		`-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)`
	`110`	`+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)`
`111`	`111`	`{`
`112`	`112`	`GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks()));`
`113`	`113`	`}`
`114`	`114`
`115`	`115`	`template <>`
`116`		`-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)`
	`116`	`+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)`
`117`	`117`	`{`
`118`	`118`	`GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks()));`
`119`	`119`	`}`
`120`	`120`
`121`	`121`	`template <>`
`122`		`-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)`
	`122`	`+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)`
`123`	`123`	`{`
`124`	`124`	`GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.LooperCandidates(), processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp());`
`125`	`125`	`}`
`126`	`126`
`127`	`127`	`template <>`
`128`		`-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)`
	`128`	`+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)`
`129`	`129`	`{`
`130`	`130`	`GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSortO2(), processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp());`
`131`	`131`	`}`
`132`	`132`	`#endif // GPUCA_SPECIALIZE_THRUST_SORTS`
`133`	`133`
`134`	`134`	`template <>`
`135`		`-inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)`
	`135`	`+inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)`
`136`	`136`	`{`
`137`	`137`	`GPUChkErr(cudaMemsetAsync(ptr, 0, size, mInternals->Streams[_xyz.x.stream]));`
`138`	`138`	`}`
Original file line number	Diff line number	Diff line change
`@@ -16,9 +16,11 @@`
`16`	`16`	`#include "GPUReconstructionKernelIncludes.h"`
`17`	`17`
`18`	`18`	`#include "GPUReconstructionOCLKernelsSpecialize.inc"`
	`19`	`+#include "GPUReconstructionProcessingKernels.inc"`
	`20`	`+template void GPUReconstructionProcessing::KernelInterface<GPUReconstructionOCL, GPUReconstructionDeviceBase>::runKernelVirtual(const int num, const void* args);`
`19`	`21`
`20`	`22`	`template <class T, int32_t I, typename... Args>`
`21`		`-inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)`
	`23`	`+inline void GPUReconstructionOCL::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args)`
`22`	`24`	`{`
`23`	`25`	`cl_kernel k = getKernelObject<cl_kernel, T, I>();`
`24`	`26`	`auto& x = _xyz.x;`
`@@ -48,12 +50,6 @@ inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime&`
`48`	`50`	`}`
`49`	`51`	`}`
`50`	`52`
`51`		`-template <class T, int32_t I, typename... Args>`
`52`		`-void GPUReconstructionOCL::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)`
`53`		`-{`
`54`		`- std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);`
`55`		`-}`
`56`		`-`
`57`	`53`	`template <class T, int32_t I>`
`58`	`54`	`int32_t GPUReconstructionOCL::AddKernel()`
`59`	`55`	`{`
`@@ -86,7 +82,3 @@ int32_t GPUReconstructionOCL::AddKernels()`
`86`	`82`	`#undef GPUCA_KRNL`
`87`	`83`	`return 0;`
`88`	`84`	`}`
`89`		`-`
`90`		`-#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionOCL::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);`
`91`		`-#include "GPUReconstructionKernelList.h"`
`92`		`-#undef GPUCA_KRNL`