GPU: Unify creation of list of kernel names and kernel numbers

davidrohr · davidrohr · commit 95ae41ee037a · 2025-03-26T19:59:45.000+01:00
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx b/GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx
@@ -124,3 +124,39 @@ std::unique_ptr<gpu_reconstruction_kernels::threadContext> GPUReconstructionProc
 
 gpu_reconstruction_kernels::threadContext::threadContext() = default;
 gpu_reconstruction_kernels::threadContext::~threadContext() = default;
+
+template <class T, int32_t I>
+uint32_t GPUReconstructionProcessing::GetKernelNum(int32_t k)
+{
+  static int32_t num = k;
+  if (num < 0) {
+    throw std::runtime_error("Internal Error - Kernel Number not Set");
+  }
+  return num;
+}
+
+namespace o2::gpu::internal
+{
+static std::vector<std::string> initKernelNames()
+{
+  std::vector<std::string> retVal;
+#define GPUCA_KRNL(x_class, ...)                                                            \
+  GPUReconstructionProcessing::GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>(retVal.size()); \
+  retVal.emplace_back(GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class)));
+#include "GPUReconstructionKernelList.h"
+#undef GPUCA_KRNL
+  return retVal;
+}
+} // namespace o2::gpu::internal
+
+const std::vector<std::string> GPUReconstructionProcessing::mKernelNames = o2::gpu::internal::initKernelNames();
+
+#define GPUCA_KRNL(x_class, ...)                                                                        \
+  template uint32_t GPUReconstructionProcessing::GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>(int32_t); \
+  template <>                                                                                           \
+  const char* GPUReconstructionProcessing::GetKernelName<GPUCA_M_KRNL_TEMPLATE(x_class)>()              \
+  {                                                                                                     \
+    return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class));                                                     \
+  }
+#include "GPUReconstructionKernelList.h"
+#undef GPUCA_KRNL
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
@@ -74,7 +74,10 @@ class GPUReconstructionProcessing : public GPUReconstruction
 
   // Interface to query name of a kernel
   template <class T, int32_t I>
-  constexpr static const char* GetKernelName();
+  static const char* GetKernelName();
+  const std::string& GetKernelName(int32_t i) const { return mKernelNames[i]; }
+  template <class T, int32_t I = 0>
+  static uint32_t GetKernelNum(int32_t k = -1);
 
   // Public queries for timers
   auto& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)]; }
@@ -100,6 +103,8 @@ class GPUReconstructionProcessing : public GPUReconstruction
   GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg) : GPUReconstruction(cfg) {}
   using deviceEvent = gpu_reconstruction_kernels::deviceEvent;
 
+  static const std::vector<std::string> mKernelNames;
+
   int32_t mActiveHostKernelThreads = 0;  // Number of currently active threads on the host for kernels
   uint32_t mNActiveThreadsOuterLoop = 1; // Number of threads currently running an outer loop
 
@@ -174,15 +179,6 @@ HighResTimer& GPUReconstructionProcessing::getTimer(const char* name, int32_t nu
   return timer->timer[num];
 }
 
-#define GPUCA_KRNL(x_class, ...)                                                                     \
-  template <>                                                                                        \
-  constexpr const char* GPUReconstructionProcessing::GetKernelName<GPUCA_M_KRNL_TEMPLATE(x_class)>() \
-  {                                                                                                  \
-    return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class));                                                  \
-  }
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
-
 } // namespace o2::gpu
 
 #endif
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -610,25 +610,25 @@ void GPUReconstructionCUDABackend::PrintKernelOccupancies()
     GPUChkErr(cuOccupancyMaxActiveBlocksPerMultiprocessor(&maxBlocks, *mInternals->kernelFunctions[i], threads, 0));
     GPUChkErr(cuFuncGetAttribute(&nRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, *mInternals->kernelFunctions[i]));
     GPUChkErr(cuFuncGetAttribute(&sMem, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, *mInternals->kernelFunctions[i]));
-    GPUInfo("Kernel: %50s Block size: %4d, Maximum active blocks: %3d, Suggested blocks: %3d, Regs: %3d, smem: %3d", mInternals->kernelNames[i].c_str(), threads, maxBlocks, suggestedBlocks, nRegs, sMem);
+    GPUInfo("Kernel: %50s Block size: %4d, Maximum active blocks: %3d, Suggested blocks: %3d, Regs: %3d, smem: %3d", GetKernelName(i).c_str(), threads, maxBlocks, suggestedBlocks, nRegs, sMem);
   }
 }
 
 void GPUReconstructionCUDA::loadKernelModules(bool perKernel)
 {
   uint32_t j = 0;
 #define GPUCA_KRNL(x_class, ...)                                                                                                                                                        \
-  getRTCkernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>(mInternals->kernelFunctions.size());                                                                                                  \
+  if (GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>() != j) {                                                                                                                            \
+    GPUFatal("kernel numbers out of sync");                                                                                                                                             \
+  }                                                                                                                                                                                     \
   mInternals->kernelFunctions.emplace_back(new CUfunction);                                                                                                                             \
-  mInternals->kernelNames.emplace_back(GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))));                                                                                    \
   if (mProcessingSettings.debugLevel >= 3) {                                                                                                                                            \
     GPUInfo("Loading kernel %s (j = %u)", GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), j);                                                                              \
   }                                                                                                                                                                                     \
   GPUChkErr(cuModuleGetFunction(mInternals->kernelFunctions.back().get(), *mInternals->kernelModules[perKernel ? j : 0], GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))))); \
   j++;
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
-
   if (j != mInternals->kernelModules.size()) {
     GPUFatal("Did not load all kernels (%u < %u)", j, (uint32_t)mInternals->kernelModules.size());
   }
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -46,11 +46,6 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase
   void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
   template <class T, int32_t I = 0>
   gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend();
-  template <class T, int32_t I>
-  class backendInternal;
-
-  template <class T, int32_t I = 0>
-  static int32_t getRTCkernelNum(int32_t k = -1);
 
   void getRTCKernelCalls(std::vector<std::string>& kernels);
 
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAInternals.h
@@ -30,7 +30,6 @@ namespace o2::gpu
 struct GPUReconstructionCUDAInternals {
   std::vector<std::unique_ptr<CUmodule>> kernelModules;     // module for RTC compilation
   std::vector<std::unique_ptr<CUfunction>> kernelFunctions; // vector of ptrs to RTC kernels
-  std::vector<std::string> kernelNames;                     // names of kernels
   cudaStream_t Streams[GPUCA_MAX_STREAMS];                  // Pointer to array of CUDA Streams
 
   static void getArgPtrs(const void** pArgs) {}
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
@@ -55,7 +55,7 @@ inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSet
 #endif
     pArgs[arg_offset] = &y.index;
     GPUReconstructionCUDAInternals::getArgPtrs(&pArgs[arg_offset + 1], args...);
-    GPUChkErr(cuLaunchKernel(*mInternals->kernelFunctions[getRTCkernelNum<T, I>()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void**)pArgs, nullptr));
+    GPUChkErr(cuLaunchKernel(*mInternals->kernelFunctions[GetKernelNum<T, I>()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void**)pArgs, nullptr));
   }
 }
 
@@ -111,20 +111,6 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Ar
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
 
-template <class T, int32_t I>
-int32_t GPUReconstructionCUDABackend::getRTCkernelNum(int32_t k)
-{
-  static int32_t num = k;
-  if (num < 0) {
-    throw std::runtime_error("Invalid kernel");
-  }
-  return num;
-}
-
-#define GPUCA_KRNL(x_class, ...) template int32_t GPUReconstructionCUDABackend::getRTCkernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>(int32_t k);
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
-
 void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& kernels)
 {
 #define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__)));
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
@@ -399,7 +399,7 @@ int32_t GPUReconstructionOCLBackend::ExitDevice_Runtime()
       clReleaseMemObject(mInternals->mem_gpu);
       clReleaseMemObject(mInternals->mem_constant);
       for (uint32_t i = 0; i < mInternals->kernels.size(); i++) {
-        clReleaseKernel(mInternals->kernels[i].first);
+        clReleaseKernel(mInternals->kernels[i]);
       }
       mInternals->kernels.clear();
     }
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
@@ -56,8 +56,6 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase
 
   template <class T, int32_t I = 0>
   int32_t AddKernel();
-  template <class T, int32_t I = 0>
-  uint32_t FindKernel();
   template <class T, int32_t I = 0, typename... Args>
   void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
   template <class T, int32_t I = 0>
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLIncludesHost.h
@@ -49,7 +49,7 @@ struct GPUReconstructionOCLInternals {
   cl_mem mem_host;
   cl_program program;
 
-  std::vector<std::pair<cl_kernel, std::string>> kernels;
+  std::vector<cl_kernel> kernels;
 };
 } // namespace o2::gpu
 
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
@@ -58,20 +58,6 @@ void GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs<T, I, Arg
   std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
 }
 
-template <class T, int32_t I>
-inline uint32_t GPUReconstructionOCLBackend::FindKernel()
-{
-  std::string name(GetKernelName<T, I>());
-
-  for (uint32_t k = 0; k < mInternals->kernels.size(); k++) {
-    if (mInternals->kernels[k].second == name) {
-      return (k);
-    }
-  }
-  GPUError("Could not find OpenCL kernel %s", name.c_str());
-  throw ::std::runtime_error("Requested unsupported OpenCL kernel");
-}
-
 template <class T, int32_t I>
 int32_t GPUReconstructionOCLBackend::AddKernel()
 {
@@ -84,15 +70,14 @@ int32_t GPUReconstructionOCLBackend::AddKernel()
     GPUError("Error creating OPENCL Kernel: %s", name.c_str());
     return 1;
   }
-  mInternals->kernels.emplace_back(krnl, name);
+  mInternals->kernels.emplace_back(krnl);
   return 0;
 }
 
 template <class S, class T, int32_t I>
 S& GPUReconstructionOCLBackend::getKernelObject()
 {
-  static uint32_t krnl = FindKernel<T, I>();
-  return mInternals->kernels[krnl].first;
+  return mInternals->kernels[GetKernelNum<T, I>()];
 }
 
 int32_t GPUReconstructionOCLBackend::AddKernels()

Original file line number	Diff line number	Diff line change
`@@ -399,7 +399,7 @@ int32_t GPUReconstructionOCLBackend::ExitDevice_Runtime()`
`399`	`399`	`clReleaseMemObject(mInternals->mem_gpu);`
`400`	`400`	`clReleaseMemObject(mInternals->mem_constant);`
`401`	`401`	`for (uint32_t i = 0; i < mInternals->kernels.size(); i++) {`
`402`		`- clReleaseKernel(mInternals->kernels[i].first);`
	`402`	`+ clReleaseKernel(mInternals->kernels[i]);`
`403`	`403`	`}`
`404`	`404`	`mInternals->kernels.clear();`
`405`	`405`	`}`