AliceO2Group · davidrohr · Mar 26, 2025 · Mar 26, 2025 · Mar 26, 2025 · Mar 26, 2025
@@ -273,6 +273,7 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
     if (mProcessingSettings.createO2Output > 1) {
       mProcessingSettings.createO2Output = 1;
     }
+    mProcessingSettings.rtc.deterministic = 1;
   }
   if (mProcessingSettings.deterministicGPUReconstruction && mProcessingSettings.debugLevel >= 6) {
     mProcessingSettings.nTPCClustererLanes = 1;

@@ -57,17 +57,24 @@ void GPUReconstructionProcessing::runParallelOuterLoop(bool doGPU, uint32_t nThr
   }
 }
 
-namespace o2::gpu
-{
-namespace // anonymous
+uint32_t GPUReconstructionProcessing::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
 {
-static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class member not global, but cannot be moved to header due to ROOT limitation
-} // anonymous namespace
-} // namespace o2::gpu
+  if (condition && mProcessingSettings.inKernelParallel != 1) {
+    mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
+  } else {
+    mNActiveThreadsOuterLoop = 1;
+  }
+  if (mProcessingSettings.debugLevel >= 5) {
+    printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop);
+  }
+  return mNActiveThreadsOuterLoop;
+}
+
+std::atomic_flag GPUReconstructionProcessing::mTimerFlag = ATOMIC_FLAG_INIT;
 
 GPUReconstructionProcessing::timerMeta* GPUReconstructionProcessing::insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step)
 {
-  while (timerFlag.test_and_set()) {
+  while (mTimerFlag.test_and_set()) {
   }
   if (mTimers.size() <= id) {
     mTimers.resize(id + 1);
@@ -81,20 +88,20 @@ GPUReconstructionProcessing::timerMeta* GPUReconstructionProcessing::insertTimer
     mTimers[id]->count++;
   }
   timerMeta* retVal = mTimers[id].get();
-  timerFlag.clear();
+  mTimerFlag.clear();
   return retVal;
 }
 
 GPUReconstructionProcessing::timerMeta* GPUReconstructionProcessing::getTimerById(uint32_t id, bool increment)
 {
   timerMeta* retVal = nullptr;
-  while (timerFlag.test_and_set()) {
+  while (mTimerFlag.test_and_set()) {
   }
   if (mTimers.size() > id && mTimers[id]) {
     retVal = mTimers[id].get();
     retVal->count += increment;
   }
-  timerFlag.clear();
+  mTimerFlag.clear();
   return retVal;
 }
 
@@ -104,23 +111,46 @@ uint32_t GPUReconstructionProcessing::getNextTimerId()
   return id.fetch_add(1);
 }
 
-uint32_t GPUReconstructionProcessing::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
-{
-  if (condition && mProcessingSettings.inKernelParallel != 1) {
-    mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
-  } else {
-    mNActiveThreadsOuterLoop = 1;
-  }
-  if (mProcessingSettings.debugLevel >= 5) {
-    printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop);
-  }
-  return mNActiveThreadsOuterLoop;
-}
-
 std::unique_ptr<gpu_reconstruction_kernels::threadContext> GPUReconstructionProcessing::GetThreadContext()
 {
   return std::make_unique<gpu_reconstruction_kernels::threadContext>();
 }
 
 gpu_reconstruction_kernels::threadContext::threadContext() = default;
 gpu_reconstruction_kernels::threadContext::~threadContext() = default;
+
+template <class T, int32_t I>
+uint32_t GPUReconstructionProcessing::GetKernelNum(int32_t k)
+{
+  static int32_t num = k;
+  if (num < 0) {
+    throw std::runtime_error("Internal Error - Kernel Number not Set");
+  }
+  return num;
+}
+
+namespace o2::gpu::internal
+{
+static std::vector<std::string> initKernelNames()
+{
+  std::vector<std::string> retVal;
+#define GPUCA_KRNL(x_class, ...)                                                            \
+  GPUReconstructionProcessing::GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>(retVal.size()); \
+  retVal.emplace_back(GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class)));
+#include "GPUReconstructionKernelList.h"
+#undef GPUCA_KRNL
+  return retVal;
+}
+} // namespace o2::gpu::internal
+
+const std::vector<std::string> GPUReconstructionProcessing::mKernelNames = o2::gpu::internal::initKernelNames();
+
+#define GPUCA_KRNL(x_class, ...)                                                                        \
+  template uint32_t GPUReconstructionProcessing::GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>(int32_t); \
+  template <>                                                                                           \
+  const char* GPUReconstructionProcessing::GetKernelName<GPUCA_M_KRNL_TEMPLATE(x_class)>()              \
+  {                                                                                                     \
+    return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class));                                                     \
+  }
+#include "GPUReconstructionKernelList.h"
+#undef GPUCA_KRNL
@@ -20,6 +20,7 @@
 
 #include "utils/timer.h"
 #include <functional>
+#include <atomic>
 
 namespace o2::gpu
 {
@@ -74,7 +75,10 @@ class GPUReconstructionProcessing : public GPUReconstruction
 
   // Interface to query name of a kernel
   template <class T, int32_t I>
-  constexpr static const char* GetKernelName();
+  static const char* GetKernelName();
+  const std::string& GetKernelName(int32_t i) const { return mKernelNames[i]; }
+  template <class T, int32_t I = 0>
+  static uint32_t GetKernelNum(int32_t k = -1);
 
   // Public queries for timers
   auto& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)]; }
@@ -100,6 +104,8 @@ class GPUReconstructionProcessing : public GPUReconstruction
   GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg) : GPUReconstruction(cfg) {}
   using deviceEvent = gpu_reconstruction_kernels::deviceEvent;
 
+  static const std::vector<std::string> mKernelNames;
+
   int32_t mActiveHostKernelThreads = 0;  // Number of currently active threads on the host for kernels
   uint32_t mNActiveThreadsOuterLoop = 1; // Number of threads currently running an outer loop
 
@@ -130,6 +136,8 @@ class GPUReconstructionProcessing : public GPUReconstruction
   uint32_t getNextTimerId();
   timerMeta* getTimerById(uint32_t id, bool increment = true);
   timerMeta* insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step);
+
+  static std::atomic_flag mTimerFlag;
 };
 
 template <class T>
@@ -174,15 +182,6 @@ HighResTimer& GPUReconstructionProcessing::getTimer(const char* name, int32_t nu
   return timer->timer[num];
 }
 
-#define GPUCA_KRNL(x_class, ...)                                                                     \
-  template <>                                                                                        \
-  constexpr const char* GPUReconstructionProcessing::GetKernelName<GPUCA_M_KRNL_TEMPLATE(x_class)>() \
-  {                                                                                                  \
-    return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class));                                                  \
-  }
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
-
 } // namespace o2::gpu
 
 #endif
@@ -610,25 +610,25 @@ void GPUReconstructionCUDABackend::PrintKernelOccupancies()
     GPUChkErr(cuOccupancyMaxActiveBlocksPerMultiprocessor(&maxBlocks, *mInternals->kernelFunctions[i], threads, 0));
     GPUChkErr(cuFuncGetAttribute(&nRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, *mInternals->kernelFunctions[i]));
     GPUChkErr(cuFuncGetAttribute(&sMem, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, *mInternals->kernelFunctions[i]));
-    GPUInfo("Kernel: %50s Block size: %4d, Maximum active blocks: %3d, Suggested blocks: %3d, Regs: %3d, smem: %3d", mInternals->kernelNames[i].c_str(), threads, maxBlocks, suggestedBlocks, nRegs, sMem);
+    GPUInfo("Kernel: %50s Block size: %4d, Maximum active blocks: %3d, Suggested blocks: %3d, Regs: %3d, smem: %3d", GetKernelName(i).c_str(), threads, maxBlocks, suggestedBlocks, nRegs, sMem);
   }
 }
 
 void GPUReconstructionCUDA::loadKernelModules(bool perKernel)
 {
   uint32_t j = 0;
 #define GPUCA_KRNL(x_class, ...)                                                                                                                                                        \
-  getRTCkernelNum<false, GPUCA_M_KRNL_TEMPLATE(x_class)>(mInternals->kernelFunctions.size());                                                                                           \
+  if (GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>() != j) {                                                                                                                            \
+    GPUFatal("kernel numbers out of sync");                                                                                                                                             \
+  }                                                                                                                                                                                     \
   mInternals->kernelFunctions.emplace_back(new CUfunction);                                                                                                                             \
-  mInternals->kernelNames.emplace_back(GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))));                                                                                    \
   if (mProcessingSettings.debugLevel >= 3) {                                                                                                                                            \
     GPUInfo("Loading kernel %s (j = %u)", GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), j);                                                                              \
   }                                                                                                                                                                                     \
   GPUChkErr(cuModuleGetFunction(mInternals->kernelFunctions.back().get(), *mInternals->kernelModules[perKernel ? j : 0], GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))))); \
   j++;
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
-
   if (j != mInternals->kernelModules.size()) {
     GPUFatal("Did not load all kernels (%u < %u)", j, (uint32_t)mInternals->kernelModules.size());
   }

@@ -46,11 +46,6 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase
   void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
   template <class T, int32_t I = 0>
   gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend();
-  template <class T, int32_t I>
-  class backendInternal;
-
-  template <bool multi, class T, int32_t I = 0>
-  static int32_t getRTCkernelNum(int32_t k = -1);
 
   void getRTCKernelCalls(std::vector<std::string>& kernels);
 

@@ -31,11 +31,12 @@ QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command);
 QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_arch);
 QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_no_fast_math);
 
+#include "GPUNoFastMathKernels.h"
+
 int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
 {
   std::string rtcparam = std::string("#define GPUCA_RTC_CODE\n") +
                          std::string(mProcessingSettings.rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") +
-                         std::string(mProcessingSettings.rtc.deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n") +
                          GPUParamRTC::generateRTCCode(param(), mProcessingSettings.rtc.optConstexpr);
   if (filename == "") {
     filename = "/tmp/o2cagpu_rtc_";
@@ -54,7 +55,6 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
   std::string baseCommand = (mProcessingSettings.RTCprependCommand != "" ? (mProcessingSettings.RTCprependCommand + " ") : "");
   baseCommand += (getenv("O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv("O2_GPU_RTC_OVERRIDE_CMD")) : std::string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len));
   baseCommand += std::string(" ") + (mProcessingSettings.RTCoverrideArchitecture != "" ? mProcessingSettings.RTCoverrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
-  baseCommand += mProcessingSettings.rtc.deterministic ? (std::string(" ") + std::string(_binary_GPUReconstructionCUDArtc_command_no_fast_math_start, _binary_GPUReconstructionCUDArtc_command_no_fast_math_len)) : std::string("");
 
   char shasource[21], shaparam[21], shacmd[21], shakernels[21];
   if (mProcessingSettings.rtc.cacheOutput) {
@@ -169,13 +169,20 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
       kernel += mProcessingSettings.rtc.compilePerKernel ? kernels[i] : kernelsall;
       kernel += "}";
 
-      if (fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() ||
+      bool deterministic = mProcessingSettings.rtc.deterministic || o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end();
+      const std::string deterministicStr = std::string(deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n");
+
+      if (fwrite(deterministicStr.c_str(), 1, deterministicStr.size(), fp) != deterministicStr.size() ||
+          fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() ||
           fwrite(_binary_GPUReconstructionCUDArtc_src_start, 1, _binary_GPUReconstructionCUDArtc_src_len, fp) != _binary_GPUReconstructionCUDArtc_src_len ||
           fwrite(kernel.c_str(), 1, kernel.size(), fp) != kernel.size()) {
         throw std::runtime_error("Error writing file");
       }
       fclose(fp);
       std::string command = baseCommand;
+      if (deterministic) {
+        command += std::string(" ") + std::string(_binary_GPUReconstructionCUDArtc_command_no_fast_math_start, _binary_GPUReconstructionCUDArtc_command_no_fast_math_len);
+      }
       command += " -c " + filename + "_" + std::to_string(i) + mRtcSrcExtension + " -o " + filename + "_" + std::to_string(i) + mRtcBinExtension;
       if (mProcessingSettings.debugLevel < 0) {
         command += " &> /dev/null";

@@ -30,7 +30,6 @@ namespace o2::gpu
 struct GPUReconstructionCUDAInternals {
   std::vector<std::unique_ptr<CUmodule>> kernelModules;     // module for RTC compilation
   std::vector<std::unique_ptr<CUfunction>> kernelFunctions; // vector of ptrs to RTC kernels
-  std::vector<std::string> kernelNames;                     // names of kernels
   cudaStream_t Streams[GPUCA_MAX_STREAMS];                  // Pointer to array of CUDA Streams
 
   static void getArgPtrs(const void** pArgs) {}

@@ -55,7 +55,7 @@ inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSet
 #endif
     pArgs[arg_offset] = &y.index;
     GPUReconstructionCUDAInternals::getArgPtrs(&pArgs[arg_offset + 1], args...);
-    GPUChkErr(cuLaunchKernel(*mInternals->kernelFunctions[getRTCkernelNum<false, T, I>()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void**)pArgs, nullptr));
+    GPUChkErr(cuLaunchKernel(*mInternals->kernelFunctions[GetKernelNum<T, I>()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void**)pArgs, nullptr));
   }
 }
 
@@ -111,22 +111,6 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Ar
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
 
-template <bool multi, class T, int32_t I>
-int32_t GPUReconstructionCUDABackend::getRTCkernelNum(int32_t k)
-{
-  static int32_t num = k;
-  if (num < 0) {
-    throw std::runtime_error("Invalid kernel");
-  }
-  return num;
-}
-
-#define GPUCA_KRNL(x_class, ...)                                                                                    \
-  template int32_t GPUReconstructionCUDABackend::getRTCkernelNum<false, GPUCA_M_KRNL_TEMPLATE(x_class)>(int32_t k); \
-  template int32_t GPUReconstructionCUDABackend::getRTCkernelNum<true, GPUCA_M_KRNL_TEMPLATE(x_class)>(int32_t k);
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
-
 void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& kernels)
 {
 #define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__)));

@@ -399,7 +399,7 @@ int32_t GPUReconstructionOCLBackend::ExitDevice_Runtime()
       clReleaseMemObject(mInternals->mem_gpu);
       clReleaseMemObject(mInternals->mem_constant);
       for (uint32_t i = 0; i < mInternals->kernels.size(); i++) {
-        clReleaseKernel(mInternals->kernels[i].first);
+        clReleaseKernel(mInternals->kernels[i]);
       }
       mInternals->kernels.clear();
     }

@@ -56,8 +56,6 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase
 
   template <class T, int32_t I = 0>
   int32_t AddKernel();
-  template <class T, int32_t I = 0>
-  uint32_t FindKernel();
   template <class T, int32_t I = 0, typename... Args>
   void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
   template <class T, int32_t I = 0>

@@ -49,7 +49,7 @@ struct GPUReconstructionOCLInternals {
   cl_mem mem_host;
   cl_program program;
 
-  std::vector<std::pair<cl_kernel, std::string>> kernels;
+  std::vector<cl_kernel> kernels;
 };
 } // namespace o2::gpu
 

@@ -58,20 +58,6 @@ void GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs<T, I, Arg
   std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
 }
 
-template <class T, int32_t I>
-inline uint32_t GPUReconstructionOCLBackend::FindKernel()
-{
-  std::string name(GetKernelName<T, I>());
-
-  for (uint32_t k = 0; k < mInternals->kernels.size(); k++) {
-    if (mInternals->kernels[k].second == name) {
-      return (k);
-    }
-  }
-  GPUError("Could not find OpenCL kernel %s", name.c_str());
-  throw ::std::runtime_error("Requested unsupported OpenCL kernel");
-}
-
 template <class T, int32_t I>
 int32_t GPUReconstructionOCLBackend::AddKernel()
 {
@@ -84,15 +70,14 @@ int32_t GPUReconstructionOCLBackend::AddKernel()
     GPUError("Error creating OPENCL Kernel: %s", name.c_str());
     return 1;
   }
-  mInternals->kernels.emplace_back(krnl, name);
+  mInternals->kernels.emplace_back(krnl);
   return 0;
 }
 
 template <class S, class T, int32_t I>
 S& GPUReconstructionOCLBackend::getKernelObject()
 {
-  static uint32_t krnl = FindKernel<T, I>();
-  return mInternals->kernels[krnl].first;
+  return mInternals->kernels[GetKernelNum<T, I>()];
 }
 
 int32_t GPUReconstructionOCLBackend::AddKernels()

@@ -242,6 +242,10 @@ file(GENERATE
      OUTPUT include_gpu_onthefly/GPUReconstructionIncludesDeviceAll.h
      INPUT Base/GPUReconstructionIncludesDeviceAll.template.h
 )
+file(GENERATE
+     OUTPUT include_gpu_onthefly/GPUNoFastMathKernels.h
+     INPUT cmake/GPUNoFastMathKernels.template.h
+)
 if(NOT ALIGPU_BUILD_TYPE STREQUAL "O2")
   include_directories(${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly)
 endif()