AliceO2Group · davidrohr · Mar 28, 2025 · Mar 27, 2025 · Mar 27, 2025 · Mar 27, 2025
@@ -214,7 +214,7 @@ typedef GPUCommonAlgorithm CAAlgo;
 
 } // namespace o2::gpu
 
-#if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_HOSTONLY)
+#if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_HOSTONLY)
 
 #include "GPUCommonAlgorithmThrust.h"
 

@@ -15,6 +15,7 @@
 #ifndef GPUCOMMONALGORITHMTHRUST_H
 #define GPUCOMMONALGORITHMTHRUST_H
 
+#ifndef GPUCA_GPUCODE_COMPILEKERNELS
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wshadow"
 #include <thrust/sort.h>
@@ -25,14 +26,19 @@
 #include "GPUCommonDef.h"
 #include "GPUCommonHelpers.h"
 
+#ifndef __HIPCC__ // CUDA
+#include <cub/cub.cuh>
+#else // HIP
+#include <hipcub/hipcub.hpp>
+#endif
+#endif // GPUCA_GPUCODE_COMPILEKERNELS
+
 #ifndef __HIPCC__ // CUDA
 #define GPUCA_THRUST_NAMESPACE thrust::cuda
 #define GPUCA_CUB_NAMESPACE cub
-#include <cub/cub.cuh>
 #else // HIP
 #define GPUCA_THRUST_NAMESPACE thrust::hip
 #define GPUCA_CUB_NAMESPACE hipcub
-#include <hipcub/hipcub.hpp>
 #endif
 
 namespace o2::gpu
@@ -90,6 +96,7 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co
   thrust::sort(GPUCA_THRUST_NAMESPACE::par, thrustBegin, thrustEnd, comp);
 }
 
+#ifndef GPUCA_GPUCODE_COMPILEKERNELS
 template <class T, class S>
 GPUhi() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp)
 {
@@ -105,6 +112,8 @@ GPUhi() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begi
   GPUChkErrS(GPUCA_CUB_NAMESPACE::DeviceMergeSort::SortKeys(tempMem, tempSize, begin, N, comp, rec->mInternals->Streams[stream]));
 #endif
 }
+#endif // #ifndef GPUCA_GPUCODE_COMPILEKERNELS
+
 } // namespace o2::gpu
 
 #undef GPUCA_THRUST_NAMESPACE

@@ -193,12 +193,10 @@ void GPUParamRTC::setFrom(const GPUParam& param)
 
 std::string GPUParamRTC::generateRTCCode(const GPUParam& param, bool useConstexpr)
 {
-  return "#ifndef GPUCA_GPUCODE_DEVICE\n"
-         "#include <string>\n"
+  return "#include <string>\n"
          "#include <vector>\n"
          "#include <cstdint>\n"
          "#include <cstddef>\n"
-         "#endif\n"
          "namespace o2::gpu { class GPUDisplayFrontendInterface; }\n" +
          qConfigPrintRtc(std::make_tuple(&param.rec.tpc, &param.rec.trd, &param.rec, &param.par), useConstexpr);
 }

@@ -18,7 +18,7 @@ endif()
 message(STATUS "Building GPUTracking with CUDA support ${TMP_TARGET}")
 
 set(SRCS GPUReconstructionCUDA.cu GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu)
-set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesHost.h)
+set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesSystem.h)
 # -------------------------------- Prepare RTC -------------------------------------------------------
 enable_language(ASM)
 if(ALIGPU_BUILD_TYPE STREQUAL "O2")
@@ -67,8 +67,8 @@ set(GPU_RTC_BIN ${CMAKE_CURRENT_BINARY_DIR}/GPUReconstructionCUDArtc)
 # cmake-format: off
 add_custom_command(
     OUTPUT ${GPU_RTC_BIN}.src
-    COMMAND cp ${GPUDIR}/Base/cuda/GPUReconstructionCUDAIncludesHost.h ${GPU_RTC_BIN}.src
-    COMMAND ${CMAKE_CXX_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_INCLUDES} -std=c++${CMAKE_CUDA_STANDARD} -D__CUDA_ARCH__=${RTC_CUDA_ARCH} -D__CUDACC__ -x c++ -nostdinc -E ${GPU_RTC_SRC} >> ${GPU_RTC_BIN}.src
+    COMMAND cp ${GPUDIR}/Base/cuda/GPUReconstructionCUDAIncludesSystem.h ${GPU_RTC_BIN}.src
+    COMMAND ${CMAKE_CXX_COMPILER} ${GPU_RTC_DEFINES} ${GPU_RTC_INCLUDES} -std=c++${CMAKE_CUDA_STANDARD} -D__CUDA_ARCH__=${RTC_CUDA_ARCH} -D__CUDACC__ -x c++ -nostdinc -E -P ${GPU_RTC_SRC} >> ${GPU_RTC_BIN}.src
     MAIN_DEPENDENCY ${GPU_RTC_SRC}
     IMPLICIT_DEPENDS CXX ${GPU_RTC_SRC}
     COMMAND_EXPAND_LISTS
@@ -84,7 +84,7 @@ add_custom_target(${MODULE}_CUDA_SRC_CHK ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}
 
 add_custom_command(
     OUTPUT ${GPU_RTC_BIN}.command
-    COMMAND echo -n "${CMAKE_CUDA_COMPILER} -forward-unknown-to-host-compiler ${GPU_RTC_DEFINES} ${GPU_RTC_FLAGS_SEPARATED} -x cu -fatbin" > ${GPU_RTC_BIN}.command
+    COMMAND echo -n "${CMAKE_CUDA_COMPILER} -forward-unknown-to-host-compiler ${GPU_RTC_DEFINES} ${GPU_RTC_FLAGS_SEPARATED} -x cu -fatbin -Xcudafe --diag_suppress=177" > ${GPU_RTC_BIN}.command
     COMMAND_EXPAND_LISTS VERBATIM
     COMMENT "Preparing CUDA RTC command file ${GPU_RTC_BIN}.command"
 )

@@ -19,7 +19,8 @@
 #include "GPUDefParametersDefault.h"
 #include "GPUDefParametersLoad.inc"
 
-#include "GPUReconstructionCUDAIncludesHost.h"
+#include "GPUReconstructionCUDAIncludesSystem.h"
+#include "GPUReconstructionCUDADef.h"
 #include <cuda_profiler_api.h>
 
 #include "GPUReconstructionCUDA.h"
@@ -113,7 +114,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
   constexpr int32_t reqVerMaj = 2;
   constexpr int32_t reqVerMin = 0;
 #endif
-  if (mProcessingSettings.rtc.enable && mProcessingSettings.rtc.runTest == 2) {
+  if (mProcessingSettings.rtc.enable && mProcessingSettings.rtctech.runTest == 2) {
     genAndLoadRTC();
     exit(0);
   }
@@ -432,14 +433,14 @@ void GPUReconstructionCUDA::genAndLoadRTC()
     throw std::runtime_error("Runtime compilation failed");
   }
   for (uint32_t i = 0; i < nCompile; i++) {
-    if (mProcessingSettings.rtc.runTest != 2) {
+    if (mProcessingSettings.rtctech.runTest != 2) {
       mInternals->kernelModules.emplace_back(std::make_unique<CUmodule>());
       GPUChkErr(cuModuleLoad(mInternals->kernelModules.back().get(), (filename + "_" + std::to_string(i) + mRtcBinExtension).c_str()));
     }
     remove((filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str());
     remove((filename + "_" + std::to_string(i) + mRtcBinExtension).c_str());
   }
-  if (mProcessingSettings.rtc.runTest == 2) {
+  if (mProcessingSettings.rtctech.runTest == 2) {
     return;
   }
   loadKernelModules(mProcessingSettings.rtc.compilePerKernel);

@@ -12,7 +12,8 @@
 /// \file GPUReconstructionCUDAExternalProvider.cu
 /// \author David Rohr
 
-#include "GPUReconstructionCUDAIncludesHost.h"
+#include "GPUReconstructionCUDAIncludesSystem.h"
+#include "GPUReconstructionCUDADef.h"
 
 #include "GPUReconstructionCUDA.h"
 #include "GPUReconstructionCUDAInternals.h"

@@ -13,6 +13,10 @@
 /// \author David Rohr
 
 #define GPUCA_GPUCODE_HOSTONLY
+#define GPUCA_DEF_PARAMETERS_LOAD_DEFAULTS
+#include "GPUDefParametersDefault.h"
+#include "GPUDefParametersLoad.inc"
+
 #include "GPUReconstructionCUDA.h"
 #include "GPUParamRTC.h"
 #include "GPUDefMacros.h"
@@ -52,28 +56,33 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
     kernelsall += kernels[i] + "\n";
   }
 
-  std::string baseCommand = (mProcessingSettings.RTCprependCommand != "" ? (mProcessingSettings.RTCprependCommand + " ") : "");
+  std::string baseCommand = (mProcessingSettings.rtctech.prependCommand != "" ? (mProcessingSettings.rtctech.prependCommand + " ") : "");
   baseCommand += (getenv("O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv("O2_GPU_RTC_OVERRIDE_CMD")) : std::string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len));
-  baseCommand += std::string(" ") + (mProcessingSettings.RTCoverrideArchitecture != "" ? mProcessingSettings.RTCoverrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
+  baseCommand += std::string(" ") + (mProcessingSettings.rtctech.overrideArchitecture != "" ? mProcessingSettings.rtctech.overrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
+  const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true);
+  if (mProcessingSettings.rtctech.printLaunchBounds || mProcessingSettings.debugLevel >= 3) {
+    GPUInfo("RTC Launch Bounds:\n%s", launchBounds.c_str());
+  }
 
-  char shasource[21], shaparam[21], shacmd[21], shakernels[21];
+  char shasource[21], shaparam[21], shacmd[21], shakernels[21], shabounds[21];
   if (mProcessingSettings.rtc.cacheOutput) {
     o2::framework::internal::SHA1(shasource, _binary_GPUReconstructionCUDArtc_src_start, _binary_GPUReconstructionCUDArtc_src_len);
     o2::framework::internal::SHA1(shaparam, rtcparam.c_str(), rtcparam.size());
     o2::framework::internal::SHA1(shacmd, baseCommand.c_str(), baseCommand.size());
     o2::framework::internal::SHA1(shakernels, kernelsall.c_str(), kernelsall.size());
+    o2::framework::internal::SHA1(shabounds, launchBounds.c_str(), launchBounds.size());
   }
 
   nCompile = mProcessingSettings.rtc.compilePerKernel ? kernels.size() : 1;
   bool cacheLoaded = false;
   int32_t fd = 0;
   if (mProcessingSettings.rtc.cacheOutput) {
-    if (mProcessingSettings.RTCcacheFolder != ".") {
-      std::filesystem::create_directories(mProcessingSettings.RTCcacheFolder);
+    if (mProcessingSettings.rtctech.cacheFolder != ".") {
+      std::filesystem::create_directories(mProcessingSettings.rtctech.cacheFolder);
     }
-    if (mProcessingSettings.rtc.cacheMutex) {
+    if (mProcessingSettings.rtctech.cacheMutex) {
       mode_t mask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
-      fd = open((mProcessingSettings.RTCcacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask);
+      fd = open((mProcessingSettings.rtctech.cacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask);
       if (fd == -1) {
         throw std::runtime_error("Error opening rtc cache mutex lock file");
       }
@@ -83,45 +92,42 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
       }
     }
 
-    FILE* fp = fopen((mProcessingSettings.RTCcacheFolder + "/rtc.cuda.cache").c_str(), "rb");
+    FILE* fp = fopen((mProcessingSettings.rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "rb");
     char sharead[20];
     if (fp) {
       size_t len;
       while (true) {
-        if (fread(sharead, 1, 20, fp) != 20) {
-          throw std::runtime_error("Cache file corrupt");
-        }
-        if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shasource, 20)) {
-          GPUInfo("Cache file content outdated (source)");
-          break;
-        }
-        if (fread(sharead, 1, 20, fp) != 20) {
-          throw std::runtime_error("Cache file corrupt");
-        }
-        if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shaparam, 20)) {
-          GPUInfo("Cache file content outdated (param)");
-          break;
-        }
-        if (fread(sharead, 1, 20, fp) != 20) {
-          throw std::runtime_error("Cache file corrupt");
-        }
-        if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shacmd, 20)) {
-          GPUInfo("Cache file content outdated (commandline)");
-          break;
-        }
-        if (fread(sharead, 1, 20, fp) != 20) {
-          throw std::runtime_error("Cache file corrupt");
-        }
-        if (!mProcessingSettings.rtc.ignoreCacheValid && memcmp(sharead, shakernels, 20)) {
-          GPUInfo("Cache file content outdated (kernel definitions)");
+        auto checkSHA = [&](const char* shacmp, const char* name) {
+          if (fread(sharead, 1, 20, fp) != 20) {
+            throw std::runtime_error("Cache file corrupt");
+          }
+          if (mProcessingSettings.debugLevel >= 3) {
+            char shaprint1[41], shaprint2[41];
+            for (uint32_t i = 0; i < 20; i++) {
+              sprintf(shaprint1 + 2 * i, "%02X ", shacmp[i]);
+              sprintf(shaprint2 + 2 * i, "%02X ", sharead[i]);
+            }
+            GPUInfo("SHA for %s: expected %s, read %s", name, shaprint1, shaprint2);
+          }
+          if (!mProcessingSettings.rtctech.ignoreCacheValid && memcmp(sharead, shacmp, 20)) {
+            GPUInfo("Cache file content outdated (%s)", name);
+            return 1;
+          }
+          return 0;
+        };
+        if (checkSHA(shasource, "source") ||
+            checkSHA(shaparam, "param") ||
+            checkSHA(shacmd, "command line") ||
+            checkSHA(shakernels, "kernel definitions") ||
+            checkSHA(shabounds, "launch bounds")) {
           break;
         }
         GPUSettingsProcessingRTC cachedSettings;
         static_assert(std::is_trivially_copyable_v<GPUSettingsProcessingRTC> == true, "GPUSettingsProcessingRTC must be POD");
         if (fread(&cachedSettings, sizeof(cachedSettings), 1, fp) != 1) {
           throw std::runtime_error("Cache file corrupt");
         }
-        if (!mProcessingSettings.rtc.ignoreCacheValid && !(cachedSettings == mProcessingSettings.rtc)) {
+        if (!mProcessingSettings.rtctech.ignoreCacheValid && !(cachedSettings == mProcessingSettings.rtc)) {
           GPUInfo("Cache file content outdated (rtc parameters)");
           break;
         }
@@ -169,11 +175,12 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
       kernel += mProcessingSettings.rtc.compilePerKernel ? kernels[i] : kernelsall;
       kernel += "}";
 
-      bool deterministic = mProcessingSettings.rtc.deterministic || o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end();
+      bool deterministic = mProcessingSettings.rtc.deterministic || (mProcessingSettings.rtc.compilePerKernel && o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end());
       const std::string deterministicStr = std::string(deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n");
 
       if (fwrite(deterministicStr.c_str(), 1, deterministicStr.size(), fp) != deterministicStr.size() ||
           fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() ||
+          fwrite(launchBounds.c_str(), 1, launchBounds.size(), fp) != launchBounds.size() ||
           fwrite(_binary_GPUReconstructionCUDArtc_src_start, 1, _binary_GPUReconstructionCUDArtc_src_len, fp) != _binary_GPUReconstructionCUDArtc_src_len ||
           fwrite(kernel.c_str(), 1, kernel.size(), fp) != kernel.size()) {
         throw std::runtime_error("Error writing file");
@@ -203,7 +210,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
       GPUInfo("RTC Compilation finished (%f seconds)", rtcTimer.GetCurrentElapsedTime());
     }
     if (mProcessingSettings.rtc.cacheOutput) {
-      FILE* fp = fopen((mProcessingSettings.RTCcacheFolder + "/rtc.cuda.cache").c_str(), "w+b");
+      FILE* fp = fopen((mProcessingSettings.rtctech.cacheFolder + "/rtc.cuda.cache").c_str(), "w+b");
       if (fp == nullptr) {
         throw std::runtime_error("Cannot open cache file for writing");
       }
@@ -213,6 +220,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
           fwrite(shaparam, 1, 20, fp) != 20 ||
           fwrite(shacmd, 1, 20, fp) != 20 ||
           fwrite(shakernels, 1, 20, fp) != 20 ||
+          fwrite(shabounds, 1, 20, fp) != 20 ||
           fwrite(&mProcessingSettings.rtc, sizeof(mProcessingSettings.rtc), 1, fp) != 1) {
         throw std::runtime_error("Error writing cache file");
       }
@@ -240,7 +248,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
       fclose(fp);
     }
   }
-  if (mProcessingSettings.rtc.cacheOutput && mProcessingSettings.rtc.cacheMutex) {
+  if (mProcessingSettings.rtc.cacheOutput && mProcessingSettings.rtctech.cacheMutex) {
     if (lockf(fd, F_ULOCK, 0)) {
       throw std::runtime_error("Error unlocking RTC cache mutex file");
     }

@@ -9,7 +9,7 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 
-/// \file GPUReconstructionCUDAIncludes.h
+/// \file GPUReconstructionCUDAIncludesSystem.h
 /// \author David Rohr
 
 #ifndef O2_GPU_GPURECONSTRUCTIONCUDAINCLUDES_H
@@ -32,8 +32,4 @@
 #include <sm_20_atomic_functions.h>
 #include <cuda_fp16.h>
 
-#ifndef GPUCA_RTC_CODE
-#include "GPUReconstructionCUDADef.h"
-#endif
-
 #endif
@@ -12,7 +12,8 @@
 /// \file GPUReconstructionCUDAKernels.cu
 /// \author David Rohr
 
-#include "GPUReconstructionCUDAIncludesHost.h"
+#include "GPUReconstructionCUDAIncludesSystem.h"
+#include "GPUReconstructionCUDADef.h"
 
 #include "GPUReconstructionCUDA.h"
 #include "GPUReconstructionCUDAInternals.h"
@@ -108,13 +109,6 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Ar
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
 
-void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& kernels)
-{
-#define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__)));
-#include "GPUReconstructionKernelList.h"
-#undef GPUCA_KRNL
-}
-
 #ifndef GPUCA_NO_CONSTANT_MEMORY
 static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {
   void* retVal = nullptr;
@@ -124,3 +118,14 @@ static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstS
   return retVal;
 });
 #endif
+
+void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& kernels)
+{
+#undef GPUCA_KRNL_LB
+#undef __launch_bounds__
+#define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__)));
+#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_RTC_LB_, GPUCA_M_KRNL_NAME(x_class))), GPUCA_M_STRIP(x_attributes)), __VA_ARGS__)
+#include "GPUReconstructionKernelList.h"
+#undef GPUCA_KRNL
+#undef GPUCA_KRNL_LB
+}
@@ -13,9 +13,10 @@
 /// \author David Rohr
 
 #define GPUCA_GPUCODE_COMPILEKERNELS
-#include "GPUReconstructionCUDAIncludesHost.h"
+#include "GPUReconstructionCUDAIncludesSystem.h"
 #define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
 #define GPUCA_KRNL(...) GPUCA_KRNLGPU(__VA_ARGS__);
+#include "GPUReconstructionCUDADef.h"
 #include "GPUReconstructionKernelMacros.h"
 
 // clang-format off

@@ -16,7 +16,7 @@
 #define GPUCA_GPUCODE_COMPILEKERNELS
 #define GPUCA_RTC_SPECIAL_CODE(...) GPUCA_RTC_SPECIAL_CODE(__VA_ARGS__)
 #define GPUCA_DETERMINISTIC_CODE(...) GPUCA_DETERMINISTIC_CODE(__VA_ARGS__)
-// GPUReconstructionCUDAIncludesHost.h auto-prependended without preprocessor running
+// GPUReconstructionCUDAIncludesSystem.h prependended without preprocessor running
 #include "GPUReconstructionCUDADef.h"
 #include "GPUReconstructionIncludesDeviceAll.h"