AliceO2Group
diff --git a/‎GPU/GPUTracking/Base/GPUReconstructionCPU.cxx‎
Lines changed: 17 additions & 20 deletions b/‎GPU/GPUTracking/Base/GPUReconstructionCPU.cxx‎
Lines changed: 17 additions & 20 deletions
diff --git a/‎GPU/GPUTracking/Base/GPUReconstructionCPU.h‎
Lines changed: 3 additions & 3 deletions b/‎GPU/GPUTracking/Base/GPUReconstructionCPU.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h‎
Lines changed: 15 additions & 67 deletions b/‎GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h‎
Lines changed: 15 additions & 67 deletions
diff --git a/‎GPU/GPUTracking/Base/GPUReconstructionKernels.h‎
Lines changed: 4 additions & 7 deletions b/‎GPU/GPUTracking/Base/GPUReconstructionKernels.h‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu‎
Lines changed: 4 additions & 18 deletions b/‎GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu‎
Lines changed: 4 additions & 18 deletions
diff --git a/‎GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h‎
Lines changed: 1 addition & 1 deletion b/‎GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h‎
Lines changed: 1 addition & 1 deletion
@@ -66,28 +66,25 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetu
   if (x.nThreads != 1) {
     throw std::runtime_error("Cannot run device kernel on host with nThreads != 1");
   }
-  uint32_t num = y.num == 0 || y.num == -1 ? 1 : y.num;
-  for (uint32_t k = 0; k < num; k++) {
-    int32_t nThreads = getNKernelHostThreads(false);
-    if (nThreads > 1) {
-      if (mProcessingSettings.debugLevel >= 5) {
-        printf("Running %d Threads\n", nThreads);
-      }
-      tbb::this_task_arena::isolate([&] {
-        mThreading->activeThreads->execute([&] {
-          tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
-            typename T::GPUSharedMemory smem;
-            for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
-              T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
-            }
-          });
+  int32_t nThreads = getNKernelHostThreads(false);
+  if (nThreads > 1) {
+    if (mProcessingSettings.debugLevel >= 5) {
+      printf("Running %d Threads\n", nThreads);
+    }
+    tbb::this_task_arena::isolate([&] {
+      mThreading->activeThreads->execute([&] {
+        tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
+          typename T::GPUSharedMemory smem;
+          for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
+            T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.index], args...);
+          }
         });
       });
-    } else {
-      for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
-        typename T::GPUSharedMemory smem;
-        T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
-      }
+    });
+  } else {
+    for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
+      typename T::GPUSharedMemory smem;
+      T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.index], args...);
     }
   }
 }
 
@@ -49,7 +49,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
 
  public:
   ~GPUReconstructionCPU() override;
-  static constexpr krnlRunRange krnlRunRangeNone{0, -1};
+  static constexpr krnlRunRange krnlRunRangeNone{0};
   static constexpr krnlEvent krnlEventNone = krnlEvent{nullptr, nullptr, 0};
 
   template <class S, int32_t I = 0, typename... Args>
@@ -77,7 +77,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
 
   GPUReconstructionCPU(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg) {}
 
-#define GPUCA_KRNL(x_class, attributes, x_arguments, x_forward, x_types)                                                                                                                     \
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types)                                                                                                                   \
   inline void runKernelImplWrapper(gpu_reconstruction_kernels::classArgument<GPUCA_M_KRNL_TEMPLATE(x_class)>, bool cpuFallback, double& timer, krnlSetup&& setup GPUCA_M_STRIP(x_arguments)) \
   {                                                                                                                                                                                          \
     if (cpuFallback) {                                                                                                                                                                       \
@@ -161,7 +161,7 @@ inline void GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args)
     throw std::runtime_error("GPUCA_MAX_THREADS exceeded");
   }
   if (mProcessingSettings.debugLevel >= 3) {
-    GPUInfo("Running kernel %s (Stream %d, Range %d/%d, Grid %d/%d) on %s", GetKernelName<S, I>(), stream, setup.y.start, setup.y.num, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : cpuFallback ? "CPU (fallback)" : mDeviceName.c_str());
+    GPUInfo("Running kernel %s (Stream %d, Index %d, Grid %d/%d) on %s", GetKernelName<S, I>(), stream, setup.y.index, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : cpuFallback ? "CPU (fallback)" : mDeviceName.c_str());
   }
   if (nThreads == 0 || nBlocks == 0) {
     return;
 
@@ -51,97 +51,45 @@
 #define GPUCA_ATTRRES3(XX) // 3 attributes not supported
 #define GPUCA_ATTRRES2(XX, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES2_, GPUCA_M_FIRST(__VA_ARGS__)))(XX, __VA_ARGS__)
 #define GPUCA_ATTRRES(XX, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(XX, __VA_ARGS__)
-// GPU Kernel entry point for single sector
-#define GPUCA_KRNLGPU_SINGLE_DEF(x_class, x_attributes, x_arguments, ...) \
-  GPUg() void GPUCA_ATTRRES(,GPUCA_M_SHIFT(GPUCA_M_STRIP(x_attributes))) GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))(GPUCA_CONSMEM_PTR int32_t iSector_internal GPUCA_M_STRIP(x_arguments))
-#ifdef GPUCA_KRNL_DEFONLY
-#define GPUCA_KRNLGPU_SINGLE(...) GPUCA_KRNLGPU_SINGLE_DEF(__VA_ARGS__);
-#else
-#define GPUCA_KRNLGPU_SINGLE(x_class, x_attributes, x_arguments, x_forward, ...) GPUCA_KRNLGPU_SINGLE_DEF(x_class, x_attributes, x_arguments, x_forward, __VA_ARGS__) \
-  { \
-    GPUshared() typename GPUCA_M_STRIP_FIRST(x_class)::GPUSharedMemory smem; \
-    GPUCA_M_STRIP_FIRST(x_class)::template Thread<GPUCA_M_KRNL_NUM(x_class)>(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, GPUCA_M_STRIP_FIRST(x_class)::Processor(GPUCA_CONSMEM)[iSector_internal] GPUCA_M_STRIP(x_forward)); \
-  }
-#endif
 
-// GPU Kernel entry point for multiple sector
-#define GPUCA_KRNLGPU_MULTI_DEF(x_class, x_attributes, x_arguments, ...) \
-  GPUg() void GPUCA_ATTRRES(,GPUCA_M_SHIFT(GPUCA_M_STRIP(x_attributes))) GPUCA_M_CAT3(krnl_, GPUCA_M_KRNL_NAME(x_class), _multi)(GPUCA_CONSMEM_PTR int32_t firstSector, int32_t nSectorCount GPUCA_M_STRIP(x_arguments))
+// GPU Kernel entry point
+#define GPUCA_KRNLGPU_DEF(x_class, x_attributes, x_arguments, ...) \
+  GPUg() void GPUCA_ATTRRES(,GPUCA_M_STRIP(x_attributes)) GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))(GPUCA_CONSMEM_PTR int32_t _iSector_internal GPUCA_M_STRIP(x_arguments))
+
 #ifdef GPUCA_KRNL_DEFONLY
-#define GPUCA_KRNLGPU_MULTI(...) GPUCA_KRNLGPU_MULTI_DEF(__VA_ARGS__);
+#define GPUCA_KRNLGPU(...) GPUCA_KRNLGPU_DEF(__VA_ARGS__);
 #else
-#define GPUCA_KRNLGPU_MULTI(x_class, x_attributes, x_arguments, x_forward, ...) GPUCA_KRNLGPU_MULTI_DEF(x_class, x_attributes, x_arguments, x_forward, __VA_ARGS__) \
+#define GPUCA_KRNLGPU(x_class, x_attributes, x_arguments, x_forward, ...) \
+  GPUCA_KRNLGPU_DEF(x_class, x_attributes, x_arguments, x_forward, __VA_ARGS__) \
   { \
-    const int32_t iSector_internal = nSectorCount * (get_group_id(0) + (get_num_groups(0) % nSectorCount != 0 && nSectorCount * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0); \
-    const int32_t nSectorBlockOffset = get_num_groups(0) * iSector_internal / nSectorCount; \
-    const int32_t sectorBlockId = get_group_id(0) - nSectorBlockOffset; \
-    const int32_t sectorGridDim = get_num_groups(0) * (iSector_internal + 1) / nSectorCount - get_num_groups(0) * (iSector_internal) / nSectorCount; \
     GPUshared() typename GPUCA_M_STRIP_FIRST(x_class)::GPUSharedMemory smem; \
-    GPUCA_M_STRIP_FIRST(x_class)::template Thread<GPUCA_M_KRNL_NUM(x_class)>(sectorGridDim, get_local_size(0), sectorBlockId, get_local_id(0), smem, GPUCA_M_STRIP_FIRST(x_class)::Processor(GPUCA_CONSMEM)[firstSector + iSector_internal] GPUCA_M_STRIP(x_forward)); \
+    GPUCA_M_STRIP_FIRST(x_class)::template Thread<GPUCA_M_KRNL_NUM(x_class)>(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, GPUCA_M_STRIP_FIRST(x_class)::Processor(GPUCA_CONSMEM)[_iSector_internal] GPUCA_M_STRIP(x_forward)); \
   }
 #endif
 
-// GPU Host wrapper pre- and post-parts
-#define GPUCA_KRNL_PRE(x_class, ...) \
+// GPU Host wrappers for kernel
+#define GPUCA_KRNL_HOST(x_class, ...) \
+  GPUCA_KRNLGPU(x_class, __VA_ARGS__) \
   template <> class GPUCA_KRNL_BACKEND_CLASS::backendInternal<GPUCA_M_KRNL_TEMPLATE(x_class)> { \
    public: \
     template <typename T, typename... Args> \
     static inline void runKernelBackendMacro(const krnlSetupTime& _xyz, T* me, const Args&... args) \
     { \
       auto& x = _xyz.x; \
-      auto& y = _xyz.y;
-
-#define GPUCA_KRNL_POST() \
+      auto& y = _xyz.y; \
+      GPUCA_KRNL_CALL(x_class, __VA_ARGS__) \
     } \
   };
 
-// GPU Host wrappers for single kernel, multi-sector, or auto-detection
-#define GPUCA_KRNL_single(...) \
-  GPUCA_KRNLGPU_SINGLE(__VA_ARGS__) \
-  GPUCA_KRNL_PRE(__VA_ARGS__) \
-  if (y.num > 1) { \
-    throw std::runtime_error("Kernel called with invalid number of sectors"); \
-  } else { \
-    GPUCA_KRNL_CALL_single(__VA_ARGS__) \
-  } \
-  GPUCA_KRNL_POST()
-
-#define GPUCA_KRNL_multi(...) \
-  GPUCA_KRNLGPU_MULTI(__VA_ARGS__) \
-  GPUCA_KRNL_PRE(__VA_ARGS__) \
-  GPUCA_KRNL_CALL_multi(__VA_ARGS__) \
-  GPUCA_KRNL_POST()
-
-#define GPUCA_KRNL_(...) GPUCA_KRNL_single(__VA_ARGS__)
-#define GPUCA_KRNL_simple(...) GPUCA_KRNL_single(__VA_ARGS__)
-#define GPUCA_KRNL_both(...) \
-  GPUCA_KRNLGPU_SINGLE(__VA_ARGS__) \
-  GPUCA_KRNLGPU_MULTI(__VA_ARGS__) \
-  GPUCA_KRNL_PRE(__VA_ARGS__) \
-  if (y.num <= 1) { \
-    GPUCA_KRNL_CALL_single(__VA_ARGS__) \
-  } else { \
-    GPUCA_KRNL_CALL_multi(__VA_ARGS__) \
-  } \
-  GPUCA_KRNL_POST()
-
-#define GPUCA_KRNL_LOAD_(...) GPUCA_KRNL_LOAD_single(__VA_ARGS__)
-#define GPUCA_KRNL_LOAD_simple(...) GPUCA_KRNL_LOAD_single(__VA_ARGS__)
-#define GPUCA_KRNL_LOAD_both(...) \
-  GPUCA_KRNL_LOAD_single(__VA_ARGS__) \
-  GPUCA_KRNL_LOAD_multi(__VA_ARGS__)
-
 #define GPUCA_KRNL_PROP(x_class, x_attributes) \
   template <> gpu_reconstruction_kernels::krnlProperties GPUCA_KRNL_BACKEND_CLASS::getKernelPropertiesBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>() { \
-    gpu_reconstruction_kernels::krnlProperties ret = gpu_reconstruction_kernels::krnlProperties{GPUCA_ATTRRES(_INTERNAL_PROP,GPUCA_M_SHIFT(GPUCA_M_STRIP(x_attributes)))}; \
+    gpu_reconstruction_kernels::krnlProperties ret = gpu_reconstruction_kernels::krnlProperties{GPUCA_ATTRRES(_INTERNAL_PROP,GPUCA_M_STRIP(x_attributes))}; \
     return ret.nThreads > 0 ? ret : gpu_reconstruction_kernels::krnlProperties{(int32_t)mThreadCount}; \
   }
 
-// Generate GPU kernel and host wrapper
-#define GPUCA_KRNL_WRAP(x_func, x_class, x_attributes, ...) GPUCA_M_CAT(x_func, GPUCA_M_STRIP_FIRST(x_attributes))(x_class, x_attributes, __VA_ARGS__)
 #endif // GPUCA_GPUCODE
 
-#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (GPUCA_M_STRIP(x_attributes), REG, (GPUCA_M_CAT(GPUCA_LB_, GPUCA_M_KRNL_NAME(x_class)))), __VA_ARGS__)
+#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_LB_, GPUCA_M_KRNL_NAME(x_class)))), __VA_ARGS__)
 
 #endif // O2_GPU_GPURECONSTRUCTIONKERNELMACROS_H
 // clang-format on
@@ -41,11 +41,8 @@ struct krnlExec {
 };
 struct krnlRunRange {
   constexpr krnlRunRange() = default;
-  constexpr krnlRunRange(uint32_t a) : start(a), num(0) {}
-  constexpr krnlRunRange(uint32_t s, int32_t n) : start(s), num(n) {}
-
-  uint32_t start = 0;
-  int32_t num = 0;
+  constexpr krnlRunRange(uint32_t v) : index(v) {}
+  uint32_t index = 0;
 };
 struct krnlEvent {
   constexpr krnlEvent(deviceEvent* e = nullptr, deviceEvent* el = nullptr, int32_t n = 1) : ev(e), evList(el), nEvents(n) {}
@@ -63,7 +60,7 @@ struct krnlProperties {
 };
 
 struct krnlSetup {
-  krnlSetup(const krnlExec& xx, const krnlRunRange& yy = {0, -1}, const krnlEvent& zz = {nullptr, nullptr, 0}) : x(xx), y(yy), z(zz) {}
+  krnlSetup(const krnlExec& xx, const krnlRunRange& yy = {0}, const krnlEvent& zz = {nullptr, nullptr, 0}) : x(xx), y(yy), z(zz) {}
   krnlExec x;
   krnlRunRange y;
   krnlEvent z;
@@ -98,7 +95,7 @@ class GPUReconstructionKernels : public T
   template <class S, int32_t I = 0, typename... Args>
   using krnlSetupArgs = gpu_reconstruction_kernels::krnlSetupArgs<S, I, Args...>;
 
-#define GPUCA_KRNL(x_class, attributes, x_arguments, x_forward, x_types)                                                                                \
+#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types)                                                                              \
   virtual void runKernelImpl(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args)                                          \
   {                                                                                                                                                     \
     T::template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(args);                                                                                 \
 
@@ -380,7 +380,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
   GPUFailedMsg(cuModuleLoadData(mInternals->kernelModules.back().get(), GPUCA_M_CAT3(_binary_cuda_kernel_module_fatbin_krnl_, GPUCA_M_KRNL_NAME(x_class), GPUCA_M_CAT(PER_KERNEL_OBJECT_EXT, _start))));
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
-      loadKernelModules(true, false);
+      loadKernelModules(true);
     }
 #endif
     void* devPtrConstantMem = nullptr;
@@ -630,34 +630,20 @@ void GPUReconstructionCUDABackend::PrintKernelOccupancies()
   }
 }
 
-void GPUReconstructionCUDA::loadKernelModules(bool perKernel, bool perSingleMulti)
+void GPUReconstructionCUDA::loadKernelModules(bool perKernel)
 {
   uint32_t j = 0;
-#define GPUCA_KRNL(...)                          \
-  GPUCA_KRNL_WRAP(GPUCA_KRNL_LOAD_, __VA_ARGS__) \
-  j += !perSingleMulti;
-#define GPUCA_KRNL_LOAD_single(x_class, ...)                                                                                                                                               \
+#define GPUCA_KRNL(x_class, ...)                                                                                                                                                           \
   getRTCkernelNum<false, GPUCA_M_KRNL_TEMPLATE(x_class)>(mInternals->kernelFunctions.size());                                                                                              \
   mInternals->kernelFunctions.emplace_back(new CUfunction);                                                                                                                                \
   mInternals->kernelNames.emplace_back(GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))));                                                                                       \
   if (mProcessingSettings.debugLevel >= 3) {                                                                                                                                               \
     GPUInfo("Loading kernel %s (j = %u)", GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), j);                                                                                 \
   }                                                                                                                                                                                        \
   GPUFailedMsg(cuModuleGetFunction(mInternals->kernelFunctions.back().get(), *mInternals->kernelModules[perKernel ? j : 0], GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))))); \
-  j += perSingleMulti;
-#define GPUCA_KRNL_LOAD_multi(x_class, ...)                                                                                                                                                         \
-  getRTCkernelNum<true, GPUCA_M_KRNL_TEMPLATE(x_class)>(mInternals->kernelFunctions.size());                                                                                                        \
-  mInternals->kernelFunctions.emplace_back(new CUfunction);                                                                                                                                         \
-  mInternals->kernelNames.emplace_back(GPUCA_M_STR(GPUCA_M_CAT3(krnl_, GPUCA_M_KRNL_NAME(x_class), _multi)));                                                                                       \
-  if (mProcessingSettings.debugLevel >= 3) {                                                                                                                                                        \
-    GPUInfo("Loading kernel %s (j = %u)", GPUCA_M_STR(GPUCA_M_CAT3(krnl_, GPUCA_M_KRNL_NAME(x_class), _multi)), j);                                                                                 \
-  }                                                                                                                                                                                                 \
-  GPUFailedMsg(cuModuleGetFunction(mInternals->kernelFunctions.back().get(), *mInternals->kernelModules[perKernel ? j : 0], GPUCA_M_STR(GPUCA_M_CAT3(krnl_, GPUCA_M_KRNL_NAME(x_class), _multi)))); \
-  j += perSingleMulti;
+  j++;
 #include "GPUReconstructionKernelList.h"
 #undef GPUCA_KRNL
-#undef GPUCA_KRNL_LOAD_single
-#undef GPUCA_KRNL_LOAD_multi
 
   if (j != mInternals->kernelModules.size()) {
     GPUFatal("Did not load all kernels (%u < %u)", j, (uint32_t)mInternals->kernelModules.size());
 
@@ -98,7 +98,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels<GPUReconstructionC
  private:
   int32_t genRTC(std::string& filename, uint32_t& nCompile);
   void genAndLoadRTC();
-  void loadKernelModules(bool perKernel, bool perSingleMulti = true);
+  void loadKernelModules(bool perKernel);
   const char *mRtcSrcExtension = ".src", *mRtcBinExtension = ".o";
 };