AliceO2Group
diff --git a/‎CODEOWNERS‎
Lines changed: 2 additions & 2 deletions b/‎CODEOWNERS‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu‎
Lines changed: 2 additions & 7 deletions b/‎Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎GPU/Common/GPUCommonAlgorithm.h‎
Lines changed: 14 additions & 22 deletions b/‎GPU/Common/GPUCommonAlgorithm.h‎
Lines changed: 14 additions & 22 deletions
diff --git a/‎GPU/Common/GPUCommonAlgorithmThrust.h‎
Lines changed: 28 additions & 8 deletions b/‎GPU/Common/GPUCommonAlgorithmThrust.h‎
Lines changed: 28 additions & 8 deletions
diff --git a/‎GPU/Common/GPUCommonDef.h‎
Lines changed: 9 additions & 1 deletion b/‎GPU/Common/GPUCommonDef.h‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎GPU/Common/GPUCommonHelpers.h‎
Lines changed: 19 additions & 0 deletions b/‎GPU/Common/GPUCommonHelpers.h‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎GPU/Common/GPUCommonMath.h‎
Lines changed: 32 additions & 29 deletions b/‎GPU/Common/GPUCommonMath.h‎
Lines changed: 32 additions & 29 deletions
diff --git a/‎GPU/Common/GPUCommonTransform3D.h‎
Lines changed: 2 additions & 5 deletions b/‎GPU/Common/GPUCommonTransform3D.h‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎GPU/Common/GPUROOTCartesianFwd.h‎
Lines changed: 2 additions & 5 deletions b/‎GPU/Common/GPUROOTCartesianFwd.h‎
Lines changed: 2 additions & 5 deletions
@@ -34,7 +34,7 @@
 /DataFormats/Detectors/GlobalTracking          @shahor02
 /DataFormats/Detectors/GlobalTrackingWorkflow  @shahor02
 /DataFormats/Detectors/HMPID                   @gvolpe79
-/DataFormats/Detectors/ITSMFT                  @mcoquet642 @mconcas @shahor02
+/DataFormats/Detectors/ITSMFT                  @fprino @mcoquet642 @mconcas @shahor02
 /DataFormats/Detectors/MUON                    @AliceO2Group/muon-experts @shahor02
 /DataFormats/Detectors/PHOS                    @peressounko @kharlov
 /DataFormats/Detectors/Passive                 @sawenzel
@@ -65,7 +65,7 @@
 /Detectors/GlobalTracking          @shahor02
 /Detectors/GlobalTrackingWorkflow  @shahor02
 /Detectors/HMPID                   @gvolpe79
-/Detectors/ITSMFT                  @mcoquet642 @mconcas @shahor02
+/Detectors/ITSMFT                  @fprino @mcoquet642 @mconcas @shahor02
 /Detectors/MUON                    @AliceO2Group/muon-experts @shahor02
 /Detectors/PHOS                    @peressounko @kharlov
 /Detectors/Passive                 @sawenzel
 
@@ -43,13 +43,8 @@
 #define THRUST_NAMESPACE thrust::hip
 #endif
 
-#ifdef GPUCA_DETERMINISTIC_MODE
-#define GPU_BLOCKS 1
-#define GPU_THREADS 1
-#else
-#define GPU_BLOCKS 99999
-#define GPU_THREADS 99999
-#endif
+#define GPU_BLOCKS GPUCA_DETERMINISTIC_CODE(1, 99999)
+#define GPU_THREADS GPUCA_DETERMINISTIC_CODE(1, 99999)
 
 // O2 track model
 #include "ReconstructionDataFormats/Track.h"
 
@@ -24,9 +24,7 @@
 
 // ----------------------------- SORTING -----------------------------
 
-namespace o2
-{
-namespace gpu
+namespace o2::gpu
 {
 class GPUCommonAlgorithm
 {
@@ -43,6 +41,10 @@ class GPUCommonAlgorithm
   GPUd() static void sortInBlock(T* begin, T* end, const S& comp);
   template <class T, class S>
   GPUd() static void sortDeviceDynamic(T* begin, T* end, const S& comp);
+#ifndef __OPENCL__
+  template <class T, class S>
+  GPUh() static void sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
+#endif
   template <class T>
   GPUd() static void swap(T& a, T& b);
 
@@ -71,13 +73,6 @@ class GPUCommonAlgorithm
   template <typename I>
   GPUd() static void IterSwap(I a, I b) noexcept;
 };
-} // namespace gpu
-} // namespace o2
-
-namespace o2
-{
-namespace gpu
-{
 
 #ifndef GPUCA_ALGORITHM_STD
 template <typename I>
@@ -217,18 +212,15 @@ GPUdi() void GPUCommonAlgorithm::QuickSort(I f, I l) noexcept
 
 typedef GPUCommonAlgorithm CAAlgo;
 
-} // namespace gpu
-} // namespace o2
+} // namespace o2::gpu
 
 #if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_HOSTONLY)
 
 #include "GPUCommonAlgorithmThrust.h"
 
 #else
 
-namespace o2
-{
-namespace gpu
+namespace o2::gpu
 {
 
 template <class T>
@@ -247,15 +239,12 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co
   GPUCommonAlgorithm::sort(begin, end, comp);
 }
 
-} // namespace gpu
-} // namespace o2
+} // namespace o2::gpu
 
 #endif // THRUST
 // sort and sortInBlock below are not taken from Thrust, since our implementations are faster
 
-namespace o2
-{
-namespace gpu
+namespace o2::gpu
 {
 
 template <class T>
@@ -328,8 +317,7 @@ GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b)
 }
 #endif
 
-} // namespace gpu
-} // namespace o2
+} // namespace o2::gpu
 
 // ----------------------------- WORK GROUP FUNCTIONS -----------------------------
 
@@ -458,4 +446,8 @@ GPUdi() T warp_broadcast(T v, int32_t i)
 
 #endif
 
+#ifdef GPUCA_ALGORITHM_STD
+#undef GPUCA_ALGORITHM_STD
+#endif
+
 #endif
@@ -23,16 +23,19 @@
 #pragma GCC diagnostic pop
 
 #include "GPUCommonDef.h"
+#include "GPUCommonHelpers.h"
 
-#ifdef __CUDACC__
+#ifndef __HIPCC__ // CUDA
 #define GPUCA_THRUST_NAMESPACE thrust::cuda
-#else
+#define GPUCA_CUB_NAMESPACE cub
+#include <cub/cub.cuh>
+#else // HIP
 #define GPUCA_THRUST_NAMESPACE thrust::hip
+#define GPUCA_CUB_NAMESPACE hipcub
+#include <hipcub/hipcub.hpp>
 #endif
 
-namespace o2
-{
-namespace gpu
+namespace o2::gpu
 {
 
 // - Our quicksort and bubble sort implementations are faster
@@ -54,7 +57,7 @@ GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end, const S& comp)
 }
 
 template <class T>
-GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end)
+GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end) // TODO: Try cub::BlockMergeSort
 {
   if (get_local_id(0) == 0) {
     sortDeviceDynamic(begin, end);
@@ -87,7 +90,24 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co
   thrust::sort(GPUCA_THRUST_NAMESPACE::par, thrustBegin, thrustEnd, comp);
 }
 
-} // namespace gpu
-} // namespace o2
+template <class T, class S>
+GPUhi() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp)
+{
+  thrust::device_ptr<T> p(begin);
+#if 0 // Use Thrust
+  auto alloc = rec->getThrustVolatileDeviceAllocator();
+  thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(rec->mInternals->Streams[stream]), p, p + N, comp);
+#else // Use CUB
+  size_t tempSize = 0;
+  void* tempMem = nullptr;
+  GPUChkErrS(GPUCA_CUB_NAMESPACE::DeviceMergeSort::SortKeys(tempMem, tempSize, begin, N, comp, rec->mInternals->Streams[stream]));
+  tempMem = rec->AllocateVolatileDeviceMemory(tempSize);
+  GPUChkErrS(GPUCA_CUB_NAMESPACE::DeviceMergeSort::SortKeys(tempMem, tempSize, begin, N, comp, rec->mInternals->Streams[stream]));
+#endif
+}
+} // namespace o2::gpu
+
+#undef GPUCA_THRUST_NAMESPACE
+#undef GPUCA_CUB_NAMESPACE
 
 #endif
@@ -68,10 +68,18 @@
   #define GPUCA_DEBUG_STREAMER_CHECK(...)
 #endif
 
-#ifndef GPUCA_RTC_SPECIAL_CODE
+#ifndef GPUCA_RTC_SPECIAL_CODE // By default, we ignore special RTC code
   #define GPUCA_RTC_SPECIAL_CODE(...)
 #endif
 
+#ifndef GPUCA_DETERMINISTIC_CODE
+  #ifdef GPUCA_DETERMINISTIC_MODE
+    #define GPUCA_DETERMINISTIC_CODE(det, indet) det // In deterministic mode, take deterministic code path
+  #else
+    #define GPUCA_DETERMINISTIC_CODE(det, indet) indet // otherwise the fast default code path
+  #endif
+#endif
+
 // API Definitions for GPU Compilation
 #include "GPUCommonDefAPI.h"
 
 
@@ -35,6 +35,7 @@
 #include "GPUCommonDef.h"
 #include "GPUCommonLogger.h"
 #include <cstdint>
+#include <functional>
 
 namespace o2::gpu::internal
 {
@@ -60,4 +61,22 @@ static inline int32_t GPUReconstructionChkErr(const int64_t error, const char* f
 #undef GPUCOMMON_INTERNAL_CAT
 } // namespace o2::gpu::internal
 
+namespace o2::gpu
+{
+class GPUReconstruction;
+class ThrustVolatileAllocator
+{
+ public:
+  typedef char value_type;
+
+  char* allocate(std::ptrdiff_t n);
+  void deallocate(char* ptr, size_t);
+
+ private:
+  ThrustVolatileAllocator(GPUReconstruction* r);
+  std::function<char*(size_t)> mAlloc;
+  friend class GPUReconstruction;
+};
+} // namespace o2::gpu
+
 #endif
@@ -42,9 +42,7 @@
     #define GPUCA_CHOICE(c1, c2, c3) (c1) // Select first option for Host
 #endif // clang-format on
 
-namespace o2
-{
-namespace gpu
+namespace o2::gpu
 {
 
 class GPUCommonMath
@@ -250,7 +248,7 @@ GPUdi() uint32_t GPUCommonMath::Float2UIntReint(const float& x)
 #endif
 }
 
-#ifdef GPUCA_DETERMINISTIC_MODE
+GPUCA_DETERMINISTIC_CODE( // clang-format off
 GPUdi() constexpr float GPUCommonMath::Round(float x) { return GPUCA_CHOICE(roundf(x), roundf(x), round(x)); }
 GPUdi() constexpr int32_t GPUCommonMath::Float2IntRn(float x) { return (int32_t)Round(x); }
 GPUhdi() constexpr float GPUCommonMath::Sqrt(float x) { return GPUCA_CHOICE(sqrtf(x), (float)sqrt((double)x), sqrt(x)); }
@@ -266,7 +264,7 @@ GPUdi() constexpr float GPUCommonMath::Log(float x) { return GPUCA_CHOICE((float
 GPUdi() constexpr float GPUCommonMath::Exp(float x) { return GPUCA_CHOICE((float)exp((double)x), (float)exp((double)x), exp(x)); }
 GPUdi() constexpr bool GPUCommonMath::Finite(float x) { return GPUCA_CHOICE(std::isfinite(x), isfinite(x), isfinite(x)); }
 GPUdi() constexpr bool GPUCommonMath::IsNaN(float x) { return GPUCA_CHOICE(std::isnan(x), isnan(x), isnan(x)); }
-#else
+, // !GPUCA_DETERMINISTIC_CODE
 GPUdi() constexpr float GPUCommonMath::Round(float x) { return GPUCA_CHOICE(roundf(x), rintf(x), rint(x)); }
 GPUdi() constexpr int32_t GPUCommonMath::Float2IntRn(float x) { return GPUCA_CHOICE((int32_t)Round(x), __float2int_rn(x), (int32_t)Round(x)); }
 GPUhdi() constexpr float GPUCommonMath::Sqrt(float x) { return GPUCA_CHOICE(sqrtf(x), sqrtf(x), sqrt(x)); }
@@ -282,20 +280,22 @@ GPUdi() constexpr float GPUCommonMath::Log(float x) { return GPUCA_CHOICE(logf(x
 GPUdi() constexpr float GPUCommonMath::Exp(float x) { return GPUCA_CHOICE(expf(x), expf(x), exp(x)); }
 GPUdi() constexpr bool GPUCommonMath::Finite(float x) { return true; }
 GPUdi() constexpr bool GPUCommonMath::IsNaN(float x) { return false; }
-#endif
+) // clang-format on
 
 GPUhdi() void GPUCommonMath::SinCos(float x, float& s, float& c)
 {
-#if defined(GPUCA_DETERMINISTIC_MODE) && !defined(__OPENCL__)
-  s = sin((double)x);
-  c = cos((double)x);
-#elif !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
-  __sincosf(x, &s, &c);
+  GPUCA_DETERMINISTIC_CODE( // clang-format off
+    s = sin((double)x);
+    c = cos((double)x);
+  , // !GPUCA_DETERMINISTIC_CODE
+#if !defined(GPUCA_GPUCODE_DEVICE) && defined(__APPLE__)
+    __sincosf(x, &s, &c);
 #elif !defined(GPUCA_GPUCODE_DEVICE) && (defined(__GNU_SOURCE__) || defined(_GNU_SOURCE) || defined(GPUCA_GPUCODE))
-  sincosf(x, &s, &c);
+    sincosf(x, &s, &c);
 #else
-  GPUCA_CHOICE((void)((s = sinf(x)) + (c = cosf(x))), sincosf(x, &s, &c), s = sincos(x, &c));
+    GPUCA_CHOICE((void)((s = sinf(x)) + (c = cosf(x))), sincosf(x, &s, &c), s = sincos(x, &c));
 #endif
+  ) // clang-format on
 }
 
 GPUhdi() void GPUCommonMath::SinCosd(double x, double& s, double& c)
@@ -392,22 +392,26 @@ GPUdi() T GPUCommonMath::MaxWithRef(T x, T y, T z, T w, S refX, S refY, S refZ,
 
 GPUdi() float GPUCommonMath::InvSqrt(float _x)
 {
-#if defined(GPUCA_DETERMINISTIC_MODE) || defined(__OPENCL__)
-  return 1.f / Sqrt(_x);
-#elif defined(__CUDACC__) || defined(__HIPCC__)
-  return __frsqrt_rn(_x);
-#elif defined(__FAST_MATH__)
-  return 1.f / sqrtf(_x);
+  GPUCA_DETERMINISTIC_CODE( // clang-format off
+    return 1.f / Sqrt(_x);
+  , // !GPUCA_DETERMINISTIC_CODE
+#if defined(__CUDACC__) || defined(__HIPCC__)
+    return __frsqrt_rn(_x);
+#elif defined(__OPENCL__) && defined(__clang__)
+    return 1.f / sqrt(_x);
+#elif !defined(__OPENCL__) && (defined(__FAST_MATH__) || defined(__clang__))
+    return 1.f / sqrtf(_x);
 #else
-  union {
-    float f;
-    int32_t i;
-  } x = {_x};
-  const float xhalf = 0.5f * x.f;
-  x.i = 0x5f3759df - (x.i >> 1);
-  x.f = x.f * (1.5f - xhalf * x.f * x.f);
-  return x.f;
+    union {
+      float f;
+      int32_t i;
+    } x = {_x};
+    const float xhalf = 0.5f * x.f;
+    x.i = 0x5f3759df - (x.i >> 1);
+    x.f = x.f * (1.5f - xhalf * x.f * x.f);
+    return x.f;
 #endif
+  ) // clang-format on
 }
 
 template <>
@@ -540,7 +544,6 @@ GPUdii() void GPUCommonMath::AtomicMinInternal(GPUglobalref() GPUgeneric() GPUAt
 
 #undef GPUCA_CHOICE
 
-} // namespace gpu
-} // namespace o2
+} // namespace o2::gpu
 
 #endif // GPUCOMMONMATH_H
@@ -17,9 +17,7 @@
 
 #include "GPUCommonDef.h"
 
-namespace o2
-{
-namespace gpu
+namespace o2::gpu
 {
 class Transform3D
 {
@@ -79,7 +77,6 @@ class Transform3D
                                 kZZ = 10,
                                 kDZ = 11 };
 };
-} // namespace gpu
-} // namespace o2
+} // namespace o2::gpu
 
 #endif
@@ -46,9 +46,7 @@ class DefaultCoordinateSystemTag;
 } // namespace Math
 } // namespace ROOT
 
-namespace o2
-{
-namespace math_utils
+namespace o2::math_utils
 {
 
 namespace detail
@@ -79,7 +77,6 @@ template <typename T>
 using Vector3D = detail::GPUPoint3D<T, 1>;
 #endif
 
-} // namespace math_utils
-} // namespace o2
+} // namespace o2::math_utils
 
 #endif