Skip to content
Closed
36 changes: 14 additions & 22 deletions GPU/Common/GPUCommonAlgorithm.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@

// ----------------------------- SORTING -----------------------------

namespace o2
{
namespace gpu
namespace o2::gpu
{
class GPUCommonAlgorithm
{
Expand All @@ -43,6 +41,10 @@ class GPUCommonAlgorithm
GPUd() static void sortInBlock(T* begin, T* end, const S& comp);
template <class T, class S>
GPUd() static void sortDeviceDynamic(T* begin, T* end, const S& comp);
#ifndef __OPENCL__
template <class T, class S>
GPUh() static void sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
#endif
template <class T>
GPUd() static void swap(T& a, T& b);

Expand Down Expand Up @@ -71,13 +73,6 @@ class GPUCommonAlgorithm
template <typename I>
GPUd() static void IterSwap(I a, I b) noexcept;
};
} // namespace gpu
} // namespace o2

namespace o2
{
namespace gpu
{

#ifndef GPUCA_ALGORITHM_STD
template <typename I>
Expand Down Expand Up @@ -217,18 +212,15 @@ GPUdi() void GPUCommonAlgorithm::QuickSort(I f, I l) noexcept

typedef GPUCommonAlgorithm CAAlgo;

} // namespace gpu
} // namespace o2
} // namespace o2::gpu

#if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_HOSTONLY)

#include "GPUCommonAlgorithmThrust.h"

#else

namespace o2
{
namespace gpu
namespace o2::gpu
{

template <class T>
Expand All @@ -247,15 +239,12 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co
GPUCommonAlgorithm::sort(begin, end, comp);
}

} // namespace gpu
} // namespace o2
} // namespace o2::gpu

#endif // THRUST
// sort and sortInBlock below are not taken from Thrust, since our implementations are faster

namespace o2
{
namespace gpu
namespace o2::gpu
{

template <class T>
Expand Down Expand Up @@ -328,8 +317,7 @@ GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b)
}
#endif

} // namespace gpu
} // namespace o2
} // namespace o2::gpu

// ----------------------------- WORK GROUP FUNCTIONS -----------------------------

Expand Down Expand Up @@ -458,4 +446,8 @@ GPUdi() T warp_broadcast(T v, int32_t i)

#endif

#ifdef GPUCA_ALGORITHM_STD
#undef GPUCA_ALGORITHM_STD
#endif

#endif
36 changes: 28 additions & 8 deletions GPU/Common/GPUCommonAlgorithmThrust.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,19 @@
#pragma GCC diagnostic pop

#include "GPUCommonDef.h"
#include "GPUCommonHelpers.h"

#ifdef __CUDACC__
#ifndef __HIPCC__ // CUDA
#define GPUCA_THRUST_NAMESPACE thrust::cuda
#else
#define GPUCA_CUB_NAMESPACE cub
#include <cub/cub.cuh>
#else // HIP
#define GPUCA_THRUST_NAMESPACE thrust::hip
#define GPUCA_CUB_NAMESPACE hipcub
#include <hipcub/hipcub.hpp>
#endif

namespace o2
{
namespace gpu
namespace o2::gpu
{

// - Our quicksort and bubble sort implementations are faster
Expand All @@ -54,7 +57,7 @@ GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end, const S& comp)
}

template <class T>
GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end)
GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end) // TODO: Try cub::BlockMergeSort
{
if (get_local_id(0) == 0) {
sortDeviceDynamic(begin, end);
Expand Down Expand Up @@ -87,7 +90,24 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co
thrust::sort(GPUCA_THRUST_NAMESPACE::par, thrustBegin, thrustEnd, comp);
}

} // namespace gpu
} // namespace o2
template <class T, class S>
GPUhi() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp)
{
thrust::device_ptr<T> p(begin);
#if 0 // Use Thrust
auto alloc = rec->getThrustVolatileDeviceAllocator();
thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(rec->mInternals->Streams[stream]), p, p + N, comp);
#else // Use CUB
size_t tempSize = 0;
void* tempMem = nullptr;
GPUChkErrS(GPUCA_CUB_NAMESPACE::DeviceMergeSort::SortKeys(tempMem, tempSize, begin, N, comp, rec->mInternals->Streams[stream]));
tempMem = rec->AllocateVolatileDeviceMemory(tempSize);
GPUChkErrS(GPUCA_CUB_NAMESPACE::DeviceMergeSort::SortKeys(tempMem, tempSize, begin, N, comp, rec->mInternals->Streams[stream]));
#endif
}
} // namespace o2::gpu

#undef GPUCA_THRUST_NAMESPACE
#undef GPUCA_CUB_NAMESPACE

#endif
19 changes: 19 additions & 0 deletions GPU/Common/GPUCommonHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "GPUCommonDef.h"
#include "GPUCommonLogger.h"
#include <cstdint>
#include <functional>

namespace o2::gpu::internal
{
Expand All @@ -60,4 +61,22 @@ static inline int32_t GPUReconstructionChkErr(const int64_t error, const char* f
#undef GPUCOMMON_INTERNAL_CAT
} // namespace o2::gpu::internal

namespace o2::gpu
{
class GPUReconstruction;
class ThrustVolatileAllocator
{
public:
typedef char value_type;

char* allocate(std::ptrdiff_t n);
void deallocate(char* ptr, size_t);

private:
ThrustVolatileAllocator(GPUReconstruction* r);
std::function<char*(size_t)> mAlloc;
friend class GPUReconstruction;
};
} // namespace o2::gpu

#endif
7 changes: 2 additions & 5 deletions GPU/Common/GPUCommonMath.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,7 @@
#define GPUCA_CHOICE(c1, c2, c3) (c1) // Select first option for Host
#endif // clang-format on

namespace o2
{
namespace gpu
namespace o2::gpu
{

class GPUCommonMath
Expand Down Expand Up @@ -540,7 +538,6 @@ GPUdii() void GPUCommonMath::AtomicMinInternal(GPUglobalref() GPUgeneric() GPUAt

#undef GPUCA_CHOICE

} // namespace gpu
} // namespace o2
} // namespace o2::gpu

#endif // GPUCOMMONMATH_H
7 changes: 2 additions & 5 deletions GPU/Common/GPUCommonTransform3D.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@

#include "GPUCommonDef.h"

namespace o2
{
namespace gpu
namespace o2::gpu
{
class Transform3D
{
Expand Down Expand Up @@ -79,7 +77,6 @@ class Transform3D
kZZ = 10,
kDZ = 11 };
};
} // namespace gpu
} // namespace o2
} // namespace o2::gpu

#endif
7 changes: 2 additions & 5 deletions GPU/Common/GPUROOTCartesianFwd.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ class DefaultCoordinateSystemTag;
} // namespace Math
} // namespace ROOT

namespace o2
{
namespace math_utils
namespace o2::math_utils
{

namespace detail
Expand Down Expand Up @@ -79,7 +77,6 @@ template <typename T>
using Vector3D = detail::GPUPoint3D<T, 1>;
#endif

} // namespace math_utils
} // namespace o2
} // namespace o2::math_utils

#endif
7 changes: 2 additions & 5 deletions GPU/Common/GPUROOTSMatrixFwd.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,7 @@ class MatRepStd;
} // namespace Math
} // namespace ROOT

namespace o2
{
namespace math_utils
namespace o2::math_utils
{

namespace detail
Expand Down Expand Up @@ -72,7 +70,6 @@ template <class T, uint32_t D1, uint32_t D2 = D1>
using MatRepStd = detail::MatRepStdGPU<T, D1, D2>;
#endif

} // namespace math_utils
} // namespace o2
} // namespace o2::math_utils

#endif
14 changes: 7 additions & 7 deletions GPU/GPUTracking/Base/GPUGeneralKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
#endif

#if defined(__HIPCC__)
#define GPUCA_CUB hipcub
#define GPUCA_CUB_NAMESPACE hipcub
#else
#define GPUCA_CUB cub
#define GPUCA_CUB_NAMESPACE cub
#endif

namespace o2::gpu
Expand All @@ -54,7 +54,7 @@ class GPUKernelTemplate
struct GPUSharedMemoryWarpScan64 {
// Provides the shared memory resources for warp wide CUB collectives
#if (defined(__CUDACC__) || defined(__HIPCC__)) && defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_HOSTONLY)
typedef GPUCA_CUB::WarpScan<T> WarpScan;
typedef GPUCA_CUB_NAMESPACE::WarpScan<T> WarpScan;
union {
typename WarpScan::TempStorage cubWarpTmpMem;
};
Expand All @@ -65,9 +65,9 @@ class GPUKernelTemplate
struct GPUSharedMemoryScan64 {
// Provides the shared memory resources for CUB collectives
#if (defined(__CUDACC__) || defined(__HIPCC__)) && defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_HOSTONLY)
typedef GPUCA_CUB::BlockScan<T, I> BlockScan;
typedef GPUCA_CUB::BlockReduce<T, I> BlockReduce;
typedef GPUCA_CUB::WarpScan<T> WarpScan;
typedef GPUCA_CUB_NAMESPACE::BlockScan<T, I> BlockScan;
typedef GPUCA_CUB_NAMESPACE::BlockReduce<T, I> BlockReduce;
typedef GPUCA_CUB_NAMESPACE::WarpScan<T> WarpScan;
union {
typename BlockScan::TempStorage cubTmpMem;
typename BlockReduce::TempStorage cubReduceTmpMem;
Expand Down Expand Up @@ -110,6 +110,6 @@ class GPUitoa : public GPUKernelTemplate

} // namespace o2::gpu

#undef GPUCA_CUB
#undef GPUCA_CUB_NAMESPACE

#endif
10 changes: 10 additions & 0 deletions GPU/GPUTracking/Base/GPUReconstruction.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "GPUROOTDumpCore.h"
#include "GPUConfigDump.h"
#include "GPUChainTracking.h"
#include "GPUCommonHelpers.h"

#include "GPUMemoryResource.h"
#include "GPUChain.h"
Expand Down Expand Up @@ -1193,3 +1194,12 @@ void GPUReconstruction::SetInputControl(void* ptr, size_t size)
{
mInputControl.set(ptr, size);
}

ThrustVolatileAllocator::ThrustVolatileAllocator(GPUReconstruction* r)
{
mAlloc = [&r](size_t n) { return (char*)r->AllocateVolatileDeviceMemory(n); };
}
ThrustVolatileAllocator GPUReconstruction::getThrustVolatileDeviceAllocator()
{
return ThrustVolatileAllocator(this);
}
2 changes: 2 additions & 0 deletions GPU/GPUTracking/Base/GPUReconstruction.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ struct GPUMemorySizeScalers;
struct GPUReconstructionPipelineContext;
struct GPUReconstructionThreading;
class GPUROOTDumpCore;
class ThrustVolatileAllocator;

namespace gpu_reconstruction_kernels
{
Expand Down Expand Up @@ -165,6 +166,7 @@ class GPUReconstruction
void ClearAllocatedMemory(bool clearOutputs = true);
void ReturnVolatileDeviceMemory();
void ReturnVolatileMemory();
ThrustVolatileAllocator getThrustVolatileDeviceAllocator();
void PushNonPersistentMemory(uint64_t tag);
void PopNonPersistentMemory(RecoStep step, uint64_t tag);
void BlockStackedMemory(GPUReconstruction* rec);
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Base/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ endif()
message(STATUS "Building GPUTracking with CUDA support ${TMP_TARGET}")

set(SRCS GPUReconstructionCUDA.cu GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu)
set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesHost.h CUDAThrustHelpers.h)
set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesHost.h)
# -------------------------------- Prepare RTC -------------------------------------------------------
enable_language(ASM)
if(ALIGPU_BUILD_TYPE STREQUAL "O2")
Expand Down
Loading