Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 14 additions & 22 deletions GPU/Common/GPUCommonAlgorithm.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,7 @@

// ----------------------------- SORTING -----------------------------

namespace o2
{
namespace gpu
namespace o2::gpu
{
class GPUCommonAlgorithm
{
Expand All @@ -43,6 +41,10 @@ class GPUCommonAlgorithm
GPUd() static void sortInBlock(T* begin, T* end, const S& comp);
template <class T, class S>
GPUd() static void sortDeviceDynamic(T* begin, T* end, const S& comp);
#ifndef __OPENCL__
template <class T, class S>
GPUh() static void sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
#endif
template <class T>
GPUd() static void swap(T& a, T& b);

Expand Down Expand Up @@ -71,13 +73,6 @@ class GPUCommonAlgorithm
template <typename I>
GPUd() static void IterSwap(I a, I b) noexcept;
};
} // namespace gpu
} // namespace o2

namespace o2
{
namespace gpu
{

#ifndef GPUCA_ALGORITHM_STD
template <typename I>
Expand Down Expand Up @@ -217,18 +212,15 @@ GPUdi() void GPUCommonAlgorithm::QuickSort(I f, I l) noexcept

typedef GPUCommonAlgorithm CAAlgo;

} // namespace gpu
} // namespace o2
} // namespace o2::gpu

#if (((defined(__CUDACC__) && !defined(__clang__)) || defined(__HIPCC__))) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_HOSTONLY)

#include "GPUCommonAlgorithmThrust.h"

#else

namespace o2
{
namespace gpu
namespace o2::gpu
{

template <class T>
Expand All @@ -247,15 +239,12 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co
GPUCommonAlgorithm::sort(begin, end, comp);
}

} // namespace gpu
} // namespace o2
} // namespace o2::gpu

#endif // THRUST
// sort and sortInBlock below are not taken from Thrust, since our implementations are faster

namespace o2
{
namespace gpu
namespace o2::gpu
{

template <class T>
Expand Down Expand Up @@ -328,8 +317,7 @@ GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b)
}
#endif

} // namespace gpu
} // namespace o2
} // namespace o2::gpu

// ----------------------------- WORK GROUP FUNCTIONS -----------------------------

Expand Down Expand Up @@ -458,4 +446,8 @@ GPUdi() T warp_broadcast(T v, int32_t i)

#endif

#ifdef GPUCA_ALGORITHM_STD
#undef GPUCA_ALGORITHM_STD
#endif

#endif
36 changes: 28 additions & 8 deletions GPU/Common/GPUCommonAlgorithmThrust.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,19 @@
#pragma GCC diagnostic pop

#include "GPUCommonDef.h"
#include "GPUCommonHelpers.h"

#ifdef __CUDACC__
#ifndef __HIPCC__ // CUDA
#define GPUCA_THRUST_NAMESPACE thrust::cuda
#else
#define GPUCA_CUB_NAMESPACE cub
#include <cub/cub.cuh>
#else // HIP
#define GPUCA_THRUST_NAMESPACE thrust::hip
#define GPUCA_CUB_NAMESPACE hipcub
#include <hipcub/hipcub.hpp>
#endif

namespace o2
{
namespace gpu
namespace o2::gpu
{

// - Our quicksort and bubble sort implementations are faster
Expand All @@ -54,7 +57,7 @@ GPUdi() void GPUCommonAlgorithm::sort(T* begin, T* end, const S& comp)
}

template <class T>
GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end)
GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end) // TODO: Try cub::BlockMergeSort
{
if (get_local_id(0) == 0) {
sortDeviceDynamic(begin, end);
Expand Down Expand Up @@ -87,7 +90,24 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co
thrust::sort(GPUCA_THRUST_NAMESPACE::par, thrustBegin, thrustEnd, comp);
}

} // namespace gpu
} // namespace o2
template <class T, class S>
GPUhi() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp)
{
thrust::device_ptr<T> p(begin);
#if 0 // Use Thrust
auto alloc = rec->getThrustVolatileDeviceAllocator();
thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(rec->mInternals->Streams[stream]), p, p + N, comp);
#else // Use CUB
size_t tempSize = 0;
void* tempMem = nullptr;
GPUChkErrS(GPUCA_CUB_NAMESPACE::DeviceMergeSort::SortKeys(tempMem, tempSize, begin, N, comp, rec->mInternals->Streams[stream]));
tempMem = rec->AllocateVolatileDeviceMemory(tempSize);
GPUChkErrS(GPUCA_CUB_NAMESPACE::DeviceMergeSort::SortKeys(tempMem, tempSize, begin, N, comp, rec->mInternals->Streams[stream]));
#endif
}
} // namespace o2::gpu

#undef GPUCA_THRUST_NAMESPACE
#undef GPUCA_CUB_NAMESPACE

#endif
19 changes: 19 additions & 0 deletions GPU/Common/GPUCommonHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "GPUCommonDef.h"
#include "GPUCommonLogger.h"
#include <cstdint>
#include <functional>

namespace o2::gpu::internal
{
Expand All @@ -60,4 +61,22 @@ static inline int32_t GPUReconstructionChkErr(const int64_t error, const char* f
#undef GPUCOMMON_INTERNAL_CAT
} // namespace o2::gpu::internal

namespace o2::gpu
{
class GPUReconstruction;
class ThrustVolatileAllocator
{
public:
typedef char value_type;

char* allocate(std::ptrdiff_t n);
void deallocate(char* ptr, size_t);

private:
ThrustVolatileAllocator(GPUReconstruction* r);
std::function<char*(size_t)> mAlloc;
friend class GPUReconstruction;
};
} // namespace o2::gpu

#endif
7 changes: 2 additions & 5 deletions GPU/Common/GPUCommonMath.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,7 @@
#define GPUCA_CHOICE(c1, c2, c3) (c1) // Select first option for Host
#endif // clang-format on

namespace o2
{
namespace gpu
namespace o2::gpu
{

class GPUCommonMath
Expand Down Expand Up @@ -540,7 +538,6 @@ GPUdii() void GPUCommonMath::AtomicMinInternal(GPUglobalref() GPUgeneric() GPUAt

#undef GPUCA_CHOICE

} // namespace gpu
} // namespace o2
} // namespace o2::gpu

#endif // GPUCOMMONMATH_H
7 changes: 2 additions & 5 deletions GPU/Common/GPUCommonTransform3D.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@

#include "GPUCommonDef.h"

namespace o2
{
namespace gpu
namespace o2::gpu
{
class Transform3D
{
Expand Down Expand Up @@ -79,7 +77,6 @@ class Transform3D
kZZ = 10,
kDZ = 11 };
};
} // namespace gpu
} // namespace o2
} // namespace o2::gpu

#endif
7 changes: 2 additions & 5 deletions GPU/Common/GPUROOTCartesianFwd.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ class DefaultCoordinateSystemTag;
} // namespace Math
} // namespace ROOT

namespace o2
{
namespace math_utils
namespace o2::math_utils
{

namespace detail
Expand Down Expand Up @@ -79,7 +77,6 @@ template <typename T>
using Vector3D = detail::GPUPoint3D<T, 1>;
#endif

} // namespace math_utils
} // namespace o2
} // namespace o2::math_utils

#endif
7 changes: 2 additions & 5 deletions GPU/Common/GPUROOTSMatrixFwd.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,7 @@ class MatRepStd;
} // namespace Math
} // namespace ROOT

namespace o2
{
namespace math_utils
namespace o2::math_utils
{

namespace detail
Expand Down Expand Up @@ -72,7 +70,6 @@ template <class T, uint32_t D1, uint32_t D2 = D1>
using MatRepStd = detail::MatRepStdGPU<T, D1, D2>;
#endif

} // namespace math_utils
} // namespace o2
} // namespace o2::math_utils

#endif
14 changes: 7 additions & 7 deletions GPU/GPUTracking/Base/GPUGeneralKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
#endif

#if defined(__HIPCC__)
#define GPUCA_CUB hipcub
#define GPUCA_CUB_NAMESPACE hipcub
#else
#define GPUCA_CUB cub
#define GPUCA_CUB_NAMESPACE cub
#endif

namespace o2::gpu
Expand All @@ -54,7 +54,7 @@ class GPUKernelTemplate
struct GPUSharedMemoryWarpScan64 {
// Provides the shared memory resources for warp wide CUB collectives
#if (defined(__CUDACC__) || defined(__HIPCC__)) && defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_HOSTONLY)
typedef GPUCA_CUB::WarpScan<T> WarpScan;
typedef GPUCA_CUB_NAMESPACE::WarpScan<T> WarpScan;
union {
typename WarpScan::TempStorage cubWarpTmpMem;
};
Expand All @@ -65,9 +65,9 @@ class GPUKernelTemplate
struct GPUSharedMemoryScan64 {
// Provides the shared memory resources for CUB collectives
#if (defined(__CUDACC__) || defined(__HIPCC__)) && defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_HOSTONLY)
typedef GPUCA_CUB::BlockScan<T, I> BlockScan;
typedef GPUCA_CUB::BlockReduce<T, I> BlockReduce;
typedef GPUCA_CUB::WarpScan<T> WarpScan;
typedef GPUCA_CUB_NAMESPACE::BlockScan<T, I> BlockScan;
typedef GPUCA_CUB_NAMESPACE::BlockReduce<T, I> BlockReduce;
typedef GPUCA_CUB_NAMESPACE::WarpScan<T> WarpScan;
union {
typename BlockScan::TempStorage cubTmpMem;
typename BlockReduce::TempStorage cubReduceTmpMem;
Expand Down Expand Up @@ -110,6 +110,6 @@ class GPUitoa : public GPUKernelTemplate

} // namespace o2::gpu

#undef GPUCA_CUB
#undef GPUCA_CUB_NAMESPACE

#endif
10 changes: 10 additions & 0 deletions GPU/GPUTracking/Base/GPUReconstruction.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "GPUROOTDumpCore.h"
#include "GPUConfigDump.h"
#include "GPUChainTracking.h"
#include "GPUCommonHelpers.h"

#include "GPUMemoryResource.h"
#include "GPUChain.h"
Expand Down Expand Up @@ -1193,3 +1194,12 @@ void GPUReconstruction::SetInputControl(void* ptr, size_t size)
{
mInputControl.set(ptr, size);
}

ThrustVolatileAllocator::ThrustVolatileAllocator(GPUReconstruction* r)
{
mAlloc = [&r](size_t n) { return (char*)r->AllocateVolatileDeviceMemory(n); };
}
ThrustVolatileAllocator GPUReconstruction::getThrustVolatileDeviceAllocator()
{
return ThrustVolatileAllocator(this);
}
2 changes: 2 additions & 0 deletions GPU/GPUTracking/Base/GPUReconstruction.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ struct GPUMemorySizeScalers;
struct GPUReconstructionPipelineContext;
struct GPUReconstructionThreading;
class GPUROOTDumpCore;
class ThrustVolatileAllocator;

namespace gpu_reconstruction_kernels
{
Expand Down Expand Up @@ -165,6 +166,7 @@ class GPUReconstruction
void ClearAllocatedMemory(bool clearOutputs = true);
void ReturnVolatileDeviceMemory();
void ReturnVolatileMemory();
ThrustVolatileAllocator getThrustVolatileDeviceAllocator();
void PushNonPersistentMemory(uint64_t tag);
void PopNonPersistentMemory(RecoStep step, uint64_t tag);
void BlockStackedMemory(GPUReconstruction* rec);
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Base/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ endif()
message(STATUS "Building GPUTracking with CUDA support ${TMP_TARGET}")

set(SRCS GPUReconstructionCUDA.cu GPUReconstructionCUDAGenRTC.cxx GPUReconstructionCUDAKernels.cu)
set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesHost.h CUDAThrustHelpers.h)
set(HDRS GPUReconstructionCUDA.h GPUReconstructionCUDAInternals.h GPUReconstructionCUDAHelpers.inc GPUReconstructionCUDADef.h GPUReconstructionCUDAIncludesHost.h)
# -------------------------------- Prepare RTC -------------------------------------------------------
enable_language(ASM)
if(ALIGPU_BUILD_TYPE STREQUAL "O2")
Expand Down
Loading