Skip to content

Commit ac0408b

Browse files
committed
GPU: Add GPUCommonAlgorithm::sortOnDevice function for starting sort on device from host
1 parent d9b654e commit ac0408b

File tree

6 files changed

+21
-18
lines changed

6 files changed

+21
-18
lines changed

GPU/Common/GPUCommonAlgorithm.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ class GPUCommonAlgorithm
4343
GPUd() static void sortInBlock(T* begin, T* end, const S& comp);
4444
template <class T, class S>
4545
GPUd() static void sortDeviceDynamic(T* begin, T* end, const S& comp);
46+
#ifndef __OPENCL__
47+
template <class T, class S>
48+
GPUh() static void sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
49+
#endif
4650
template <class T>
4751
GPUd() static void swap(T& a, T& b);
4852

GPU/Common/GPUCommonAlgorithmThrust.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,14 @@ GPUdi() void GPUCommonAlgorithm::sortDeviceDynamic(T* begin, T* end, const S& co
8787
thrust::sort(GPUCA_THRUST_NAMESPACE::par, thrustBegin, thrustEnd, comp);
8888
}
8989

90+
template <class T, class S>
91+
GPUhi() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp)
92+
{
93+
thrust::device_ptr<T> p(begin);
94+
auto alloc = rec->getThrustVolatileDeviceAllocator();
95+
thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(rec->mInternals->Streams[stream]), p, p + N, comp);
96+
}
97+
9098
} // namespace gpu
9199
} // namespace o2
92100

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase
5454

5555
void getRTCKernelCalls(std::vector<std::string>& kernels);
5656

57+
template <class T, class S>
58+
friend GPUh() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);
5759
GPUReconstructionCUDAInternals* mInternals;
5860
};
5961

GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
121121
for (uint32_t i = 0; i < NSECTORS; i++) {
122122
runKernel<GPUTPCGMMergerUnpackSaveNumber>({{1, -WarpSize(), 0, deviceType}}, i);
123123
runKernel<GPUTPCGMMergerUnpackResetIds>(GetGridAuto(0, deviceType), i);
124-
runKernel<GPUTPCGMMergerSectorRefit>(GetGridAuto(0, deviceType), i);
124+
runKernel<GPUTPCGMMergerSectorRefit>(GetGridAuto(0, deviceType), i); // TODO: Why all in stream 0?
125125
}
126126
if (GetProcessingSettings().deterministicGPUReconstruction) {
127127
runKernel<GPUTPCGMMergerUnpackSaveNumber>({{1, -WarpSize(), 0, deviceType}}, NSECTORS);

GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -772,12 +772,10 @@ struct MergeBorderTracks_compMin {
772772
template <>
773773
inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)
774774
{
775-
thrust::device_ptr<GPUTPCGMBorderRange> p(range);
776-
ThrustVolatileAllocator alloc = getThrustVolatileDeviceAllocator();
777775
if (cmpMax) {
778-
thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), p, p + N, MergeBorderTracks_compMax());
776+
GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMax());
779777
} else {
780-
thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), p, p + N, MergeBorderTracks_compMin());
778+
GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMin());
781779
}
782780
}
783781
#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize MergeBorderTracks<3>
@@ -1877,17 +1875,13 @@ struct GPUTPCGMMergerSortTracksQPt_comp {
18771875
template <>
18781876
inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)
18791877
{
1880-
thrust::device_ptr<uint32_t> trackSort((uint32_t*)mProcessorsShadow->tpcMerger.TrackOrderProcess());
1881-
ThrustVolatileAllocator alloc = getThrustVolatileDeviceAllocator();
1882-
thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
1878+
GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
18831879
}
18841880

18851881
template <>
18861882
inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)
18871883
{
1888-
thrust::device_ptr<uint32_t> trackSort((uint32_t*)mProcessorsShadow->tpcMerger.TrackSort());
1889-
ThrustVolatileAllocator alloc = getThrustVolatileDeviceAllocator();
1890-
thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
1884+
GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
18911885
}
18921886
#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt
18931887

@@ -2110,9 +2104,7 @@ struct GPUTPCGMMergerMergeLoopers_comp {
21102104
template <>
21112105
inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)
21122106
{
2113-
thrust::device_ptr<MergeLooperParam> params(mProcessorsShadow->tpcMerger.LooperCandidates());
2114-
ThrustVolatileAllocator alloc = getThrustVolatileDeviceAllocator();
2115-
thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), params, params + processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp());
2107+
GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.LooperCandidates(), processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp());
21162108
}
21172109
#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt
21182110

GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,7 @@ struct GPUTPCGMO2OutputSort_comp {
104104
template <>
105105
inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)
106106
{
107-
thrust::device_ptr<GPUTPCGMMerger::tmpSort> trackSort(mProcessorsShadow->tpcMerger.TrackSortO2());
108-
ThrustVolatileAllocator alloc = getThrustVolatileDeviceAllocator();
109-
;
110-
thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp());
107+
GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSortO2(), processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp());
111108
}
112109
#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMO2Output::Thread<GPUTPCGMO2Output::sort>
113110

0 commit comments

Comments
 (0)