GPU: Remove hack to disable synchronization in thrust::sort, which was not working any more

davidrohr · davidrohr · commit c2aa2f542ded · 2025-03-25T11:25:28.000+01:00
diff --git a/GPU/GPUTracking/Base/cuda/CUDAThrustHelpers.h b/GPU/GPUTracking/Base/cuda/CUDAThrustHelpers.h
@@ -22,12 +22,12 @@
 namespace o2::gpu
 {
 
-class ThrustVolatileAsyncAllocator
+class ThrustVolatileAllocator
 {
  public:
   typedef char value_type;
 
-  ThrustVolatileAsyncAllocator(GPUReconstruction* r) : mRec(r) {}
+  ThrustVolatileAllocator(GPUReconstruction* r) : mRec(r) {}
   char* allocate(std::ptrdiff_t n) { return (char*)mRec->AllocateVolatileDeviceMemory(n); }
 
   void deallocate(char* ptr, size_t) {}
@@ -38,24 +38,4 @@ class ThrustVolatileAsyncAllocator
 
 } // namespace o2::gpu
 
-#ifndef __HIPCC__
-// Override synchronize call at end of thrust algorithm running on stream, just don't run cudaStreamSynchronize
-namespace thrust::cuda_cub
-{
-
-typedef thrust::cuda_cub::execution_policy<typeof(thrust::cuda::par(*(o2::gpu::ThrustVolatileAsyncAllocator*)nullptr).on(*(cudaStream_t*)nullptr))> thrustStreamPolicy;
-template <>
-__host__ __device__ inline cudaError_t synchronize<thrustStreamPolicy>(thrustStreamPolicy& policy)
-{
-#ifndef GPUCA_GPUCODE_DEVICE
-  // Do not synchronize!
-  return cudaSuccess;
-#else
-  return synchronize_stream(derived_cast(policy));
-#endif
-}
-
-} // namespace thrust::cuda_cub
-#endif // __HIPCC__
-
 #endif // GPU_CUDATHRUSTHELPERS_H
diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx
@@ -773,7 +773,7 @@ template <>
 inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)
 {
   thrust::device_ptr<GPUTPCGMBorderRange> p(range);
-  ThrustVolatileAsyncAllocator alloc(this);
+  ThrustVolatileAllocator alloc(this);
   if (cmpMax) {
     thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), p, p + N, MergeBorderTracks_compMax());
   } else {
@@ -1878,15 +1878,15 @@ template <>
 inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)
 {
   thrust::device_ptr<uint32_t> trackSort((uint32_t*)mProcessorsShadow->tpcMerger.TrackOrderProcess());
-  ThrustVolatileAsyncAllocator alloc(this);
+  ThrustVolatileAllocator alloc(this);
   thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
 }
 
 template <>
 inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)
 {
   thrust::device_ptr<uint32_t> trackSort((uint32_t*)mProcessorsShadow->tpcMerger.TrackSort());
-  ThrustVolatileAsyncAllocator alloc(this);
+  ThrustVolatileAllocator alloc(this);
   thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
 }
 #endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt
@@ -2111,7 +2111,7 @@ template <>
 inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)
 {
   thrust::device_ptr<MergeLooperParam> params(mProcessorsShadow->tpcMerger.LooperCandidates());
-  ThrustVolatileAsyncAllocator alloc(this);
+  ThrustVolatileAllocator alloc(this);
   thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), params, params + processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp());
 }
 #endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt
diff --git a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx
@@ -105,7 +105,7 @@ template <>
 inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)
 {
   thrust::device_ptr<GPUTPCGMMerger::tmpSort> trackSort(mProcessorsShadow->tpcMerger.TrackSortO2());
-  ThrustVolatileAsyncAllocator alloc(this);
+  ThrustVolatileAllocator alloc(this);
   thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp());
 }
 #endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMO2Output::Thread<GPUTPCGMO2Output::sort>

Original file line number	Diff line number	Diff line change
`@@ -773,7 +773,7 @@ template <>`
`773`	`773`	`inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)`
`774`	`774`	`{`
`775`	`775`	`thrust::device_ptr<GPUTPCGMBorderRange> p(range);`
`776`		`- ThrustVolatileAsyncAllocator alloc(this);`
	`776`	`+ ThrustVolatileAllocator alloc(this);`
`777`	`777`	`if (cmpMax) {`
`778`	`778`	`thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), p, p + N, MergeBorderTracks_compMax());`
`779`	`779`	`} else {`
`@@ -1878,15 +1878,15 @@ template <>`
`1878`	`1878`	`inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)`
`1879`	`1879`	`{`
`1880`	`1880`	`thrust::device_ptr<uint32_t> trackSort((uint32_t*)mProcessorsShadow->tpcMerger.TrackOrderProcess());`
`1881`		`- ThrustVolatileAsyncAllocator alloc(this);`
	`1881`	`+ ThrustVolatileAllocator alloc(this);`
`1882`	`1882`	`thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks()));`
`1883`	`1883`	`}`
`1884`	`1884`
`1885`	`1885`	`template <>`
`1886`	`1886`	`inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)`
`1887`	`1887`	`{`
`1888`	`1888`	`thrust::device_ptr<uint32_t> trackSort((uint32_t*)mProcessorsShadow->tpcMerger.TrackSort());`
`1889`		`- ThrustVolatileAsyncAllocator alloc(this);`
	`1889`	`+ ThrustVolatileAllocator alloc(this);`
`1890`	`1890`	`thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks()));`
`1891`	`1891`	`}`
`1892`	`1892`	`#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt`
`@@ -2111,7 +2111,7 @@ template <>`
`2111`	`2111`	`inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)`
`2112`	`2112`	`{`
`2113`	`2113`	`thrust::device_ptr<MergeLooperParam> params(mProcessorsShadow->tpcMerger.LooperCandidates());`
`2114`		`- ThrustVolatileAsyncAllocator alloc(this);`
	`2114`	`+ ThrustVolatileAllocator alloc(this);`
`2115`	`2115`	`thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), params, params + processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp());`
`2116`	`2116`	`}`
`2117`	`2117`	`#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMMergerSortTracks and GPUTPCGMMergerSortTracksQPt`
Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ template <>`
`105`	`105`	`inline void GPUCA_M_CAT3(GPUReconstruction, GPUCA_GPUTYPE, Backend)::runKernelBackendInternal<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)`
`106`	`106`	`{`
`107`	`107`	`thrust::device_ptr<GPUTPCGMMerger::tmpSort> trackSort(mProcessorsShadow->tpcMerger.TrackSortO2());`
`108`		`- ThrustVolatileAsyncAllocator alloc(this);`
	`108`	`+ ThrustVolatileAllocator alloc(this);`
`109`	`109`	`thrust::sort(GPUCA_THRUST_NAMESPACE::par(alloc).on(mInternals->Streams[_xyz.x.stream]), trackSort, trackSort + processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp());`
`110`	`110`	`}`
`111`	`111`	`#endif // GPUCA_SPECIALIZE_THRUST_SORTS - Specialize GPUTPCGMO2Output::Thread<GPUTPCGMO2Output::sort>`