@@ -50,19 +50,13 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSecto
5050 gputpcgmmergertypes::GPUTPCGMBorderRange* range2 = MergerShadow.BorderRange (jSector) + *processors ()->tpcTrackers [jSector].NTracks ();
5151 runKernel<GPUTPCGMMergerMergeBorders, 3 >({{1 , -WarpSize (), stream, deviceType}}, range1, n1, 0 );
5252 runKernel<GPUTPCGMMergerMergeBorders, 3 >({{1 , -WarpSize (), stream, deviceType}}, range2, n2, 1 );
53- deviceEvent* e = nullptr ;
54- int32_t ne = 0 ;
55- if (i == n - 1 ) { // Synchronize all execution on stream 0 with the last kernel
56- ne = std::min<int32_t >(n, mRec ->NStreams ());
57- for (int32_t j = 1 ; j < ne; j++) {
58- RecordMarker (&mEvents ->sector [j], j);
59- }
60- e = &mEvents ->sector [1 ];
61- ne--;
62- stream = 0 ;
63- }
64- runKernel<GPUTPCGMMergerMergeBorders, 2 >({GetGridAuto (stream, deviceType), krnlRunRangeNone, {nullptr , e, ne}}, i, withinSector, mergeMode);
53+ runKernel<GPUTPCGMMergerMergeBorders, 2 >({GetGridAuto (stream, deviceType)}, i, withinSector, mergeMode);
54+ }
55+ int32_t ne = std::min<int32_t >(n, mRec ->NStreams ()) - 1; // Stream 0 must wait for all streams, Note n > 1
56+ for (int32_t j = 0 ; j < ne; j++) {
57+ RecordMarker (&mEvents ->sector [j], j + 1 );
6558 }
59+ StreamWaitForEvents (0 , &mEvents ->sector [0 ], ne);
6660 } else {
6761 for (uint32_t i = 0 ; i < n; i++) {
6862 runKernel<GPUTPCGMMergerMergeBorders, 0 >(GetGridAuto (0 , deviceType), i, withinSector, mergeMode);
0 commit comments