Skip to content

Commit 3e56e55

Browse files
committed
GPU: Improve synchronization during track-merging, no need to serialize the last kernel
1 parent 7a706ae commit 3e56e55

File tree

1 file changed

+6
-12
lines changed

1 file changed

+6
-12
lines changed

GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -50,19 +50,13 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSecto
5050
gputpcgmmergertypes::GPUTPCGMBorderRange* range2 = MergerShadow.BorderRange(jSector) + *processors()->tpcTrackers[jSector].NTracks();
5151
runKernel<GPUTPCGMMergerMergeBorders, 3>({{1, -WarpSize(), stream, deviceType}}, range1, n1, 0);
5252
runKernel<GPUTPCGMMergerMergeBorders, 3>({{1, -WarpSize(), stream, deviceType}}, range2, n2, 1);
53-
deviceEvent* e = nullptr;
54-
int32_t ne = 0;
55-
if (i == n - 1) { // Synchronize all execution on stream 0 with the last kernel
56-
ne = std::min<int32_t>(n, mRec->NStreams());
57-
for (int32_t j = 1; j < ne; j++) {
58-
RecordMarker(&mEvents->sector[j], j);
59-
}
60-
e = &mEvents->sector[1];
61-
ne--;
62-
stream = 0;
63-
}
64-
runKernel<GPUTPCGMMergerMergeBorders, 2>({GetGridAuto(stream, deviceType), krnlRunRangeNone, {nullptr, e, ne}}, i, withinSector, mergeMode);
53+
runKernel<GPUTPCGMMergerMergeBorders, 2>({GetGridAuto(stream, deviceType)}, i, withinSector, mergeMode);
54+
}
55+
int32_t ne = std::min<int32_t>(n, mRec->NStreams()) - 1; // Stream 0 must wait for all streams, Note n > 1
56+
for (int32_t j = 0; j < ne; j++) {
57+
RecordMarker(&mEvents->sector[j], j + 1);
6558
}
59+
StreamWaitForEvents(0, &mEvents->sector[0], ne);
6660
} else {
6761
for (uint32_t i = 0; i < n; i++) {
6862
runKernel<GPUTPCGMMergerMergeBorders, 0>(GetGridAuto(0, deviceType), i, withinSector, mergeMode);

0 commit comments

Comments
 (0)