Skip to content

Commit 2498a68

Browse files
committed
ITS: GPU: put cell finding on different streams
Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
1 parent f2b0957 commit 2498a68

File tree

5 files changed

+44
-21
lines changed

5 files changed

+44
-21
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,8 @@ class TimeFrameGPU : public TimeFrame<nLayers>
7373
void downloadCellsDevice();
7474
void downloadCellsLUTDevice();
7575
void unregisterRest();
76-
template <Task task>
77-
auto& getStream(const size_t stream)
78-
{
79-
return mGpuStreams[stream];
80-
}
8176
auto& getStreams() { return mGpuStreams; }
77+
void syncStream(const size_t stream);
8278
void syncStreams();
8379
virtual void wipe() final;
8480

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@ namespace gpu
2525

2626
#ifdef GPUCA_GPUCODE // GPUg() global kernels must only when compiled by GPU compiler
2727

28-
GPUdi() int4 getEmptyBinsRect() { return int4{0, 0, 0, 0}; }
28+
GPUdi() int4 getEmptyBinsRect()
29+
{
30+
return int4{0, 0, 0, 0};
31+
}
2932

3033
GPUd() bool fitTrack(TrackITSExt& track,
3134
int start,
@@ -137,7 +140,8 @@ void countCellsHandler(const Cluster** sortedClusters,
137140
const float cellDeltaTanLambdaSigma,
138141
const float nSigmaCut,
139142
const int nBlocks,
140-
const int nThreads);
143+
const int nThreads,
144+
gpu::Streams& streams);
141145

142146
void computeCellsHandler(const Cluster** sortedClusters,
143147
const Cluster** unsortedClusters,
@@ -155,7 +159,8 @@ void computeCellsHandler(const Cluster** sortedClusters,
155159
const float cellDeltaTanLambdaSigma,
156160
const float nSigmaCut,
157161
const int nBlocks,
158-
const int nThreads);
162+
const int nThreads,
163+
gpu::Streams& streams);
159164

160165
unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice,
161166
int* neighboursLUTs,

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ void TimeFrameGPU<nLayers>::createCellsLUTDevice()
402402
template <int nLayers>
403403
void TimeFrameGPU<nLayers>::createCellsBuffers(const int layer)
404404
{
405-
GPUTimer timer(mGpuStreams[0], "creating cells buffers");
405+
GPUTimer timer(mGpuStreams[layer], "creating cells buffers");
406406
mNCells[layer] = 0;
407407
GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get()));
408408
GPULog("gpu-transfer: creating cell buffer for {} elements on layer {}, for {:.2f} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / constants::MB);
@@ -567,6 +567,12 @@ void TimeFrameGPU<nLayers>::initialise(const int iteration,
567567
o2::its::TimeFrame<nLayers>::initialise(iteration, trkParam, maxLayers);
568568
}
569569

570+
template <int nLayers>
571+
void TimeFrameGPU<nLayers>::syncStream(const size_t stream)
572+
{
573+
mGpuStreams[stream].sync();
574+
}
575+
570576
template <int nLayers>
571577
void TimeFrameGPU<nLayers>::syncStreams()
572578
{

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,6 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
116116
conf.nBlocksLayerTracklets[iteration],
117117
conf.nThreadsLayerTracklets[iteration],
118118
mTimeFrameGPU->getStreams());
119-
mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed
120119
}
121120

122121
template <int nLayers>
@@ -125,18 +124,30 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
125124
mTimeFrameGPU->createCellsLUTDevice();
126125
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
127126

127+
std::vector<bool> isTrackletStreamSynched(this->mTrkParams[iteration].TrackletsPerRoad());
128+
auto syncOnce = [&](const int iLayer) {
129+
if (!isTrackletStreamSynched[iLayer]) {
130+
mTimeFrameGPU->syncStream(iLayer);
131+
isTrackletStreamSynched[iLayer] = true;
132+
}
133+
};
134+
128135
for (int iLayer = 0; iLayer < this->mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
129-
if (!mTimeFrameGPU->getNTracklets()[iLayer + 1] || !mTimeFrameGPU->getNTracklets()[iLayer]) {
136+
// need to ensure that trackleting on layers iLayer and iLayer + 1 are done (only once)
137+
syncOnce(iLayer);
138+
syncOnce(iLayer + 1);
139+
// if there are no tracklets skip entirely
140+
const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getNTracklets()[iLayer])};
141+
if (!currentLayerTrackletsNum || !mTimeFrameGPU->getNTracklets()[iLayer + 1]) {
130142
mTimeFrameGPU->getNCells()[iLayer] = 0;
131143
continue;
132144
}
133-
const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getNTracklets()[iLayer])};
134145
countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(),
135146
mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
136147
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
137148
mTimeFrameGPU->getDeviceArrayTracklets(),
138149
mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
139-
mTimeFrameGPU->getNTracklets()[iLayer],
150+
currentLayerTrackletsNum,
140151
iLayer,
141152
nullptr,
142153
mTimeFrameGPU->getDeviceArrayCellsLUT(),
@@ -147,14 +158,15 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
147158
this->mTrkParams[iteration].CellDeltaTanLambdaSigma,
148159
this->mTrkParams[iteration].NSigmaCut,
149160
conf.nBlocksLayerCells[iteration],
150-
conf.nThreadsLayerCells[iteration]);
161+
conf.nThreadsLayerCells[iteration],
162+
mTimeFrameGPU->getStreams());
151163
mTimeFrameGPU->createCellsBuffers(iLayer);
152164
computeCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(),
153165
mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
154166
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
155167
mTimeFrameGPU->getDeviceArrayTracklets(),
156168
mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
157-
mTimeFrameGPU->getNTracklets()[iLayer],
169+
currentLayerTrackletsNum,
158170
iLayer,
159171
mTimeFrameGPU->getDeviceCells()[iLayer],
160172
mTimeFrameGPU->getDeviceArrayCellsLUT(),
@@ -165,8 +177,10 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
165177
this->mTrkParams[iteration].CellDeltaTanLambdaSigma,
166178
this->mTrkParams[iteration].NSigmaCut,
167179
conf.nBlocksLayerCells[iteration],
168-
conf.nThreadsLayerCells[iteration]);
180+
conf.nThreadsLayerCells[iteration],
181+
mTimeFrameGPU->getStreams());
169182
}
183+
mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed
170184
}
171185

172186
template <int nLayers>

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1065,9 +1065,10 @@ void countCellsHandler(
10651065
const float cellDeltaTanLambdaSigma,
10661066
const float nSigmaCut,
10671067
const int nBlocks,
1068-
const int nThreads)
1068+
const int nThreads,
1069+
gpu::Streams& streams)
10691070
{
1070-
gpu::computeLayerCellsKernel<true><<<nBlocks, nThreads>>>(
1071+
gpu::computeLayerCellsKernel<true><<<nBlocks, nThreads, 0, streams[layer].get()>>>(
10711072
sortedClusters, // const Cluster**
10721073
unsortedClusters, // const Cluster**
10731074
tfInfo, // const TrackingFrameInfo**
@@ -1082,7 +1083,7 @@ void countCellsHandler(
10821083
maxChi2ClusterAttachment, // const float
10831084
cellDeltaTanLambdaSigma, // const float
10841085
nSigmaCut); // const float
1085-
gpu::cubExclusiveScanInPlace(cellsLUTsHost, nTracklets + 1);
1086+
gpu::cubExclusiveScanInPlace(cellsLUTsHost, nTracklets + 1, streams[layer].get());
10861087
}
10871088

10881089
void computeCellsHandler(
@@ -1102,9 +1103,10 @@ void computeCellsHandler(
11021103
const float cellDeltaTanLambdaSigma,
11031104
const float nSigmaCut,
11041105
const int nBlocks,
1105-
const int nThreads)
1106+
const int nThreads,
1107+
gpu::Streams& streams)
11061108
{
1107-
gpu::computeLayerCellsKernel<false><<<nBlocks, nThreads>>>(
1109+
gpu::computeLayerCellsKernel<false><<<nBlocks, nThreads, 0, streams[layer].get()>>>(
11081110
sortedClusters, // const Cluster**
11091111
unsortedClusters, // const Cluster**
11101112
tfInfo, // const TrackingFrameInfo**

0 commit comments

Comments
 (0)