Skip to content

Commit d4e16e1

Browse files
authored
ITS: GPU: overlap memcpy with compute kernels (#14596)
1 parent 86424f9 commit d4e16e1

File tree

1 file changed

+24
-12
lines changed

1 file changed

+24
-12
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -60,18 +60,23 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
6060
int startROF{this->mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * this->mTrkParams[iteration].nROFsPerIterations : 0};
6161
int endROF{o2::gpu::CAMath::Min(this->mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * this->mTrkParams[iteration].nROFsPerIterations + this->mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof(), mTimeFrameGPU->getNrof())};
6262

63-
for (int iLayer = 0; iLayer < nLayers; ++iLayer) {
64-
// TODO lazy loading of essential data on separate streams
63+
// start by queuing loading needed of two last layers
64+
for (int iLayer{nLayers}; iLayer-- > nLayers - 2;) {
6565
mTimeFrameGPU->createUsedClustersDevice(iteration, iLayer);
6666
mTimeFrameGPU->loadClustersDevice(iteration, iLayer);
67-
mTimeFrameGPU->loadUnsortedClustersDevice(iteration, iLayer);
6867
mTimeFrameGPU->loadClustersIndexTables(iteration, iLayer);
6968
mTimeFrameGPU->loadROFrameClustersDevice(iteration, iLayer);
7069
mTimeFrameGPU->recordEvent(iLayer);
7170
}
7271

73-
// processing starts here
74-
for (int iLayer{0}; iLayer < this->mTrkParams[iteration].TrackletsPerRoad(); ++iLayer) {
72+
for (int iLayer{this->mTrkParams[iteration].TrackletsPerRoad()}; iLayer--;) {
73+
if (iLayer) { // queue loading data of next layer in parallel, this the copies are overlapping with computation kernels
74+
mTimeFrameGPU->createUsedClustersDevice(iteration, iLayer - 1);
75+
mTimeFrameGPU->loadClustersDevice(iteration, iLayer - 1);
76+
mTimeFrameGPU->loadClustersIndexTables(iteration, iLayer - 1);
77+
mTimeFrameGPU->loadROFrameClustersDevice(iteration, iLayer - 1);
78+
mTimeFrameGPU->recordEvent(iLayer - 1);
79+
}
7580
mTimeFrameGPU->createTrackletsLUTDevice(iteration, iLayer);
7681
mTimeFrameGPU->waitEvent(iLayer, iLayer + 1); // wait stream until all data is available
7782
countTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
@@ -91,7 +96,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
9196
mTimeFrameGPU->getDeviceArrayUsedClusters(),
9297
mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
9398
mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
94-
mTimeFrameGPU->getDeviceTrackletsLUTs(), // Required for the exclusive sums
99+
mTimeFrameGPU->getDeviceTrackletsLUTs(),
95100
iteration,
96101
this->mTrkParams[iteration].NSigmaCut,
97102
mTimeFrameGPU->getPhiCuts(),
@@ -107,7 +112,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
107112
mTimeFrameGPU->getStreams());
108113
mTimeFrameGPU->createTrackletsBuffers(iLayer);
109114
if (mTimeFrameGPU->getNTracklets()[iLayer] == 0) {
110-
return;
115+
continue;
111116
}
112117
computeTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
113118
mTimeFrameGPU->getDeviceMultCutMask(),
@@ -151,18 +156,25 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
151156
{
152157
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
153158

154-
for (int iLayer = 0; iLayer < nLayers; ++iLayer) {
155-
// TODO lazy loading of essential data on separate streams
159+
// start by queuing loading needed of three last layers
160+
for (int iLayer{nLayers}; iLayer-- > nLayers - 3;) {
161+
mTimeFrameGPU->loadUnsortedClustersDevice(iteration, iLayer);
156162
mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration, iLayer);
157163
mTimeFrameGPU->recordEvent(iLayer);
158164
}
159165

160-
for (int iLayer{0}; iLayer < this->mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
166+
for (int iLayer{this->mTrkParams[iteration].CellsPerRoad()}; iLayer--;) {
167+
if (iLayer) {
168+
mTimeFrameGPU->loadUnsortedClustersDevice(iteration, iLayer - 1);
169+
mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration, iLayer - 1);
170+
mTimeFrameGPU->recordEvent(iLayer - 1);
171+
}
172+
161173
// if there are no tracklets skip entirely
162174
const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getNTracklets()[iLayer])};
163175
if (!currentLayerTrackletsNum || !mTimeFrameGPU->getNTracklets()[iLayer + 1]) {
164176
mTimeFrameGPU->getNCells()[iLayer] = 0;
165-
return;
177+
continue;
166178
}
167179

168180
mTimeFrameGPU->createCellsLUTDevice(iLayer);
@@ -189,7 +201,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
189201
mTimeFrameGPU->getStreams());
190202
mTimeFrameGPU->createCellsBuffers(iLayer);
191203
if (mTimeFrameGPU->getNCells()[iLayer] == 0) {
192-
return;
204+
continue;
193205
}
194206
computeCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(),
195207
mTimeFrameGPU->getDeviceArrayUnsortedClusters(),

0 commit comments

Comments
 (0)