Skip to content

Commit 853e48d

Browse files
committed
ITS: GPU: put cell neighbour finding on different streams
Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
1 parent 2498a68 commit 853e48d

File tree

5 files changed

+101
-129
lines changed

5 files changed

+101
-129
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,17 +62,15 @@ class TimeFrameGPU : public TimeFrame<nLayers>
6262
void createCellsDevice();
6363
void createCellsLUTDevice();
6464
void createNeighboursIndexTablesDevice();
65-
void createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours);
66-
void createNeighboursDevice(const unsigned int layer, std::vector<std::pair<int, int>>& neighbours);
65+
void createNeighboursDevice(const unsigned int layer);
6766
void createNeighboursLUTDevice(const int, const unsigned int);
68-
void createNeighboursDeviceArray();
6967
void createTrackITSExtDevice(bounded_vector<CellSeed>&);
7068
void downloadTrackITSExtDevice(bounded_vector<CellSeed>&);
7169
void downloadCellsNeighboursDevice(std::vector<bounded_vector<std::pair<int, int>>>&, const int);
7270
void downloadNeighboursLUTDevice(bounded_vector<int>&, const int);
7371
void downloadCellsDevice();
7472
void downloadCellsLUTDevice();
75-
void unregisterRest();
73+
auto& getStream(const size_t stream) { return mGpuStreams[stream]; }
7674
auto& getStreams() { return mGpuStreams; }
7775
void syncStream(const size_t stream);
7876
void syncStreams();
@@ -96,7 +94,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
9694
gpuPair<int, int>* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; }
9795
std::array<int*, nLayers - 2>& getDeviceNeighboursAll() { return mNeighboursDevice; }
9896
int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
99-
int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; }
97+
int** getDeviceNeighboursArray() { return mNeighboursDevice.data(); }
10098
TrackingFrameInfo* getDeviceTrackingFrameInfo(const int);
10199
const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
102100
const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
@@ -109,7 +107,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
109107
int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; }
110108
int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; }
111109
int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; }
112-
CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; }
110+
CellSeed** getDeviceArrayCells() { return mCellsDevice.data(); }
113111
CellSeed* getDeviceTrackSeeds() { return mTrackSeedsDevice; }
114112
o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; }
115113
float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; }
@@ -176,7 +174,6 @@ class TimeFrameGPU : public TimeFrame<nLayers>
176174
std::array<CellSeed*, nLayers - 2> mCellsDevice;
177175
std::array<int*, nLayers - 2> mNeighboursIndexTablesDevice;
178176
CellSeed* mTrackSeedsDevice;
179-
CellSeed** mCellsDeviceArray;
180177
std::array<o2::track::TrackParCovF*, nLayers - 2> mCellSeedsDevice;
181178
o2::track::TrackParCovF** mCellSeedsDeviceArray;
182179
std::array<float*, nLayers - 2> mCellSeedsChi2Device;
@@ -186,7 +183,6 @@ class TimeFrameGPU : public TimeFrame<nLayers>
186183
TrackITSExt* mTrackITSExtDevice;
187184
std::array<gpuPair<int, int>*, nLayers - 2> mNeighbourPairsDevice;
188185
std::array<int*, nLayers - 2> mNeighboursDevice;
189-
int** mNeighboursDeviceArray;
190186
std::array<TrackingFrameInfo*, nLayers> mTrackingFrameInfoDevice;
191187
const TrackingFrameInfo** mTrackingFrameInfoDeviceArray;
192188

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -162,21 +162,22 @@ void computeCellsHandler(const Cluster** sortedClusters,
162162
const int nThreads,
163163
gpu::Streams& streams);
164164

165-
unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice,
166-
int* neighboursLUTs,
167-
int** cellsLUTs,
168-
gpuPair<int, int>* cellNeighbours,
169-
int* neighboursIndexTable,
170-
const Tracklet** tracklets,
171-
const int deltaROF,
172-
const float maxChi2ClusterAttachment,
173-
const float bz,
174-
const int layerIndex,
175-
const unsigned int nCells,
176-
const unsigned int nCellsNext,
177-
const int maxCellNeighbours,
178-
const int nBlocks,
179-
const int nThreads);
165+
void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
166+
int* neighboursLUTs,
167+
int** cellsLUTs,
168+
gpuPair<int, int>* cellNeighbours,
169+
int* neighboursIndexTable,
170+
const Tracklet** tracklets,
171+
const int deltaROF,
172+
const float maxChi2ClusterAttachment,
173+
const float bz,
174+
const int layerIndex,
175+
const unsigned int nCells,
176+
const unsigned int nCellsNext,
177+
const int maxCellNeighbours,
178+
const int nBlocks,
179+
const int nThreads,
180+
gpu::Stream& stream);
180181

181182
void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
182183
int* neighboursLUTs,
@@ -192,11 +193,13 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
192193
const unsigned int nCellsNext,
193194
const int maxCellNeighbours,
194195
const int nBlocks,
195-
const int nThreads);
196+
const int nThreads,
197+
gpu::Stream& stream);
196198

197199
int filterCellNeighboursHandler(gpuPair<int, int>*,
198200
int*,
199201
unsigned int,
202+
gpu::Stream&,
200203
o2::its::ExternalAllocator* = nullptr);
201204

202205
template <int nLayers = 7>

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 11 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -349,26 +349,20 @@ void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
349349
{
350350
GPUTimer timer(mGpuStreams[0], "creating cells neighbours");
351351
// Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
352-
allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator());
353-
GPUChkErrS(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
354-
GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
355352
for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
356353
GPULog("gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {:.2f} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / constants::MB);
357-
allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator());
358-
GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
359-
if (iLayer < nLayers - 3) {
360-
mNNeighbours[iLayer] = 0;
361-
}
354+
allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
355+
GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer].get()));
362356
}
363357
}
364358

365359
template <int nLayers>
366360
void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
367361
{
368-
GPUTimer timer(mGpuStreams[0], "reserving neighboursLUT");
362+
GPUTimer timer(mGpuStreams[layer], "reserving neighboursLUT");
369363
GPULog("gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {:.2f} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / constants::MB);
370-
allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); // We need one element more to move exc -> inc
371-
GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
364+
allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); // We need one element more to move exc -> inc
365+
GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[layer].get()));
372366
}
373367

374368
template <int nLayers>
@@ -382,8 +376,6 @@ void TimeFrameGPU<nLayers>::loadCellsDevice()
382376
GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer].get()));
383377
GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
384378
}
385-
allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator());
386-
GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
387379
}
388380

389381
template <int nLayers>
@@ -441,35 +433,15 @@ void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(bounded_vector<CellSeed>& seeds
441433
}
442434

443435
template <int nLayers>
444-
void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours)
436+
void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer)
445437
{
446-
GPUTimer timer(mGpuStreams[0], "reserving neighbours");
438+
GPUTimer timer(mGpuStreams[layer], "reserving neighbours");
439+
GPUChkErrS(cudaMemcpyAsync(&(this->mNNeighbours[layer]), &(mNeighboursLUTDevice[layer][this->mNCells[layer + 1] - 1]), sizeof(unsigned int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get()));
447440
GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / constants::MB);
448-
allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0], this->getExtAllocator());
449-
GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
441+
allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), (this->mNNeighbours[layer]) * sizeof(gpuPair<int, int>), mGpuStreams[layer], this->getExtAllocator());
442+
GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, (this->mNNeighbours[layer]) * sizeof(gpuPair<int, int>), mGpuStreams[layer].get()));
450443
GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / constants::MB);
451-
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), mGpuStreams[0], this->getExtAllocator());
452-
}
453-
454-
template <int nLayers>
455-
void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, std::vector<std::pair<int, int>>& neighbours)
456-
{
457-
GPUTimer timer(mGpuStreams[0], "reserving neighbours");
458-
this->mCellsNeighbours[layer].clear();
459-
this->mCellsNeighbours[layer].resize(neighbours.size());
460-
GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / constants::MB);
461-
allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0], this->getExtAllocator());
462-
GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
463-
GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / constants::MB);
464-
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
465-
}
466-
467-
template <int nLayers>
468-
void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
469-
{
470-
GPUTimer timer(mGpuStreams[0], "reserving neighbours");
471-
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
472-
GPUChkErrS(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
444+
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), (this->mNNeighbours[layer]) * sizeof(int), mGpuStreams[layer], this->getExtAllocator());
473445
}
474446

475447
template <int nLayers>
@@ -532,15 +504,6 @@ void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeed>&
532504
GPUChkErrS(cudaHostUnregister(seeds.data()));
533505
}
534506

535-
template <int nLayers>
536-
void TimeFrameGPU<nLayers>::unregisterRest()
537-
{
538-
GPUTimer timer(mGpuStreams[0], "unregistering rest of the host memory");
539-
GPULog("unregistering rest of the host memory...");
540-
GPUChkErrS(cudaHostUnregister(mCellsDevice.data()));
541-
// GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data()));
542-
}
543-
544507
template <int nLayers>
545508
void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
546509
{

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 38 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -180,15 +180,27 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
180180
conf.nThreadsLayerCells[iteration],
181181
mTimeFrameGPU->getStreams());
182182
}
183-
mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed
184183
}
185184

186185
template <int nLayers>
187186
void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
188187
{
189188
mTimeFrameGPU->createNeighboursIndexTablesDevice();
190189
const auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
190+
191+
std::vector<bool> isCellStreamSynched(this->mTrkParams[iteration].TrackletsPerRoad() - 1);
192+
auto syncOnce = [&](const int iLayer) {
193+
if (!isCellStreamSynched[iLayer]) {
194+
mTimeFrameGPU->syncStream(iLayer);
195+
isCellStreamSynched[iLayer] = true;
196+
}
197+
};
198+
191199
for (int iLayer{0}; iLayer < this->mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
200+
// ensure that celling is done for iLayer and iLayer+1 is done
201+
syncOnce(iLayer);
202+
syncOnce(iLayer + 1);
203+
192204
const int currentLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer])};
193205
const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer + 1])};
194206
if (!nextLayerCellsNum || !currentLayerCellsNum) {
@@ -197,24 +209,23 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
197209
}
198210

199211
mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum);
200-
unsigned int nNeigh = countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
201-
mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
202-
mTimeFrameGPU->getDeviceArrayCellsLUT(),
203-
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
204-
mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
205-
(const Tracklet**)mTimeFrameGPU->getDeviceArrayTracklets(),
206-
this->mTrkParams[0].DeltaROF,
207-
this->mTrkParams[0].MaxChi2ClusterAttachment,
208-
this->mBz,
209-
iLayer,
210-
currentLayerCellsNum,
211-
nextLayerCellsNum,
212-
1e2,
213-
conf.nBlocksFindNeighbours[iteration],
214-
conf.nThreadsFindNeighbours[iteration]);
215-
216-
mTimeFrameGPU->createNeighboursDevice(iLayer, nNeigh);
217-
212+
countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
213+
mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
214+
mTimeFrameGPU->getDeviceArrayCellsLUT(),
215+
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
216+
mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
217+
(const Tracklet**)mTimeFrameGPU->getDeviceArrayTracklets(),
218+
this->mTrkParams[0].DeltaROF,
219+
this->mTrkParams[0].MaxChi2ClusterAttachment,
220+
this->mBz,
221+
iLayer,
222+
currentLayerCellsNum,
223+
nextLayerCellsNum,
224+
1e2,
225+
conf.nBlocksFindNeighbours[iteration],
226+
conf.nThreadsFindNeighbours[iteration],
227+
mTimeFrameGPU->getStream(iLayer));
228+
mTimeFrameGPU->createNeighboursDevice(iLayer);
218229
computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
219230
mTimeFrameGPU->getDeviceNeighboursLUT(iLayer),
220231
mTimeFrameGPU->getDeviceArrayCellsLUT(),
@@ -229,16 +240,15 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
229240
nextLayerCellsNum,
230241
1e2,
231242
conf.nBlocksFindNeighbours[iteration],
232-
conf.nThreadsFindNeighbours[iteration]);
233-
234-
nNeigh = filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
235-
mTimeFrameGPU->getDeviceNeighbours(iLayer),
236-
nNeigh,
237-
mTimeFrameGPU->getExternalAllocator());
238-
mTimeFrameGPU->getArrayNNeighbours()[iLayer] = nNeigh;
243+
conf.nThreadsFindNeighbours[iteration],
244+
mTimeFrameGPU->getStream(iLayer));
245+
mTimeFrameGPU->getArrayNNeighbours()[iLayer] = filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
246+
mTimeFrameGPU->getDeviceNeighbours(iLayer),
247+
mTimeFrameGPU->getArrayNNeighbours()[iLayer],
248+
mTimeFrameGPU->getStream(iLayer),
249+
mTimeFrameGPU->getExternalAllocator());
239250
}
240-
mTimeFrameGPU->createNeighboursDeviceArray();
241-
mTimeFrameGPU->unregisterRest();
251+
mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed
242252
};
243253

244254
template <int nLayers>

0 commit comments

Comments
 (0)