Skip to content

Commit 320de2d

Browse files
committed
Add processNeighbours GPU kernel and handler
1 parent 6fa29aa commit 320de2d

File tree

8 files changed

+422
-101
lines changed

8 files changed

+422
-101
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,10 @@ class TimeFrameGPU : public TimeFrame
7676
void createCellsBuffers(const int);
7777
void createCellsDevice();
7878
void createCellsLUTDevice();
79-
void createNeighboursDevice();
79+
void createNeighboursIndexTablesDevice();
8080
void createNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours);
8181
void createNeighboursLUTDevice(const int, const unsigned int);
82+
void createNeighboursDeviceArray();
8283
void createTrackITSExtDevice(std::vector<CellSeed>&);
8384
void downloadTrackITSExtDevice(std::vector<CellSeed>&);
8485
void downloadCellsNeighboursDevice(std::vector<std::vector<std::pair<int, int>>>&, const int);
@@ -113,7 +114,10 @@ class TimeFrameGPU : public TimeFrame
113114
Road<nLayers - 2>* getDeviceRoads() { return mRoadsDevice; }
114115
TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; }
115116
int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; }
116-
gpuPair<int, int>* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
117+
gsl::span<int*> getDeviceNeighboursLUTs() { return mNeighboursLUTDevice; }
118+
gpuPair<int, int>* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; }
119+
int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
120+
int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; }
117121
TrackingFrameInfo* getDeviceTrackingFrameInfo(const int);
118122
const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
119123
const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
@@ -195,7 +199,9 @@ class TimeFrameGPU : public TimeFrame
195199

196200
Road<nLayers - 2>* mRoadsDevice;
197201
TrackITSExt* mTrackITSExtDevice;
198-
std::array<gpuPair<int, int>*, nLayers - 2> mNeighboursDevice;
202+
std::array<gpuPair<int, int>*, nLayers - 2> mNeighbourPairsDevice;
203+
std::array<int*, nLayers - 2> mNeighboursDevice;
204+
int** mNeighboursDeviceArray;
199205
std::array<TrackingFrameInfo*, nLayers> mTrackingFrameInfoDevice;
200206
const TrackingFrameInfo** mTrackingFrameInfoDeviceArray;
201207

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ GPUg() void fitTrackSeedsKernel(
4040
CellSeed* trackSeeds,
4141
const TrackingFrameInfo** foundTrackingFrameInfo,
4242
o2::its::TrackITSExt* tracks,
43+
const float* minPts,
4344
const unsigned int nSeeds,
4445
const float Bz,
4546
const int startLevel,
@@ -175,13 +176,34 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
175176
const int nBlocks,
176177
const int nThreads);
177178

178-
void filterCellNeighboursHandler(std::vector<int>&,
179-
gpuPair<int, int>*,
180-
unsigned int);
179+
int filterCellNeighboursHandler(std::vector<int>&,
180+
gpuPair<int, int>*,
181+
int*,
182+
unsigned int);
183+
184+
template <int nLayers = 7>
185+
void processNeighboursHandler(const int startLayer,
186+
const int startLevel,
187+
CellSeed** allCellSeeds,
188+
CellSeed* currentCellSeeds,
189+
const unsigned int nCurrentCells,
190+
const unsigned char** usedClusters,
191+
int* neighbours,
192+
gsl::span<int*> neighboursDeviceLUTs,
193+
const TrackingFrameInfo** foundTrackingFrameInfo,
194+
const float bz,
195+
const float MaxChi2ClusterAttachment,
196+
const o2::base::Propagator* propagator,
197+
const o2::base::PropagatorF::MatCorrType matCorrType,
198+
std::vector<int>& updatedCellIdHost, // temporary host vectors
199+
std::vector<CellSeed>& updatedCellSeedHost, // temporary host vectors
200+
const int nBlocks,
201+
const int nThreads);
181202

182203
void trackSeedHandler(CellSeed* trackSeeds,
183204
const TrackingFrameInfo** foundTrackingFrameInfo,
184205
o2::its::TrackITSExt* tracks,
206+
std::vector<float>& minPtsHost,
185207
const unsigned int nSeeds,
186208
const float Bz,
187209
const int startLevel,

Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
if(CUDA_ENABLED)
1414
find_package(CUDAToolkit)
1515
message(STATUS "Building ITS CUDA tracker")
16-
# add_compile_options(-O0 -g -lineinfo -fPIC)
16+
add_compile_options(-O0 -g -lineinfo -fPIC)
1717
# add_compile_definitions(ITS_MEASURE_GPU_TIME)
1818
o2_add_library(ITStrackingCUDA
1919
SOURCES ClusterLinesGPU.cu

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -306,17 +306,28 @@ void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
306306
}
307307

308308
template <int nLayers>
309-
void TimeFrameGPU<nLayers>::createNeighboursDevice()
309+
void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
310310
{
311-
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds");
311+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours");
312+
// Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
313+
allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
314+
checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
315+
checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
312316
for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
313317
LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
314318
allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator());
315319
checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
316320
}
317-
allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
318-
checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
319-
checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
321+
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
322+
}
323+
324+
template <int nLayers>
325+
void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
326+
{
327+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
328+
LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
329+
allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
330+
checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
320331
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
321332
}
322333

@@ -400,19 +411,20 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int& layer, st
400411
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
401412
mCellsNeighbours[layer].clear();
402413
mCellsNeighbours[layer].resize(neighbours.size());
414+
LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
415+
allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
416+
checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
403417
LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
404-
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
405-
checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
418+
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), &(mGpuStreams[0]), getExtAllocator());
406419
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
407420
}
408421

409422
template <int nLayers>
410-
void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
423+
void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
411424
{
412-
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
413-
LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
414-
allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
415-
checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
425+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
426+
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), &(mGpuStreams[0]), getExtAllocator());
427+
checkGPUError(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
416428
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
417429
}
418430

@@ -459,7 +471,7 @@ void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<std::vecto
459471
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours from layer {}", layer));
460472
LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / MB);
461473
// TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :)
462-
checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
474+
checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
463475
}
464476

465477
template <int nLayers>

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,12 @@ template <int nLayers>
9191
void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex)
9292
{
9393
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
94-
// TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex);
9594
mTimeFrameGPU->createTrackletsLUTDevice(iteration);
9695

9796
const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f);
9897
gsl::span<const Vertex> diamondSpan(&diamondVert, 1);
9998
int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0};
100-
int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()};
99+
int endROF{o2::gpu::CAMath::Min(mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof(), mTimeFrameGPU->getNrof())};
101100

102101
countTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
103102
mTimeFrameGPU->getDeviceMultCutMask(),
@@ -214,7 +213,7 @@ void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)
214213
template <int nLayers>
215214
void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
216215
{
217-
mTimeFrameGPU->createNeighboursDevice();
216+
mTimeFrameGPU->createNeighboursIndexTablesDevice();
218217
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
219218
std::vector<std::vector<std::pair<int, int>>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1);
220219
for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
@@ -228,17 +227,16 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
228227
continue;
229228
}
230229

231-
int layerCellsNum{static_cast<int>(mTimeFrameGPU->getCells()[iLayer].size())};
232230
mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum);
233231
countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
234232
mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
235233
mTimeFrameGPU->getDeviceArrayCellsLUT(),
236-
mTimeFrameGPU->getDeviceNeighbours(iLayer),
234+
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
237235
mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
238236
mTrkParams[0].MaxChi2ClusterAttachment,
239237
mBz,
240238
iLayer,
241-
layerCellsNum,
239+
mTimeFrameGPU->getNCells()[iLayer],
242240
nextLayerCellsNum,
243241
1e2,
244242
conf.nBlocks,
@@ -250,12 +248,12 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
250248
computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
251249
mTimeFrameGPU->getDeviceNeighboursLUT(iLayer),
252250
mTimeFrameGPU->getDeviceArrayCellsLUT(),
253-
mTimeFrameGPU->getDeviceNeighbours(iLayer),
251+
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
254252
mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
255253
mTrkParams[0].MaxChi2ClusterAttachment,
256254
mBz,
257255
iLayer,
258-
layerCellsNum,
256+
mTimeFrameGPU->getNCells()[iLayer],
259257
nextLayerCellsNum,
260258
1e2,
261259
conf.nBlocks,
@@ -264,24 +262,46 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
264262
mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighboursLayer[iLayer].size());
265263

266264
filterCellNeighboursHandler(mTimeFrameGPU->getCellsNeighbours()[iLayer],
265+
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
267266
mTimeFrameGPU->getDeviceNeighbours(iLayer),
268267
cellsNeighboursLayer[iLayer].size());
269268
}
269+
mTimeFrameGPU->createNeighboursDeviceArray();
270270
mTimeFrameGPU->downloadCellsDevice();
271271
mTimeFrameGPU->unregisterRest();
272272
};
273273

274274
template <int nLayers>
275275
void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
276276
{
277+
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
277278
for (int startLevel{mTrkParams[iteration].CellsPerRoad()}; startLevel >= mTrkParams[iteration].CellMinimumLevel(); --startLevel) {
278279
const int minimumLayer{startLevel - 1};
279280
std::vector<CellSeed> trackSeeds;
280281
for (int startLayer{mTrkParams[iteration].CellsPerRoad() - 1}; startLayer >= minimumLayer; --startLayer) {
282+
if ((mTrkParams[iteration].StartLayerMask & (1 << (startLayer + 2))) == 0) {
283+
continue;
284+
}
281285
std::vector<int> lastCellId, updatedCellId;
282286
std::vector<CellSeed> lastCellSeed, updatedCellSeed;
283287

284-
processNeighbours(startLayer, startLevel, mTimeFrameGPU->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId);
288+
processNeighboursHandler<nLayers>(startLayer,
289+
startLevel,
290+
mTimeFrameGPU->getDeviceArrayCells(),
291+
mTimeFrameGPU->getDeviceCells()[startLayer],
292+
mTimeFrameGPU->getNCells()[startLayer],
293+
mTimeFrameGPU->getDeviceArrayUsedClusters(),
294+
mTimeFrameGPU->getDeviceNeighbours(startLayer - 1),
295+
mTimeFrameGPU->getDeviceNeighboursLUTs(),
296+
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
297+
mBz,
298+
mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment
299+
mTimeFrameGPU->getDevicePropagator(),
300+
mCorrType,
301+
updatedCellId, // temporary host vectors
302+
updatedCellSeed, // temporary host vectors
303+
conf.nBlocks,
304+
conf.nThreads);
285305

286306
int level = startLevel;
287307
for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
@@ -304,15 +324,16 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
304324
}
305325
mTimeFrameGPU->createTrackITSExtDevice(trackSeeds);
306326
mTimeFrameGPU->loadTrackSeedsDevice(trackSeeds);
307-
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
308-
trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed* trackSeeds,
309-
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo,
310-
mTimeFrameGPU->getDeviceTrackITSExt(), // o2::its::TrackITSExt* tracks,
311-
trackSeeds.size(), // const size_t nSeeds,
312-
mBz, // const float Bz,
327+
328+
trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed* trackSeeds
329+
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo
330+
mTimeFrameGPU->getDeviceTrackITSExt(), // o2::its::TrackITSExt* tracks
331+
mTrkParams[iteration].MinPt, // std::vector<float>& minPtsHost,
332+
trackSeeds.size(), // const size_t nSeeds
333+
mBz, // const float Bz
313334
startLevel, // const int startLevel,
314-
mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment,
315-
mTrkParams[0].MaxChi2NDF, // float maxChi2NDF,
335+
mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment
336+
mTrkParams[0].MaxChi2NDF, // float maxChi2NDF
316337
mTimeFrameGPU->getDevicePropagator(), // const o2::base::Propagator* propagator
317338
mCorrType, // o2::base::PropagatorImpl<float>::MatCorrType
318339
conf.nBlocks,
@@ -367,8 +388,8 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
367388
}
368389
mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
369390
}
391+
mTimeFrameGPU->loadUsedClustersDevice();
370392
}
371-
mTimeFrameGPU->loadUsedClustersDevice();
372393
if (iteration == mTrkParams.size() - 1) {
373394
mTimeFrameGPU->unregisterHostMemory(0);
374395
}

0 commit comments

Comments
 (0)