Skip to content

Commit 07e4515

Browse files
authored
ITS::gpu: Add processNeighbours GPU kernel and handler (#13822)
1 parent e9e0633 commit 07e4515

File tree

6 files changed

+421
-88
lines changed

6 files changed

+421
-88
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,10 @@ class TimeFrameGPU : public TimeFrame
7676
void createCellsBuffers(const int);
7777
void createCellsDevice();
7878
void createCellsLUTDevice();
79-
void createNeighboursDevice();
79+
void createNeighboursIndexTablesDevice();
8080
void createNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours);
8181
void createNeighboursLUTDevice(const int, const unsigned int);
82+
void createNeighboursDeviceArray();
8283
void createTrackITSExtDevice(std::vector<CellSeed>&);
8384
void downloadTrackITSExtDevice(std::vector<CellSeed>&);
8485
void downloadCellsNeighboursDevice(std::vector<std::vector<std::pair<int, int>>>&, const int);
@@ -113,7 +114,10 @@ class TimeFrameGPU : public TimeFrame
113114
Road<nLayers - 2>* getDeviceRoads() { return mRoadsDevice; }
114115
TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; }
115116
int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; }
116-
gpuPair<int, int>* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
117+
gsl::span<int*> getDeviceNeighboursLUTs() { return mNeighboursLUTDevice; }
118+
gpuPair<int, int>* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; }
119+
int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
120+
int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; }
117121
TrackingFrameInfo* getDeviceTrackingFrameInfo(const int);
118122
const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
119123
const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
@@ -195,7 +199,9 @@ class TimeFrameGPU : public TimeFrame
195199

196200
Road<nLayers - 2>* mRoadsDevice;
197201
TrackITSExt* mTrackITSExtDevice;
198-
std::array<gpuPair<int, int>*, nLayers - 2> mNeighboursDevice;
202+
std::array<gpuPair<int, int>*, nLayers - 2> mNeighbourPairsDevice;
203+
std::array<int*, nLayers - 2> mNeighboursDevice;
204+
int** mNeighboursDeviceArray;
199205
std::array<TrackingFrameInfo*, nLayers> mTrackingFrameInfoDevice;
200206
const TrackingFrameInfo** mTrackingFrameInfoDeviceArray;
201207

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,9 +176,31 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
176176
const int nBlocks,
177177
const int nThreads);
178178

179-
void filterCellNeighboursHandler(std::vector<int>&,
180-
gpuPair<int, int>*,
181-
unsigned int);
179+
int filterCellNeighboursHandler(std::vector<int>&,
180+
gpuPair<int, int>*,
181+
int*,
182+
unsigned int);
183+
184+
template <int nLayers = 7>
185+
void processNeighboursHandler(const int startLayer,
186+
const int startLevel,
187+
CellSeed** allCellSeeds,
188+
CellSeed* currentCellSeeds,
189+
const unsigned int nCurrentCells,
190+
const unsigned char** usedClusters,
191+
int* neighbours,
192+
gsl::span<int*> neighboursDeviceLUTs,
193+
const TrackingFrameInfo** foundTrackingFrameInfo,
194+
const float bz,
195+
const float MaxChi2ClusterAttachment,
196+
const o2::base::Propagator* propagator,
197+
const o2::base::PropagatorF::MatCorrType matCorrType,
198+
const std::vector<int>& lastCellIdHost, // temporary host vector
199+
const std::vector<CellSeed>& lastCellSeedHost, // temporary host vector
200+
std::vector<int>& updatedCellIdHost, // temporary host vector
201+
std::vector<CellSeed>& updatedCellSeedHost, // temporary host vector
202+
const int nBlocks,
203+
const int nThreads);
182204

183205
void trackSeedHandler(CellSeed* trackSeeds,
184206
const TrackingFrameInfo** foundTrackingFrameInfo,

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -306,17 +306,28 @@ void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
306306
}
307307

308308
template <int nLayers>
309-
void TimeFrameGPU<nLayers>::createNeighboursDevice()
309+
void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
310310
{
311-
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds");
311+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours");
312+
// Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
313+
allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
314+
checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
315+
checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
312316
for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
313317
LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
314318
allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator());
315319
checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
316320
}
317-
allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
318-
checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
319-
checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
321+
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
322+
}
323+
324+
template <int nLayers>
325+
void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
326+
{
327+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
328+
LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
329+
allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
330+
checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
320331
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
321332
}
322333

@@ -400,19 +411,20 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int& layer, st
400411
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
401412
mCellsNeighbours[layer].clear();
402413
mCellsNeighbours[layer].resize(neighbours.size());
414+
LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
415+
allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
416+
checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
403417
LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
404-
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
405-
checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
418+
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), &(mGpuStreams[0]), getExtAllocator());
406419
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
407420
}
408421

409422
template <int nLayers>
410-
void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
423+
void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
411424
{
412-
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
413-
LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
414-
allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
415-
checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
425+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
426+
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), &(mGpuStreams[0]), getExtAllocator());
427+
checkGPUError(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
416428
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
417429
}
418430

@@ -459,7 +471,7 @@ void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<std::vecto
459471
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours from layer {}", layer));
460472
LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / MB);
461473
// TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :)
462-
checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
474+
checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
463475
}
464476

465477
template <int nLayers>

0 commit comments

Comments
 (0)