Skip to content

Commit e7179fc

Browse files
authored
ITS-GPU: Cleanup for some host code (#13907)
* Cleanup * Fix nCells printout
1 parent 0a74715 commit e7179fc

File tree

5 files changed

+81
-65
lines changed

5 files changed

+81
-65
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,8 @@ class TimeFrameGPU : public TimeFrame
7777
void createCellsDevice();
7878
void createCellsLUTDevice();
7979
void createNeighboursIndexTablesDevice();
80-
void createNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours);
80+
void createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours);
81+
void createNeighboursDevice(const unsigned int layer, std::vector<std::pair<int, int>>& neighbours);
8182
void createNeighboursLUTDevice(const int, const unsigned int);
8283
void createNeighboursDeviceArray();
8384
void createTrackITSExtDevice(std::vector<CellSeed>&);
@@ -151,6 +152,9 @@ class TimeFrameGPU : public TimeFrame
151152
gsl::span<Tracklet*> getDeviceTracklet() { return mTrackletsDevice; }
152153
gsl::span<CellSeed*> getDeviceCells() { return mCellsDevice; }
153154

155+
// Overridden getters
156+
int getNumberOfCells() const;
157+
154158
private:
155159
void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations
156160
bool mHostRegistered = false;
@@ -252,6 +256,12 @@ inline std::vector<unsigned int> TimeFrameGPU<nLayers>::getClusterSizes()
252256
return sizes;
253257
}
254258

259+
template <int nLayers>
260+
inline int TimeFrameGPU<nLayers>::getNumberOfCells() const
261+
{
262+
return std::accumulate(mNCells.begin(), mNCells.end(), 0);
263+
}
264+
255265
} // namespace gpu
256266
} // namespace its
257267
} // namespace o2

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -148,19 +148,19 @@ void computeCellsHandler(const Cluster** sortedClusters,
148148
const int nBlocks,
149149
const int nThreads);
150150

151-
void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
152-
int* neighboursLUTs,
153-
int** cellsLUTs,
154-
gpuPair<int, int>* cellNeighbours,
155-
int* neighboursIndexTable,
156-
const float maxChi2ClusterAttachment,
157-
const float bz,
158-
const int layerIndex,
159-
const unsigned int nCells,
160-
const unsigned int nCellsNext,
161-
const int maxCellNeighbours,
162-
const int nBlocks,
163-
const int nThreads);
151+
unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice,
152+
int* neighboursLUTs,
153+
int** cellsLUTs,
154+
gpuPair<int, int>* cellNeighbours,
155+
int* neighboursIndexTable,
156+
const float maxChi2ClusterAttachment,
157+
const float bz,
158+
const int layerIndex,
159+
const unsigned int nCells,
160+
const unsigned int nCellsNext,
161+
const int maxCellNeighbours,
162+
const int nBlocks,
163+
const int nThreads);
164164

165165
void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
166166
int* neighboursLUTs,

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,19 @@ void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(std::vector<CellSeed>& seeds)
406406
}
407407

408408
template <int nLayers>
409-
void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours)
409+
void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours)
410+
{
411+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
412+
LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / MB);
413+
allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
414+
checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
415+
LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / MB);
416+
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), &(mGpuStreams[0]), getExtAllocator());
417+
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
418+
}
419+
420+
template <int nLayers>
421+
void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, std::vector<std::pair<int, int>>& neighbours)
410422
{
411423
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
412424
mCellsNeighbours[layer].clear();

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -212,36 +212,30 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
212212
{
213213
mTimeFrameGPU->createNeighboursIndexTablesDevice();
214214
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
215-
std::vector<std::vector<std::pair<int, int>>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1);
216215
for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
217216
const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer + 1])};
218-
mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear();
219-
mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0);
220217

221-
// if (mTimeFrameGPU->getCells()[iLayer + 1].empty() ||
222-
// mTimeFrameGPU->getCellsLookupTable()[iLayer].empty()) {
223-
// mTimeFrameGPU->getCellsNeighbours()[iLayer].clear();
224-
// continue;
225-
// }
218+
if (!nextLayerCellsNum) {
219+
continue;
220+
}
226221

227222
mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum);
228-
countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
229-
mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
230-
mTimeFrameGPU->getDeviceArrayCellsLUT(),
231-
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
232-
mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
233-
mTrkParams[0].MaxChi2ClusterAttachment,
234-
mBz,
235-
iLayer,
236-
mTimeFrameGPU->getNCells()[iLayer],
237-
nextLayerCellsNum,
238-
1e2,
239-
conf.nBlocks,
240-
conf.nThreads);
241-
mTimeFrameGPU->downloadNeighboursLUTDevice(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer], iLayer);
242-
// Get the number of found cells from LUT
243-
cellsNeighboursLayer[iLayer].resize(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].back());
244-
mTimeFrameGPU->createNeighboursDevice(iLayer, cellsNeighboursLayer[iLayer]);
223+
unsigned int nNeigh = countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
224+
mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
225+
mTimeFrameGPU->getDeviceArrayCellsLUT(),
226+
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
227+
mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
228+
mTrkParams[0].MaxChi2ClusterAttachment,
229+
mBz,
230+
iLayer,
231+
mTimeFrameGPU->getNCells()[iLayer],
232+
nextLayerCellsNum,
233+
1e2,
234+
conf.nBlocks,
235+
conf.nThreads);
236+
237+
mTimeFrameGPU->createNeighboursDevice(iLayer, nNeigh);
238+
245239
computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
246240
mTimeFrameGPU->getDeviceNeighboursLUT(iLayer),
247241
mTimeFrameGPU->getDeviceArrayCellsLUT(),
@@ -255,13 +249,11 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
255249
1e2,
256250
conf.nBlocks,
257251
conf.nThreads);
258-
mTimeFrameGPU->getCellsNeighbours()[iLayer].clear();
259-
mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighboursLayer[iLayer].size());
260252

261253
filterCellNeighboursHandler(mTimeFrameGPU->getCellsNeighbours()[iLayer],
262254
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
263255
mTimeFrameGPU->getDeviceNeighbours(iLayer),
264-
cellsNeighboursLayer[iLayer].size());
256+
nNeigh);
265257
}
266258
mTimeFrameGPU->createNeighboursDeviceArray();
267259
mTimeFrameGPU->unregisterRest();

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,19 +1066,19 @@ void computeCellsHandler(
10661066
nSigmaCut); // const float
10671067
}
10681068

1069-
void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
1070-
int* neighboursLUT,
1071-
int** cellsLUTs,
1072-
gpuPair<int, int>* cellNeighbours,
1073-
int* neighboursIndexTable,
1074-
const float maxChi2ClusterAttachment,
1075-
const float bz,
1076-
const int layerIndex,
1077-
const unsigned int nCells,
1078-
const unsigned int nCellsNext,
1079-
const int maxCellNeighbours,
1080-
const int nBlocks,
1081-
const int nThreads)
1069+
unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice,
1070+
int* neighboursLUT,
1071+
int** cellsLUTs,
1072+
gpuPair<int, int>* cellNeighbours,
1073+
int* neighboursIndexTable,
1074+
const float maxChi2ClusterAttachment,
1075+
const float bz,
1076+
const int layerIndex,
1077+
const unsigned int nCells,
1078+
const unsigned int nCellsNext,
1079+
const int maxCellNeighbours,
1080+
const int nBlocks,
1081+
const int nThreads)
10821082
{
10831083
gpu::computeLayerCellNeighboursKernel<true><<<nBlocks, nThreads>>>(
10841084
cellsLayersDevice,
@@ -1091,8 +1091,7 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
10911091
layerIndex,
10921092
nCells,
10931093
maxCellNeighbours);
1094-
// gpuCheckError(cudaPeekAtLastError());
1095-
// gpuCheckError(cudaDeviceSynchronize());
1094+
10961095
void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr;
10971096
size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0;
10981097
gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage
@@ -1102,28 +1101,31 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
11021101
nCellsNext)); // num_items
11031102

11041103
discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
1105-
gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage
1106-
temp_storage_bytes, // temp_storage_bytes
1107-
neighboursLUT, // d_in
1108-
neighboursLUT, // d_out
1109-
nCellsNext)); // num_items
1104+
gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage
1105+
temp_storage_bytes, // temp_storage_bytes
1106+
neighboursLUT, // d_in
1107+
neighboursLUT, // d_out
1108+
nCellsNext)); // num_items
1109+
11101110
gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage
11111111
temp_storage_bytes_2, // temp_storage_bytes
11121112
neighboursIndexTable, // d_in
11131113
neighboursIndexTable, // d_out
11141114
nCells + 1, // num_items
11151115
0)); // NOLINT: this is the offset of the sum, not a pointer
1116+
11161117
discardResult(cudaMalloc(&d_temp_storage_2, temp_storage_bytes_2));
11171118
gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage
11181119
temp_storage_bytes_2, // temp_storage_bytes
11191120
neighboursIndexTable, // d_in
11201121
neighboursIndexTable, // d_out
11211122
nCells + 1, // num_items
11221123
0)); // NOLINT: this is the offset of the sum, not a pointer
1124+
unsigned int nNeighbours;
1125+
gpuCheckError(cudaMemcpy(&nNeighbours, &neighboursLUT[nCellsNext - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost));
11231126
gpuCheckError(cudaFree(d_temp_storage));
11241127
gpuCheckError(cudaFree(d_temp_storage_2));
1125-
gpuCheckError(cudaPeekAtLastError());
1126-
gpuCheckError(cudaDeviceSynchronize());
1128+
return nNeighbours;
11271129
}
11281130

11291131
void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,

0 commit comments

Comments
 (0)