Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ class TimeFrameGPU : public TimeFrame
void createCellsDevice();
void createCellsLUTDevice();
void createNeighboursIndexTablesDevice();
void createNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours);
void createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours);
void createNeighboursDevice(const unsigned int layer, std::vector<std::pair<int, int>>& neighbours);
void createNeighboursLUTDevice(const int, const unsigned int);
void createNeighboursDeviceArray();
void createTrackITSExtDevice(std::vector<CellSeed>&);
Expand Down Expand Up @@ -151,6 +152,9 @@ class TimeFrameGPU : public TimeFrame
gsl::span<Tracklet*> getDeviceTracklet() { return mTrackletsDevice; }
gsl::span<CellSeed*> getDeviceCells() { return mCellsDevice; }

// Overridden getters
int getNumberOfCells() const;

private:
void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations
bool mHostRegistered = false;
Expand Down Expand Up @@ -252,6 +256,12 @@ inline std::vector<unsigned int> TimeFrameGPU<nLayers>::getClusterSizes()
return sizes;
}

template <int nLayers>
inline int TimeFrameGPU<nLayers>::getNumberOfCells() const
{
return std::accumulate(mNCells.begin(), mNCells.end(), 0);
}

} // namespace gpu
} // namespace its
} // namespace o2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,19 +148,19 @@ void computeCellsHandler(const Cluster** sortedClusters,
const int nBlocks,
const int nThreads);

void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
int* neighboursLUTs,
int** cellsLUTs,
gpuPair<int, int>* cellNeighbours,
int* neighboursIndexTable,
const float maxChi2ClusterAttachment,
const float bz,
const int layerIndex,
const unsigned int nCells,
const unsigned int nCellsNext,
const int maxCellNeighbours,
const int nBlocks,
const int nThreads);
unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice,
int* neighboursLUTs,
int** cellsLUTs,
gpuPair<int, int>* cellNeighbours,
int* neighboursIndexTable,
const float maxChi2ClusterAttachment,
const float bz,
const int layerIndex,
const unsigned int nCells,
const unsigned int nCellsNext,
const int maxCellNeighbours,
const int nBlocks,
const int nThreads);

void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
int* neighboursLUTs,
Expand Down
14 changes: 13 additions & 1 deletion Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,19 @@ void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(std::vector<CellSeed>& seeds)
}

template <int nLayers>
void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours)
void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours)
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), &(mGpuStreams[0]), getExtAllocator());
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

template <int nLayers>
void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, std::vector<std::pair<int, int>>& neighbours)
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
mCellsNeighbours[layer].clear();
Expand Down
48 changes: 20 additions & 28 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -212,36 +212,30 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
{
mTimeFrameGPU->createNeighboursIndexTablesDevice();
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
std::vector<std::vector<std::pair<int, int>>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1);
for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer + 1])};
mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear();
mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0);

// if (mTimeFrameGPU->getCells()[iLayer + 1].empty() ||
// mTimeFrameGPU->getCellsLookupTable()[iLayer].empty()) {
// mTimeFrameGPU->getCellsNeighbours()[iLayer].clear();
// continue;
// }
if (!nextLayerCellsNum) {
continue;
}

mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum);
countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
mTimeFrameGPU->getDeviceArrayCellsLUT(),
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
mTrkParams[0].MaxChi2ClusterAttachment,
mBz,
iLayer,
mTimeFrameGPU->getNCells()[iLayer],
nextLayerCellsNum,
1e2,
conf.nBlocks,
conf.nThreads);
mTimeFrameGPU->downloadNeighboursLUTDevice(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer], iLayer);
// Get the number of found cells from LUT
cellsNeighboursLayer[iLayer].resize(mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].back());
mTimeFrameGPU->createNeighboursDevice(iLayer, cellsNeighboursLayer[iLayer]);
unsigned int nNeigh = countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
mTimeFrameGPU->getDeviceArrayCellsLUT(),
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
mTrkParams[0].MaxChi2ClusterAttachment,
mBz,
iLayer,
mTimeFrameGPU->getNCells()[iLayer],
nextLayerCellsNum,
1e2,
conf.nBlocks,
conf.nThreads);

mTimeFrameGPU->createNeighboursDevice(iLayer, nNeigh);

computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
mTimeFrameGPU->getDeviceNeighboursLUT(iLayer),
mTimeFrameGPU->getDeviceArrayCellsLUT(),
Expand All @@ -255,13 +249,11 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
1e2,
conf.nBlocks,
conf.nThreads);
mTimeFrameGPU->getCellsNeighbours()[iLayer].clear();
mTimeFrameGPU->getCellsNeighbours()[iLayer].reserve(cellsNeighboursLayer[iLayer].size());

filterCellNeighboursHandler(mTimeFrameGPU->getCellsNeighbours()[iLayer],
mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
mTimeFrameGPU->getDeviceNeighbours(iLayer),
cellsNeighboursLayer[iLayer].size());
nNeigh);
}
mTimeFrameGPU->createNeighboursDeviceArray();
mTimeFrameGPU->unregisterRest();
Expand Down
46 changes: 24 additions & 22 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1066,19 +1066,19 @@ void computeCellsHandler(
nSigmaCut); // const float
}

void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
int* neighboursLUT,
int** cellsLUTs,
gpuPair<int, int>* cellNeighbours,
int* neighboursIndexTable,
const float maxChi2ClusterAttachment,
const float bz,
const int layerIndex,
const unsigned int nCells,
const unsigned int nCellsNext,
const int maxCellNeighbours,
const int nBlocks,
const int nThreads)
unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice,
int* neighboursLUT,
int** cellsLUTs,
gpuPair<int, int>* cellNeighbours,
int* neighboursIndexTable,
const float maxChi2ClusterAttachment,
const float bz,
const int layerIndex,
const unsigned int nCells,
const unsigned int nCellsNext,
const int maxCellNeighbours,
const int nBlocks,
const int nThreads)
{
gpu::computeLayerCellNeighboursKernel<true><<<nBlocks, nThreads>>>(
cellsLayersDevice,
Expand All @@ -1091,8 +1091,7 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
layerIndex,
nCells,
maxCellNeighbours);
// gpuCheckError(cudaPeekAtLastError());
// gpuCheckError(cudaDeviceSynchronize());

void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr;
size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0;
gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage
Expand All @@ -1102,28 +1101,31 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
nCellsNext)); // num_items

discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage
temp_storage_bytes, // temp_storage_bytes
neighboursLUT, // d_in
neighboursLUT, // d_out
nCellsNext)); // num_items
gpuCheckError(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage
temp_storage_bytes, // temp_storage_bytes
neighboursLUT, // d_in
neighboursLUT, // d_out
nCellsNext)); // num_items

gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage
temp_storage_bytes_2, // temp_storage_bytes
neighboursIndexTable, // d_in
neighboursIndexTable, // d_out
nCells + 1, // num_items
0)); // NOLINT: this is the offset of the sum, not a pointer

discardResult(cudaMalloc(&d_temp_storage_2, temp_storage_bytes_2));
gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage
temp_storage_bytes_2, // temp_storage_bytes
neighboursIndexTable, // d_in
neighboursIndexTable, // d_out
nCells + 1, // num_items
0)); // NOLINT: this is the offset of the sum, not a pointer
unsigned int nNeighbours;
gpuCheckError(cudaMemcpy(&nNeighbours, &neighboursLUT[nCellsNext - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost));
gpuCheckError(cudaFree(d_temp_storage));
gpuCheckError(cudaFree(d_temp_storage_2));
gpuCheckError(cudaPeekAtLastError());
gpuCheckError(cudaDeviceSynchronize());
return nNeighbours;
}

void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
Expand Down
Loading