Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,10 @@ class TimeFrameGPU : public TimeFrame
void createCellsBuffers(const int);
void createCellsDevice();
void createCellsLUTDevice();
void createNeighboursDevice();
void createNeighboursIndexTablesDevice();
void createNeighboursDevice(const unsigned int& layer, std::vector<std::pair<int, int>>& neighbours);
void createNeighboursLUTDevice(const int, const unsigned int);
void createNeighboursDeviceArray();
void createTrackITSExtDevice(std::vector<CellSeed>&);
void downloadTrackITSExtDevice(std::vector<CellSeed>&);
void downloadCellsNeighboursDevice(std::vector<std::vector<std::pair<int, int>>>&, const int);
Expand Down Expand Up @@ -113,7 +114,10 @@ class TimeFrameGPU : public TimeFrame
Road<nLayers - 2>* getDeviceRoads() { return mRoadsDevice; }
TrackITSExt* getDeviceTrackITSExt() { return mTrackITSExtDevice; }
int* getDeviceNeighboursLUT(const int layer) { return mNeighboursLUTDevice[layer]; }
gpuPair<int, int>* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
gsl::span<int*> getDeviceNeighboursLUTs() { return mNeighboursLUTDevice; }
gpuPair<int, int>* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; }
int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; }
TrackingFrameInfo* getDeviceTrackingFrameInfo(const int);
const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
Expand Down Expand Up @@ -195,7 +199,9 @@ class TimeFrameGPU : public TimeFrame

Road<nLayers - 2>* mRoadsDevice;
TrackITSExt* mTrackITSExtDevice;
std::array<gpuPair<int, int>*, nLayers - 2> mNeighboursDevice;
std::array<gpuPair<int, int>*, nLayers - 2> mNeighbourPairsDevice;
std::array<int*, nLayers - 2> mNeighboursDevice;
int** mNeighboursDeviceArray;
std::array<TrackingFrameInfo*, nLayers> mTrackingFrameInfoDevice;
const TrackingFrameInfo** mTrackingFrameInfoDeviceArray;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,31 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
const int nBlocks,
const int nThreads);

void filterCellNeighboursHandler(std::vector<int>&,
gpuPair<int, int>*,
unsigned int);
int filterCellNeighboursHandler(std::vector<int>&,
gpuPair<int, int>*,
int*,
unsigned int);

template <int nLayers = 7>
void processNeighboursHandler(const int startLayer,
const int startLevel,
CellSeed** allCellSeeds,
CellSeed* currentCellSeeds,
const unsigned int nCurrentCells,
const unsigned char** usedClusters,
int* neighbours,
gsl::span<int*> neighboursDeviceLUTs,
const TrackingFrameInfo** foundTrackingFrameInfo,
const float bz,
const float MaxChi2ClusterAttachment,
const o2::base::Propagator* propagator,
const o2::base::PropagatorF::MatCorrType matCorrType,
const std::vector<int>& lastCellIdHost, // temporary host vector
const std::vector<CellSeed>& lastCellSeedHost, // temporary host vector
std::vector<int>& updatedCellIdHost, // temporary host vector
std::vector<CellSeed>& updatedCellSeedHost, // temporary host vector
const int nBlocks,
const int nThreads);

void trackSeedHandler(CellSeed* trackSeeds,
const TrackingFrameInfo** foundTrackingFrameInfo,
Expand Down
38 changes: 25 additions & 13 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -306,17 +306,28 @@ void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
}

template <int nLayers>
void TimeFrameGPU<nLayers>::createNeighboursDevice()
void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds");
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours");
// Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator());
checkGPUError(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
}
allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, getExtAllocator());
checkGPUError(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

template <int nLayers>
void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

Expand Down Expand Up @@ -400,19 +411,20 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int& layer, st
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
mCellsNeighbours[layer].clear();
mCellsNeighbours[layer].resize(neighbours.size());
LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
checkGPUError(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), &(mGpuStreams[0]), getExtAllocator());
checkGPUError(cudaMemsetAsync(mNeighboursDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), &(mGpuStreams[0]), getExtAllocator());
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

template <int nLayers>
void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
{
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT");
LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB);
allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, getExtAllocator()); // We need one element more to move exc -> inc
checkGPUError(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours");
allocMemAsync(reinterpret_cast<void**>(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), &(mGpuStreams[0]), getExtAllocator());
checkGPUError(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
}

Expand Down Expand Up @@ -459,7 +471,7 @@ void TimeFrameGPU<nLayers>::downloadCellsNeighboursDevice(std::vector<std::vecto
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours from layer {}", layer));
LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair<int, int>) / MB);
// TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :)
checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighboursDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
checkGPUError(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair<int, int>), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
}

template <int nLayers>
Expand Down
Loading
Loading