ITS: GPU: put cell neighbour finding on different streams

f3sch · f3sch · commit 853e48dd3895 · 2025-08-09T16:51:33.000+02:00
Signed-off-by: Felix Schlepper &lt;felix.schlepper@cern.ch&gt;
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -62,17 +62,15 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   void createCellsDevice();
   void createCellsLUTDevice();
   void createNeighboursIndexTablesDevice();
-  void createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours);
-  void createNeighboursDevice(const unsigned int layer, std::vector<std::pair<int, int>>& neighbours);
+  void createNeighboursDevice(const unsigned int layer);
   void createNeighboursLUTDevice(const int, const unsigned int);
-  void createNeighboursDeviceArray();
   void createTrackITSExtDevice(bounded_vector<CellSeed>&);
   void downloadTrackITSExtDevice(bounded_vector<CellSeed>&);
   void downloadCellsNeighboursDevice(std::vector<bounded_vector<std::pair<int, int>>>&, const int);
   void downloadNeighboursLUTDevice(bounded_vector<int>&, const int);
   void downloadCellsDevice();
   void downloadCellsLUTDevice();
-  void unregisterRest();
+  auto& getStream(const size_t stream) { return mGpuStreams[stream]; }
   auto& getStreams() { return mGpuStreams; }
   void syncStream(const size_t stream);
   void syncStreams();
@@ -96,7 +94,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   gpuPair<int, int>* getDeviceNeighbourPairs(const int layer) { return mNeighbourPairsDevice[layer]; }
   std::array<int*, nLayers - 2>& getDeviceNeighboursAll() { return mNeighboursDevice; }
   int* getDeviceNeighbours(const int layer) { return mNeighboursDevice[layer]; }
-  int** getDeviceNeighboursArray() { return mNeighboursDeviceArray; }
+  int** getDeviceNeighboursArray() { return mNeighboursDevice.data(); }
   TrackingFrameInfo* getDeviceTrackingFrameInfo(const int);
   const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
   const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
@@ -109,7 +107,7 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; }
   int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; }
   int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; }
-  CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; }
+  CellSeed** getDeviceArrayCells() { return mCellsDevice.data(); }
   CellSeed* getDeviceTrackSeeds() { return mTrackSeedsDevice; }
   o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; }
   float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; }
@@ -176,7 +174,6 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   std::array<CellSeed*, nLayers - 2> mCellsDevice;
   std::array<int*, nLayers - 2> mNeighboursIndexTablesDevice;
   CellSeed* mTrackSeedsDevice;
-  CellSeed** mCellsDeviceArray;
   std::array<o2::track::TrackParCovF*, nLayers - 2> mCellSeedsDevice;
   o2::track::TrackParCovF** mCellSeedsDeviceArray;
   std::array<float*, nLayers - 2> mCellSeedsChi2Device;
@@ -186,7 +183,6 @@ class TimeFrameGPU : public TimeFrame<nLayers>
   TrackITSExt* mTrackITSExtDevice;
   std::array<gpuPair<int, int>*, nLayers - 2> mNeighbourPairsDevice;
   std::array<int*, nLayers - 2> mNeighboursDevice;
-  int** mNeighboursDeviceArray;
   std::array<TrackingFrameInfo*, nLayers> mTrackingFrameInfoDevice;
   const TrackingFrameInfo** mTrackingFrameInfoDeviceArray;
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -162,21 +162,22 @@ void computeCellsHandler(const Cluster** sortedClusters,
                          const int nThreads,
                          gpu::Streams& streams);
 
-unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice,
-                                        int* neighboursLUTs,
-                                        int** cellsLUTs,
-                                        gpuPair<int, int>* cellNeighbours,
-                                        int* neighboursIndexTable,
-                                        const Tracklet** tracklets,
-                                        const int deltaROF,
-                                        const float maxChi2ClusterAttachment,
-                                        const float bz,
-                                        const int layerIndex,
-                                        const unsigned int nCells,
-                                        const unsigned int nCellsNext,
-                                        const int maxCellNeighbours,
-                                        const int nBlocks,
-                                        const int nThreads);
+void countCellNeighboursHandler(CellSeed** cellsLayersDevice,
+                                int* neighboursLUTs,
+                                int** cellsLUTs,
+                                gpuPair<int, int>* cellNeighbours,
+                                int* neighboursIndexTable,
+                                const Tracklet** tracklets,
+                                const int deltaROF,
+                                const float maxChi2ClusterAttachment,
+                                const float bz,
+                                const int layerIndex,
+                                const unsigned int nCells,
+                                const unsigned int nCellsNext,
+                                const int maxCellNeighbours,
+                                const int nBlocks,
+                                const int nThreads,
+                                gpu::Stream& stream);
 
 void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
                                   int* neighboursLUTs,
@@ -192,11 +193,13 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
                                   const unsigned int nCellsNext,
                                   const int maxCellNeighbours,
                                   const int nBlocks,
-                                  const int nThreads);
+                                  const int nThreads,
+                                  gpu::Stream& stream);
 
 int filterCellNeighboursHandler(gpuPair<int, int>*,
                                 int*,
                                 unsigned int,
+                                gpu::Stream&,
                                 o2::its::ExternalAllocator* = nullptr);
 
 template <int nLayers = 7>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -349,26 +349,20 @@ void TimeFrameGPU<nLayers>::createNeighboursIndexTablesDevice()
 {
   GPUTimer timer(mGpuStreams[0], "creating cells neighbours");
   // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps.
-  allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable));
-  GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
     GPULog("gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {:.2f} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / constants::MB);
-    allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-    GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
-    if (iLayer < nLayers - 3) {
-      mNNeighbours[iLayer] = 0;
-    }
+    allocMemAsync(reinterpret_cast<void**>(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator());
+    GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer].get()));
   }
 }
 
 template <int nLayers>
 void TimeFrameGPU<nLayers>::createNeighboursLUTDevice(const int layer, const unsigned int nCells)
 {
-  GPUTimer timer(mGpuStreams[0], "reserving neighboursLUT");
+  GPUTimer timer(mGpuStreams[layer], "reserving neighboursLUT");
   GPULog("gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {:.2f} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / constants::MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); // We need one element more to move exc -> inc
-  GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get()));
+  allocMemAsync(reinterpret_cast<void**>(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); // We need one element more to move exc -> inc
+  GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[layer].get()));
 }
 
 template <int nLayers>
@@ -382,8 +376,6 @@ void TimeFrameGPU<nLayers>::loadCellsDevice()
     GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer].get()));
     GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get()));
   }
-  allocMemAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
 }
 
 template <int nLayers>
@@ -441,35 +433,15 @@ void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(bounded_vector<CellSeed>& seeds
 }
 
 template <int nLayers>
-void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours)
+void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer)
 {
-  GPUTimer timer(mGpuStreams[0], "reserving neighbours");
+  GPUTimer timer(mGpuStreams[layer], "reserving neighbours");
+  GPUChkErrS(cudaMemcpyAsync(&(this->mNNeighbours[layer]), &(mNeighboursLUTDevice[layer][this->mNCells[layer + 1] - 1]), sizeof(unsigned int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get()));
   GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / constants::MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
+  allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), (this->mNNeighbours[layer]) * sizeof(gpuPair<int, int>), mGpuStreams[layer], this->getExtAllocator());
+  GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, (this->mNNeighbours[layer]) * sizeof(gpuPair<int, int>), mGpuStreams[layer].get()));
   GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair<int, int>) / constants::MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-}
-
-template <int nLayers>
-void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer, std::vector<std::pair<int, int>>& neighbours)
-{
-  GPUTimer timer(mGpuStreams[0], "reserving neighbours");
-  this->mCellsNeighbours[layer].clear();
-  this->mCellsNeighbours[layer].resize(neighbours.size());
-  GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / constants::MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair<int, int>), mGpuStreams[0].get()));
-  GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair<int, int>) / constants::MB);
-  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator());
-}
-
-template <int nLayers>
-void TimeFrameGPU<nLayers>::createNeighboursDeviceArray()
-{
-  GPUTimer timer(mGpuStreams[0], "reserving neighbours");
-  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator());
-  GPUChkErrS(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  allocMemAsync(reinterpret_cast<void**>(&mNeighboursDevice[layer]), (this->mNNeighbours[layer]) * sizeof(int), mGpuStreams[layer], this->getExtAllocator());
 }
 
 template <int nLayers>
@@ -532,15 +504,6 @@ void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeed>&
   GPUChkErrS(cudaHostUnregister(seeds.data()));
 }
 
-template <int nLayers>
-void TimeFrameGPU<nLayers>::unregisterRest()
-{
-  GPUTimer timer(mGpuStreams[0], "unregistering rest of the host memory");
-  GPULog("unregistering rest of the host memory...");
-  GPUChkErrS(cudaHostUnregister(mCellsDevice.data()));
-  // GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data()));
-}
-
 template <int nLayers>
 void TimeFrameGPU<nLayers>::unregisterHostMemory(const int maxLayers)
 {
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -180,15 +180,27 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
                         conf.nThreadsLayerCells[iteration],
                         mTimeFrameGPU->getStreams());
   }
-  mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed
 }
 
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
 {
   mTimeFrameGPU->createNeighboursIndexTablesDevice();
   const auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
+
+  std::vector<bool> isCellStreamSynched(this->mTrkParams[iteration].TrackletsPerRoad() - 1);
+  auto syncOnce = [&](const int iLayer) {
+    if (!isCellStreamSynched[iLayer]) {
+      mTimeFrameGPU->syncStream(iLayer);
+      isCellStreamSynched[iLayer] = true;
+    }
+  };
+
   for (int iLayer{0}; iLayer < this->mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
+    // ensure that celling is done for iLayer and iLayer+1 is done
+    syncOnce(iLayer);
+    syncOnce(iLayer + 1);
+
     const int currentLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer])};
     const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer + 1])};
     if (!nextLayerCellsNum || !currentLayerCellsNum) {
@@ -197,24 +209,23 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
     }
 
     mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum);
-    unsigned int nNeigh = countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
-                                                     mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
-                                                     mTimeFrameGPU->getDeviceArrayCellsLUT(),
-                                                     mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
-                                                     mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
-                                                     (const Tracklet**)mTimeFrameGPU->getDeviceArrayTracklets(),
-                                                     this->mTrkParams[0].DeltaROF,
-                                                     this->mTrkParams[0].MaxChi2ClusterAttachment,
-                                                     this->mBz,
-                                                     iLayer,
-                                                     currentLayerCellsNum,
-                                                     nextLayerCellsNum,
-                                                     1e2,
-                                                     conf.nBlocksFindNeighbours[iteration],
-                                                     conf.nThreadsFindNeighbours[iteration]);
-
-    mTimeFrameGPU->createNeighboursDevice(iLayer, nNeigh);
-
+    countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
+                               mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here.
+                               mTimeFrameGPU->getDeviceArrayCellsLUT(),
+                               mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
+                               mTimeFrameGPU->getDeviceNeighboursIndexTables(iLayer),
+                               (const Tracklet**)mTimeFrameGPU->getDeviceArrayTracklets(),
+                               this->mTrkParams[0].DeltaROF,
+                               this->mTrkParams[0].MaxChi2ClusterAttachment,
+                               this->mBz,
+                               iLayer,
+                               currentLayerCellsNum,
+                               nextLayerCellsNum,
+                               1e2,
+                               conf.nBlocksFindNeighbours[iteration],
+                               conf.nThreadsFindNeighbours[iteration],
+                               mTimeFrameGPU->getStream(iLayer));
+    mTimeFrameGPU->createNeighboursDevice(iLayer);
     computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(),
                                  mTimeFrameGPU->getDeviceNeighboursLUT(iLayer),
                                  mTimeFrameGPU->getDeviceArrayCellsLUT(),
@@ -229,16 +240,15 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
                                  nextLayerCellsNum,
                                  1e2,
                                  conf.nBlocksFindNeighbours[iteration],
-                                 conf.nThreadsFindNeighbours[iteration]);
-
-    nNeigh = filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
-                                         mTimeFrameGPU->getDeviceNeighbours(iLayer),
-                                         nNeigh,
-                                         mTimeFrameGPU->getExternalAllocator());
-    mTimeFrameGPU->getArrayNNeighbours()[iLayer] = nNeigh;
+                                 conf.nThreadsFindNeighbours[iteration],
+                                 mTimeFrameGPU->getStream(iLayer));
+    mTimeFrameGPU->getArrayNNeighbours()[iLayer] = filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
+                                                                               mTimeFrameGPU->getDeviceNeighbours(iLayer),
+                                                                               mTimeFrameGPU->getArrayNNeighbours()[iLayer],
+                                                                               mTimeFrameGPU->getStream(iLayer),
+                                                                               mTimeFrameGPU->getExternalAllocator());
   }
-  mTimeFrameGPU->createNeighboursDeviceArray();
-  mTimeFrameGPU->unregisterRest();
+  mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed
 };
 
 template <int nLayers>
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu