Checkpointing

mconcas · mconcas · commit 81e1e5efb057 · 2024-11-20T18:21:50.000+01:00
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -51,10 +51,14 @@ class TimeFrameGPU : public TimeFrame
   void initialise(const int, const TrackingParameters&, const int, IndexTableUtils* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr);
   void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int);
   void initDeviceSAFitting();
+  void loadIndexTableUtils(const int);
   void loadTrackingFrameInfoDevice(const int);
   void loadUnsortedClustersDevice(const int);
   void loadClustersDevice(const int);
-  void loadROframeClustersDevice(const int iteration);
+  void loadClustersIndexTables(const int iteration);
+  void createUsedClustersDevice(const int);
+  void loadUsedClustersDevice();
+  void loadROframeClustersDevice(const int);
   void loadMultiplicityCutMask(const int);
   void loadVertices(const int);
 
@@ -112,6 +116,8 @@ class TimeFrameGPU : public TimeFrame
   const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
   const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
   const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; }
+  const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; }
+  const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; }
   const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; }
   const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; }
   const int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; }
@@ -148,17 +154,20 @@ class TimeFrameGPU : public TimeFrame
   // Device pointers
   StaticTrackingParameters<nLayers>* mTrackingParamsDevice;
   IndexTableUtils* mIndexTableUtilsDevice;
-  std::array<unsigned char*, nLayers> mUsedClustersDevice;
 
   // Hybrid pref
   uint8_t* mMultMaskDevice;
   Vertex* mPrimaryVerticesDevice;
   int* mROFramesPVDevice;
   std::array<Cluster*, nLayers> mClustersDevice;
   std::array<Cluster*, nLayers> mUnsortedClustersDevice;
+  std::array<int*, nLayers> mClustersIndexTablesDevice;
+  std::array<unsigned char*, nLayers> mUsedClustersDevice;
   std::array<int*, nLayers> mROFramesClustersDevice;
   const Cluster** mClustersDeviceArray;
   const Cluster** mUnsortedClustersDeviceArray;
+  const int** mClustersIndexTablesDeviceArray;
+  const unsigned char** mUsedClustersDeviceArray;
   const int** mROFrameClustersDeviceArray;
   std::array<Tracklet*, nLayers - 1> mTrackletsDevice;
   const Tracklet** mTrackletsDeviceArray;
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -51,7 +51,8 @@ GPUg() void fitTrackSeedsKernel(
 } // namespace gpu
 
 template <int nLayers = 7>
-void computeTrackletsInRofsHandler(const uint8_t* multMask,
+void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
+                                   const uint8_t* multMask,
                                    const int startROF,
                                    const int endROF,
                                    const int maxROF,
@@ -62,6 +63,15 @@ void computeTrackletsInRofsHandler(const uint8_t* multMask,
                                    const int nVertices,
                                    const Cluster** clusters,
                                    const int** ROFClusters,
+                                   const unsigned char** usedClusters,
+                                   const int** clustersIndexTables,
+                                   const int iteration,
+                                   const float NSigmaCut,
+                                   std::vector<float>& phiCuts,
+                                   const float resolutionPV,
+                                   std::vector<float>& minR,
+                                   std::vector<float>& maxR,
+                                   std::vector<float>& resolutions,
                                    std::vector<float>& radii,
                                    std::vector<float>& mulScatAng,
                                    const int nBlocks,
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
@@ -39,9 +39,9 @@ struct gpuSpan {
   using ref = T&;
 
   GPUd() gpuSpan() : _data(nullptr), _size(0) {}
-  GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {}
-  GPUd() ref operator[](std::size_t idx) const { return _data[idx]; }
-  GPUd() std::size_t size() const { return _size; }
+  GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {}
+  GPUd() ref operator[](unsigned int idx) const { return _data[idx]; }
+  GPUd() unsigned int size() const { return _size; }
   GPUd() bool empty() const { return _size == 0; }
   GPUd() ref front() const { return _data[0]; }
   GPUd() ref back() const { return _data[_size - 1]; }
@@ -50,7 +50,7 @@ struct gpuSpan {
 
  protected:
   ptr _data;
-  std::size_t _size;
+  unsigned int _size;
 };
 
 template <typename T>
@@ -60,10 +60,10 @@ struct gpuSpan<const T> {
   using ref = const T&;
 
   GPUd() gpuSpan() : _data(nullptr), _size(0) {}
-  GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {}
+  GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {}
   GPUd() gpuSpan(const gpuSpan<T>& other) : _data(other._data), _size(other._size) {}
-  GPUd() ref operator[](std::size_t idx) const { return _data[idx]; }
-  GPUd() std::size_t size() const { return _size; }
+  GPUd() ref operator[](unsigned int idx) const { return _data[idx]; }
+  GPUd() unsigned int size() const { return _size; }
   GPUd() bool empty() const { return _size == 0; }
   GPUd() ref front() const { return _data[0]; }
   GPUd() ref back() const { return _data[_size - 1]; }
@@ -72,7 +72,7 @@ struct gpuSpan<const T> {
 
  protected:
   ptr _data;
-  std::size_t _size;
+  unsigned int _size;
 };
 
 enum class Task {
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -92,6 +92,19 @@ void TimeFrameGPU<nLayers>::setDevicePropagator(const o2::base::PropagatorImpl<f
   mPropagatorDevice = propagator;
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
+{
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading indextable utils");
+  if (!iteration) {
+    LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB);
+    allocMemAsync(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, getExtAllocator());
+  }
+  LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB);
+  checkGPUError(cudaMemcpyAsync(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration)
 {
@@ -128,13 +141,56 @@ void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration)
   }
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration)
+{
+  if (!iteration) {
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters");
+    for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+      LOGP(info, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, mIndexTables[iLayer].size(), mIndexTables[iLayer].size() * sizeof(int) / MB);
+      allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[iLayer]), mIndexTables[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
+      checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    }
+    allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, getExtAllocator());
+    checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createUsedClustersDevice(const int iteration)
+{
+  if (!iteration) {
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags");
+    for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+      LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB);
+      allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[iLayer]), mUsedClusters[iLayer].size() * sizeof(unsigned char), nullptr, getExtAllocator());
+      checkGPUError(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get()));
+    }
+    allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), nullptr, getExtAllocator());
+    checkGPUError(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadUsedClustersDevice()
+{
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags");
+  for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+    LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(unsigned char) / MB);
+    checkGPUError(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  }
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadROframeClustersDevice(const int iteration)
 {
   if (!iteration) {
     START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters");
     for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
-      LOGP(info, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB);
+      LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB);
       allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
       checkGPUError(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     }
@@ -167,7 +223,7 @@ void TimeFrameGPU<nLayers>::loadMultiplicityCutMask(const int iteration)
 {
   if (!iteration) {
     START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask");
-    LOGP(info, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB);
+    LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, getExtAllocator());
     checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
@@ -179,10 +235,10 @@ void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
 {
   if (!iteration) {
     START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices");
-    LOGP(info, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB);
+    LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mROFramesPVDevice), mROFramesPV.size() * sizeof(int), nullptr, getExtAllocator());
     checkGPUError(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
-    LOGP(info, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB);
+    LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB);
     allocMemAsync(reinterpret_cast<void**>(&mPrimaryVerticesDevice), mPrimaryVertices.size() * sizeof(Vertex), nullptr, getExtAllocator());
     checkGPUError(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
     STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -31,10 +31,13 @@ void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
   mTimeFrameGPU->initialise(iteration, mTrkParams[iteration], nLayers);
   mTimeFrameGPU->loadClustersDevice(iteration);
   mTimeFrameGPU->loadUnsortedClustersDevice(iteration);
+  mTimeFrameGPU->loadClustersIndexTables(iteration);
   mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration);
   mTimeFrameGPU->loadMultiplicityCutMask(iteration);
   mTimeFrameGPU->loadVertices(iteration);
   mTimeFrameGPU->loadROframeClustersDevice(iteration);
+  mTimeFrameGPU->createUsedClustersDevice(iteration);
+  mTimeFrameGPU->loadIndexTableUtils(iteration);
 }
 
 template <int nLayers>
@@ -95,7 +98,8 @@ void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int
   int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0};
   int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()};
 
-  computeTrackletsInRofsHandler<nLayers>(mTimeFrameGPU->getDeviceMultCutMask(),
+  computeTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
+                                         mTimeFrameGPU->getDeviceMultCutMask(),
                                          startROF,
                                          endROF,
                                          mTimeFrameGPU->getNrof(),
@@ -106,6 +110,15 @@ void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int
                                          mTimeFrameGPU->getPrimaryVerticesNum(),
                                          mTimeFrameGPU->getDeviceArrayClusters(),
                                          mTimeFrameGPU->getDeviceROframeClusters(),
+                                         mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                                         mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
+                                         iteration,
+                                         mTrkParams[iteration].NSigmaCut,
+                                         mTimeFrameGPU->getPhiCuts(),
+                                         mTrkParams[iteration].PVres,
+                                         mTimeFrameGPU->getMinRs(),
+                                         mTimeFrameGPU->getMaxRs(),
+                                         mTimeFrameGPU->getPositionResolutions(),
                                          mTrkParams[iteration].LayerRadii,
                                          mTimeFrameGPU->getMSangles(),
                                          conf.nBlocks,
@@ -324,6 +337,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
       mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
     }
   }
+  mTimeFrameGPU->loadUsedClustersDevice();
   if (iteration == mTrkParams.size() - 1) {
     mTimeFrameGPU->unregisterHostMemory(0);
   }
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h