AliceO2Group
diff --git a/‎Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h‎
Lines changed: 9 additions & 10 deletions b/‎Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h‎
Lines changed: 9 additions & 10 deletions
diff --git a/‎Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h‎
Lines changed: 5 additions & 5 deletions b/‎Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt‎
Lines changed: 7 additions & 4 deletions b/‎Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu‎
Lines changed: 44 additions & 42 deletions b/‎Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu‎
Lines changed: 44 additions & 42 deletions
@@ -41,8 +41,7 @@ class TimeFrameGPU final : public TimeFrame<NLayers>
   void popMemoryStack(const int);
   void registerHostMemory(const int);
   void unregisterHostMemory(const int);
-  void initialise(const int, const TrackingParameters&, const int, IndexTableUtilsN* utils = nullptr);
-  void initDeviceSAFitting();
+  void initialise(const int, const TrackingParameters&, const int);
   void loadIndexTableUtils(const int);
   void loadTrackingFrameInfoDevice(const int, const int);
   void createTrackingFrameInfoDeviceArray(const int);
@@ -59,8 +58,8 @@ class TimeFrameGPU final : public TimeFrame<NLayers>
   void createROFrameClustersDeviceArray(const int);
   void loadMultiplicityCutMask(const int);
   void loadVertices(const int);
-  void loadROFOverlapTable();
-  void loadROFVertexLookupTable();
+  void loadROFOverlapTable(const int);
+  void loadROFVertexLookupTable(const int);
   void updateROFVertexLookupTable(const int);
 
   ///
@@ -174,9 +173,9 @@ class TimeFrameGPU final : public TimeFrame<NLayers>
   gsl::span<CellSeedN*> getDeviceCells() { return mCellsDevice; }
 
   // Overridden getters
-  int getNumberOfTracklets() const final;
-  int getNumberOfCells() const final;
-  int getNumberOfNeighbours() const final;
+  size_t getNumberOfTracklets() const final;
+  size_t getNumberOfCells() const final;
+  size_t getNumberOfNeighbours() const final;
 
  private:
   void allocMemAsync(void**, size_t, Stream&, bool, int32_t = o2::gpu::GPUMemoryResource::MEMORY_GPU); // Abstract owned and unowned memory allocations on specific stream
@@ -275,19 +274,19 @@ inline std::vector<unsigned int> TimeFrameGPU<NLayers>::getClusterSizes()
 }
 
 template <int NLayers>
-inline int TimeFrameGPU<NLayers>::getNumberOfTracklets() const
+inline size_t TimeFrameGPU<NLayers>::getNumberOfTracklets() const
 {
   return std::accumulate(mNTracklets.begin(), mNTracklets.end(), 0);
 }
 
 template <int NLayers>
-inline int TimeFrameGPU<NLayers>::getNumberOfCells() const
+inline size_t TimeFrameGPU<NLayers>::getNumberOfCells() const
 {
   return std::accumulate(mNCells.begin(), mNCells.end(), 0);
 }
 
 template <int NLayers>
-inline int TimeFrameGPU<NLayers>::getNumberOfNeighbours() const
+inline size_t TimeFrameGPU<NLayers>::getNumberOfNeighbours() const
 {
   return std::accumulate(mNNeighbours.begin(), mNNeighbours.end(), 0);
 }
 
@@ -19,16 +19,16 @@
 namespace o2::its
 {
 
-template <int nLayers = 7>
-class TrackerTraitsGPU final : public TrackerTraits<nLayers>
+template <int NLayers = 7>
+class TrackerTraitsGPU final : public TrackerTraits<NLayers>
 {
-  using typename TrackerTraits<nLayers>::IndexTableUtilsN;
+  using typename TrackerTraits<NLayers>::IndexTableUtilsN;
 
  public:
   TrackerTraitsGPU() = default;
   ~TrackerTraitsGPU() final = default;
 
-  void adoptTimeFrame(TimeFrame<nLayers>* tf) final;
+  void adoptTimeFrame(TimeFrame<NLayers>* tf) final;
   void initialiseTimeFrame(const int iteration) final;
 
   void computeLayerTracklets(const int iteration, int) final;
@@ -48,7 +48,7 @@ class TrackerTraitsGPU final : public TrackerTraits<nLayers>
 
  private:
   IndexTableUtilsN* mDeviceIndexTableUtils;
-  gpu::TimeFrameGPU<nLayers>* mTimeFrameGPU;
+  gpu::TimeFrameGPU<NLayers>* mTimeFrameGPU;
 };
 
 } // namespace o2::its
 
@@ -13,9 +13,6 @@
 if(CUDA_ENABLED)
     find_package(CUDAToolkit)
     message(STATUS "Building ITS CUDA tracker")
-    # add_compile_options(-O0 -g -lineinfo -fPIC -DGPU_FORCE_DEVICE_ASSERTS=ON)
-    # add_compile_definitions(ITS_MEASURE_GPU_TIME)
-    # add_compile_definitions(ITS_GPU_LOG)
     o2_add_library(ITStrackingCUDA
                    SOURCES TrackerTraitsGPU.cxx
                            TimeFrameGPU.cu
@@ -29,7 +26,13 @@ if(CUDA_ENABLED)
                    PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider
                    TARGETVARNAME targetName)
 
+    set_target_gpu_arch("CUDA" ${targetName})
+    # Enable relocatable device code (needed for separable compilation + debugging)
     set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+    # target_compile_options(${targetName} PRIVATE
+    #     $<$<COMPILE_LANGUAGE:CUDA>:-G;-O0;-Xptxas=-O0>
+    #     $<$<COMPILE_LANGUAGE:CXX>:-O0;-g>
+    # )
+    # target_compile_definitions(${targetName} PRIVATE ITS_MEASURE_GPU_TIME ITS_GPU_LOG)
     target_compile_definitions(${targetName} PRIVATE $<TARGET_PROPERTY:O2::ITStracking,COMPILE_DEFINITIONS>)
-    set_target_gpu_arch("CUDA" ${targetName})
 endif()
@@ -265,57 +265,58 @@ void TimeFrameGPU<NLayers>::loadVertices(const int iteration)
 {
   if (!iteration) {
     GPUTimer timer("loading seeding vertices");
-    // GPULog("gpu-transfer: loading {} ROframes vertices, for {:.2f} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / constants::MB);
-    // allocMem(reinterpret_cast<void**>(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), this->hasFrameworkAllocator());
-    // GPUChkErrS(cudaMemcpy(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice));
     GPULog("gpu-transfer: loading {} seeding vertices, for {:.2f} MB.", this->mPrimaryVertices.size(), this->mPrimaryVertices.size() * sizeof(Vertex) / constants::MB);
     allocMem(reinterpret_cast<void**>(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), this->hasFrameworkAllocator());
     GPUChkErrS(cudaMemcpy(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice));
   }
 }
 
 template <int NLayers>
-void TimeFrameGPU<NLayers>::loadROFOverlapTable()
+void TimeFrameGPU<NLayers>::loadROFOverlapTable(const int iteration)
 {
-  GPUTimer timer("initialising device view of ROFOverlapTable");
-  const auto& hostTable = this->getROFOverlapTable();
-  const auto& hostView = this->getROFOverlapTableView();
-  using TableEntry = ROFOverlapTable<NLayers>::TableEntry;
-  using TableIndex = ROFOverlapTable<NLayers>::TableIndex;
-  using LayerTiming = o2::its::LayerTiming;
-  TableEntry* d_flatTable{nullptr};
-  TableIndex* d_indices{nullptr};
-  LayerTiming* d_layers{nullptr};
-  size_t flatTableSize = hostTable.getFlatTableSize();
-  allocMem(reinterpret_cast<void**>(&d_flatTable), flatTableSize * sizeof(TableEntry), this->hasFrameworkAllocator());
-  GPUChkErrS(cudaMemcpy(d_flatTable, hostView.mFlatTable, flatTableSize * sizeof(TableEntry), cudaMemcpyHostToDevice));
-  allocMem(reinterpret_cast<void**>(&d_indices), hostTable.getIndicesSize() * sizeof(TableIndex), this->hasFrameworkAllocator());
-  GPUChkErrS(cudaMemcpy(d_indices, hostView.mIndices, hostTable.getIndicesSize() * sizeof(TableIndex), cudaMemcpyHostToDevice));
-  allocMem(reinterpret_cast<void**>(&d_layers), NLayers * sizeof(LayerTiming), this->hasFrameworkAllocator());
-  GPUChkErrS(cudaMemcpy(d_layers, hostView.mLayers, NLayers * sizeof(LayerTiming), cudaMemcpyHostToDevice));
-  mDeviceROFOverlapTableView = hostTable.getDeviceView(d_flatTable, d_indices, d_layers);
+  if (!iteration) {
+    GPUTimer timer("initialising device view of ROFOverlapTable");
+    const auto& hostTable = this->getROFOverlapTable();
+    const auto& hostView = this->getROFOverlapTableView();
+    using TableEntry = ROFOverlapTable<NLayers>::TableEntry;
+    using TableIndex = ROFOverlapTable<NLayers>::TableIndex;
+    using LayerTiming = o2::its::LayerTiming;
+    TableEntry* d_flatTable{nullptr};
+    TableIndex* d_indices{nullptr};
+    LayerTiming* d_layers{nullptr};
+    size_t flatTableSize = hostTable.getFlatTableSize();
+    allocMem(reinterpret_cast<void**>(&d_flatTable), flatTableSize * sizeof(TableEntry), this->hasFrameworkAllocator());
+    GPUChkErrS(cudaMemcpy(d_flatTable, hostView.mFlatTable, flatTableSize * sizeof(TableEntry), cudaMemcpyHostToDevice));
+    allocMem(reinterpret_cast<void**>(&d_indices), hostTable.getIndicesSize() * sizeof(TableIndex), this->hasFrameworkAllocator());
+    GPUChkErrS(cudaMemcpy(d_indices, hostView.mIndices, hostTable.getIndicesSize() * sizeof(TableIndex), cudaMemcpyHostToDevice));
+    allocMem(reinterpret_cast<void**>(&d_layers), NLayers * sizeof(LayerTiming), this->hasFrameworkAllocator());
+    GPUChkErrS(cudaMemcpy(d_layers, hostView.mLayers, NLayers * sizeof(LayerTiming), cudaMemcpyHostToDevice));
+    mDeviceROFOverlapTableView = hostTable.getDeviceView(d_flatTable, d_indices, d_layers);
+  }
 }
 
 template <int NLayers>
-void TimeFrameGPU<NLayers>::loadROFVertexLookupTable()
+void TimeFrameGPU<NLayers>::loadROFVertexLookupTable(const int iteration)
 {
-  GPUTimer timer("initialising device view of ROFVertexLookupTable");
-  const auto& hostTable = this->getROFVertexLookupTable();
-  const auto& hostView = this->getROFVertexLookupTableView();
-  using TableEntry = ROFVertexLookupTable<NLayers>::TableEntry;
-  using TableIndex = ROFVertexLookupTable<NLayers>::TableIndex;
-  using LayerTiming = o2::its::LayerTiming;
-  TableEntry* d_flatTable{nullptr};
-  TableIndex* d_indices{nullptr};
-  LayerTiming* d_layers{nullptr};
-  size_t flatTableSize = hostTable.getFlatTableSize();
-  allocMem(reinterpret_cast<void**>(&d_flatTable), flatTableSize * sizeof(TableEntry), this->hasFrameworkAllocator());
-  GPUChkErrS(cudaMemcpy(d_flatTable, hostView.mFlatTable, flatTableSize * sizeof(TableEntry), cudaMemcpyHostToDevice));
-  allocMem(reinterpret_cast<void**>(&d_indices), hostTable.getIndicesSize() * sizeof(TableIndex), this->hasFrameworkAllocator());
-  GPUChkErrS(cudaMemcpy(d_indices, hostView.mIndices, hostTable.getIndicesSize() * sizeof(TableIndex), cudaMemcpyHostToDevice));
-  allocMem(reinterpret_cast<void**>(&d_layers), NLayers * sizeof(LayerTiming), this->hasFrameworkAllocator());
-  GPUChkErrS(cudaMemcpy(d_layers, hostView.mLayers, NLayers * sizeof(LayerTiming), cudaMemcpyHostToDevice));
-  mDeviceROFVertexLookupTableView = hostTable.getDeviceView(d_flatTable, d_indices, d_layers);
+  if (!iteration) {
+    GPUTimer timer("initialising device view of ROFVertexLookupTable");
+    const auto& hostTable = this->getROFVertexLookupTable();
+    const auto& hostView = this->getROFVertexLookupTableView();
+    using TableEntry = ROFVertexLookupTable<NLayers>::TableEntry;
+    using TableIndex = ROFVertexLookupTable<NLayers>::TableIndex;
+    using LayerTiming = o2::its::LayerTiming;
+    TableEntry* d_flatTable{nullptr};
+    TableIndex* d_indices{nullptr};
+    LayerTiming* d_layers{nullptr};
+    size_t flatTableSize = hostTable.getFlatTableSize();
+    allocMem(reinterpret_cast<void**>(&d_flatTable), flatTableSize * sizeof(TableEntry), this->hasFrameworkAllocator());
+    GPUChkErrS(cudaMemcpy(d_flatTable, hostView.mFlatTable, flatTableSize * sizeof(TableEntry), cudaMemcpyHostToDevice));
+    allocMem(reinterpret_cast<void**>(&d_indices), hostTable.getIndicesSize() * sizeof(TableIndex), this->hasFrameworkAllocator());
+    GPUChkErrS(cudaMemcpy(d_indices, hostView.mIndices, hostTable.getIndicesSize() * sizeof(TableIndex), cudaMemcpyHostToDevice));
+    allocMem(reinterpret_cast<void**>(&d_layers), NLayers * sizeof(LayerTiming), this->hasFrameworkAllocator());
+    GPUChkErrS(cudaMemcpy(d_layers, hostView.mLayers, NLayers * sizeof(LayerTiming), cudaMemcpyHostToDevice));
+    mDeviceROFVertexLookupTableView = hostTable.getDeviceView(d_flatTable, d_indices, d_layers);
+  }
 }
 
 template <int NLayers>
@@ -373,6 +374,7 @@ void TimeFrameGPU<NLayers>::createTrackletsBuffers(const int layer)
   mGpuStreams[layer].sync(); // ensure number of tracklets is correct
   GPULog("gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {:.2f} MB.", mNTracklets[layer], layer, mNTracklets[layer] * sizeof(Tracklet) / constants::MB);
   allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[layer]), mNTracklets[layer] * sizeof(Tracklet), mGpuStreams[layer], this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
+  GPUChkErrS(cudaMemsetAsync(mTrackletsDevice[layer], 0, mNTracklets[layer] * sizeof(Tracklet), mGpuStreams[layer].get()));
   GPUChkErrS(cudaMemcpyAsync(&mTrackletsDeviceArray[layer], &mTrackletsDevice[layer], sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
 }
 
@@ -468,6 +470,7 @@ void TimeFrameGPU<NLayers>::createCellsBuffers(const int layer)
   mGpuStreams[layer].sync(); // ensure number of cells is correct
   GPULog("gpu-transfer: creating cell buffer for {} elements on layer {}, for {:.2f} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeedN) / constants::MB);
   allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeedN), mGpuStreams[layer], this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
+  GPUChkErrS(cudaMemsetAsync(mCellsDevice[layer], 0, mNCells[layer] * sizeof(CellSeedN), mGpuStreams[layer].get()));
   GPUChkErrS(cudaMemcpyAsync(&mCellsDeviceArray[layer], &mCellsDevice[layer], sizeof(CellSeedN*), cudaMemcpyHostToDevice, mGpuStreams[layer].get()));
 }
 
@@ -637,11 +640,10 @@ void TimeFrameGPU<NLayers>::popMemoryStack(const int iteration)
 template <int NLayers>
 void TimeFrameGPU<NLayers>::initialise(const int iteration,
                                        const TrackingParameters& trkParam,
-                                       const int maxLayers,
-                                       IndexTableUtilsN* utils)
+                                       const int maxLayers)
 {
   mGpuStreams.resize(NLayers);
-  o2::its::TimeFrame<NLayers>::initialise(iteration, trkParam, maxLayers);
+  o2::its::TimeFrame<NLayers>::initialise(iteration, trkParam, maxLayers, false);
 }
 
 template <int NLayers>