@@ -60,18 +60,23 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
6060 int startROF{this ->mTrkParams [iteration].nROFsPerIterations > 0 ? iROFslice * this ->mTrkParams [iteration].nROFsPerIterations : 0 };
6161 int endROF{o2::gpu::CAMath::Min (this ->mTrkParams [iteration].nROFsPerIterations > 0 ? (iROFslice + 1 ) * this ->mTrkParams [iteration].nROFsPerIterations + this ->mTrkParams [iteration].DeltaROF : mTimeFrameGPU ->getNrof (), mTimeFrameGPU ->getNrof ())};
6262
63- for ( int iLayer = 0 ; iLayer < nLayers; ++iLayer) {
64- // TODO lazy loading of essential data on separate streams
63+ // start by queuing loading needed of two last layers
64+ for ( int iLayer{nLayers}; iLayer-- > nLayers - 2 ;) {
6565 mTimeFrameGPU ->createUsedClustersDevice (iteration, iLayer);
6666 mTimeFrameGPU ->loadClustersDevice (iteration, iLayer);
67- mTimeFrameGPU ->loadUnsortedClustersDevice (iteration, iLayer);
6867 mTimeFrameGPU ->loadClustersIndexTables (iteration, iLayer);
6968 mTimeFrameGPU ->loadROFrameClustersDevice (iteration, iLayer);
7069 mTimeFrameGPU ->recordEvent (iLayer);
7170 }
7271
73- // processing starts here
74- for (int iLayer{0 }; iLayer < this ->mTrkParams [iteration].TrackletsPerRoad (); ++iLayer) {
72+ for (int iLayer{this ->mTrkParams [iteration].TrackletsPerRoad ()}; iLayer--;) {
73+ if (iLayer) { // queue loading data of next layer in parallel, this the copies are overlapping with computation kernels
74+ mTimeFrameGPU ->createUsedClustersDevice (iteration, iLayer - 1 );
75+ mTimeFrameGPU ->loadClustersDevice (iteration, iLayer - 1 );
76+ mTimeFrameGPU ->loadClustersIndexTables (iteration, iLayer - 1 );
77+ mTimeFrameGPU ->loadROFrameClustersDevice (iteration, iLayer - 1 );
78+ mTimeFrameGPU ->recordEvent (iLayer - 1 );
79+ }
7580 mTimeFrameGPU ->createTrackletsLUTDevice (iteration, iLayer);
7681 mTimeFrameGPU ->waitEvent (iLayer, iLayer + 1 ); // wait stream until all data is available
7782 countTrackletsInROFsHandler<nLayers>(mTimeFrameGPU ->getDeviceIndexTableUtils (),
@@ -91,7 +96,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
9196 mTimeFrameGPU ->getDeviceArrayUsedClusters (),
9297 mTimeFrameGPU ->getDeviceArrayClustersIndexTables (),
9398 mTimeFrameGPU ->getDeviceArrayTrackletsLUT (),
94- mTimeFrameGPU ->getDeviceTrackletsLUTs (), // Required for the exclusive sums
99+ mTimeFrameGPU ->getDeviceTrackletsLUTs (),
95100 iteration,
96101 this ->mTrkParams [iteration].NSigmaCut ,
97102 mTimeFrameGPU ->getPhiCuts (),
@@ -107,7 +112,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
107112 mTimeFrameGPU ->getStreams ());
108113 mTimeFrameGPU ->createTrackletsBuffers (iLayer);
109114 if (mTimeFrameGPU ->getNTracklets ()[iLayer] == 0 ) {
110- return ;
115+ continue ;
111116 }
112117 computeTrackletsInROFsHandler<nLayers>(mTimeFrameGPU ->getDeviceIndexTableUtils (),
113118 mTimeFrameGPU ->getDeviceMultCutMask (),
@@ -151,18 +156,25 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
151156{
152157 auto & conf = o2::its::ITSGpuTrackingParamConfig::Instance ();
153158
154- for (int iLayer = 0 ; iLayer < nLayers; ++iLayer) {
155- // TODO lazy loading of essential data on separate streams
159+ // start by queuing loading needed of three last layers
160+ for (int iLayer{nLayers}; iLayer-- > nLayers - 3 ;) {
161+ mTimeFrameGPU ->loadUnsortedClustersDevice (iteration, iLayer);
156162 mTimeFrameGPU ->loadTrackingFrameInfoDevice (iteration, iLayer);
157163 mTimeFrameGPU ->recordEvent (iLayer);
158164 }
159165
160- for (int iLayer{0 }; iLayer < this ->mTrkParams [iteration].CellsPerRoad (); ++iLayer) {
166+ for (int iLayer{this ->mTrkParams [iteration].CellsPerRoad ()}; iLayer--;) {
167+ if (iLayer) {
168+ mTimeFrameGPU ->loadUnsortedClustersDevice (iteration, iLayer - 1 );
169+ mTimeFrameGPU ->loadTrackingFrameInfoDevice (iteration, iLayer - 1 );
170+ mTimeFrameGPU ->recordEvent (iLayer - 1 );
171+ }
172+
161173 // if there are no tracklets skip entirely
162174 const int currentLayerTrackletsNum{static_cast <int >(mTimeFrameGPU ->getNTracklets ()[iLayer])};
163175 if (!currentLayerTrackletsNum || !mTimeFrameGPU ->getNTracklets ()[iLayer + 1 ]) {
164176 mTimeFrameGPU ->getNCells ()[iLayer] = 0 ;
165- return ;
177+ continue ;
166178 }
167179
168180 mTimeFrameGPU ->createCellsLUTDevice (iLayer);
@@ -189,7 +201,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
189201 mTimeFrameGPU ->getStreams ());
190202 mTimeFrameGPU ->createCellsBuffers (iLayer);
191203 if (mTimeFrameGPU ->getNCells ()[iLayer] == 0 ) {
192- return ;
204+ continue ;
193205 }
194206 computeCellsHandler (mTimeFrameGPU ->getDeviceArrayClusters (),
195207 mTimeFrameGPU ->getDeviceArrayUnsortedClusters (),
0 commit comments