Skip to content

Commit 8eebfb5

Browse files
f3schdavidrohr
authored andcommitted
ITS: GPU: reduce TrackITS allocation
Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
1 parent 2d37a89 commit 8eebfb5

File tree

5 files changed

+234
-92
lines changed

5 files changed

+234
-92
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,8 @@ class TimeFrameGPU final : public TimeFrame<nLayers>
8080
void createNeighboursIndexTablesDevice(const int);
8181
void createNeighboursDevice(const unsigned int layer);
8282
void createNeighboursLUTDevice(const int, const unsigned int);
83-
void createTrackITSExtDevice(bounded_vector<CellSeedN>&);
84-
void downloadTrackITSExtDevice(bounded_vector<CellSeedN>&);
83+
void createTrackITSExtDevice(const size_t);
84+
void downloadTrackITSExtDevice();
8585
void downloadCellsNeighboursDevice(std::vector<bounded_vector<std::pair<int, int>>>&, const int);
8686
void downloadNeighboursLUTDevice(bounded_vector<int>&, const int);
8787
void downloadCellsDevice();
@@ -140,6 +140,8 @@ class TimeFrameGPU final : public TimeFrame<nLayers>
140140
int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; }
141141
CellSeedN** getDeviceArrayCells() { return mCellsDeviceArray; }
142142
CellSeedN* getDeviceTrackSeeds() { return mTrackSeedsDevice; }
143+
int* getDeviceTrackSeedsLUT() { return mTrackSeedsLUTDevice; }
144+
auto getNTrackSeeds() const { return mNTracks; }
143145
o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; }
144146
float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; }
145147
int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; }
@@ -219,6 +221,8 @@ class TimeFrameGPU final : public TimeFrame<nLayers>
219221
CellSeedN** mCellsDeviceArray;
220222
std::array<int*, nLayers - 3> mNeighboursIndexTablesDevice;
221223
CellSeedN* mTrackSeedsDevice{nullptr};
224+
int* mTrackSeedsLUTDevice{nullptr};
225+
unsigned int mNTracks{0};
222226
std::array<o2::track::TrackParCovF*, nLayers - 2> mCellSeedsDevice;
223227
o2::track::TrackParCovF** mCellSeedsDeviceArray;
224228
std::array<float*, nLayers - 2> mCellSeedsChi2Device;

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -207,23 +207,48 @@ void processNeighboursHandler(const int startLayer,
207207
const int nThreads);
208208

209209
template <int nLayers = 7>
210-
void trackSeedHandler(CellSeed<nLayers>* trackSeeds,
211-
const TrackingFrameInfo** foundTrackingFrameInfo,
212-
const Cluster** unsortedClusters,
213-
o2::its::TrackITSExt* tracks,
214-
const std::vector<float>& layerRadiiHost,
215-
const std::vector<float>& minPtsHost,
216-
const unsigned int nSeeds,
217-
const float Bz,
218-
const int startLevel,
219-
const float maxChi2ClusterAttachment,
220-
const float maxChi2NDF,
221-
const int reseedIfShorter,
222-
const bool repeatRefitOut,
223-
const bool shiftRefToCluster,
224-
const o2::base::Propagator* propagator,
225-
const o2::base::PropagatorF::MatCorrType matCorrType,
226-
const int nBlocks,
227-
const int nThreads);
210+
void countTrackSeedHandler(CellSeed<nLayers>* trackSeeds,
211+
const TrackingFrameInfo** foundTrackingFrameInfo,
212+
const Cluster** unsortedClusters,
213+
int* seedLUT,
214+
const std::vector<float>& layerRadiiHost,
215+
const std::vector<float>& minPtsHost,
216+
const unsigned int nSeeds,
217+
const float Bz,
218+
const int startLevel,
219+
const float maxChi2ClusterAttachment,
220+
const float maxChi2NDF,
221+
const int reseedIfShorter,
222+
const bool repeatRefitOut,
223+
const bool shiftRefToCluster,
224+
const o2::base::Propagator* propagator,
225+
const o2::base::PropagatorF::MatCorrType matCorrType,
226+
o2::its::ExternalAllocator* alloc,
227+
const int nBlocks,
228+
const int nThreads);
229+
230+
template <int nLayers = 7>
231+
void computeTrackSeedHandler(CellSeed<nLayers>* trackSeeds,
232+
const TrackingFrameInfo** foundTrackingFrameInfo,
233+
const Cluster** unsortedClusters,
234+
o2::its::TrackITSExt* tracks,
235+
const int* seedLUT,
236+
const std::vector<float>& layerRadiiHost,
237+
const std::vector<float>& minPtsHost,
238+
const unsigned int nSeeds,
239+
const unsigned int nTracks,
240+
const float Bz,
241+
const int startLevel,
242+
const float maxChi2ClusterAttachment,
243+
const float maxChi2NDF,
244+
const int reseedIfShorter,
245+
const bool repeatRefitOut,
246+
const bool shiftRefToCluster,
247+
const o2::base::Propagator* propagator,
248+
const o2::base::PropagatorF::MatCorrType matCorrType,
249+
o2::its::ExternalAllocator* alloc,
250+
const int nBlocks,
251+
const int nThreads);
252+
228253
} // namespace o2::its
229254
#endif // ITSTRACKINGGPU_TRACKINGKERNELS_H_

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -439,8 +439,10 @@ void TimeFrameGPU<nLayers>::loadTrackSeedsDevice(bounded_vector<CellSeedN>& seed
439439
GPUTimer timer("loading track seeds");
440440
GPULog("gpu-transfer: loading {} track seeds, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(CellSeedN) / constants::MB);
441441
allocMem(reinterpret_cast<void**>(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeedN), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
442-
GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeedN), cudaHostRegisterPortable));
443442
GPUChkErrS(cudaMemcpy(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeedN), cudaMemcpyHostToDevice));
443+
GPULog("gpu-transfer: creating {} track seeds LUT, for {:.2f} MB.", seeds.size() + 1, (seeds.size() + 1) * sizeof(int) / constants::MB);
444+
allocMem(reinterpret_cast<void**>(&mTrackSeedsLUTDevice), (seeds.size() + 1) * sizeof(int), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
445+
GPUChkErrS(cudaMemset(mTrackSeedsLUTDevice, 0, (seeds.size() + 1) * sizeof(int)));
444446
}
445447

446448
template <int nLayers>
@@ -458,14 +460,15 @@ void TimeFrameGPU<nLayers>::createNeighboursDevice(const unsigned int layer)
458460
}
459461

460462
template <int nLayers>
461-
void TimeFrameGPU<nLayers>::createTrackITSExtDevice(bounded_vector<CellSeedN>& seeds)
463+
void TimeFrameGPU<nLayers>::createTrackITSExtDevice(const size_t nSeeds)
462464
{
463465
GPUTimer timer("reserving tracks");
464-
mTrackITSExt = bounded_vector<TrackITSExt>(seeds.size(), {}, this->getMemoryPool().get());
465-
GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / constants::MB);
466-
allocMem(reinterpret_cast<void**>(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
467-
GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt)));
468-
GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable));
466+
mNTracks = 0;
467+
GPUChkErrS(cudaMemcpy(&mNTracks, mTrackSeedsLUTDevice + nSeeds, sizeof(int), cudaMemcpyDeviceToHost));
468+
GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", mNTracks, mNTracks * sizeof(o2::its::TrackITSExt) / constants::MB);
469+
mTrackITSExt = bounded_vector<TrackITSExt>(mNTracks, {}, this->getMemoryPool().get());
470+
allocMem(reinterpret_cast<void**>(&mTrackITSExtDevice), mNTracks * sizeof(o2::its::TrackITSExt), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
471+
GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, mNTracks * sizeof(o2::its::TrackITSExt)));
469472
}
470473

471474
template <int nLayers>
@@ -588,13 +591,11 @@ void TimeFrameGPU<nLayers>::downloadNeighboursLUTDevice(bounded_vector<int>& lut
588591
}
589592

590593
template <int nLayers>
591-
void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice(bounded_vector<CellSeedN>& seeds)
594+
void TimeFrameGPU<nLayers>::downloadTrackITSExtDevice()
592595
{
593596
GPUTimer timer("downloading tracks");
594597
GPULog("gpu-transfer: downloading {} tracks, for {:.2f} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / constants::MB);
595-
GPUChkErrS(cudaMemcpy(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost));
596-
GPUChkErrS(cudaHostUnregister(mTrackITSExt.data()));
597-
GPUChkErrS(cudaHostUnregister(seeds.data()));
598+
GPUChkErrS(cudaMemcpy(mTrackITSExt.data(), mTrackITSExtDevice, mTrackITSExt.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost));
598599
}
599600

600601
template <int nLayers>

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -322,29 +322,52 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
322322
LOGP(debug, "No track seeds found, skipping track finding");
323323
continue;
324324
}
325-
mTimeFrameGPU->createTrackITSExtDevice(trackSeeds);
326325
mTimeFrameGPU->loadTrackSeedsDevice(trackSeeds);
327326

328-
trackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed*
329-
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo**
330-
mTimeFrameGPU->getDeviceArrayUnsortedClusters(), // Cluster**
331-
mTimeFrameGPU->getDeviceTrackITSExt(), // o2::its::TrackITSExt*
332-
this->mTrkParams[iteration].LayerRadii, // const std::vector<float>&
333-
this->mTrkParams[iteration].MinPt, // const std::vector<float>&
334-
trackSeeds.size(), // const size_t nSeeds
335-
this->mBz, // const float Bz
336-
startLevel, // const int startLevel,
337-
this->mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment
338-
this->mTrkParams[0].MaxChi2NDF, // float maxChi2NDF
339-
this->mTrkParams[0].RepeatRefitOut,
340-
this->mTrkParams[0].ReseedIfShorter,
341-
this->mTrkParams[0].ShiftRefToCluster,
342-
mTimeFrameGPU->getDevicePropagator(), // const o2::base::Propagator* propagator
343-
this->mTrkParams[0].CorrType, // o2::base::PropagatorImpl<float>::MatCorrType
344-
conf.nBlocksTracksSeeds[iteration],
345-
conf.nThreadsTracksSeeds[iteration]);
346-
347-
mTimeFrameGPU->downloadTrackITSExtDevice(trackSeeds);
327+
// Since TrackITSExt is an enourmous class it is better to first count how many
328+
// successfull fits we do and only then allocate
329+
countTrackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(),
330+
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
331+
mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
332+
mTimeFrameGPU->getDeviceTrackSeedsLUT(),
333+
this->mTrkParams[iteration].LayerRadii,
334+
this->mTrkParams[iteration].MinPt,
335+
trackSeeds.size(),
336+
this->mBz,
337+
startLevel,
338+
this->mTrkParams[0].MaxChi2ClusterAttachment,
339+
this->mTrkParams[0].MaxChi2NDF,
340+
this->mTrkParams[0].RepeatRefitOut,
341+
this->mTrkParams[0].ReseedIfShorter,
342+
this->mTrkParams[0].ShiftRefToCluster,
343+
mTimeFrameGPU->getDevicePropagator(),
344+
this->mTrkParams[0].CorrType,
345+
mTimeFrameGPU->getFrameworkAllocator(),
346+
conf.nBlocksTracksSeeds[iteration],
347+
conf.nThreadsTracksSeeds[iteration]);
348+
mTimeFrameGPU->createTrackITSExtDevice(trackSeeds.size());
349+
computeTrackSeedHandler(mTimeFrameGPU->getDeviceTrackSeeds(),
350+
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
351+
mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
352+
mTimeFrameGPU->getDeviceTrackITSExt(),
353+
mTimeFrameGPU->getDeviceTrackSeedsLUT(),
354+
this->mTrkParams[iteration].LayerRadii,
355+
this->mTrkParams[iteration].MinPt,
356+
trackSeeds.size(),
357+
mTimeFrameGPU->getNTrackSeeds(),
358+
this->mBz,
359+
startLevel,
360+
this->mTrkParams[0].MaxChi2ClusterAttachment,
361+
this->mTrkParams[0].MaxChi2NDF,
362+
this->mTrkParams[0].RepeatRefitOut,
363+
this->mTrkParams[0].ReseedIfShorter,
364+
this->mTrkParams[0].ShiftRefToCluster,
365+
mTimeFrameGPU->getDevicePropagator(),
366+
this->mTrkParams[0].CorrType,
367+
mTimeFrameGPU->getFrameworkAllocator(),
368+
conf.nBlocksTracksSeeds[iteration],
369+
conf.nThreadsTracksSeeds[iteration]);
370+
mTimeFrameGPU->downloadTrackITSExtDevice();
348371

349372
auto& tracks = mTimeFrameGPU->getTrackITSExt();
350373

0 commit comments

Comments
 (0)