Skip to content

Commit ddcccec

Browse files
f3schdavidrohr
authored andcommitted
ITS: GPU: free artefacts from memory after iteration
Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
1 parent 2439bbe commit ddcccec

File tree

10 files changed

+158
-128
lines changed

10 files changed

+158
-128
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ class TimeFrameGPU final : public TimeFrame<nLayers>
3535
~TimeFrameGPU() = default;
3636

3737
/// Most relevant operations
38+
void pushMemoryStack(const int);
39+
void popMemoryStack(const int);
3840
void registerHostMemory(const int);
3941
void unregisterHostMemory(const int);
4042
void initialise(const int, const TrackingParameters&, const int, IndexTableUtilsN* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr);
@@ -177,8 +179,8 @@ class TimeFrameGPU final : public TimeFrame<nLayers>
177179
int getNumberOfNeighbours() const final;
178180

179181
private:
180-
void allocMemAsync(void**, size_t, Stream&, bool); // Abstract owned and unowned memory allocations on specific stream
181-
void allocMem(void**, size_t, bool); // Abstract owned and unowned memory allocations on default stream
182+
void allocMemAsync(void**, size_t, Stream&, bool, int32_t = o2::gpu::GPUMemoryResource::MEMORY_GPU); // Abstract owned and unowned memory allocations on specific stream
183+
void allocMem(void**, size_t, bool, int32_t = o2::gpu::GPUMemoryResource::MEMORY_GPU); // Abstract owned and unowned memory allocations on default stream
182184
TimeFrameGPUParameters mGpuParams;
183185

184186
// Host-available device buffer sizes

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ struct TypedAllocator {
313313

314314
pointer allocate(size_type n)
315315
{
316-
void* raw = mInternalAllocator->allocate(n * sizeof(T));
316+
void* raw = mInternalAllocator->allocateStack(n * sizeof(T));
317317
return thrust::device_pointer_cast(static_cast<T*>(raw));
318318
}
319319

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 74 additions & 55 deletions
Large diffs are not rendered by default.

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ template <int nLayers>
2828
void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
2929
{
3030
mTimeFrameGPU->initialise(iteration, this->mTrkParams[iteration], nLayers);
31-
3231
// on default stream
3332
mTimeFrameGPU->loadVertices(iteration);
3433
mTimeFrameGPU->loadIndexTableUtils(iteration);
@@ -45,6 +44,8 @@ void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
4544
mTimeFrameGPU->createTrackletsBuffersArray(iteration);
4645
mTimeFrameGPU->createCellsBuffersArray(iteration);
4746
mTimeFrameGPU->createCellsLUTDeviceArray(iteration);
47+
// push every create artefact on the stack
48+
mTimeFrameGPU->pushMemoryStack(iteration);
4849
}
4950

5051
template <int nLayers>
@@ -108,7 +109,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
108109
mTimeFrameGPU->getPositionResolutions(),
109110
this->mTrkParams[iteration].LayerRadii,
110111
mTimeFrameGPU->getMSangles(),
111-
mTimeFrameGPU->getExternalDeviceAllocator(),
112+
mTimeFrameGPU->getFrameworkAllocator(),
112113
conf.nBlocksLayerTracklets[iteration],
113114
conf.nThreadsLayerTracklets[iteration],
114115
mTimeFrameGPU->getStreams());
@@ -146,7 +147,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
146147
mTimeFrameGPU->getPositionResolutions(),
147148
this->mTrkParams[iteration].LayerRadii,
148149
mTimeFrameGPU->getMSangles(),
149-
mTimeFrameGPU->getExternalDeviceAllocator(),
150+
mTimeFrameGPU->getFrameworkAllocator(),
150151
conf.nBlocksLayerTracklets[iteration],
151152
conf.nThreadsLayerTracklets[iteration],
152153
mTimeFrameGPU->getStreams());
@@ -197,7 +198,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
197198
this->mTrkParams[iteration].MaxChi2ClusterAttachment,
198199
this->mTrkParams[iteration].CellDeltaTanLambdaSigma,
199200
this->mTrkParams[iteration].NSigmaCut,
200-
mTimeFrameGPU->getExternalDeviceAllocator(),
201+
mTimeFrameGPU->getFrameworkAllocator(),
201202
conf.nBlocksLayerCells[iteration],
202203
conf.nThreadsLayerCells[iteration],
203204
mTimeFrameGPU->getStreams());
@@ -253,7 +254,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
253254
currentLayerCellsNum,
254255
nextLayerCellsNum,
255256
1e2,
256-
mTimeFrameGPU->getExternalDeviceAllocator(),
257+
mTimeFrameGPU->getFrameworkAllocator(),
257258
conf.nBlocksFindNeighbours[iteration],
258259
conf.nThreadsFindNeighbours[iteration],
259260
mTimeFrameGPU->getStream(iLayer));
@@ -281,7 +282,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
281282
mTimeFrameGPU->getDeviceNeighbours(iLayer),
282283
mTimeFrameGPU->getArrayNNeighbours()[iLayer],
283284
mTimeFrameGPU->getStream(iLayer),
284-
mTimeFrameGPU->getExternalDeviceAllocator());
285+
mTimeFrameGPU->getFrameworkAllocator());
285286
}
286287
mTimeFrameGPU->syncStreams(false);
287288
}
@@ -312,7 +313,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
312313
this->mTrkParams[0].MaxChi2NDF,
313314
mTimeFrameGPU->getDevicePropagator(),
314315
this->mTrkParams[0].CorrType,
315-
mTimeFrameGPU->getExternalDeviceAllocator(),
316+
mTimeFrameGPU->getFrameworkAllocator(),
316317
conf.nBlocksProcessNeighbours[iteration],
317318
conf.nThreadsProcessNeighbours[iteration]);
318319
}
@@ -386,6 +387,8 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
386387
}
387388
mTimeFrameGPU->loadUsedClustersDevice();
388389
}
390+
// wipe the artefact memory
391+
mTimeFrameGPU->popMemoryStack(iteration);
389392
};
390393

391394
template <int nLayers>

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -644,23 +644,6 @@ GPUg() void __launch_bounds__(256, 1) processNeighboursKernel(
644644
}
645645
}
646646

647-
GPUhi() void allocateMemory(void** p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
648-
{
649-
if (alloc) {
650-
*p = alloc->allocate(bytes);
651-
} else {
652-
GPUChkErrS(cudaMallocAsync(p, bytes, stream));
653-
}
654-
}
655-
656-
GPUhi() void deallocateMemory(void* p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr)
657-
{
658-
if (alloc) {
659-
alloc->deallocate(reinterpret_cast<char*>(p), bytes);
660-
} else {
661-
GPUChkErrS(cudaFreeAsync(p, stream));
662-
}
663-
}
664647
} // namespace gpu
665648

666649
template <int nLayers>

Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,39 @@
1717
#define TRACKINGITSU_INCLUDE_EXTERNALALLOCATOR_H_
1818

1919
#include <memory_resource>
20+
#include "GPUO2ExternalUser.h"
21+
#include "Base/GPUMemoryResource.h"
2022

2123
namespace o2::its
2224
{
2325

2426
class ExternalAllocator
2527
{
28+
using Type = std::underlying_type_t<o2::gpu::GPUMemoryResource::MemoryType>;
29+
2630
public:
27-
virtual void* allocate(size_t) = 0;
2831
virtual void deallocate(char*, size_t) = 0;
32+
virtual void* allocate(size_t) = 0;
33+
void* allocate(size_t s, Type type)
34+
{
35+
auto old = mType;
36+
mType = type;
37+
void* p = allocate(s);
38+
mType = old;
39+
return p;
40+
}
41+
void* allocateStack(size_t s)
42+
{
43+
return allocate(s, (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK));
44+
}
45+
virtual void pushTagOnStack(uint64_t) = 0;
46+
virtual void popTagOffStack(uint64_t) = 0;
47+
48+
void setType(Type t) noexcept { mType = t; }
49+
Type getType() const noexcept { return mType; }
50+
51+
protected:
52+
Type mType;
2953
};
3054

3155
class ExternalAllocatorAdaptor final : public std::pmr::memory_resource
@@ -36,7 +60,7 @@ class ExternalAllocatorAdaptor final : public std::pmr::memory_resource
3660
protected:
3761
void* do_allocate(size_t bytes, size_t alignment) override
3862
{
39-
void* p = mAlloc->allocate(bytes);
63+
void* p = mAlloc->allocate(bytes, o2::gpu::GPUMemoryResource::MemoryType::MEMORY_HOST);
4064
if (!p) {
4165
throw std::bad_alloc();
4266
}

Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
#include "ITStracking/IndexTableUtils.h"
3434
#include "ITStracking/ExternalAllocator.h"
3535
#include "ITStracking/BoundedAllocator.h"
36-
3736
#include "SimulationDataFormat/MCCompLabel.h"
3837
#include "SimulationDataFormat/MCTruthContainer.h"
3938

@@ -235,23 +234,14 @@ struct TimeFrame {
235234
void setBz(float bz) { mBz = bz; }
236235
float getBz() const { return mBz; }
237236

238-
/// State if memory will be externally managed.
239-
// device
240-
ExternalAllocator* mExtDeviceAllocator{nullptr};
241-
void setExternalDeviceAllocator(ExternalAllocator* allocator) { mExtDeviceAllocator = allocator; }
242-
ExternalAllocator* getExternalDeviceAllocator() { return mExtDeviceAllocator; }
243-
bool hasExternalDeviceAllocator() const noexcept { return mExtDeviceAllocator != nullptr; }
244-
// host
245-
ExternalAllocator* mExtHostAllocator{nullptr};
246-
void setExternalHostAllocator(ExternalAllocator* allocator)
247-
{
248-
mExtHostAllocator = allocator;
249-
mExtMemoryPool = std::make_shared<BoundedMemoryResource>(mExtHostAllocator);
250-
}
251-
ExternalAllocator* getExternalHostAllocator() { return mExtHostAllocator; }
252-
bool hasExternalHostAllocator() const noexcept { return mExtHostAllocator != nullptr; }
253-
std::shared_ptr<BoundedMemoryResource> mExtMemoryPool;
254-
std::pmr::memory_resource* getMaybeExternalHostResource(bool forceHost = false) { return (hasExternalHostAllocator() && !forceHost) ? mExtMemoryPool.get() : mMemoryPool.get(); }
237+
/// State if memory will be externally managed by the GPU framework
238+
ExternalAllocator* mExternalAllocator{nullptr};
239+
std::shared_ptr<BoundedMemoryResource> mExtMemoryPool; // host memory pool managed by the framework
240+
auto getFrameworkAllocator() { return mExternalAllocator; };
241+
void setFrameworkAllocator(ExternalAllocator* ext);
242+
bool hasFrameworkAllocator() const noexcept { return mExternalAllocator != nullptr; }
243+
std::pmr::memory_resource* getMaybeFrameworkHostResource(bool forceHost = false) { return (hasFrameworkAllocator() && !forceHost) ? mExtMemoryPool.get() : mMemoryPool.get(); }
244+
255245
// Propagator
256246
const o2::base::PropagatorImpl<float>* getDevicePropagator() const { return mPropagatorDevice; }
257247
virtual void setDevicePropagator(const o2::base::PropagatorImpl<float>*) {};

Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -177,9 +177,9 @@ template <int nLayers>
177177
void TimeFrame<nLayers>::resetROFrameData(size_t nRofs)
178178
{
179179
for (int iLayer{0}; iLayer < nLayers; ++iLayer) {
180-
deepVectorClear(mUnsortedClusters[iLayer], getMaybeExternalHostResource());
181-
deepVectorClear(mTrackingFrameInfo[iLayer], getMaybeExternalHostResource());
182-
clearResizeBoundedVector(mROFramesClusters[iLayer], nRofs + 1, getMaybeExternalHostResource());
180+
deepVectorClear(mUnsortedClusters[iLayer], getMaybeFrameworkHostResource());
181+
deepVectorClear(mTrackingFrameInfo[iLayer], getMaybeFrameworkHostResource());
182+
clearResizeBoundedVector(mROFramesClusters[iLayer], nRofs + 1, getMaybeFrameworkHostResource());
183183
deepVectorClear(mClusterExternalIndices[iLayer], mMemoryPool.get());
184184

185185
if (iLayer < 2) {
@@ -302,11 +302,11 @@ void TimeFrame<nLayers>::initialise(const int iteration, const TrackingParameter
302302
clearResizeBoundedVector(mBogusClusters, trkParam.NLayers, mMemoryPool.get());
303303
deepVectorClear(mTrackletClusters);
304304
for (unsigned int iLayer{0}; iLayer < std::min((int)mClusters.size(), maxLayers); ++iLayer) {
305-
clearResizeBoundedVector(mClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeExternalHostResource(maxLayers != nLayers));
306-
clearResizeBoundedVector(mUsedClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeExternalHostResource(maxLayers != nLayers));
305+
clearResizeBoundedVector(mClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeFrameworkHostResource(maxLayers != nLayers));
306+
clearResizeBoundedVector(mUsedClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeFrameworkHostResource(maxLayers != nLayers));
307307
mPositionResolution[iLayer] = o2::gpu::CAMath::Sqrt(0.5f * (trkParam.SystErrorZ2[iLayer] + trkParam.SystErrorY2[iLayer]) + trkParam.LayerResolution[iLayer] * trkParam.LayerResolution[iLayer]);
308308
}
309-
clearResizeBoundedArray(mIndexTables, mNrof * (trkParam.ZBins * trkParam.PhiBins + 1), getMaybeExternalHostResource(maxLayers != nLayers));
309+
clearResizeBoundedArray(mIndexTables, mNrof * (trkParam.ZBins * trkParam.PhiBins + 1), getMaybeFrameworkHostResource(maxLayers != nLayers));
310310
clearResizeBoundedVector(mLines, mNrof, mMemoryPool.get());
311311
clearResizeBoundedVector(mTrackletClusters, mNrof, mMemoryPool.get());
312312

@@ -574,6 +574,7 @@ void TimeFrame<nLayers>::setMemoryPool(std::shared_ptr<BoundedMemoryResource> po
574574
initVector(v, useExternal);
575575
}
576576
};
577+
577578
// these will only reside on the host for the cpu part
578579
initVector(mTotVertPerIteration);
579580
initContainers(mClusterExternalIndices);
@@ -603,12 +604,19 @@ void TimeFrame<nLayers>::setMemoryPool(std::shared_ptr<BoundedMemoryResource> po
603604
initVector(mRoadLabels);
604605
initContainers(mTracksLabel);
605606
// these will use possibly an externally provided allocator
606-
initContainers(mClusters, hasExternalHostAllocator());
607-
initContainers(mUsedClusters, hasExternalHostAllocator());
608-
initContainers(mUnsortedClusters, hasExternalHostAllocator());
609-
initContainers(mIndexTables, hasExternalHostAllocator());
610-
initContainers(mTrackingFrameInfo, hasExternalHostAllocator());
611-
initContainers(mROFramesClusters, hasExternalHostAllocator());
607+
initContainers(mClusters, hasFrameworkAllocator());
608+
initContainers(mUsedClusters, hasFrameworkAllocator());
609+
initContainers(mUnsortedClusters, hasFrameworkAllocator());
610+
initContainers(mIndexTables, hasFrameworkAllocator());
611+
initContainers(mTrackingFrameInfo, hasFrameworkAllocator());
612+
initContainers(mROFramesClusters, hasFrameworkAllocator());
613+
}
614+
615+
template <int nLayers>
616+
void TimeFrame<nLayers>::setFrameworkAllocator(ExternalAllocator* ext)
617+
{
618+
mExternalAllocator = ext;
619+
mExtMemoryPool = std::make_shared<BoundedMemoryResource>(mExternalAllocator);
612620
}
613621

614622
template <int nLayers>
@@ -639,7 +647,7 @@ void TimeFrame<nLayers>::wipe()
639647
deepVectorClear(mLines);
640648
// if we use the external host allocator then the assumption is that we
641649
// don't clear the memory ourself
642-
if (!hasExternalHostAllocator()) {
650+
if (!hasFrameworkAllocator()) {
643651
deepVectorClear(mClusters);
644652
deepVectorClear(mUsedClusters);
645653
deepVectorClear(mUnsortedClusters);

GPU/GPUTracking/Global/GPUChainITS.cxx

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,18 +25,23 @@ namespace o2::its
2525
class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator
2626
{
2727
public:
28-
GPUFrameworkExternalAllocator(GPUMemoryResource::MemoryType type) : mType(type) {}
29-
30-
void* allocate(size_t size) override
28+
void* allocate(size_t size) final
3129
{
3230
return mFWReco->AllocateDirectMemory(size, mType);
3331
}
34-
void deallocate(char* ptr, size_t size) override {}
32+
void deallocate(char* ptr, size_t size) final {} // this is a simple no-op
33+
void pushTagOnStack(uint64_t tag)
34+
{
35+
mFWReco->PushNonPersistentMemory(tag);
36+
}
37+
void popTagOffStack(uint64_t tag)
38+
{
39+
mFWReco->PopNonPersistentMemory(GPUDataTypes::RecoStep::ITSTracking, tag);
40+
}
3541
void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; }
3642

3743
private:
3844
o2::gpu::GPUReconstruction* mFWReco;
39-
GPUMemoryResource::MemoryType mType;
4045
};
4146
} // namespace o2::its
4247

@@ -73,12 +78,9 @@ o2::its::TimeFrame<7>* GPUChainITS::GetITSTimeframe()
7378
}
7479
#if !defined(GPUCA_STANDALONE)
7580
if (mITSTimeFrame->isGPU()) {
76-
mFrameworkDeviceAllocator.reset(new o2::its::GPUFrameworkExternalAllocator(GPUMemoryResource::MEMORY_GPU));
77-
mFrameworkDeviceAllocator->setReconstructionFramework(rec());
78-
mITSTimeFrame->setExternalDeviceAllocator(mFrameworkDeviceAllocator.get());
79-
mFrameworkHostAllocator.reset(new o2::its::GPUFrameworkExternalAllocator(GPUMemoryResource::MEMORY_HOST));
80-
mFrameworkHostAllocator->setReconstructionFramework(rec());
81-
mITSTimeFrame->setExternalHostAllocator(mFrameworkHostAllocator.get());
81+
mFrameworkAllocator.reset(new o2::its::GPUFrameworkExternalAllocator());
82+
mFrameworkAllocator->setReconstructionFramework(rec());
83+
mITSTimeFrame->setFrameworkAllocator(mFrameworkAllocator.get());
8284
}
8385
#endif
8486
return mITSTimeFrame.get();

GPU/GPUTracking/Global/GPUChainITS.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,7 @@ class GPUChainITS final : public GPUChain
5353
std::unique_ptr<o2::its::TrackerTraits<7>> mITSTrackerTraits;
5454
std::unique_ptr<o2::its::VertexerTraits<7>> mITSVertexerTraits;
5555
std::unique_ptr<o2::its::TimeFrame<7>> mITSTimeFrame;
56-
std::unique_ptr<o2::its::GPUFrameworkExternalAllocator> mFrameworkDeviceAllocator;
57-
std::unique_ptr<o2::its::GPUFrameworkExternalAllocator> mFrameworkHostAllocator;
56+
std::unique_ptr<o2::its::GPUFrameworkExternalAllocator> mFrameworkAllocator;
5857
};
5958
} // namespace o2::gpu
6059

0 commit comments

Comments
 (0)