Skip to content

Commit 1b04a9a

Browse files
f3schFelix Schlepper
andauthored
ITS: GPU: improve mm and add tests for resource (#14681)
Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch> Co-authored-by: Felix Schlepper <fschlepp@aliceml.cern.ch>
1 parent bf44a8f commit 1b04a9a

File tree

18 files changed

+474
-246
lines changed

18 files changed

+474
-246
lines changed

Detectors/ITSMFT/ITS/tracking/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,3 +56,5 @@ o2_target_root_dictionary(ITStracking
5656
if(CUDA_ENABLED OR HIP_ENABLED)
5757
add_subdirectory(GPU)
5858
endif()
59+
60+
add_subdirectory(test)

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 77 additions & 65 deletions
Large diffs are not rendered by default.

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,14 @@ void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
3333
mTimeFrameGPU->loadVertices(iteration);
3434
mTimeFrameGPU->loadIndexTableUtils(iteration);
3535
mTimeFrameGPU->loadMultiplicityCutMask(iteration);
36+
// pinned on host
3637
mTimeFrameGPU->createUsedClustersDeviceArray(iteration);
3738
mTimeFrameGPU->createClustersDeviceArray(iteration);
3839
mTimeFrameGPU->createUnsortedClustersDeviceArray(iteration);
3940
mTimeFrameGPU->createClustersIndexTablesArray(iteration);
4041
mTimeFrameGPU->createTrackingFrameInfoDeviceArray(iteration);
4142
mTimeFrameGPU->createROFrameClustersDeviceArray(iteration);
43+
// device array
4244
mTimeFrameGPU->createTrackletsLUTDeviceArray(iteration);
4345
mTimeFrameGPU->createTrackletsBuffersArray(iteration);
4446
mTimeFrameGPU->createCellsBuffersArray(iteration);
@@ -106,7 +108,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
106108
mTimeFrameGPU->getPositionResolutions(),
107109
this->mTrkParams[iteration].LayerRadii,
108110
mTimeFrameGPU->getMSangles(),
109-
mTimeFrameGPU->getExternalAllocator(),
111+
mTimeFrameGPU->getExternalDeviceAllocator(),
110112
conf.nBlocksLayerTracklets[iteration],
111113
conf.nThreadsLayerTracklets[iteration],
112114
mTimeFrameGPU->getStreams());
@@ -144,7 +146,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int i
144146
mTimeFrameGPU->getPositionResolutions(),
145147
this->mTrkParams[iteration].LayerRadii,
146148
mTimeFrameGPU->getMSangles(),
147-
mTimeFrameGPU->getExternalAllocator(),
149+
mTimeFrameGPU->getExternalDeviceAllocator(),
148150
conf.nBlocksLayerTracklets[iteration],
149151
conf.nThreadsLayerTracklets[iteration],
150152
mTimeFrameGPU->getStreams());
@@ -195,7 +197,7 @@ void TrackerTraitsGPU<nLayers>::computeLayerCells(const int iteration)
195197
this->mTrkParams[iteration].MaxChi2ClusterAttachment,
196198
this->mTrkParams[iteration].CellDeltaTanLambdaSigma,
197199
this->mTrkParams[iteration].NSigmaCut,
198-
mTimeFrameGPU->getExternalAllocator(),
200+
mTimeFrameGPU->getExternalDeviceAllocator(),
199201
conf.nBlocksLayerCells[iteration],
200202
conf.nThreadsLayerCells[iteration],
201203
mTimeFrameGPU->getStreams());
@@ -251,7 +253,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
251253
currentLayerCellsNum,
252254
nextLayerCellsNum,
253255
1e2,
254-
mTimeFrameGPU->getExternalAllocator(),
256+
mTimeFrameGPU->getExternalDeviceAllocator(),
255257
conf.nBlocksFindNeighbours[iteration],
256258
conf.nThreadsFindNeighbours[iteration],
257259
mTimeFrameGPU->getStream(iLayer));
@@ -279,7 +281,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
279281
mTimeFrameGPU->getDeviceNeighbours(iLayer),
280282
mTimeFrameGPU->getArrayNNeighbours()[iLayer],
281283
mTimeFrameGPU->getStream(iLayer),
282-
mTimeFrameGPU->getExternalAllocator());
284+
mTimeFrameGPU->getExternalDeviceAllocator());
283285
}
284286
mTimeFrameGPU->syncStreams(false);
285287
}
@@ -310,7 +312,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
310312
this->mTrkParams[0].MaxChi2NDF,
311313
mTimeFrameGPU->getDevicePropagator(),
312314
this->mTrkParams[0].CorrType,
313-
mTimeFrameGPU->getExternalAllocator(),
315+
mTimeFrameGPU->getExternalDeviceAllocator(),
314316
conf.nBlocksProcessNeighbours[iteration],
315317
conf.nThreadsProcessNeighbours[iteration]);
316318
}

Detectors/ITSMFT/ITS/tracking/include/ITStracking/BoundedAllocator.h

Lines changed: 35 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include <new>
2323
#include <vector>
2424

25+
#include "ITStracking/ExternalAllocator.h"
26+
2527
#include "GPUCommonLogger.h"
2628

2729
namespace o2::its
@@ -56,6 +58,7 @@ class BoundedMemoryResource final : public std::pmr::memory_resource
5658

5759
BoundedMemoryResource(size_t maxBytes = std::numeric_limits<size_t>::max(), std::pmr::memory_resource* upstream = std::pmr::get_default_resource())
5860
: mMaxMemory(maxBytes), mUpstream(upstream) {}
61+
BoundedMemoryResource(ExternalAllocator* alloc) : mAdaptor(std::make_unique<ExternalAllocatorAdaptor>(alloc)), mUpstream(mAdaptor.get()) {}
5962

6063
void* do_allocate(size_t bytes, size_t alignment) final
6164
{
@@ -69,7 +72,14 @@ class BoundedMemoryResource final : public std::pmr::memory_resource
6972
} while (!mUsedMemory.compare_exchange_weak(current_used, new_used,
7073
std::memory_order_acq_rel,
7174
std::memory_order_relaxed));
72-
return mUpstream->allocate(bytes, alignment);
75+
void* p{nullptr};
76+
try {
77+
p = mUpstream->allocate(bytes, alignment);
78+
} catch (...) {
79+
mUsedMemory.fetch_sub(bytes, std::memory_order_relaxed);
80+
throw;
81+
}
82+
return p;
7383
}
7484

7585
void do_deallocate(void* p, size_t bytes, size_t alignment) final
@@ -87,11 +97,12 @@ class BoundedMemoryResource final : public std::pmr::memory_resource
8797
size_t getMaxMemory() const noexcept { return mMaxMemory; }
8898
void setMaxMemory(size_t max)
8999
{
90-
if (mUsedMemory > max) {
100+
size_t used = mUsedMemory.load(std::memory_order_acquire);
101+
if (used > max) {
91102
++mCountThrow;
92-
throw MemoryLimitExceeded(0, mUsedMemory, max);
103+
throw MemoryLimitExceeded(0, used, max);
93104
}
94-
mMaxMemory = max;
105+
mMaxMemory.store(max, std::memory_order_release);
95106
}
96107

97108
void print() const
@@ -106,76 +117,74 @@ class BoundedMemoryResource final : public std::pmr::memory_resource
106117
}
107118

108119
private:
109-
size_t mMaxMemory{std::numeric_limits<size_t>::max()};
120+
std::atomic<size_t> mMaxMemory{std::numeric_limits<size_t>::max()};
110121
std::atomic<size_t> mCountThrow{0};
111122
std::atomic<size_t> mUsedMemory{0};
112-
std::pmr::memory_resource* mUpstream;
123+
std::unique_ptr<ExternalAllocatorAdaptor> mAdaptor{nullptr};
124+
std::pmr::memory_resource* mUpstream{nullptr};
113125
};
114126

115127
template <typename T>
116128
using bounded_vector = std::pmr::vector<T>;
117129

118130
template <typename T>
119-
void deepVectorClear(std::vector<T>& vec)
131+
inline void deepVectorClear(std::vector<T>& vec)
120132
{
121133
std::vector<T>().swap(vec);
122134
}
123135

124136
template <typename T>
125-
inline void deepVectorClear(bounded_vector<T>& vec, BoundedMemoryResource* bmr = nullptr)
137+
inline void deepVectorClear(bounded_vector<T>& vec, std::pmr::memory_resource* mr = nullptr)
126138
{
139+
std::pmr::memory_resource* tmr = (mr != nullptr) ? mr : vec.get_allocator().resource();
127140
vec.~bounded_vector<T>();
128-
if (bmr == nullptr) {
129-
auto alloc = vec.get_allocator().resource();
130-
new (&vec) bounded_vector<T>(alloc);
131-
} else {
132-
new (&vec) bounded_vector<T>(bmr);
133-
}
141+
new (&vec) bounded_vector<T>(std::pmr::polymorphic_allocator<T>{tmr});
134142
}
135143

136144
template <typename T>
137-
void deepVectorClear(std::vector<bounded_vector<T>>& vec, BoundedMemoryResource* bmr = nullptr)
145+
inline void deepVectorClear(std::vector<bounded_vector<T>>& vec, std::pmr::memory_resource* mr = nullptr)
138146
{
139147
for (auto& v : vec) {
140-
deepVectorClear(v, bmr);
148+
deepVectorClear(v, mr);
141149
}
142150
}
143151

144152
template <typename T, size_t S>
145-
void deepVectorClear(std::array<bounded_vector<T>, S>& arr, BoundedMemoryResource* bmr = nullptr)
153+
inline void deepVectorClear(std::array<bounded_vector<T>, S>& arr, std::pmr::memory_resource* mr = nullptr)
146154
{
147155
for (size_t i{0}; i < S; ++i) {
148-
deepVectorClear(arr[i], bmr);
156+
deepVectorClear(arr[i], mr);
149157
}
150158
}
151159

152160
template <typename T>
153-
void clearResizeBoundedVector(bounded_vector<T>& vec, size_t size, BoundedMemoryResource* bmr, T def = T())
161+
inline void clearResizeBoundedVector(bounded_vector<T>& vec, size_t sz, std::pmr::memory_resource* mr = nullptr, T def = T())
154162
{
163+
std::pmr::memory_resource* tmr = (mr != nullptr) ? mr : vec.get_allocator().resource();
155164
vec.~bounded_vector<T>();
156-
new (&vec) bounded_vector<T>(size, def, bmr);
165+
new (&vec) bounded_vector<T>(sz, def, std::pmr::polymorphic_allocator<T>{tmr});
157166
}
158167

159168
template <typename T>
160-
void clearResizeBoundedVector(std::vector<bounded_vector<T>>& vec, size_t size, BoundedMemoryResource* bmr)
169+
inline void clearResizeBoundedVector(std::vector<bounded_vector<T>>& vec, size_t size, std::pmr::memory_resource* mr)
161170
{
162171
vec.clear();
163172
vec.reserve(size);
164-
for (size_t i{0}; i < size; ++i) {
165-
vec.emplace_back(bmr);
173+
for (size_t i = 0; i < size; ++i) {
174+
vec.emplace_back(std::pmr::polymorphic_allocator<bounded_vector<T>>{mr});
166175
}
167176
}
168177

169178
template <typename T, size_t S>
170-
void clearResizeBoundedArray(std::array<bounded_vector<T>, S>& arr, size_t size, BoundedMemoryResource* bmr, T def = T())
179+
inline void clearResizeBoundedArray(std::array<bounded_vector<T>, S>& arr, size_t size, std::pmr::memory_resource* mr = nullptr, T def = T())
171180
{
172181
for (size_t i{0}; i < S; ++i) {
173-
clearResizeBoundedVector(arr[i], size, bmr, def);
182+
clearResizeBoundedVector(arr[i], size, mr, def);
174183
}
175184
}
176185

177186
template <typename T>
178-
std::vector<T> toSTDVector(const bounded_vector<T>& b)
187+
inline std::vector<T> toSTDVector(const bounded_vector<T>& b)
179188
{
180189
std::vector<T> t(b.size());
181190
std::copy(b.cbegin(), b.cend(), t.begin());

Detectors/ITSMFT/ITS/tracking/include/ITStracking/Cluster.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@
1616
#ifndef TRACKINGITSU_INCLUDE_CACLUSTER_H_
1717
#define TRACKINGITSU_INCLUDE_CACLUSTER_H_
1818

19+
#include <array>
20+
#include "ITStracking/Constants.h"
1921
#include "GPUCommonRtypes.h"
20-
#include "GPUCommonArray.h"
2122

2223
namespace o2::its
2324
{
@@ -47,8 +48,8 @@ struct Cluster final {
4748
float zCoordinate{-999.f};
4849
float phi{-999.f};
4950
float radius{-999.f};
50-
int clusterId{-1};
51-
int indexTableBinIndex{-1};
51+
int clusterId{constants::UnusedIndex};
52+
int indexTableBinIndex{constants::UnusedIndex};
5253

5354
ClassDefNV(Cluster, 1);
5455
};
@@ -70,7 +71,7 @@ struct TrackingFrameInfo final {
7071
float zCoordinate{-999.f};
7172
float xTrackingFrame{-999.f};
7273
float alphaTrackingFrame{-999.f};
73-
std::array<float, 2> positionTrackingFrame = {-1., -1.};
74+
std::array<float, 2> positionTrackingFrame = {constants::UnusedIndex, constants::UnusedIndex};
7475
std::array<float, 3> covarianceTrackingFrame = {999., 999., 999.};
7576

7677
ClassDefNV(TrackingFrameInfo, 1);

Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#ifndef TRACKINGITSU_INCLUDE_EXTERNALALLOCATOR_H_
1717
#define TRACKINGITSU_INCLUDE_EXTERNALALLOCATOR_H_
1818

19+
#include <memory_resource>
20+
1921
namespace o2::its
2022
{
2123

@@ -25,6 +27,36 @@ class ExternalAllocator
2527
virtual void* allocate(size_t) = 0;
2628
virtual void deallocate(char*, size_t) = 0;
2729
};
30+
31+
class ExternalAllocatorAdaptor final : public std::pmr::memory_resource
32+
{
33+
public:
34+
explicit ExternalAllocatorAdaptor(ExternalAllocator* alloc) : mAlloc(alloc) {}
35+
36+
protected:
37+
void* do_allocate(size_t bytes, size_t alignment) override
38+
{
39+
void* p = mAlloc->allocate(bytes);
40+
if (!p) {
41+
throw std::bad_alloc();
42+
}
43+
return p;
44+
}
45+
46+
void do_deallocate(void* p, size_t bytes, size_t) override
47+
{
48+
mAlloc->deallocate(static_cast<char*>(p), bytes);
49+
}
50+
51+
bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override
52+
{
53+
return this == &other;
54+
}
55+
56+
private:
57+
ExternalAllocator* mAlloc;
58+
};
59+
2860
} // namespace o2::its
2961

3062
#endif

Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ struct TimeFrame {
6969
using CellSeedN = CellSeed<nLayers>;
7070
friend class gpu::TimeFrameGPU<nLayers>;
7171

72-
TimeFrame();
73-
virtual ~TimeFrame();
72+
TimeFrame() = default;
73+
virtual ~TimeFrame() = default;
7474

7575
const Vertex& getPrimaryVertex(const int ivtx) const { return mPrimaryVertices[ivtx]; }
7676
gsl::span<const Vertex> getPrimaryVertices(int rofId) const;
@@ -95,7 +95,7 @@ struct TimeFrame {
9595
gsl::span<const unsigned char>::iterator& pattIt,
9696
const itsmft::TopologyDictionary* dict,
9797
const dataformats::MCTruthContainer<MCCompLabel>* mcLabels = nullptr);
98-
void resetROFrameData();
98+
void resetROFrameData(size_t nROFs);
9999

100100
int getTotalClusters() const;
101101
auto& getTotVertIteration() { return mTotVertPerIteration; }
@@ -188,7 +188,7 @@ struct TimeFrame {
188188
auto getNumberOfUsedExtendedClusters() const { return mNExtendedUsedClusters; }
189189

190190
/// memory management
191-
void setMemoryPool(std::shared_ptr<BoundedMemoryResource>& pool);
191+
void setMemoryPool(std::shared_ptr<BoundedMemoryResource> pool);
192192
auto& getMemoryPool() const noexcept { return mMemoryPool; }
193193
bool checkMemory(unsigned long max) { return getArtefactsMemory() < max; }
194194
unsigned long getArtefactsMemory() const;
@@ -233,33 +233,33 @@ struct TimeFrame {
233233
void setBz(float bz) { mBz = bz; }
234234
float getBz() const { return mBz; }
235235

236-
void setExternalAllocator(ExternalAllocator* allocator)
236+
/// State if memory will be externally managed.
237+
// device
238+
ExternalAllocator* mExtDeviceAllocator{nullptr};
239+
void setExternalDeviceAllocator(ExternalAllocator* allocator) { mExtDeviceAllocator = allocator; }
240+
ExternalAllocator* getExternalDeviceAllocator() { return mExtDeviceAllocator; }
241+
bool hasExternalDeviceAllocator() const noexcept { return mExtDeviceAllocator != nullptr; }
242+
// host
243+
ExternalAllocator* mExtHostAllocator{nullptr};
244+
void setExternalHostAllocator(ExternalAllocator* allocator)
237245
{
238-
if (isGPU()) {
239-
LOGP(debug, "Setting timeFrame allocator to external");
240-
mAllocator = allocator;
241-
} else {
242-
LOGP(fatal, "External allocator is currently only supported for GPU");
243-
}
246+
mExtHostAllocator = allocator;
247+
mExtMemoryPool = std::make_shared<BoundedMemoryResource>(mExtHostAllocator);
244248
}
245-
246-
ExternalAllocator* getExternalAllocator() { return mAllocator; }
247-
248-
virtual void setDevicePropagator(const o2::base::PropagatorImpl<float>*)
249-
{
250-
return;
251-
};
249+
ExternalAllocator* getExternalHostAllocator() { return mExtHostAllocator; }
250+
bool hasExternalHostAllocator() const noexcept { return mExtHostAllocator != nullptr; }
251+
std::shared_ptr<BoundedMemoryResource> mExtMemoryPool;
252+
std::pmr::memory_resource* getMaybeExternalHostResource(bool forceHost = false) { return (hasExternalHostAllocator() && !forceHost) ? mExtMemoryPool.get() : mMemoryPool.get(); }
253+
// Propagator
252254
const o2::base::PropagatorImpl<float>* getDevicePropagator() const { return mPropagatorDevice; }
255+
virtual void setDevicePropagator(const o2::base::PropagatorImpl<float>*) {};
253256

254257
template <typename... T>
255258
void addClusterToLayer(int layer, T&&... args);
256259
template <typename... T>
257260
void addTrackingFrameInfoToLayer(int layer, T&&... args);
258261
void addClusterExternalIndexToLayer(int layer, const int idx) { mClusterExternalIndices[layer].push_back(idx); }
259262

260-
void resetVectors();
261-
void resetTracklets();
262-
263263
/// Debug and printing
264264
void checkTrackletLUTs();
265265
void printROFoffsets();
@@ -290,10 +290,6 @@ struct TimeFrame {
290290
bounded_vector<int> mROFramesPV;
291291
bounded_vector<Vertex> mPrimaryVertices;
292292

293-
// State if memory will be externally managed.
294-
ExternalAllocator* mAllocator = nullptr;
295-
bool getExtAllocator() const noexcept { return mAllocator != nullptr; }
296-
297293
std::array<bounded_vector<Cluster>, nLayers> mUnsortedClusters;
298294
std::vector<bounded_vector<Tracklet>> mTracklets;
299295
std::vector<bounded_vector<CellSeedN>> mCells;

Detectors/ITSMFT/ITS/tracking/include/ITStracking/Tracker.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ class Tracker
6666
const LogFunc& = [](const std::string& s) { std::cerr << s << '\n'; });
6767

6868
void setParameters(const std::vector<TrackingParameters>& p) { mTrkParams = p; }
69-
void setMemoryPool(std::shared_ptr<BoundedMemoryResource>& pool) { mMemoryPool = pool; }
69+
void setMemoryPool(std::shared_ptr<BoundedMemoryResource> pool) { mMemoryPool = pool; }
7070
std::vector<TrackingParameters>& getParameters() { return mTrkParams; }
7171
void setBz(float bz) { mTraits->setBz(bz); }
7272
bool isMatLUT() const { return mTraits->isMatLUT(); }

0 commit comments

Comments
 (0)