Skip to content

Commit c587bac

Browse files
authored
Use typed custom allocator for some thrust containers (#14371)
1 parent 8e06932 commit c587bac

File tree

6 files changed

+74
-29
lines changed

6 files changed

+74
-29
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
namespace o2::its
2020
{
2121
class CellSeed;
22+
class ExternalAllocator;
2223
namespace gpu
2324
{
2425
#ifdef GPUCA_GPUCODE // GPUg() global kernels must only when compiled by GPU compiler
@@ -178,7 +179,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
178179

179180
int filterCellNeighboursHandler(gpuPair<int, int>*,
180181
int*,
181-
unsigned int);
182+
unsigned int,
183+
o2::its::ExternalAllocator* = nullptr);
182184

183185
template <int nLayers = 7>
184186
void processNeighboursHandler(const int startLayer,
@@ -191,6 +193,7 @@ void processNeighboursHandler(const int startLayer,
191193
gsl::span<int*> neighboursDeviceLUTs,
192194
const TrackingFrameInfo** foundTrackingFrameInfo,
193195
bounded_vector<CellSeed>& seedsHost,
196+
o2::its::ExternalAllocator*,
194197
const float bz,
195198
const float MaxChi2ClusterAttachment,
196199
const float maxChi2NDF,

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "ITStrackingGPU/TrackerTraitsGPU.h"
1919
#include "ITStrackingGPU/TrackingKernels.h"
2020
#include "ITStracking/TrackingConfigParam.h"
21+
2122
namespace o2::its
2223
{
2324
constexpr int UnusedIndex{-1};
@@ -209,7 +210,8 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighbours(const int iteration)
209210

210211
filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer),
211212
mTimeFrameGPU->getDeviceNeighbours(iLayer),
212-
nNeigh);
213+
nNeigh,
214+
mTimeFrameGPU->getExternalAllocator());
213215
}
214216
mTimeFrameGPU->createNeighboursDeviceArray();
215217
mTimeFrameGPU->unregisterRest();
@@ -236,6 +238,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
236238
mTimeFrameGPU->getDeviceNeighboursLUTs(),
237239
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
238240
trackSeeds,
241+
mTimeFrameGPU->getExternalAllocator(),
239242
this->mBz,
240243
this->mTrkParams[0].MaxChi2ClusterAttachment,
241244
this->mTrkParams[0].MaxChi2NDF,

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,14 @@
2828
#include "ITStracking/Constants.h"
2929
#include "ITStracking/IndexTableUtils.h"
3030
#include "ITStracking/MathUtils.h"
31+
#include "ITStracking/ExternalAllocator.h"
3132
#include "DataFormatsITS/TrackITS.h"
3233
#include "ReconstructionDataFormats/Vertex.h"
3334

3435
#include "ITStrackingGPU/TrackerTraitsGPU.h"
3536
#include "ITStrackingGPU/TrackingKernels.h"
3637
#include "ITStrackingGPU/Utils.h"
3738

38-
#include "GPUCommonHelpers.h"
39-
4039
#ifndef __HIPCC__
4140
#define THRUST_NAMESPACE thrust::cuda
4241
#else
@@ -64,6 +63,37 @@ GPUdii() float Sq(float v)
6463
namespace gpu
6564
{
6665

66+
template <typename T>
67+
class TypedAllocator : public thrust::device_allocator<T>
68+
{
69+
public:
70+
using value_type = T;
71+
using pointer = T*;
72+
73+
template <typename U>
74+
struct rebind {
75+
using other = TypedAllocator<U>;
76+
};
77+
78+
explicit TypedAllocator(ExternalAllocator* allocPtr)
79+
: mInternalAllocator(allocPtr) {}
80+
81+
T* allocate(size_t n)
82+
{
83+
return reinterpret_cast<T*>(mInternalAllocator->allocate(n * sizeof(T)));
84+
}
85+
86+
void deallocate(T* p, size_t n)
87+
{
88+
char* raw_ptr = reinterpret_cast<char*>(p);
89+
size_t bytes = n * sizeof(T);
90+
mInternalAllocator->deallocate(raw_ptr, bytes); // redundant as internal dealloc is no-op.
91+
}
92+
93+
private:
94+
ExternalAllocator* mInternalAllocator;
95+
};
96+
6797
GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
6898
const o2::its::IndexTableUtils& utils,
6999
const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
@@ -1117,7 +1147,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
11171147
11181148
int filterCellNeighboursHandler(gpuPair<int, int>* cellNeighbourPairs,
11191149
int* cellNeighbours,
1120-
unsigned int nNeigh)
1150+
unsigned int nNeigh,
1151+
o2::its::ExternalAllocator* allocator)
11211152
{
11221153
thrust::device_ptr<gpuPair<int, int>> neighVectorPairs(cellNeighbourPairs);
11231154
thrust::device_ptr<int> validNeighs(cellNeighbours);
@@ -1140,6 +1171,7 @@ void processNeighboursHandler(const int startLayer,
11401171
gsl::span<int*> neighboursDeviceLUTs,
11411172
const TrackingFrameInfo** foundTrackingFrameInfo,
11421173
bounded_vector<CellSeed>& seedsHost,
1174+
o2::its::ExternalAllocator* allocator,
11431175
const float bz,
11441176
const float maxChi2ClusterAttachment,
11451177
const float maxChi2NDF,
@@ -1148,8 +1180,10 @@ void processNeighboursHandler(const int startLayer,
11481180
const int nBlocks,
11491181
const int nThreads)
11501182
{
1151-
thrust::device_vector<int> foundSeedsTable(nCells[startLayer] + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency.
1152-
// TODO: fix this.
1183+
auto allocInt = gpu::TypedAllocator<int>(allocator);
1184+
auto allocCellSeed = gpu::TypedAllocator<CellSeed>(allocator);
1185+
thrust::device_vector<int, gpu::TypedAllocator<int>> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt); // Shortcut: device_vector skips central memory management, we are relying on the contingency.
1186+
// TODO: fix this.
11531187
11541188
gpu::processNeighboursKernel<true><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
11551189
o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
@@ -1172,8 +1206,8 @@ void processNeighboursHandler(const int startLayer,
11721206
matCorrType);
11731207
gpu::cubExclusiveScanInPlace(foundSeedsTable, nCells[startLayer] + 1);
11741208
1175-
thrust::device_vector<int> updatedCellId(foundSeedsTable.back());
1176-
thrust::device_vector<CellSeed> updatedCellSeed(foundSeedsTable.back());
1209+
thrust::device_vector<int, gpu::TypedAllocator<int>> updatedCellId(foundSeedsTable.back(), 0, allocInt);
1210+
thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> updatedCellSeed(foundSeedsTable.back(), allocCellSeed);
11771211
gpu::processNeighboursKernel<false><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
11781212
o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
11791213
startLayer,
@@ -1195,13 +1229,13 @@ void processNeighboursHandler(const int startLayer,
11951229
matCorrType);
11961230
11971231
int level = startLevel;
1198-
thrust::device_vector<int> lastCellId;
1199-
thrust::device_vector<CellSeed> lastCellSeed;
1232+
thrust::device_vector<int, gpu::TypedAllocator<int>> lastCellId(allocInt);
1233+
thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> lastCellSeed(allocCellSeed);
12001234
for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
12011235
lastCellSeed.swap(updatedCellSeed);
12021236
lastCellId.swap(updatedCellId);
1203-
thrust::device_vector<CellSeed>().swap(updatedCellSeed);
1204-
thrust::device_vector<int>().swap(updatedCellId);
1237+
thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>>(allocCellSeed).swap(updatedCellSeed);
1238+
thrust::device_vector<int, gpu::TypedAllocator<int>>(allocInt).swap(updatedCellId);
12051239
auto lastCellSeedSize{lastCellSeed.size()};
12061240
foundSeedsTable.resize(lastCellSeedSize + 1);
12071241
thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0);
@@ -1253,8 +1287,7 @@ void processNeighboursHandler(const int startLayer,
12531287
propagator,
12541288
matCorrType);
12551289
}
1256-
1257-
thrust::device_vector<CellSeed> outSeeds(updatedCellSeed.size());
1290+
thrust::device_vector<CellSeed, gpu::TypedAllocator<CellSeed>> outSeeds(updatedCellSeed.size(), allocCellSeed);
12581291
auto end = thrust::copy_if(updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5)));
12591292
auto s{end - outSeeds.begin()};
12601293
seedsHost.reserve(seedsHost.size() + s);
@@ -1367,6 +1400,7 @@ template void processNeighboursHandler<7>(const int startLayer,
13671400
gsl::span<int*> neighboursDeviceLUTs,
13681401
const TrackingFrameInfo** foundTrackingFrameInfo,
13691402
bounded_vector<CellSeed>& seedsHost,
1403+
o2::its::ExternalAllocator*,
13701404
const float bz,
13711405
const float maxChi2ClusterAttachment,
13721406
const float maxChi2NDF,

Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ class ExternalAllocator
2323
{
2424
public:
2525
virtual void* allocate(size_t) = 0;
26+
virtual void deallocate(char*, size_t) = 0;
2627
};
27-
2828
} // namespace o2::its
2929

3030
#endif

Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,23 @@ struct TimeFrame {
230230
void setBz(float bz) { mBz = bz; }
231231
float getBz() const { return mBz; }
232232

233-
virtual void setDevicePropagator(const o2::base::PropagatorImpl<float>*) { return; }
233+
void setExternalAllocator(ExternalAllocator* allocator)
234+
{
235+
if (mIsGPU) {
236+
LOGP(debug, "Setting timeFrame allocator to external");
237+
mAllocator = allocator;
238+
mExtAllocator = true; // to be removed
239+
} else {
240+
LOGP(fatal, "External allocator is currently only supported for GPU");
241+
}
242+
}
243+
244+
ExternalAllocator* getExternalAllocator() { return mAllocator; }
245+
246+
virtual void setDevicePropagator(const o2::base::PropagatorImpl<float>*)
247+
{
248+
return;
249+
};
234250
const o2::base::PropagatorImpl<float>* getDevicePropagator() const { return mPropagatorDevice; }
235251

236252
template <typename... T>
@@ -277,17 +293,6 @@ struct TimeFrame {
277293
// State if memory will be externally managed.
278294
bool mExtAllocator = false;
279295
ExternalAllocator* mAllocator = nullptr;
280-
void setExternalAllocator(ExternalAllocator* allocator)
281-
{
282-
if (mIsGPU) {
283-
LOGP(debug, "Setting timeFrame allocator to external");
284-
mAllocator = allocator;
285-
mExtAllocator = true; // to be removed
286-
} else {
287-
LOGP(fatal, "External allocator is currently only supported for GPU");
288-
}
289-
}
290-
void setExtAllocator(bool ext) { mExtAllocator = ext; }
291296
bool getExtAllocator() const { return mExtAllocator; }
292297

293298
std::array<bounded_vector<Cluster>, nLayers> mUnsortedClusters;

GPU/GPUTracking/Global/GPUChainITS.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator
3030
{
3131
return mFWReco->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU);
3232
}
33-
33+
void deallocate(char* ptr, size_t) override {}
3434
void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; }
3535

3636
private:

0 commit comments

Comments
 (0)