Skip to content

Commit 81e1e5e

Browse files
committed
Checkpointing
1 parent a0a57b6 commit 81e1e5e

File tree

7 files changed

+269
-152
lines changed

7 files changed

+269
-152
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,14 @@ class TimeFrameGPU : public TimeFrame
5151
void initialise(const int, const TrackingParameters&, const int, IndexTableUtils* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr);
5252
void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int);
5353
void initDeviceSAFitting();
54+
void loadIndexTableUtils(const int);
5455
void loadTrackingFrameInfoDevice(const int);
5556
void loadUnsortedClustersDevice(const int);
5657
void loadClustersDevice(const int);
57-
void loadROframeClustersDevice(const int iteration);
58+
void loadClustersIndexTables(const int iteration);
59+
void createUsedClustersDevice(const int);
60+
void loadUsedClustersDevice();
61+
void loadROframeClustersDevice(const int);
5862
void loadMultiplicityCutMask(const int);
5963
void loadVertices(const int);
6064

@@ -112,6 +116,8 @@ class TimeFrameGPU : public TimeFrame
112116
const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
113117
const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
114118
const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; }
119+
const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; }
120+
const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; }
115121
const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; }
116122
const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; }
117123
const int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; }
@@ -148,17 +154,20 @@ class TimeFrameGPU : public TimeFrame
148154
// Device pointers
149155
StaticTrackingParameters<nLayers>* mTrackingParamsDevice;
150156
IndexTableUtils* mIndexTableUtilsDevice;
151-
std::array<unsigned char*, nLayers> mUsedClustersDevice;
152157

153158
// Hybrid pref
154159
uint8_t* mMultMaskDevice;
155160
Vertex* mPrimaryVerticesDevice;
156161
int* mROFramesPVDevice;
157162
std::array<Cluster*, nLayers> mClustersDevice;
158163
std::array<Cluster*, nLayers> mUnsortedClustersDevice;
164+
std::array<int*, nLayers> mClustersIndexTablesDevice;
165+
std::array<unsigned char*, nLayers> mUsedClustersDevice;
159166
std::array<int*, nLayers> mROFramesClustersDevice;
160167
const Cluster** mClustersDeviceArray;
161168
const Cluster** mUnsortedClustersDeviceArray;
169+
const int** mClustersIndexTablesDeviceArray;
170+
const unsigned char** mUsedClustersDeviceArray;
162171
const int** mROFrameClustersDeviceArray;
163172
std::array<Tracklet*, nLayers - 1> mTrackletsDevice;
164173
const Tracklet** mTrackletsDeviceArray;

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ GPUg() void fitTrackSeedsKernel(
5151
} // namespace gpu
5252

5353
template <int nLayers = 7>
54-
void computeTrackletsInRofsHandler(const uint8_t* multMask,
54+
void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
55+
const uint8_t* multMask,
5556
const int startROF,
5657
const int endROF,
5758
const int maxROF,
@@ -62,6 +63,15 @@ void computeTrackletsInRofsHandler(const uint8_t* multMask,
6263
const int nVertices,
6364
const Cluster** clusters,
6465
const int** ROFClusters,
66+
const unsigned char** usedClusters,
67+
const int** clustersIndexTables,
68+
const int iteration,
69+
const float NSigmaCut,
70+
std::vector<float>& phiCuts,
71+
const float resolutionPV,
72+
std::vector<float>& minR,
73+
std::vector<float>& maxR,
74+
std::vector<float>& resolutions,
6575
std::vector<float>& radii,
6676
std::vector<float>& mulScatAng,
6777
const int nBlocks,

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ struct gpuSpan {
3939
using ref = T&;
4040

4141
GPUd() gpuSpan() : _data(nullptr), _size(0) {}
42-
GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {}
43-
GPUd() ref operator[](std::size_t idx) const { return _data[idx]; }
44-
GPUd() std::size_t size() const { return _size; }
42+
GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {}
43+
GPUd() ref operator[](unsigned int idx) const { return _data[idx]; }
44+
GPUd() unsigned int size() const { return _size; }
4545
GPUd() bool empty() const { return _size == 0; }
4646
GPUd() ref front() const { return _data[0]; }
4747
GPUd() ref back() const { return _data[_size - 1]; }
@@ -50,7 +50,7 @@ struct gpuSpan {
5050

5151
protected:
5252
ptr _data;
53-
std::size_t _size;
53+
unsigned int _size;
5454
};
5555

5656
template <typename T>
@@ -60,10 +60,10 @@ struct gpuSpan<const T> {
6060
using ref = const T&;
6161

6262
GPUd() gpuSpan() : _data(nullptr), _size(0) {}
63-
GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {}
63+
GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {}
6464
GPUd() gpuSpan(const gpuSpan<T>& other) : _data(other._data), _size(other._size) {}
65-
GPUd() ref operator[](std::size_t idx) const { return _data[idx]; }
66-
GPUd() std::size_t size() const { return _size; }
65+
GPUd() ref operator[](unsigned int idx) const { return _data[idx]; }
66+
GPUd() unsigned int size() const { return _size; }
6767
GPUd() bool empty() const { return _size == 0; }
6868
GPUd() ref front() const { return _data[0]; }
6969
GPUd() ref back() const { return _data[_size - 1]; }
@@ -72,7 +72,7 @@ struct gpuSpan<const T> {
7272

7373
protected:
7474
ptr _data;
75-
std::size_t _size;
75+
unsigned int _size;
7676
};
7777

7878
enum class Task {

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 60 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,19 @@ void TimeFrameGPU<nLayers>::setDevicePropagator(const o2::base::PropagatorImpl<f
9292
mPropagatorDevice = propagator;
9393
}
9494

95+
template <int nLayers>
96+
void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
97+
{
98+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading indextable utils");
99+
if (!iteration) {
100+
LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB);
101+
allocMemAsync(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, getExtAllocator());
102+
}
103+
LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB);
104+
checkGPUError(cudaMemcpyAsync(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
105+
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
106+
}
107+
95108
template <int nLayers>
96109
void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration)
97110
{
@@ -128,13 +141,56 @@ void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration)
128141
}
129142
}
130143

144+
template <int nLayers>
145+
void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration)
146+
{
147+
if (!iteration) {
148+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters");
149+
for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
150+
LOGP(info, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, mIndexTables[iLayer].size(), mIndexTables[iLayer].size() * sizeof(int) / MB);
151+
allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[iLayer]), mIndexTables[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
152+
checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
153+
}
154+
allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, getExtAllocator());
155+
checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
156+
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
157+
}
158+
}
159+
160+
template <int nLayers>
161+
void TimeFrameGPU<nLayers>::createUsedClustersDevice(const int iteration)
162+
{
163+
if (!iteration) {
164+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags");
165+
for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
166+
LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB);
167+
allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[iLayer]), mUsedClusters[iLayer].size() * sizeof(unsigned char), nullptr, getExtAllocator());
168+
checkGPUError(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get()));
169+
}
170+
allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), nullptr, getExtAllocator());
171+
checkGPUError(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
172+
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
173+
}
174+
}
175+
176+
template <int nLayers>
177+
void TimeFrameGPU<nLayers>::loadUsedClustersDevice()
178+
{
179+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags");
180+
for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
181+
LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(unsigned char) / MB);
182+
checkGPUError(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
183+
}
184+
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
185+
}
186+
131187
template <int nLayers>
132188
void TimeFrameGPU<nLayers>::loadROframeClustersDevice(const int iteration)
133189
{
134190
if (!iteration) {
135191
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters");
136192
for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
137-
LOGP(info, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB);
193+
LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB);
138194
allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
139195
checkGPUError(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
140196
}
@@ -167,7 +223,7 @@ void TimeFrameGPU<nLayers>::loadMultiplicityCutMask(const int iteration)
167223
{
168224
if (!iteration) {
169225
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask");
170-
LOGP(info, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB);
226+
LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB);
171227
allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, getExtAllocator());
172228
checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
173229
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
@@ -179,10 +235,10 @@ void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
179235
{
180236
if (!iteration) {
181237
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices");
182-
LOGP(info, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB);
238+
LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB);
183239
allocMemAsync(reinterpret_cast<void**>(&mROFramesPVDevice), mROFramesPV.size() * sizeof(int), nullptr, getExtAllocator());
184240
checkGPUError(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
185-
LOGP(info, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB);
241+
LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB);
186242
allocMemAsync(reinterpret_cast<void**>(&mPrimaryVerticesDevice), mPrimaryVertices.size() * sizeof(Vertex), nullptr, getExtAllocator());
187243
checkGPUError(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
188244
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,13 @@ void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
3131
mTimeFrameGPU->initialise(iteration, mTrkParams[iteration], nLayers);
3232
mTimeFrameGPU->loadClustersDevice(iteration);
3333
mTimeFrameGPU->loadUnsortedClustersDevice(iteration);
34+
mTimeFrameGPU->loadClustersIndexTables(iteration);
3435
mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration);
3536
mTimeFrameGPU->loadMultiplicityCutMask(iteration);
3637
mTimeFrameGPU->loadVertices(iteration);
3738
mTimeFrameGPU->loadROframeClustersDevice(iteration);
39+
mTimeFrameGPU->createUsedClustersDevice(iteration);
40+
mTimeFrameGPU->loadIndexTableUtils(iteration);
3841
}
3942

4043
template <int nLayers>
@@ -95,7 +98,8 @@ void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int
9598
int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0};
9699
int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()};
97100

98-
computeTrackletsInRofsHandler<nLayers>(mTimeFrameGPU->getDeviceMultCutMask(),
101+
computeTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
102+
mTimeFrameGPU->getDeviceMultCutMask(),
99103
startROF,
100104
endROF,
101105
mTimeFrameGPU->getNrof(),
@@ -106,6 +110,15 @@ void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int
106110
mTimeFrameGPU->getPrimaryVerticesNum(),
107111
mTimeFrameGPU->getDeviceArrayClusters(),
108112
mTimeFrameGPU->getDeviceROframeClusters(),
113+
mTimeFrameGPU->getDeviceArrayUsedClusters(),
114+
mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
115+
iteration,
116+
mTrkParams[iteration].NSigmaCut,
117+
mTimeFrameGPU->getPhiCuts(),
118+
mTrkParams[iteration].PVres,
119+
mTimeFrameGPU->getMinRs(),
120+
mTimeFrameGPU->getMaxRs(),
121+
mTimeFrameGPU->getPositionResolutions(),
109122
mTrkParams[iteration].LayerRadii,
110123
mTimeFrameGPU->getMSangles(),
111124
conf.nBlocks,
@@ -324,6 +337,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
324337
mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
325338
}
326339
}
340+
mTimeFrameGPU->loadUsedClustersDevice();
327341
if (iteration == mTrkParams.size() - 1) {
328342
mTimeFrameGPU->unregisterHostMemory(0);
329343
}

0 commit comments

Comments
 (0)