Skip to content

Commit a0a57b6

Browse files
committed
Debugging getSpan
1 parent 5b99ec7 commit a0a57b6

File tree

8 files changed

+303
-298
lines changed

8 files changed

+303
-298
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,10 @@ class TimeFrameGPU : public TimeFrame
5454
void loadTrackingFrameInfoDevice(const int);
5555
void loadUnsortedClustersDevice(const int);
5656
void loadClustersDevice(const int);
57+
void loadROframeClustersDevice(const int iteration);
5758
void loadMultiplicityCutMask(const int);
59+
void loadVertices(const int);
60+
5861
///
5962
void loadTrackletsDevice();
6063
void loadTrackletsLUTDevice();
@@ -95,7 +98,7 @@ class TimeFrameGPU : public TimeFrame
9598
std::vector<std::vector<o2::MCCompLabel>>& getLabelsInChunks() { return mLabelsInChunks; }
9699
int getNAllocatedROFs() const { return mNrof; } // Allocated means maximum nROF for each chunk while populated is the number of loaded ones.
97100
StaticTrackingParameters<nLayers>* getDeviceTrackingParameters() { return mTrackingParamsDevice; }
98-
Vertex* getDeviceVertices() { return mVerticesDevice; }
101+
Vertex* getDeviceVertices() { return mPrimaryVerticesDevice; }
99102
int* getDeviceROFramesPV() { return mROFramesPVDevice; }
100103
unsigned char* getDeviceUsedClusters(const int);
101104
const o2::base::Propagator* getChainPropagator();
@@ -109,6 +112,7 @@ class TimeFrameGPU : public TimeFrame
109112
const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
110113
const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
111114
const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; }
115+
const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; }
112116
const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; }
113117
const int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; }
114118
int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; }
@@ -118,7 +122,7 @@ class TimeFrameGPU : public TimeFrame
118122
o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; }
119123
float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; }
120124
int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; }
121-
bool* getDevicemMultMask() { return mMultMaskDevice; }
125+
uint8_t* getDeviceMultCutMask() { return mMultMaskDevice; }
122126

123127
void setDevicePropagator(const o2::base::PropagatorImpl<float>*) override;
124128

@@ -144,17 +148,18 @@ class TimeFrameGPU : public TimeFrame
144148
// Device pointers
145149
StaticTrackingParameters<nLayers>* mTrackingParamsDevice;
146150
IndexTableUtils* mIndexTableUtilsDevice;
147-
std::array<int*, nLayers> mROFramesClustersDevice;
148151
std::array<unsigned char*, nLayers> mUsedClustersDevice;
149-
Vertex* mVerticesDevice;
150-
int* mROFramesPVDevice;
151152

152153
// Hybrid pref
153-
bool* mMultMaskDevice;
154+
uint8_t* mMultMaskDevice;
155+
Vertex* mPrimaryVerticesDevice;
156+
int* mROFramesPVDevice;
154157
std::array<Cluster*, nLayers> mClustersDevice;
155158
std::array<Cluster*, nLayers> mUnsortedClustersDevice;
159+
std::array<int*, nLayers> mROFramesClustersDevice;
156160
const Cluster** mClustersDeviceArray;
157161
const Cluster** mUnsortedClustersDeviceArray;
162+
const int** mROFrameClustersDeviceArray;
158163
std::array<Tracklet*, nLayers - 1> mTrackletsDevice;
159164
const Tracklet** mTrackletsDeviceArray;
160165
const int** mTrackletsLUTDeviceArray;

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,19 @@ GPUg() void fitTrackSeedsKernel(
5151
} // namespace gpu
5252

5353
template <int nLayers = 7>
54-
void computeTrackletsInRofsHandler(const int startROF,
54+
void computeTrackletsInRofsHandler(const uint8_t* multMask,
55+
const int startROF,
5556
const int endROF,
57+
const int maxROF,
58+
const int deltaROF,
5659
const int vertexId,
60+
const Vertex* vertices,
61+
const int* rofPV,
62+
const int nVertices,
63+
const Cluster** clusters,
64+
const int** ROFClusters,
65+
std::vector<float>& radii,
66+
std::vector<float>& mulScatAng,
5767
const int nBlocks,
5868
const int nThreads);
5969

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,26 +29,51 @@ struct gpuPair {
2929
T2 second;
3030
};
3131

32+
namespace gpu
33+
{
34+
// Poor man implementation of a span-like struct. It is very limited.
3235
template <typename T>
3336
struct gpuSpan {
37+
using value_type = T;
38+
using ptr = T*;
39+
using ref = T&;
40+
3441
GPUd() gpuSpan() : _data(nullptr), _size(0) {}
35-
GPUd() gpuSpan(T* data, size_t size) : _data(data), _size(size) {}
36-
GPUd() gpuSpan(const T* data, size_t size) : _data(data), _size(size) {}
37-
GPUd() T& operator[](size_t idx) const { return _data[idx]; }
38-
GPUd() size_t size() const { return _size; }
42+
GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {}
43+
GPUd() ref operator[](std::size_t idx) const { return _data[idx]; }
44+
GPUd() std::size_t size() const { return _size; }
3945
GPUd() bool empty() const { return _size == 0; }
40-
GPUd() T& front() const { return _data[0]; }
41-
GPUd() T& back() const { return _data[_size - 1]; }
42-
GPUd() T* begin() const { return _data; }
43-
GPUd() T* end() const { return _data + _size; }
46+
GPUd() ref front() const { return _data[0]; }
47+
GPUd() ref back() const { return _data[_size - 1]; }
48+
GPUd() ptr begin() const { return _data; }
49+
GPUd() ptr end() const { return _data + _size; }
4450

4551
protected:
46-
const T* _data;
47-
size_t _size;
52+
ptr _data;
53+
std::size_t _size;
4854
};
4955

50-
namespace gpu
51-
{
56+
template <typename T>
57+
struct gpuSpan<const T> {
58+
using value_type = T;
59+
using ptr = const T*;
60+
using ref = const T&;
61+
62+
GPUd() gpuSpan() : _data(nullptr), _size(0) {}
63+
GPUd() gpuSpan(ptr data, std::size_t dim) : _data(data), _size(dim) {}
64+
GPUd() gpuSpan(const gpuSpan<T>& other) : _data(other._data), _size(other._size) {}
65+
GPUd() ref operator[](std::size_t idx) const { return _data[idx]; }
66+
GPUd() std::size_t size() const { return _size; }
67+
GPUd() bool empty() const { return _size == 0; }
68+
GPUd() ref front() const { return _data[0]; }
69+
GPUd() ref back() const { return _data[_size - 1]; }
70+
GPUd() ptr begin() const { return _data; }
71+
GPUd() ptr end() const { return _data + _size; }
72+
73+
protected:
74+
ptr _data;
75+
std::size_t _size;
76+
};
5277

5378
enum class Task {
5479
Tracker = 0,

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,22 @@ void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration)
128128
}
129129
}
130130

131+
template <int nLayers>
132+
void TimeFrameGPU<nLayers>::loadROframeClustersDevice(const int iteration)
133+
{
134+
if (!iteration) {
135+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters");
136+
for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
137+
LOGP(info, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB);
138+
allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
139+
checkGPUError(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
140+
}
141+
allocMemAsync(reinterpret_cast<void**>(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), nullptr, getExtAllocator());
142+
checkGPUError(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
143+
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
144+
}
145+
}
146+
131147
template <int nLayers>
132148
void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration)
133149
{
@@ -149,14 +165,30 @@ void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration)
149165
template <int nLayers>
150166
void TimeFrameGPU<nLayers>::loadMultiplicityCutMask(const int iteration)
151167
{
152-
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask");
153168
if (!iteration) {
169+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask");
154170
LOGP(info, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB);
155-
allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(bool), nullptr, getExtAllocator());
171+
allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, getExtAllocator());
156172
checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
173+
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
157174
}
158-
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
159175
}
176+
177+
template <int nLayers>
178+
void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
179+
{
180+
if (!iteration) {
181+
START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices");
182+
LOGP(info, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB);
183+
allocMemAsync(reinterpret_cast<void**>(&mROFramesPVDevice), mROFramesPV.size() * sizeof(int), nullptr, getExtAllocator());
184+
checkGPUError(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
185+
LOGP(info, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB);
186+
allocMemAsync(reinterpret_cast<void**>(&mPrimaryVerticesDevice), mPrimaryVertices.size() * sizeof(Vertex), nullptr, getExtAllocator());
187+
checkGPUError(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
188+
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
189+
}
190+
}
191+
160192
template <int nLayers>
161193
void TimeFrameGPU<nLayers>::loadTrackletsDevice()
162194
{

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
3333
mTimeFrameGPU->loadUnsortedClustersDevice(iteration);
3434
mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration);
3535
mTimeFrameGPU->loadMultiplicityCutMask(iteration);
36+
mTimeFrameGPU->loadVertices(iteration);
37+
mTimeFrameGPU->loadROframeClustersDevice(iteration);
3638
}
3739

3840
template <int nLayers>
@@ -92,9 +94,20 @@ void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int
9294
gsl::span<const Vertex> diamondSpan(&diamondVert, 1);
9395
int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0};
9496
int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()};
95-
computeTrackletsInRofsHandler<nLayers>(startROF,
97+
98+
computeTrackletsInRofsHandler<nLayers>(mTimeFrameGPU->getDeviceMultCutMask(),
99+
startROF,
96100
endROF,
101+
mTimeFrameGPU->getNrof(),
102+
mTrkParams[iteration].DeltaROF,
97103
iVertex,
104+
mTimeFrameGPU->getDeviceVertices(),
105+
mTimeFrameGPU->getDeviceROFramesPV(),
106+
mTimeFrameGPU->getPrimaryVerticesNum(),
107+
mTimeFrameGPU->getDeviceArrayClusters(),
108+
mTimeFrameGPU->getDeviceROframeClusters(),
109+
mTrkParams[iteration].LayerRadii,
110+
mTimeFrameGPU->getMSangles(),
98111
conf.nBlocks,
99112
conf.nThreads);
100113
}

0 commit comments

Comments
 (0)