Skip to content

Commit ff486d4

Browse files
committed
tracklets on gpu
1 parent 208ea84 commit ff486d4

File tree

6 files changed

+66
-70
lines changed

6 files changed

+66
-70
lines changed

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,13 +138,14 @@ class TimeFrameGPU : public TimeFrame
138138
// Host-specific getters
139139
gsl::span<int> getHostNTracklets(const int chunkId);
140140
gsl::span<int> getHostNCells(const int chunkId);
141+
gsl::span<int, nLayers - 1> getNTracklets() { return mNTracklets; }
142+
gsl::span<int, nLayers - 2> getNCells() { return mNCells; }
141143

142144
// Host-available device getters
143145
gsl::span<int*> getDeviceTrackletsLUTs() { return mTrackletsLUTDevice; }
144146
gsl::span<int*> getDeviceCellLUTs() { return mCellsLUTDevice; }
145147
gsl::span<Tracklet*> getDeviceTracklet() { return mTrackletsDevice; }
146148
gsl::span<CellSeed*> getDeviceCells() { return mCellsDevice; }
147-
gsl::span<int, nLayers - 2> getNCellsDevice() { return mNCells; }
148149

149150
private:
150151
void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations

Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,10 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
9797
const unsigned char** usedClusters,
9898
const int** clustersIndexTables,
9999
Tracklet** tracklets,
100+
gsl::span<Tracklet*> spanTracklets,
101+
gsl::span<int> nTracklets,
100102
int** trackletsLUTs,
103+
gsl::span<int*> trackletsLUTsHost,
101104
const int iteration,
102105
const float NSigmaCut,
103106
std::vector<float>& phiCuts,

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -489,13 +489,6 @@ void TimeFrameGPU<nLayers>::unregisterRest()
489489
LOGP(debug, "unregistering rest of the host memory...");
490490
checkGPUError(cudaHostUnregister(mCellsDevice.data()));
491491
checkGPUError(cudaHostUnregister(mTrackletsDevice.data()));
492-
checkGPUError(cudaHostUnregister(mTrackletsLUTDevice.data()));
493-
for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
494-
if (iLayer < nLayers - 2) {
495-
checkGPUError(cudaHostUnregister(mTrackletsLookupTable[iLayer].data()));
496-
}
497-
checkGPUError(cudaHostUnregister(mTracklets[iLayer].data()));
498-
}
499492
STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
500493
}
501494

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,10 @@ void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int
144144
mTimeFrameGPU->getDeviceArrayUsedClusters(),
145145
mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
146146
mTimeFrameGPU->getDeviceArrayTracklets(),
147+
mTimeFrameGPU->getDeviceTracklet(),
148+
mTimeFrameGPU->getNTracklets(),
147149
mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
150+
mTimeFrameGPU->getDeviceTrackletsLUTs(),
148151
iteration,
149152
mTrkParams[iteration].NSigmaCut,
150153
mTimeFrameGPU->getPhiCuts(),
@@ -161,25 +164,22 @@ void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int
161164
template <int nLayers>
162165
void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)
163166
{
164-
mTimeFrameGPU->loadTrackletsDevice();
165-
mTimeFrameGPU->loadTrackletsLUTDevice();
166167
mTimeFrameGPU->createCellsLUTDevice();
167168
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
168169

169-
// #pragma omp parallel for num_threads(nLayers)
170170
for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
171-
if (mTimeFrameGPU->getTracklets()[iLayer + 1].empty() ||
172-
mTimeFrameGPU->getTracklets()[iLayer].empty()) {
171+
if (!mTimeFrameGPU->getNTracklets()[iLayer + 1] || !mTimeFrameGPU->getNTracklets()[iLayer]) {
172+
LOGP(info, "continuing here");
173173
continue;
174174
}
175-
176-
const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getTracklets()[iLayer].size())};
175+
LOGP(info, "+> {}", mTimeFrameGPU->getNTracklets()[iLayer]);
176+
const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getNTracklets()[iLayer])};
177177
countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(),
178178
mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
179179
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
180180
mTimeFrameGPU->getDeviceArrayTracklets(),
181181
mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
182-
mTimeFrameGPU->getTracklets()[iLayer].size(),
182+
mTimeFrameGPU->getNTracklets()[iLayer],
183183
iLayer,
184184
nullptr,
185185
mTimeFrameGPU->getDeviceArrayCellsLUT(),
@@ -196,7 +196,7 @@ void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)
196196
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
197197
mTimeFrameGPU->getDeviceArrayTracklets(),
198198
mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
199-
mTimeFrameGPU->getTracklets()[iLayer].size(),
199+
mTimeFrameGPU->getNTracklets()[iLayer],
200200
iLayer,
201201
mTimeFrameGPU->getDeviceCells()[iLayer],
202202
mTimeFrameGPU->getDeviceArrayCellsLUT(),
@@ -220,7 +220,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
220220
auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
221221
std::vector<std::vector<std::pair<int, int>>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1);
222222
for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
223-
const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCellsDevice()[iLayer + 1])};
223+
const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer + 1])};
224224
mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear();
225225
mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0);
226226

@@ -283,7 +283,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
283283
std::vector<int> lastCellId, updatedCellId;
284284
std::vector<CellSeed> lastCellSeed, updatedCellSeed;
285285

286-
processNeighbours(startLayer, startLevel, mTimeFrame->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId);
286+
processNeighbours(startLayer, startLevel, mTimeFrameGPU->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId);
287287

288288
int level = startLevel;
289289
for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
@@ -337,8 +337,8 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
337337
if (track.getClusterIndex(iLayer) == UnusedIndex) {
338338
continue;
339339
}
340-
nShared += int(mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer)));
341-
isFirstShared |= !iLayer && mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer));
340+
nShared += int(mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer)));
341+
isFirstShared |= !iLayer && mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer));
342342
}
343343

344344
if (nShared > mTrkParams[0].ClusterSharing) {
@@ -350,8 +350,8 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
350350
if (track.getClusterIndex(iLayer) == UnusedIndex) {
351351
continue;
352352
}
353-
mTimeFrame->markUsedCluster(iLayer, track.getClusterIndex(iLayer));
354-
int currentROF = mTimeFrame->getClusterROF(iLayer, track.getClusterIndex(iLayer));
353+
mTimeFrameGPU->markUsedCluster(iLayer, track.getClusterIndex(iLayer));
354+
int currentROF = mTimeFrameGPU->getClusterROF(iLayer, track.getClusterIndex(iLayer));
355355
for (int iR{0}; iR < 3; ++iR) {
356356
if (rofs[iR] == INT_MAX) {
357357
rofs[iR] = currentROF;
@@ -367,7 +367,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
367367
if (rofs[1] != INT_MAX) {
368368
track.setNextROFbit();
369369
}
370-
mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
370+
mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
371371
}
372372
}
373373
mTimeFrameGPU->loadUsedClustersDevice();

Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,17 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1,
192192
0.f, 0.f, 0.f, 0.f, sg2q2pt});
193193
}
194194

195+
// auto sort_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); };
196+
// auto equal_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; };
197+
198+
struct sort_tracklets {
199+
GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }
200+
};
201+
202+
struct equal_tracklets {
203+
GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; }
204+
};
205+
195206
template <typename T1, typename T2>
196207
struct pair_to_first : public thrust::unary_function<gpuPair<T1, T2>, T1> {
197208
GPUhd() int operator()(const gpuPair<T1, T2>& a) const
@@ -686,10 +697,7 @@ GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets,
686697
const int nTracklets)
687698
{
688699
for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) {
689-
auto& tracklet{tracklets[currentTrackletIndex]};
690-
if (tracklet.firstClusterIndex >= 0) {
691-
atomicAdd(trackletsLookUpTable + tracklet.firstClusterIndex, 1);
692-
}
700+
atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1);
693701
}
694702
}
695703

@@ -808,7 +816,10 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
808816
const unsigned char** usedClusters,
809817
const int** clustersIndexTables,
810818
Tracklet** tracklets,
819+
gsl::span<Tracklet*> spanTracklets,
820+
gsl::span<int> nTracklets,
811821
int** trackletsLUTs,
822+
gsl::span<int*> trackletsLUTsHost,
812823
const int iteration,
813824
const float NSigmaCut,
814825
std::vector<float>& phiCuts,
@@ -848,8 +859,31 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
848859
resolutions[iLayer],
849860
radii[iLayer + 1] - radii[iLayer],
850861
mulScatAng[iLayer]);
851-
gpuCheckError(cudaPeekAtLastError());
852-
gpuCheckError(cudaDeviceSynchronize());
862+
thrust::device_ptr<Tracklet> tracklets_ptr(spanTracklets[iLayer]);
863+
thrust::sort(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets());
864+
auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets());
865+
nTracklets[iLayer] = unique_end - tracklets_ptr;
866+
LOGP(info, "=> {} {}", nTracklets[iLayer], unique_end - tracklets_ptr);
867+
if (iLayer > 0) {
868+
gpuCheckError(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int)));
869+
gpu::compileTrackletsLookupTableKernel<<<nBlocks, nThreads>>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]);
870+
void* d_temp_storage = nullptr;
871+
size_t temp_storage_bytes = 0;
872+
gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage
873+
temp_storage_bytes, // temp_storage_bytes
874+
trackletsLUTsHost[iLayer], // d_in
875+
trackletsLUTsHost[iLayer], // d_out
876+
nClusters[iLayer] + 1, // num_items
877+
0)); // NOLINT: this is the offset of the sum, not a pointer
878+
discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
879+
gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage
880+
temp_storage_bytes, // temp_storage_bytes
881+
trackletsLUTsHost[iLayer], // d_in
882+
trackletsLUTsHost[iLayer], // d_out
883+
nClusters[iLayer] + 1, // num_items
884+
0)); // NOLINT: this is the offset of the sum, not a pointer
885+
gpuCheckError(cudaFree(d_temp_storage));
886+
}
853887
}
854888
}
855889

@@ -1127,7 +1161,10 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
11271161
const unsigned char** usedClusters,
11281162
const int** clustersIndexTables,
11291163
Tracklet** tracklets,
1164+
gsl::span<Tracklet*> spanTracklets,
1165+
gsl::span<int> nTracklets,
11301166
int** trackletsLUTs,
1167+
gsl::span<int*> trackletsLUTsHost,
11311168
const int iteration,
11321169
const float NSigmaCut,
11331170
std::vector<float>& phiCuts,

Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -193,56 +193,17 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in
193193
}
194194
}
195195
}
196-
// if (rof0 == 81) {
197-
// printf("CPU layer: %d -> %f %f %f %f %f %f %f %f\n",
198-
// iLayer,
199-
// mTrkParams[iteration].NSigmaCut,
200-
// tf->getPhiCut(iLayer),
201-
// mTrkParams[iteration].PVres,
202-
// tf->getMinR(iLayer + 1),
203-
// tf->getMaxR(iLayer + 1),
204-
// tf->getPositionResolution(iLayer),
205-
// meanDeltaR,
206-
// tf->getMSangle(iLayer));
207-
// }
208196
}
209197
}
210198
if (!tf->checkMemory(mTrkParams[iteration].MaxMemory)) {
211199
return;
212200
}
213201

214-
// for (auto iLayer{0}; iLayer < tf->getTracklets().size(); ++iLayer) {
215-
// std::cout << "tracklets layer " << iLayer << ": " << tf->getTracklets()[iLayer].size() << std::endl;
216-
// }
217-
218-
// for (auto iLayer{0}; iLayer < tf->getTrackletsLookupTable().size(); ++iLayer) {
219-
// auto lut = tf->getTrackletsLookupTable()[iLayer];
220-
// for (unsigned int iC{0}; iC < lut.size(); ++iC) {
221-
// if (!(iC % 150)) {
222-
// printf("\n row %d: ===> %d/%d\t", iLayer, iC, (int)lut.size());
223-
// }
224-
// printf("%d\t", lut[iC]);
225-
// }
226-
// }
227-
228-
// for (auto rofId{0}; rofId < 2304; ++rofId) {
229-
// int nClus = tf->getClustersOnLayer(rofId, 1).size();
230-
// if (!nClus) {
231-
// continue;
232-
// }
233-
// printf("rof: %d (%d) ==> ", rofId, nClus);
234-
235-
// for (int iC{0}; iC < nClus; ++iC) {
236-
// int nT = tf->getTrackletsLookupTable()[0][tf->getSortedIndex(rofId, 1, iC)];
237-
// printf("%d\t", nT);
238-
// }
239-
// printf("\n");
240-
// }
241-
242202
#pragma omp parallel for num_threads(mNThreads)
243203
for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
244204
/// Sort tracklets
245205
auto& trkl{tf->getTracklets()[iLayer + 1]};
206+
auto oldsize{trkl.size()};
246207
std::sort(trkl.begin(), trkl.end(), [](const Tracklet& a, const Tracklet& b) {
247208
return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex);
248209
});
@@ -265,6 +226,7 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in
265226
/// Compute LUT
266227
std::exclusive_scan(lut.begin(), lut.end(), lut.begin(), 0);
267228
lut.push_back(trkl.size());
229+
LOGP(info, "CPU layer {} -> old size: {} - new size: {}", iLayer, oldsize, trkl.size());
268230
}
269231
/// Layer 0 is done outside the loop
270232
std::sort(tf->getTracklets()[0].begin(), tf->getTracklets()[0].end(), [](const Tracklet& a, const Tracklet& b) {

0 commit comments

Comments
 (0)