Skip to content

Commit 8327107

Browse files
committed
ITS: recover single threaded performance in computeLayerCells
Signed-off-by: Felix Schlepper <felix.schlepper@cern.ch>
1 parent fe3894b commit 8327107

File tree

1 file changed

+107
-158
lines changed

1 file changed

+107
-158
lines changed

Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx

Lines changed: 107 additions & 158 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
///
1515

1616
#include <algorithm>
17-
#include <cwctype>
1817
#include <iostream>
1918
#include <iterator>
2019
#include <ranges>
@@ -284,183 +283,133 @@ void TrackerTraits<nLayers>::computeLayerCells(const int iteration)
284283
}
285284

286285
mTaskArena->execute([&] {
287-
tbb::parallel_for(
288-
tbb::blocked_range<int>(0, mTrkParams[iteration].CellsPerRoad()),
289-
[&](const tbb::blocked_range<int>& Layers) {
290-
for (int iLayer = Layers.begin(); iLayer < Layers.end(); ++iLayer) {
291-
292-
if (mTimeFrame->getTracklets()[iLayer + 1].empty() ||
293-
mTimeFrame->getTracklets()[iLayer].empty()) {
294-
continue;
295-
}
296-
297-
#ifdef OPTIMISATION_OUTPUT
298-
float resolution{o2::gpu::CAMath::Sqrt(0.5f * (mTrkParams[iteration].SystErrorZ2[iLayer] + mTrkParams[iteration].SystErrorZ2[iLayer + 1] + mTrkParams[iteration].SystErrorZ2[iLayer + 2] + mTrkParams[iteration].SystErrorY2[iLayer] + mTrkParams[iteration].SystErrorY2[iLayer + 1] + mTrkParams[iteration].SystErrorY2[iLayer + 2])) / mTrkParams[iteration].LayerResolution[iLayer]};
299-
resolution = resolution > 1.e-12 ? resolution : 1.f;
300-
#endif
301-
302-
// count number of cells found
303-
const int currentLayerTrackletsNum{static_cast<int>(mTimeFrame->getTracklets()[iLayer].size())};
304-
bounded_vector<int> perTrackletCount(currentLayerTrackletsNum + 1, 0, mMemoryPool.get());
305-
tbb::parallel_for(
306-
tbb::blocked_range<int>(0, currentLayerTrackletsNum),
307-
[&](const tbb::blocked_range<int>& Tracklets) {
308-
for (int iTracklet = Tracklets.begin(); iTracklet < Tracklets.end(); ++iTracklet) {
309-
const Tracklet& currentTracklet{mTimeFrame->getTracklets()[iLayer][iTracklet]};
310-
const int nextLayerClusterIndex{currentTracklet.secondClusterIndex};
311-
const int nextLayerFirstTrackletIndex{
312-
mTimeFrame->getTrackletsLookupTable()[iLayer][nextLayerClusterIndex]};
313-
const int nextLayerLastTrackletIndex{
314-
mTimeFrame->getTrackletsLookupTable()[iLayer][nextLayerClusterIndex + 1]};
315-
316-
if (nextLayerFirstTrackletIndex == nextLayerLastTrackletIndex) {
317-
continue;
318-
}
319-
320-
int foundCells{0};
321-
for (int iNextTracklet{nextLayerFirstTrackletIndex}; iNextTracklet < nextLayerLastTrackletIndex; ++iNextTracklet) {
322-
if (mTimeFrame->getTracklets()[iLayer + 1][iNextTracklet].firstClusterIndex != nextLayerClusterIndex) {
323-
break;
324-
}
325-
const Tracklet& nextTracklet{mTimeFrame->getTracklets()[iLayer + 1][iNextTracklet]};
326-
const float deltaTanLambda{std::abs(currentTracklet.tanLambda - nextTracklet.tanLambda)};
286+
auto forTrackletCells = [&](auto Tag, int iLayer, bounded_vector<CellSeed>& layerCells, int iTracklet, int offset = 0) -> int {
287+
const Tracklet& currentTracklet{mTimeFrame->getTracklets()[iLayer][iTracklet]};
288+
const int nextLayerClusterIndex{currentTracklet.secondClusterIndex};
289+
const int nextLayerFirstTrackletIndex{
290+
mTimeFrame->getTrackletsLookupTable()[iLayer][nextLayerClusterIndex]};
291+
const int nextLayerLastTrackletIndex{
292+
mTimeFrame->getTrackletsLookupTable()[iLayer][nextLayerClusterIndex + 1]};
293+
294+
int foundCells{0};
295+
for (int iNextTracklet{nextLayerFirstTrackletIndex}; iNextTracklet < nextLayerLastTrackletIndex; ++iNextTracklet) {
296+
if (mTimeFrame->getTracklets()[iLayer + 1][iNextTracklet].firstClusterIndex != nextLayerClusterIndex) {
297+
break;
298+
}
299+
const Tracklet& nextTracklet{mTimeFrame->getTracklets()[iLayer + 1][iNextTracklet]};
300+
const float deltaTanLambda{std::abs(currentTracklet.tanLambda - nextTracklet.tanLambda)};
327301

328302
#ifdef OPTIMISATION_OUTPUT
329-
bool good{mTimeFrame->getTrackletsLabel(iLayer)[iTracklet] == mTimeFrame->getTrackletsLabel(iLayer + 1)[iNextTracklet]};
330-
float signedDelta{currentTracklet.tanLambda - nextTracklet.tanLambda};
331-
off << std::format("{}\t{:d}\t{}\t{}\t{}\t{}", iLayer, good, signedDelta, signedDelta / (mTrkParams[iteration].CellDeltaTanLambdaSigma), tanLambda, resolution) << std::endl;
303+
float resolution{o2::gpu::CAMath::Sqrt(0.5f * (mTrkParams[iteration].SystErrorZ2[iLayer] + mTrkParams[iteration].SystErrorZ2[iLayer + 1] + mTrkParams[iteration].SystErrorZ2[iLayer + 2] + mTrkParams[iteration].SystErrorY2[iLayer] + mTrkParams[iteration].SystErrorY2[iLayer + 1] + mTrkParams[iteration].SystErrorY2[iLayer + 2])) / mTrkParams[iteration].LayerResolution[iLayer]};
304+
resolution = resolution > 1.e-12 ? resolution : 1.f;
305+
bool good{mTimeFrame->getTrackletsLabel(iLayer)[iTracklet] == mTimeFrame->getTrackletsLabel(iLayer + 1)[iNextTracklet]};
306+
float signedDelta{currentTracklet.tanLambda - nextTracklet.tanLambda};
307+
off << std::format("{}\t{:d}\t{}\t{}\t{}\t{}", iLayer, good, signedDelta, signedDelta / (mTrkParams[iteration].CellDeltaTanLambdaSigma), tanLambda, resolution) << std::endl;
332308
#endif
333309

334-
if (deltaTanLambda / mTrkParams[iteration].CellDeltaTanLambdaSigma < mTrkParams[iteration].NSigmaCut) {
310+
if (deltaTanLambda / mTrkParams[iteration].CellDeltaTanLambdaSigma < mTrkParams[iteration].NSigmaCut) {
335311

336-
/// Track seed preparation. Clusters are numbered progressively from the innermost going outward.
337-
const int clusId[3]{
338-
mTimeFrame->getClusters()[iLayer][currentTracklet.firstClusterIndex].clusterId,
339-
mTimeFrame->getClusters()[iLayer + 1][nextTracklet.firstClusterIndex].clusterId,
340-
mTimeFrame->getClusters()[iLayer + 2][nextTracklet.secondClusterIndex].clusterId};
341-
const auto& cluster1_glo = mTimeFrame->getUnsortedClusters()[iLayer][clusId[0]];
342-
const auto& cluster2_glo = mTimeFrame->getUnsortedClusters()[iLayer + 1][clusId[1]];
343-
const auto& cluster3_tf = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer + 2)[clusId[2]];
344-
auto track{buildTrackSeed(cluster1_glo, cluster2_glo, cluster3_tf)};
312+
/// Track seed preparation. Clusters are numbered progressively from the innermost going outward.
313+
const int clusId[3]{
314+
mTimeFrame->getClusters()[iLayer][currentTracklet.firstClusterIndex].clusterId,
315+
mTimeFrame->getClusters()[iLayer + 1][nextTracklet.firstClusterIndex].clusterId,
316+
mTimeFrame->getClusters()[iLayer + 2][nextTracklet.secondClusterIndex].clusterId};
317+
const auto& cluster1_glo = mTimeFrame->getUnsortedClusters()[iLayer][clusId[0]];
318+
const auto& cluster2_glo = mTimeFrame->getUnsortedClusters()[iLayer + 1][clusId[1]];
319+
const auto& cluster3_tf = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer + 2)[clusId[2]];
320+
auto track{buildTrackSeed(cluster1_glo, cluster2_glo, cluster3_tf)};
345321

346-
float chi2{0.f};
347-
bool good{false};
348-
for (int iC{2}; iC--;) {
349-
const TrackingFrameInfo& trackingHit = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer + iC)[clusId[iC]];
322+
float chi2{0.f};
323+
bool good{false};
324+
for (int iC{2}; iC--;) {
325+
const TrackingFrameInfo& trackingHit = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer + iC)[clusId[iC]];
350326

351-
if (!track.rotate(trackingHit.alphaTrackingFrame)) {
352-
break;
353-
}
327+
if (!track.rotate(trackingHit.alphaTrackingFrame)) {
328+
break;
329+
}
354330

355-
if (!track.propagateTo(trackingHit.xTrackingFrame, getBz())) {
356-
break;
357-
}
331+
if (!track.propagateTo(trackingHit.xTrackingFrame, getBz())) {
332+
break;
333+
}
358334

359-
if (!track.correctForMaterial(mTrkParams[0].LayerxX0[iLayer + iC], mTrkParams[0].LayerxX0[iLayer] * constants::Radl * constants::Rho, true)) {
360-
break;
361-
}
335+
if (!track.correctForMaterial(mTrkParams[0].LayerxX0[iLayer + iC], mTrkParams[0].LayerxX0[iLayer] * constants::Radl * constants::Rho, true)) {
336+
break;
337+
}
362338

363-
const auto predChi2{track.getPredictedChi2Quiet(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)};
364-
if (!iC && predChi2 > mTrkParams[iteration].MaxChi2ClusterAttachment) {
365-
break;
366-
}
339+
const auto predChi2{track.getPredictedChi2Quiet(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)};
340+
if (!iC && predChi2 > mTrkParams[iteration].MaxChi2ClusterAttachment) {
341+
break;
342+
}
367343

368-
if (!track.o2::track::TrackParCov::update(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)) {
369-
break;
370-
}
344+
if (!track.o2::track::TrackParCov::update(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)) {
345+
break;
346+
}
371347

372-
good = !iC;
373-
chi2 += predChi2;
374-
}
375-
if (good) {
376-
++foundCells;
377-
}
378-
}
379-
}
380-
perTrackletCount[iTracklet] = foundCells;
381-
}
382-
});
348+
good = !iC;
349+
chi2 += predChi2;
350+
}
351+
if (good) {
352+
if constexpr (decltype(Tag)::value == PassMode::OnePass::value) {
353+
layerCells.emplace_back(iLayer, clusId[0], clusId[1], clusId[2], iTracklet, iNextTracklet, track, chi2);
354+
++foundCells;
355+
} else if constexpr (decltype(Tag)::value == PassMode::TwoPassCount::value) {
356+
++foundCells;
357+
} else if constexpr (decltype(Tag)::value == PassMode::TwoPassInsert::value) {
358+
layerCells[offset++] = CellSeed(iLayer, clusId[0], clusId[1], clusId[2], iTracklet, iNextTracklet, track, chi2);
359+
} else {
360+
static_assert(false, "Unknown mode!");
361+
}
362+
}
363+
}
364+
}
365+
return foundCells;
366+
};
383367

384-
// calculate offset table and check if any cells where found
385-
std::exclusive_scan(perTrackletCount.begin(), perTrackletCount.end(), perTrackletCount.begin(), 0);
386-
auto totalCells{perTrackletCount.back()};
387-
if (totalCells == 0) {
368+
tbb::parallel_for(
369+
tbb::blocked_range<int>(0, mTrkParams[iteration].CellsPerRoad()),
370+
[&](const tbb::blocked_range<int>& Layers) {
371+
for (int iLayer = Layers.begin(); iLayer < Layers.end(); ++iLayer) {
372+
if (mTimeFrame->getTracklets()[iLayer + 1].empty() ||
373+
mTimeFrame->getTracklets()[iLayer].empty()) {
388374
continue;
389375
}
390-
auto& layerCells = mTimeFrame->getCells()[iLayer];
391-
layerCells.resize(totalCells);
392376

393-
tbb::parallel_for(
394-
tbb::blocked_range<int>(0, currentLayerTrackletsNum),
395-
[&](const tbb::blocked_range<int>& Tracklets) {
396-
for (int iTracklet = Tracklets.begin(); iTracklet < Tracklets.end(); ++iTracklet) {
397-
if (perTrackletCount[iTracklet] == perTrackletCount[iTracklet + 1]) {
398-
continue;
377+
auto& layerCells = mTimeFrame->getCells()[iLayer];
378+
const int currentLayerTrackletsNum{static_cast<int>(mTimeFrame->getTracklets()[iLayer].size())};
379+
bounded_vector<int> perTrackletCount(currentLayerTrackletsNum + 1, 0, mMemoryPool.get());
380+
if (mTaskArena->max_concurrency() <= 1) {
381+
for (int iTracklet{0}; iTracklet < currentLayerTrackletsNum; ++iTracklet) {
382+
perTrackletCount[iTracklet] = forTrackletCells(PassMode::OnePass{}, iLayer, layerCells, iTracklet);
383+
}
384+
std::exclusive_scan(perTrackletCount.begin(), perTrackletCount.end(), perTrackletCount.begin(), 0);
385+
} else {
386+
tbb::parallel_for(
387+
tbb::blocked_range<int>(0, currentLayerTrackletsNum),
388+
[&](const tbb::blocked_range<int>& Tracklets) {
389+
for (int iTracklet = Tracklets.begin(); iTracklet < Tracklets.end(); ++iTracklet) {
390+
perTrackletCount[iTracklet] = forTrackletCells(PassMode::TwoPassCount{}, iLayer, layerCells, iTracklet);
399391
}
392+
});
400393

401-
const Tracklet& currentTracklet{mTimeFrame->getTracklets()[iLayer][iTracklet]};
402-
const int nextLayerClusterIndex{currentTracklet.secondClusterIndex};
403-
const int nextLayerFirstTrackletIndex{
404-
mTimeFrame->getTrackletsLookupTable()[iLayer][nextLayerClusterIndex]};
405-
const int nextLayerLastTrackletIndex{
406-
mTimeFrame->getTrackletsLookupTable()[iLayer][nextLayerClusterIndex + 1]};
407-
408-
int position = perTrackletCount[iTracklet];
409-
for (int iNextTracklet{nextLayerFirstTrackletIndex}; iNextTracklet < nextLayerLastTrackletIndex; ++iNextTracklet) {
410-
if (mTimeFrame->getTracklets()[iLayer + 1][iNextTracklet].firstClusterIndex != nextLayerClusterIndex) {
411-
break;
412-
}
413-
const Tracklet& nextTracklet{mTimeFrame->getTracklets()[iLayer + 1][iNextTracklet]};
414-
const float deltaTanLambda{std::abs(currentTracklet.tanLambda - nextTracklet.tanLambda)};
415-
416-
if (deltaTanLambda / mTrkParams[iteration].CellDeltaTanLambdaSigma < mTrkParams[iteration].NSigmaCut) {
417-
418-
/// Track seed preparation. Clusters are numbered progressively from the innermost going outward.
419-
const int clusId[3]{
420-
mTimeFrame->getClusters()[iLayer][currentTracklet.firstClusterIndex].clusterId,
421-
mTimeFrame->getClusters()[iLayer + 1][nextTracklet.firstClusterIndex].clusterId,
422-
mTimeFrame->getClusters()[iLayer + 2][nextTracklet.secondClusterIndex].clusterId};
423-
const auto& cluster1_glo = mTimeFrame->getUnsortedClusters()[iLayer][clusId[0]];
424-
const auto& cluster2_glo = mTimeFrame->getUnsortedClusters()[iLayer + 1][clusId[1]];
425-
const auto& cluster3_tf = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer + 2)[clusId[2]];
426-
auto track{buildTrackSeed(cluster1_glo, cluster2_glo, cluster3_tf)};
427-
428-
float chi2{0.f};
429-
bool good{false};
430-
for (int iC{2}; iC--;) {
431-
const TrackingFrameInfo& trackingHit = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer + iC)[clusId[iC]];
432-
433-
if (!track.rotate(trackingHit.alphaTrackingFrame)) {
434-
break;
435-
}
436-
437-
if (!track.propagateTo(trackingHit.xTrackingFrame, getBz())) {
438-
break;
439-
}
440-
441-
if (!track.correctForMaterial(mTrkParams[0].LayerxX0[iLayer + iC], mTrkParams[0].LayerxX0[iLayer] * constants::Radl * constants::Rho, true)) {
442-
break;
443-
}
444-
445-
const auto predChi2{track.getPredictedChi2Quiet(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)};
446-
if (!iC && predChi2 > mTrkParams[iteration].MaxChi2ClusterAttachment) {
447-
break;
448-
}
449-
450-
if (!track.o2::track::TrackParCov::update(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)) {
451-
break;
452-
}
453-
454-
good = !iC;
455-
chi2 += predChi2;
456-
}
457-
if (good) {
458-
layerCells[position++] = CellSeed(iLayer, clusId[0], clusId[1], clusId[2], iTracklet, iNextTracklet, track, chi2);
459-
}
394+
std::exclusive_scan(perTrackletCount.begin(), perTrackletCount.end(), perTrackletCount.begin(), 0);
395+
auto totalCells{perTrackletCount.back()};
396+
if (totalCells == 0) {
397+
continue;
398+
}
399+
layerCells.resize(totalCells);
400+
401+
tbb::parallel_for(
402+
tbb::blocked_range<int>(0, currentLayerTrackletsNum),
403+
[&](const tbb::blocked_range<int>& Tracklets) {
404+
for (int iTracklet = Tracklets.begin(); iTracklet < Tracklets.end(); ++iTracklet) {
405+
int offset = perTrackletCount[iTracklet];
406+
if (offset == perTrackletCount[iTracklet + 1]) {
407+
continue;
460408
}
409+
forTrackletCells(PassMode::TwoPassInsert{}, iLayer, layerCells, iTracklet, offset);
461410
}
462-
}
463-
});
411+
});
412+
}
464413

465414
if (iLayer > 0) {
466415
auto& lut = mTimeFrame->getCellsLookupTable()[iLayer - 1];

0 commit comments

Comments
 (0)