Skip to content

Commit 3098510

Browse files
committed
GPU: Deduplicate sort comparisons: Use structs, since both hipcub and rocthrust do not work with lambdas for some reason
1 parent bbca3bb commit 3098510

File tree

3 files changed

+98
-147
lines changed

3 files changed

+98
-147
lines changed

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc

Lines changed: 0 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -14,88 +14,6 @@
1414

1515
#if defined(GPUCA_SPECIALIZE_THRUST_SORTS) && !defined(GPUCA_GPUCODE_COMPILEKERNELS)
1616

17-
namespace o2::gpu::internal
18-
{
19-
namespace // anonymous
20-
{
21-
struct MergeBorderTracks_compMax {
22-
GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b)
23-
{
24-
return GPUCA_DETERMINISTIC_CODE((a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId), a.fMax < b.fMax);
25-
}
26-
};
27-
struct MergeBorderTracks_compMin {
28-
GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b)
29-
{
30-
return GPUCA_DETERMINISTIC_CODE((a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId), a.fMin < b.fMin);
31-
}
32-
};
33-
34-
struct GPUTPCGMMergerSortTracks_comp {
35-
const GPUTPCGMMergedTrack* const mCmp;
36-
GPUhd() GPUTPCGMMergerSortTracks_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {}
37-
GPUd() bool operator()(const int32_t aa, const int32_t bb)
38-
{
39-
const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa];
40-
const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb];
41-
if (a.CCE() != b.CCE()) {
42-
return a.CCE() > b.CCE();
43-
}
44-
if (a.Legs() != b.Legs()) {
45-
return a.Legs() > b.Legs();
46-
}
47-
GPUCA_DETERMINISTIC_CODE( // clang-format off
48-
if (a.NClusters() != b.NClusters()) {
49-
return a.NClusters() > b.NClusters();
50-
} if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) {
51-
return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
52-
} if (a.GetParam().GetY() != b.GetParam().GetY()) {
53-
return a.GetParam().GetY() > b.GetParam().GetY();
54-
}
55-
return aa > bb;
56-
, // !GPUCA_DETERMINISTIC_CODE
57-
return a.NClusters() > b.NClusters();
58-
) // clang-format on
59-
}
60-
};
61-
62-
struct GPUTPCGMMergerSortTracksQPt_comp {
63-
const GPUTPCGMMergedTrack* const mCmp;
64-
GPUhd() GPUTPCGMMergerSortTracksQPt_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {}
65-
GPUd() bool operator()(const int32_t aa, const int32_t bb)
66-
{
67-
const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa];
68-
const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb];
69-
GPUCA_DETERMINISTIC_CODE( // clang-format off
70-
if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) {
71-
return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
72-
} if (a.GetParam().GetY() != b.GetParam().GetY()) {
73-
return a.GetParam().GetY() > b.GetParam().GetY();
74-
}
75-
return a.GetParam().GetZ() > b.GetParam().GetZ();
76-
, // !GPUCA_DETERMINISTIC_CODE
77-
return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
78-
) // clang-format on
79-
}
80-
};
81-
82-
struct GPUTPCGMMergerMergeLoopers_comp {
83-
GPUd() bool operator()(const MergeLooperParam& a, const MergeLooperParam& b)
84-
{
85-
return GPUCA_DETERMINISTIC_CODE(CAMath::Abs(a.refz) != CAMath::Abs(b.refz) ? CAMath::Abs(a.refz) < CAMath::Abs(b.refz) : a.id < b.id, CAMath::Abs(a.refz) < CAMath::Abs(b.refz));
86-
}
87-
};
88-
89-
struct GPUTPCGMO2OutputSort_comp {
90-
GPUd() bool operator()(const GPUTPCGMMerger::tmpSort& a, const GPUTPCGMMerger::tmpSort& b)
91-
{
92-
return GPUCA_DETERMINISTIC_CODE(a.y != b.y ? a.y > b.y : a.x > b.x, a.y > b.y);
93-
}
94-
};
95-
96-
} // anonymous namespace
97-
} // namespace o2::gpu::internal
98-
9917
template <>
10018
inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)
10119
{

GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx

Lines changed: 83 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -59,17 +59,13 @@
5959
#include "SimulationDataFormat/MCCompLabel.h"
6060
#endif
6161

62-
namespace o2::gpu::internal
63-
{
64-
}
62+
static constexpr int32_t kMaxParts = 400;
63+
static constexpr int32_t kMaxClusters = GPUCA_MERGER_MAX_TRACK_CLUSTERS;
64+
6565
using namespace o2::gpu;
66-
using namespace o2::gpu::internal;
6766
using namespace o2::tpc;
6867
using namespace gputpcgmmergertypes;
6968

70-
static constexpr int32_t kMaxParts = 400;
71-
static constexpr int32_t kMaxClusters = GPUCA_MERGER_MAX_TRACK_CLUSTERS;
72-
7369
namespace o2::gpu::internal
7470
{
7571
struct MergeLooperParam {
@@ -78,8 +74,79 @@ struct MergeLooperParam {
7874
float y;
7975
uint32_t id;
8076
};
77+
78+
struct MergeBorderTracks_compMax {
79+
GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b)
80+
{
81+
return GPUCA_DETERMINISTIC_CODE((a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId), a.fMax < b.fMax);
82+
}
83+
};
84+
struct MergeBorderTracks_compMin {
85+
GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b)
86+
{
87+
return GPUCA_DETERMINISTIC_CODE((a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId), a.fMin < b.fMin);
88+
}
89+
};
90+
91+
struct GPUTPCGMMergerSortTracks_comp {
92+
const GPUTPCGMMergedTrack* const mCmp;
93+
GPUhd() GPUTPCGMMergerSortTracks_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {}
94+
GPUd() bool operator()(const int32_t aa, const int32_t bb)
95+
{
96+
const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa];
97+
const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb];
98+
if (a.CCE() != b.CCE()) {
99+
return a.CCE() > b.CCE();
100+
}
101+
if (a.Legs() != b.Legs()) {
102+
return a.Legs() > b.Legs();
103+
}
104+
GPUCA_DETERMINISTIC_CODE( // clang-format off
105+
if (a.NClusters() != b.NClusters()) {
106+
return a.NClusters() > b.NClusters();
107+
} if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) {
108+
return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
109+
} if (a.GetParam().GetY() != b.GetParam().GetY()) {
110+
return a.GetParam().GetY() > b.GetParam().GetY();
111+
}
112+
return aa > bb;
113+
, // !GPUCA_DETERMINISTIC_CODE
114+
return a.NClusters() > b.NClusters();
115+
) // clang-format on
116+
}
117+
};
118+
119+
struct GPUTPCGMMergerSortTracksQPt_comp {
120+
const GPUTPCGMMergedTrack* const mCmp;
121+
GPUhd() GPUTPCGMMergerSortTracksQPt_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {}
122+
GPUd() bool operator()(const int32_t aa, const int32_t bb)
123+
{
124+
const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa];
125+
const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb];
126+
GPUCA_DETERMINISTIC_CODE( // clang-format off
127+
if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) {
128+
return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
129+
} if (a.GetParam().GetY() != b.GetParam().GetY()) {
130+
return a.GetParam().GetY() > b.GetParam().GetY();
131+
}
132+
return a.GetParam().GetZ() > b.GetParam().GetZ();
133+
, // !GPUCA_DETERMINISTIC_CODE
134+
return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
135+
) // clang-format on
136+
}
137+
};
138+
139+
struct GPUTPCGMMergerMergeLoopers_comp {
140+
GPUd() bool operator()(const MergeLooperParam& a, const MergeLooperParam& b)
141+
{
142+
return GPUCA_DETERMINISTIC_CODE(CAMath::Abs(a.refz) != CAMath::Abs(b.refz) ? CAMath::Abs(a.refz) < CAMath::Abs(b.refz) : a.id < b.id, CAMath::Abs(a.refz) < CAMath::Abs(b.refz));
143+
}
144+
};
145+
81146
} // namespace o2::gpu::internal
82147

148+
using namespace o2::gpu::internal;
149+
83150
#ifndef GPUCA_GPUCODE
84151

85152
#include "GPUQA.h"
@@ -742,11 +809,11 @@ template <>
742809
GPUd() void GPUTPCGMMerger::MergeBorderTracks<3>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUTPCGMBorderRange* range, int32_t N, int32_t cmpMax)
743810
{
744811
#ifndef GPUCA_SPECIALIZE_THRUST_SORTS
745-
if (iThread == 0) {
812+
if (iThread == 0 && iBlock == 0) {
746813
if (cmpMax) {
747-
GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return GPUCA_DETERMINISTIC_CODE((a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId), a.fMax < b.fMax); });
814+
GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, MergeBorderTracks_compMax());
748815
} else {
749-
GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return GPUCA_DETERMINISTIC_CODE((a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId), a.fMin < b.fMin); });
816+
GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, MergeBorderTracks_compMin());
750817
}
751818
}
752819
#endif
@@ -1757,60 +1824,18 @@ GPUd() void GPUTPCGMMerger::PrepareClustersForFit0(int32_t nBlocks, int32_t nThr
17571824
GPUd() void GPUTPCGMMerger::SortTracks(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread)
17581825
{
17591826
#ifndef GPUCA_SPECIALIZE_THRUST_SORTS
1760-
if (iThread || iBlock) {
1761-
return;
1827+
if (iThread == 0 && iBlock == 0) {
1828+
GPUCommonAlgorithm::sortDeviceDynamic(mTrackOrderProcess, mTrackOrderProcess + mMemory->nMergedTracks, GPUTPCGMMergerSortTracks_comp(mOutputTracks));
17621829
}
1763-
// TODO: Fix this: Have to duplicate sort comparison: Thrust cannot use the Lambda but OpenCL cannot use the object
1764-
auto comp = [cmp = mOutputTracks](const int32_t aa, const int32_t bb) {
1765-
const GPUTPCGMMergedTrack& GPUrestrict() a = cmp[aa];
1766-
const GPUTPCGMMergedTrack& GPUrestrict() b = cmp[bb];
1767-
if (a.CCE() != b.CCE()) {
1768-
return a.CCE() > b.CCE();
1769-
}
1770-
if (a.Legs() != b.Legs()) {
1771-
return a.Legs() > b.Legs();
1772-
}
1773-
GPUCA_DETERMINISTIC_CODE( // clang-format off
1774-
if (a.NClusters() != b.NClusters()) {
1775-
return a.NClusters() > b.NClusters();
1776-
} if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) {
1777-
return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
1778-
} if (a.GetParam().GetY() != b.GetParam().GetY()) {
1779-
return a.GetParam().GetY() > b.GetParam().GetY();
1780-
}
1781-
return aa > bb;
1782-
, // !GPUCA_DETERMINISTIC_CODE
1783-
return a.NClusters() > b.NClusters();
1784-
) // clang-format on
1785-
};
1786-
1787-
GPUCommonAlgorithm::sortDeviceDynamic(mTrackOrderProcess, mTrackOrderProcess + mMemory->nMergedTracks, comp);
17881830
#endif
17891831
}
17901832

17911833
GPUd() void GPUTPCGMMerger::SortTracksQPt(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread)
17921834
{
17931835
#ifndef GPUCA_SPECIALIZE_THRUST_SORTS
1794-
if (iThread || iBlock) {
1795-
return;
1836+
if (iThread == 0 && iBlock == 0) {
1837+
GPUCommonAlgorithm::sortDeviceDynamic(mTrackSort, mTrackSort + mMemory->nMergedTracks, GPUTPCGMMergerSortTracksQPt_comp(mOutputTracks));
17961838
}
1797-
// TODO: Fix this: Have to duplicate sort comparison: Thrust cannot use the Lambda but OpenCL cannot use the object
1798-
auto comp = [cmp = mOutputTracks](const int32_t aa, const int32_t bb) {
1799-
const GPUTPCGMMergedTrack& GPUrestrict() a = cmp[aa];
1800-
const GPUTPCGMMergedTrack& GPUrestrict() b = cmp[bb];
1801-
GPUCA_DETERMINISTIC_CODE( // clang-format off
1802-
if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) {
1803-
return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
1804-
} if (a.GetParam().GetY() != b.GetParam().GetY()) {
1805-
return a.GetParam().GetY() > b.GetParam().GetY();
1806-
}
1807-
return a.GetParam().GetZ() > b.GetParam().GetZ();
1808-
, // !GPUCA_DETERMINISTIC_CODE
1809-
return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt());
1810-
) // clang-format on
1811-
};
1812-
1813-
GPUCommonAlgorithm::sortDeviceDynamic(mTrackSort, mTrackSort + mMemory->nMergedTracks, comp);
18141839
#endif
18151840
}
18161841

@@ -1945,11 +1970,9 @@ GPUd() void GPUTPCGMMerger::MergeLoopersInit(int32_t nBlocks, int32_t nThreads,
19451970
GPUd() void GPUTPCGMMerger::MergeLoopersSort(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread)
19461971
{
19471972
#ifndef GPUCA_SPECIALIZE_THRUST_SORTS
1948-
if (iThread || iBlock) {
1949-
return;
1973+
if (iThread == 0 && iBlock == 0) {
1974+
GPUCommonAlgorithm::sortDeviceDynamic(mLooperCandidates, mLooperCandidates + mMemory->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp());
19501975
}
1951-
auto comp = [](const MergeLooperParam& a, const MergeLooperParam& b) { return GPUCA_DETERMINISTIC_CODE(CAMath::Abs(a.refz) != CAMath::Abs(b.refz) ? CAMath::Abs(a.refz) < CAMath::Abs(b.refz) : a.id < b.id, CAMath::Abs(a.refz) < CAMath::Abs(b.refz)); };
1952-
GPUCommonAlgorithm::sortDeviceDynamic(mLooperCandidates, mLooperCandidates + mMemory->nLooperMatchCandidates, comp);
19531976
#endif
19541977
}
19551978

GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,18 @@ using namespace o2::tpc::constants;
3434
GPUdi() static constexpr uint8_t getFlagsReject() { return GPUTPCGMMergedTrackHit::flagReject | GPUTPCGMMergedTrackHit::flagNotFit; }
3535
GPUdi() static uint32_t getFlagsRequired(const GPUSettingsRec& rec) { return rec.tpc.dropSecondaryLegsInOutput ? gputpcgmmergertypes::attachGoodLeg : gputpcgmmergertypes::attachZero; }
3636

37+
namespace o2::gpu::internal
38+
{
39+
40+
struct GPUTPCGMO2OutputSort_comp {
41+
GPUd() bool operator()(const GPUTPCGMMerger::tmpSort& a, const GPUTPCGMMerger::tmpSort& b)
42+
{
43+
return GPUCA_DETERMINISTIC_CODE(a.y != b.y ? a.y > b.y : a.x > b.x, a.y > b.y);
44+
}
45+
};
46+
47+
} // namespace o2::gpu::internal
48+
3749
template <>
3850
GPUdii() void GPUTPCGMO2Output::Thread<GPUTPCGMO2Output::prepare>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger)
3951
{
@@ -84,12 +96,10 @@ template <>
8496
GPUdii() void GPUTPCGMO2Output::Thread<GPUTPCGMO2Output::sort>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger)
8597
{
8698
#ifndef GPUCA_SPECIALIZE_THRUST_SORTS
87-
if (iThread || iBlock) {
88-
return;
99+
if (iThread == 0 && iBlock == 0) {
100+
GPUTPCGMMerger::tmpSort* GPUrestrict() trackSort = merger.TrackSortO2();
101+
GPUCommonAlgorithm::sortDeviceDynamic(trackSort, trackSort + merger.Memory()->nO2Tracks, internal::GPUTPCGMO2OutputSort_comp());
89102
}
90-
GPUTPCGMMerger::tmpSort* GPUrestrict() trackSort = merger.TrackSortO2();
91-
auto comp = [](const auto& a, const auto& b) { return GPUCA_DETERMINISTIC_CODE(a.y != b.y ? a.y > b.y : a.x > b.x, a.y > b.y); };
92-
GPUCommonAlgorithm::sortDeviceDynamic(trackSort, trackSort + merger.Memory()->nO2Tracks, comp);
93103
#endif
94104
}
95105

0 commit comments

Comments
 (0)