Skip to content

Commit 33b5b36

Browse files
committed
GPU TPC CF: Split clusterizer CXX functions out into .inc file to be used externally
1 parent 5c6657a commit 33b5b36

File tree

3 files changed

+256
-236
lines changed

3 files changed

+256
-236
lines changed

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx

Lines changed: 3 additions & 232 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
using namespace o2::gpu;
2626
using namespace o2::gpu::tpccf;
2727

28+
#include "GPUTPCCFClusterizer.inc"
29+
2830
template <>
2931
GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t onlyMC)
3032
{
@@ -34,235 +36,4 @@ GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
3436
tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
3537

3638
GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
37-
}
38-
39-
GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
40-
processorType& clusterer,
41-
const CfFragment& fragment,
42-
GPUSharedMemory& smem,
43-
const Array2D<PackedCharge>& chargeMap,
44-
const ChargePos* filteredPeakPositions,
45-
const GPUSettingsRec& calib,
46-
MCLabelAccumulator* labelAcc,
47-
uint32_t clusternum,
48-
uint32_t maxClusterPerRow,
49-
uint32_t* clusterInRow,
50-
tpc::ClusterNative* clusterByRow,
51-
uint32_t* clusterPosInRow)
52-
{
53-
uint32_t idx = get_global_id(0);
54-
55-
// For certain configurations dummy work items are added, so the total
56-
// number of work items is dividable by 64.
57-
// These dummy items also compute the last cluster but discard the result.
58-
ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
59-
Charge charge = chargeMap[pos].unpack();
60-
61-
ClusterAccumulator pc;
62-
CPU_ONLY(labelAcc->collect(pos, charge));
63-
64-
buildCluster(
65-
calib,
66-
chargeMap,
67-
pos,
68-
smem.posBcast,
69-
smem.buf,
70-
smem.innerAboveThreshold,
71-
&pc,
72-
labelAcc);
73-
74-
if (idx >= clusternum) {
75-
return;
76-
}
77-
if (fragment.isOverlap(pos.time())) {
78-
if (clusterPosInRow) {
79-
clusterPosInRow[idx] = maxClusterPerRow;
80-
}
81-
return;
82-
}
83-
pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry);
84-
85-
tpc::ClusterNative myCluster;
86-
bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param());
87-
88-
if (rejectCluster) {
89-
if (clusterPosInRow) {
90-
clusterPosInRow[idx] = maxClusterPerRow;
91-
}
92-
return;
93-
}
94-
95-
uint32_t rowIndex = 0;
96-
if (clusterByRow != nullptr) {
97-
rowIndex = sortIntoBuckets(
98-
clusterer,
99-
myCluster,
100-
pos.row(),
101-
maxClusterPerRow,
102-
clusterInRow,
103-
clusterByRow);
104-
if (clusterPosInRow != nullptr) {
105-
clusterPosInRow[idx] = rowIndex;
106-
}
107-
} else if (clusterPosInRow) {
108-
rowIndex = clusterPosInRow[idx];
109-
}
110-
111-
CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
112-
}
113-
114-
GPUdii() void GPUTPCCFClusterizer::updateClusterInner(
115-
const GPUSettingsRec& calib,
116-
uint16_t lid,
117-
uint16_t N,
118-
const PackedCharge* buf,
119-
const ChargePos& pos,
120-
ClusterAccumulator* cluster,
121-
MCLabelAccumulator* labelAcc,
122-
uint8_t* innerAboveThreshold)
123-
{
124-
uint8_t aboveThreshold = 0;
125-
126-
GPUCA_UNROLL(U(), U())
127-
for (uint16_t i = 0; i < N; i++) {
128-
Delta2 d = cfconsts::InnerNeighbors[i];
129-
130-
PackedCharge p = buf[N * lid + i];
131-
132-
Charge q = cluster->updateInner(p, d);
133-
134-
CPU_ONLY(labelAcc->collect(pos.delta(d), q));
135-
136-
aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
137-
}
138-
139-
innerAboveThreshold[lid] = aboveThreshold;
140-
141-
GPUbarrier();
142-
}
143-
144-
GPUdii() void GPUTPCCFClusterizer::updateClusterOuter(
145-
uint16_t lid,
146-
uint16_t N,
147-
uint16_t M,
148-
uint16_t offset,
149-
const PackedCharge* buf,
150-
const ChargePos& pos,
151-
ClusterAccumulator* cluster,
152-
MCLabelAccumulator* labelAcc)
153-
{
154-
GPUCA_UNROLL(U(), U())
155-
for (uint16_t i = offset; i < M + offset; i++) {
156-
PackedCharge p = buf[N * lid + i];
157-
158-
Delta2 d = cfconsts::OuterNeighbors[i];
159-
160-
Charge q = cluster->updateOuter(p, d);
161-
static_cast<void>(q); // Avoid unused varible warning on GPU.
162-
163-
CPU_ONLY(labelAcc->collect(pos.delta(d), q));
164-
}
165-
}
166-
167-
GPUdii() void GPUTPCCFClusterizer::buildCluster(
168-
const GPUSettingsRec& calib,
169-
const Array2D<PackedCharge>& chargeMap,
170-
ChargePos pos,
171-
ChargePos* posBcast,
172-
PackedCharge* buf,
173-
uint8_t* innerAboveThreshold,
174-
ClusterAccumulator* myCluster,
175-
MCLabelAccumulator* labelAcc)
176-
{
177-
uint16_t ll = get_local_id(0);
178-
179-
posBcast[ll] = pos;
180-
GPUbarrier();
181-
182-
CfUtils::blockLoad<PackedCharge>(
183-
chargeMap,
184-
SCRATCH_PAD_WORK_GROUP_SIZE,
185-
SCRATCH_PAD_WORK_GROUP_SIZE,
186-
ll,
187-
0,
188-
8,
189-
cfconsts::InnerNeighbors,
190-
posBcast,
191-
buf);
192-
updateClusterInner(
193-
calib,
194-
ll,
195-
8,
196-
buf,
197-
pos,
198-
myCluster,
199-
labelAcc,
200-
innerAboveThreshold);
201-
202-
uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
203-
204-
bool inGroup1 = ll < wgSizeHalf;
205-
206-
uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
207-
208-
CfUtils::condBlockLoad(
209-
chargeMap,
210-
wgSizeHalf,
211-
SCRATCH_PAD_WORK_GROUP_SIZE,
212-
ll,
213-
0,
214-
16,
215-
cfconsts::OuterNeighbors,
216-
posBcast,
217-
innerAboveThreshold,
218-
buf);
219-
220-
if (inGroup1) {
221-
updateClusterOuter(
222-
llhalf,
223-
16,
224-
16,
225-
0,
226-
buf,
227-
pos,
228-
myCluster,
229-
labelAcc);
230-
}
231-
232-
#if defined(GPUCA_GPUCODE)
233-
CfUtils::condBlockLoad(
234-
chargeMap,
235-
wgSizeHalf,
236-
SCRATCH_PAD_WORK_GROUP_SIZE,
237-
ll,
238-
0,
239-
16,
240-
cfconsts::OuterNeighbors,
241-
posBcast + wgSizeHalf,
242-
innerAboveThreshold + wgSizeHalf,
243-
buf);
244-
if (!inGroup1) {
245-
updateClusterOuter(
246-
llhalf,
247-
16,
248-
16,
249-
0,
250-
buf,
251-
pos,
252-
myCluster,
253-
labelAcc);
254-
}
255-
#endif
256-
}
257-
258-
GPUd() uint32_t GPUTPCCFClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t* elemsInBucket, tpc::ClusterNative* buckets)
259-
{
260-
uint32_t index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
261-
if (index < maxElemsPerBucket) {
262-
buckets[maxElemsPerBucket * row + index] = cluster;
263-
} else {
264-
clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISector * 1000 + row, index, maxElemsPerBucket);
265-
CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
266-
}
267-
return index;
268-
}
39+
}

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,14 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate
5959

6060
static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*);
6161

62+
static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
63+
64+
static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
65+
6266
private:
6367
static GPUd() void updateClusterInner(const GPUSettingsRec&, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);
6468

6569
static GPUd() void updateClusterOuter(uint16_t, uint16_t, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
66-
67-
static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
68-
69-
static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
7070
};
7171

7272
} // namespace o2::gpu

0 commit comments

Comments
 (0)