Skip to content

Commit c3d005f

Browse files
committed
GPU TPC CF: Split clusterizer CXX functions out into .inc file to be used externally
1 parent 5e8b7b5 commit c3d005f

File tree

3 files changed

+253
-233
lines changed

3 files changed

+253
-233
lines changed

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx

Lines changed: 2 additions & 229 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
using namespace o2::gpu;
2626
using namespace o2::gpu::tpccf;
2727

28+
#include "GPUTPCCFClusterizer.inc"
29+
2830
template <>
2931
GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t onlyMC)
3032
{
@@ -35,232 +37,3 @@ GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
3537

3638
GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
3739
}
38-
39-
GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
40-
processorType& clusterer,
41-
const CfFragment& fragment,
42-
GPUSharedMemory& smem,
43-
const Array2D<PackedCharge>& chargeMap,
44-
const ChargePos* filteredPeakPositions,
45-
const GPUSettingsRec& calib,
46-
MCLabelAccumulator* labelAcc,
47-
uint32_t clusternum,
48-
uint32_t maxClusterPerRow,
49-
uint32_t* clusterInRow,
50-
tpc::ClusterNative* clusterByRow,
51-
uint32_t* clusterPosInRow)
52-
{
53-
uint32_t idx = get_global_id(0);
54-
55-
// For certain configurations dummy work items are added, so the total
56-
// number of work items is dividable by 64.
57-
// These dummy items also compute the last cluster but discard the result.
58-
ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
59-
Charge charge = chargeMap[pos].unpack();
60-
61-
ClusterAccumulator pc;
62-
CPU_ONLY(labelAcc->collect(pos, charge));
63-
64-
buildCluster(
65-
calib,
66-
chargeMap,
67-
pos,
68-
smem.posBcast,
69-
smem.buf,
70-
smem.innerAboveThreshold,
71-
&pc,
72-
labelAcc);
73-
74-
if (idx >= clusternum) {
75-
return;
76-
}
77-
if (fragment.isOverlap(pos.time())) {
78-
if (clusterPosInRow) {
79-
clusterPosInRow[idx] = maxClusterPerRow;
80-
}
81-
return;
82-
}
83-
tpc::ClusterNative myCluster;
84-
bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param(), fragment.start, chargeMap);
85-
86-
if (rejectCluster) {
87-
if (clusterPosInRow) {
88-
clusterPosInRow[idx] = maxClusterPerRow;
89-
}
90-
return;
91-
}
92-
93-
uint32_t rowIndex = 0;
94-
if (clusterByRow != nullptr) {
95-
rowIndex = sortIntoBuckets(
96-
clusterer,
97-
myCluster,
98-
pos.row(),
99-
maxClusterPerRow,
100-
clusterInRow,
101-
clusterByRow);
102-
if (clusterPosInRow != nullptr) {
103-
clusterPosInRow[idx] = rowIndex;
104-
}
105-
} else if (clusterPosInRow) {
106-
rowIndex = clusterPosInRow[idx];
107-
}
108-
109-
CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
110-
}
111-
112-
GPUdii() void GPUTPCCFClusterizer::updateClusterInner(
113-
const GPUSettingsRec& calib,
114-
uint16_t lid,
115-
uint16_t N,
116-
const PackedCharge* buf,
117-
const ChargePos& pos,
118-
ClusterAccumulator* cluster,
119-
MCLabelAccumulator* labelAcc,
120-
uint8_t* innerAboveThreshold)
121-
{
122-
uint8_t aboveThreshold = 0;
123-
124-
GPUCA_UNROLL(U(), U())
125-
for (uint16_t i = 0; i < N; i++) {
126-
Delta2 d = cfconsts::InnerNeighbors[i];
127-
128-
PackedCharge p = buf[N * lid + i];
129-
130-
Charge q = cluster->updateInner(p, d);
131-
132-
CPU_ONLY(labelAcc->collect(pos.delta(d), q));
133-
134-
aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
135-
}
136-
137-
innerAboveThreshold[lid] = aboveThreshold;
138-
139-
GPUbarrier();
140-
}
141-
142-
GPUdii() void GPUTPCCFClusterizer::updateClusterOuter(
143-
uint16_t lid,
144-
uint16_t N,
145-
uint16_t M,
146-
uint16_t offset,
147-
const PackedCharge* buf,
148-
const ChargePos& pos,
149-
ClusterAccumulator* cluster,
150-
MCLabelAccumulator* labelAcc)
151-
{
152-
GPUCA_UNROLL(U(), U())
153-
for (uint16_t i = offset; i < M + offset; i++) {
154-
PackedCharge p = buf[N * lid + i];
155-
156-
Delta2 d = cfconsts::OuterNeighbors[i];
157-
158-
Charge q = cluster->updateOuter(p, d);
159-
static_cast<void>(q); // Avoid unused varible warning on GPU.
160-
161-
CPU_ONLY(labelAcc->collect(pos.delta(d), q));
162-
}
163-
}
164-
165-
GPUdii() void GPUTPCCFClusterizer::buildCluster(
166-
const GPUSettingsRec& calib,
167-
const Array2D<PackedCharge>& chargeMap,
168-
ChargePos pos,
169-
ChargePos* posBcast,
170-
PackedCharge* buf,
171-
uint8_t* innerAboveThreshold,
172-
ClusterAccumulator* myCluster,
173-
MCLabelAccumulator* labelAcc)
174-
{
175-
uint16_t ll = get_local_id(0);
176-
177-
posBcast[ll] = pos;
178-
GPUbarrier();
179-
180-
CfUtils::blockLoad<PackedCharge>(
181-
chargeMap,
182-
SCRATCH_PAD_WORK_GROUP_SIZE,
183-
SCRATCH_PAD_WORK_GROUP_SIZE,
184-
ll,
185-
0,
186-
8,
187-
cfconsts::InnerNeighbors,
188-
posBcast,
189-
buf);
190-
updateClusterInner(
191-
calib,
192-
ll,
193-
8,
194-
buf,
195-
pos,
196-
myCluster,
197-
labelAcc,
198-
innerAboveThreshold);
199-
200-
uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
201-
202-
bool inGroup1 = ll < wgSizeHalf;
203-
204-
uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
205-
206-
CfUtils::condBlockLoad(
207-
chargeMap,
208-
wgSizeHalf,
209-
SCRATCH_PAD_WORK_GROUP_SIZE,
210-
ll,
211-
0,
212-
16,
213-
cfconsts::OuterNeighbors,
214-
posBcast,
215-
innerAboveThreshold,
216-
buf);
217-
218-
if (inGroup1) {
219-
updateClusterOuter(
220-
llhalf,
221-
16,
222-
16,
223-
0,
224-
buf,
225-
pos,
226-
myCluster,
227-
labelAcc);
228-
}
229-
230-
#if defined(GPUCA_GPUCODE)
231-
CfUtils::condBlockLoad(
232-
chargeMap,
233-
wgSizeHalf,
234-
SCRATCH_PAD_WORK_GROUP_SIZE,
235-
ll,
236-
0,
237-
16,
238-
cfconsts::OuterNeighbors,
239-
posBcast + wgSizeHalf,
240-
innerAboveThreshold + wgSizeHalf,
241-
buf);
242-
if (!inGroup1) {
243-
updateClusterOuter(
244-
llhalf,
245-
16,
246-
16,
247-
0,
248-
buf,
249-
pos,
250-
myCluster,
251-
labelAcc);
252-
}
253-
#endif
254-
}
255-
256-
GPUd() uint32_t GPUTPCCFClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t* elemsInBucket, tpc::ClusterNative* buckets)
257-
{
258-
uint32_t index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
259-
if (index < maxElemsPerBucket) {
260-
buckets[maxElemsPerBucket * row + index] = cluster;
261-
} else {
262-
clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISector * 1000 + row, index, maxElemsPerBucket);
263-
CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
264-
}
265-
return index;
266-
}

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,14 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate
5959

6060
static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*);
6161

62+
static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
63+
64+
static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
65+
6266
private:
6367
static GPUd() void updateClusterInner(const GPUSettingsRec&, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);
6468

6569
static GPUd() void updateClusterOuter(uint16_t, uint16_t, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
66-
67-
static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
68-
69-
static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
7070
};
7171

7272
} // namespace o2::gpu

0 commit comments

Comments
 (0)