2525using namespace o2 ::gpu;
2626using namespace o2 ::gpu::tpccf;
2727
28+ #include " GPUTPCCFClusterizer.inc"
29+
2830template <>
2931GPUdii () void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t onlyMC)
3032{
@@ -34,235 +36,4 @@ GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
3436 tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow ;
3537
3638 GPUTPCCFClusterizer::computeClustersImpl (get_num_groups (0 ), get_local_size (0 ), get_group_id (0 ), get_local_id (0 ), clusterer, clusterer.mPmemory ->fragment , smem, chargeMap, clusterer.mPfilteredPeakPositions , clusterer.Param ().rec , CPU_PTR (&labelAcc), clusterer.mPmemory ->counters .nClusters , clusterer.mNMaxClusterPerRow , clusterer.mPclusterInRow , clusterOut, clusterer.mPclusterPosInRow );
37- }
38-
39- GPUdii () void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
40- processorType& clusterer,
41- const CfFragment& fragment,
42- GPUSharedMemory& smem,
43- const Array2D<PackedCharge>& chargeMap,
44- const ChargePos* filteredPeakPositions,
45- const GPUSettingsRec& calib,
46- MCLabelAccumulator* labelAcc,
47- uint32_t clusternum,
48- uint32_t maxClusterPerRow,
49- uint32_t * clusterInRow,
50- tpc::ClusterNative* clusterByRow,
51- uint32_t * clusterPosInRow)
52- {
53- uint32_t idx = get_global_id (0 );
54-
55- // For certain configurations dummy work items are added, so the total
56- // number of work items is dividable by 64.
57- // These dummy items also compute the last cluster but discard the result.
58- ChargePos pos = filteredPeakPositions[CAMath::Min (idx, clusternum - 1 )];
59- Charge charge = chargeMap[pos].unpack ();
60-
61- ClusterAccumulator pc;
62- CPU_ONLY (labelAcc->collect (pos, charge));
63-
64- buildCluster (
65- calib,
66- chargeMap,
67- pos,
68- smem.posBcast ,
69- smem.buf ,
70- smem.innerAboveThreshold ,
71- &pc,
72- labelAcc);
73-
74- if (idx >= clusternum) {
75- return ;
76- }
77- if (fragment.isOverlap (pos.time ())) {
78- if (clusterPosInRow) {
79- clusterPosInRow[idx] = maxClusterPerRow;
80- }
81- return ;
82- }
83- pc.finalize (pos, charge, fragment.start , clusterer.Param ().tpcGeometry );
84-
85- tpc::ClusterNative myCluster;
86- bool rejectCluster = !pc.toNative (pos, charge, myCluster, clusterer.Param ());
87-
88- if (rejectCluster) {
89- if (clusterPosInRow) {
90- clusterPosInRow[idx] = maxClusterPerRow;
91- }
92- return ;
93- }
94-
95- uint32_t rowIndex = 0 ;
96- if (clusterByRow != nullptr ) {
97- rowIndex = sortIntoBuckets (
98- clusterer,
99- myCluster,
100- pos.row (),
101- maxClusterPerRow,
102- clusterInRow,
103- clusterByRow);
104- if (clusterPosInRow != nullptr ) {
105- clusterPosInRow[idx] = rowIndex;
106- }
107- } else if (clusterPosInRow) {
108- rowIndex = clusterPosInRow[idx];
109- }
110-
111- CPU_ONLY (labelAcc->commit (pos.row (), rowIndex, maxClusterPerRow));
112- }
113-
114- GPUdii () void GPUTPCCFClusterizer::updateClusterInner(
115- const GPUSettingsRec& calib,
116- uint16_t lid,
117- uint16_t N,
118- const PackedCharge* buf,
119- const ChargePos& pos,
120- ClusterAccumulator* cluster,
121- MCLabelAccumulator* labelAcc,
122- uint8_t * innerAboveThreshold)
123- {
124- uint8_t aboveThreshold = 0 ;
125-
126- GPUCA_UNROLL (U (), U ())
127- for (uint16_t i = 0 ; i < N; i++) {
128- Delta2 d = cfconsts::InnerNeighbors[i];
129-
130- PackedCharge p = buf[N * lid + i];
131-
132- Charge q = cluster->updateInner (p, d);
133-
134- CPU_ONLY (labelAcc->collect (pos.delta (d), q));
135-
136- aboveThreshold |= (uint8_t (q > calib.tpc .cfInnerThreshold ) << i);
137- }
138-
139- innerAboveThreshold[lid] = aboveThreshold;
140-
141- GPUbarrier ();
142- }
143-
144- GPUdii () void GPUTPCCFClusterizer::updateClusterOuter(
145- uint16_t lid,
146- uint16_t N,
147- uint16_t M,
148- uint16_t offset,
149- const PackedCharge* buf,
150- const ChargePos& pos,
151- ClusterAccumulator* cluster,
152- MCLabelAccumulator* labelAcc)
153- {
154- GPUCA_UNROLL (U (), U ())
155- for (uint16_t i = offset; i < M + offset; i++) {
156- PackedCharge p = buf[N * lid + i];
157-
158- Delta2 d = cfconsts::OuterNeighbors[i];
159-
160- Charge q = cluster->updateOuter (p, d);
161- static_cast <void >(q); // Avoid unused varible warning on GPU.
162-
163- CPU_ONLY (labelAcc->collect (pos.delta (d), q));
164- }
165- }
166-
167- GPUdii () void GPUTPCCFClusterizer::buildCluster(
168- const GPUSettingsRec& calib,
169- const Array2D<PackedCharge>& chargeMap,
170- ChargePos pos,
171- ChargePos* posBcast,
172- PackedCharge* buf,
173- uint8_t * innerAboveThreshold,
174- ClusterAccumulator* myCluster,
175- MCLabelAccumulator* labelAcc)
176- {
177- uint16_t ll = get_local_id (0 );
178-
179- posBcast[ll] = pos;
180- GPUbarrier ();
181-
182- CfUtils::blockLoad<PackedCharge>(
183- chargeMap,
184- SCRATCH_PAD_WORK_GROUP_SIZE,
185- SCRATCH_PAD_WORK_GROUP_SIZE,
186- ll,
187- 0 ,
188- 8 ,
189- cfconsts::InnerNeighbors,
190- posBcast,
191- buf);
192- updateClusterInner (
193- calib,
194- ll,
195- 8 ,
196- buf,
197- pos,
198- myCluster,
199- labelAcc,
200- innerAboveThreshold);
201-
202- uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1 ) / 2 ;
203-
204- bool inGroup1 = ll < wgSizeHalf;
205-
206- uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
207-
208- CfUtils::condBlockLoad (
209- chargeMap,
210- wgSizeHalf,
211- SCRATCH_PAD_WORK_GROUP_SIZE,
212- ll,
213- 0 ,
214- 16 ,
215- cfconsts::OuterNeighbors,
216- posBcast,
217- innerAboveThreshold,
218- buf);
219-
220- if (inGroup1) {
221- updateClusterOuter (
222- llhalf,
223- 16 ,
224- 16 ,
225- 0 ,
226- buf,
227- pos,
228- myCluster,
229- labelAcc);
230- }
231-
232- #if defined(GPUCA_GPUCODE)
233- CfUtils::condBlockLoad (
234- chargeMap,
235- wgSizeHalf,
236- SCRATCH_PAD_WORK_GROUP_SIZE,
237- ll,
238- 0 ,
239- 16 ,
240- cfconsts::OuterNeighbors,
241- posBcast + wgSizeHalf,
242- innerAboveThreshold + wgSizeHalf,
243- buf);
244- if (!inGroup1) {
245- updateClusterOuter (
246- llhalf,
247- 16 ,
248- 16 ,
249- 0 ,
250- buf,
251- pos,
252- myCluster,
253- labelAcc);
254- }
255- #endif
256- }
257-
258- GPUd () uint32_t GPUTPCCFClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t * elemsInBucket, tpc::ClusterNative* buckets)
259- {
260- uint32_t index = CAMath::AtomicAdd (&elemsInBucket[row], 1u );
261- if (index < maxElemsPerBucket) {
262- buckets[maxElemsPerBucket * row + index] = cluster;
263- } else {
264- clusterer.raiseError (GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISector * 1000 + row, index, maxElemsPerBucket);
265- CAMath::AtomicExch (&elemsInBucket[row], maxElemsPerBucket);
266- }
267- return index;
268- }
39+ }
0 commit comments