2525using namespace o2 ::gpu;
2626using namespace o2 ::gpu::tpccf;
2727
28+ #include " GPUTPCCFClusterizer.inc"
29+
2830template <>
2931GPUdii () void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t onlyMC)
3032{
@@ -35,232 +37,3 @@ GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
3537
3638 GPUTPCCFClusterizer::computeClustersImpl (get_num_groups (0 ), get_local_size (0 ), get_group_id (0 ), get_local_id (0 ), clusterer, clusterer.mPmemory ->fragment , smem, chargeMap, clusterer.mPfilteredPeakPositions , clusterer.Param ().rec , CPU_PTR (&labelAcc), clusterer.mPmemory ->counters .nClusters , clusterer.mNMaxClusterPerRow , clusterer.mPclusterInRow , clusterOut, clusterer.mPclusterPosInRow );
3739}
38-
39- GPUdii () void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
40- processorType& clusterer,
41- const CfFragment& fragment,
42- GPUSharedMemory& smem,
43- const Array2D<PackedCharge>& chargeMap,
44- const ChargePos* filteredPeakPositions,
45- const GPUSettingsRec& calib,
46- MCLabelAccumulator* labelAcc,
47- uint32_t clusternum,
48- uint32_t maxClusterPerRow,
49- uint32_t * clusterInRow,
50- tpc::ClusterNative* clusterByRow,
51- uint32_t * clusterPosInRow)
52- {
53- uint32_t idx = get_global_id (0 );
54-
55- // For certain configurations dummy work items are added, so the total
56- // number of work items is dividable by 64.
57- // These dummy items also compute the last cluster but discard the result.
58- ChargePos pos = filteredPeakPositions[CAMath::Min (idx, clusternum - 1 )];
59- Charge charge = chargeMap[pos].unpack ();
60-
61- ClusterAccumulator pc;
62- CPU_ONLY (labelAcc->collect (pos, charge));
63-
64- buildCluster (
65- calib,
66- chargeMap,
67- pos,
68- smem.posBcast ,
69- smem.buf ,
70- smem.innerAboveThreshold ,
71- &pc,
72- labelAcc);
73-
74- if (idx >= clusternum) {
75- return ;
76- }
77- if (fragment.isOverlap (pos.time ())) {
78- if (clusterPosInRow) {
79- clusterPosInRow[idx] = maxClusterPerRow;
80- }
81- return ;
82- }
83- tpc::ClusterNative myCluster;
84- bool rejectCluster = !pc.toNative (pos, charge, myCluster, clusterer.Param (), fragment.start , chargeMap);
85-
86- if (rejectCluster) {
87- if (clusterPosInRow) {
88- clusterPosInRow[idx] = maxClusterPerRow;
89- }
90- return ;
91- }
92-
93- uint32_t rowIndex = 0 ;
94- if (clusterByRow != nullptr ) {
95- rowIndex = sortIntoBuckets (
96- clusterer,
97- myCluster,
98- pos.row (),
99- maxClusterPerRow,
100- clusterInRow,
101- clusterByRow);
102- if (clusterPosInRow != nullptr ) {
103- clusterPosInRow[idx] = rowIndex;
104- }
105- } else if (clusterPosInRow) {
106- rowIndex = clusterPosInRow[idx];
107- }
108-
109- CPU_ONLY (labelAcc->commit (pos.row (), rowIndex, maxClusterPerRow));
110- }
111-
112- GPUdii () void GPUTPCCFClusterizer::updateClusterInner(
113- const GPUSettingsRec& calib,
114- uint16_t lid,
115- uint16_t N,
116- const PackedCharge* buf,
117- const ChargePos& pos,
118- ClusterAccumulator* cluster,
119- MCLabelAccumulator* labelAcc,
120- uint8_t * innerAboveThreshold)
121- {
122- uint8_t aboveThreshold = 0 ;
123-
124- GPUCA_UNROLL (U (), U ())
125- for (uint16_t i = 0 ; i < N; i++) {
126- Delta2 d = cfconsts::InnerNeighbors[i];
127-
128- PackedCharge p = buf[N * lid + i];
129-
130- Charge q = cluster->updateInner (p, d);
131-
132- CPU_ONLY (labelAcc->collect (pos.delta (d), q));
133-
134- aboveThreshold |= (uint8_t (q > calib.tpc .cfInnerThreshold ) << i);
135- }
136-
137- innerAboveThreshold[lid] = aboveThreshold;
138-
139- GPUbarrier ();
140- }
141-
142- GPUdii () void GPUTPCCFClusterizer::updateClusterOuter(
143- uint16_t lid,
144- uint16_t N,
145- uint16_t M,
146- uint16_t offset,
147- const PackedCharge* buf,
148- const ChargePos& pos,
149- ClusterAccumulator* cluster,
150- MCLabelAccumulator* labelAcc)
151- {
152- GPUCA_UNROLL (U (), U ())
153- for (uint16_t i = offset; i < M + offset; i++) {
154- PackedCharge p = buf[N * lid + i];
155-
156- Delta2 d = cfconsts::OuterNeighbors[i];
157-
158- Charge q = cluster->updateOuter (p, d);
159- static_cast <void >(q); // Avoid unused varible warning on GPU.
160-
161- CPU_ONLY (labelAcc->collect (pos.delta (d), q));
162- }
163- }
164-
165- GPUdii () void GPUTPCCFClusterizer::buildCluster(
166- const GPUSettingsRec& calib,
167- const Array2D<PackedCharge>& chargeMap,
168- ChargePos pos,
169- ChargePos* posBcast,
170- PackedCharge* buf,
171- uint8_t * innerAboveThreshold,
172- ClusterAccumulator* myCluster,
173- MCLabelAccumulator* labelAcc)
174- {
175- uint16_t ll = get_local_id (0 );
176-
177- posBcast[ll] = pos;
178- GPUbarrier ();
179-
180- CfUtils::blockLoad<PackedCharge>(
181- chargeMap,
182- SCRATCH_PAD_WORK_GROUP_SIZE,
183- SCRATCH_PAD_WORK_GROUP_SIZE,
184- ll,
185- 0 ,
186- 8 ,
187- cfconsts::InnerNeighbors,
188- posBcast,
189- buf);
190- updateClusterInner (
191- calib,
192- ll,
193- 8 ,
194- buf,
195- pos,
196- myCluster,
197- labelAcc,
198- innerAboveThreshold);
199-
200- uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1 ) / 2 ;
201-
202- bool inGroup1 = ll < wgSizeHalf;
203-
204- uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
205-
206- CfUtils::condBlockLoad (
207- chargeMap,
208- wgSizeHalf,
209- SCRATCH_PAD_WORK_GROUP_SIZE,
210- ll,
211- 0 ,
212- 16 ,
213- cfconsts::OuterNeighbors,
214- posBcast,
215- innerAboveThreshold,
216- buf);
217-
218- if (inGroup1) {
219- updateClusterOuter (
220- llhalf,
221- 16 ,
222- 16 ,
223- 0 ,
224- buf,
225- pos,
226- myCluster,
227- labelAcc);
228- }
229-
230- #if defined(GPUCA_GPUCODE)
231- CfUtils::condBlockLoad (
232- chargeMap,
233- wgSizeHalf,
234- SCRATCH_PAD_WORK_GROUP_SIZE,
235- ll,
236- 0 ,
237- 16 ,
238- cfconsts::OuterNeighbors,
239- posBcast + wgSizeHalf,
240- innerAboveThreshold + wgSizeHalf,
241- buf);
242- if (!inGroup1) {
243- updateClusterOuter (
244- llhalf,
245- 16 ,
246- 16 ,
247- 0 ,
248- buf,
249- pos,
250- myCluster,
251- labelAcc);
252- }
253- #endif
254- }
255-
256- GPUd () uint32_t GPUTPCCFClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t * elemsInBucket, tpc::ClusterNative* buckets)
257- {
258- uint32_t index = CAMath::AtomicAdd (&elemsInBucket[row], 1u );
259- if (index < maxElemsPerBucket) {
260- buckets[maxElemsPerBucket * row + index] = cluster;
261- } else {
262- clusterer.raiseError (GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISector * 1000 + row, index, maxElemsPerBucket);
263- CAMath::AtomicExch (&elemsInBucket[row], maxElemsPerBucket);
264- }
265- return index;
266- }
0 commit comments