GPU TPC CF: Split clusterizer CXX functions out into .inc file to be used externally

davidrohr · davidrohr · commit 33b5b364fcd0 · 2025-03-13T09:35:53.000+01:00
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
@@ -25,6 +25,8 @@
 using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
 
+#include "GPUTPCCFClusterizer.inc"
+
 template <>
 GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t onlyMC)
 {
@@ -34,235 +36,4 @@ GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
   GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
-}
-
-GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
-                                                       processorType& clusterer,
-                                                       const CfFragment& fragment,
-                                                       GPUSharedMemory& smem,
-                                                       const Array2D<PackedCharge>& chargeMap,
-                                                       const ChargePos* filteredPeakPositions,
-                                                       const GPUSettingsRec& calib,
-                                                       MCLabelAccumulator* labelAcc,
-                                                       uint32_t clusternum,
-                                                       uint32_t maxClusterPerRow,
-                                                       uint32_t* clusterInRow,
-                                                       tpc::ClusterNative* clusterByRow,
-                                                       uint32_t* clusterPosInRow)
-{
-  uint32_t idx = get_global_id(0);
-
-  // For certain configurations dummy work items are added, so the total
-  // number of work items is dividable by 64.
-  // These dummy items also compute the last cluster but discard the result.
-  ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
-  Charge charge = chargeMap[pos].unpack();
-
-  ClusterAccumulator pc;
-  CPU_ONLY(labelAcc->collect(pos, charge));
-
-  buildCluster(
-    calib,
-    chargeMap,
-    pos,
-    smem.posBcast,
-    smem.buf,
-    smem.innerAboveThreshold,
-    &pc,
-    labelAcc);
-
-  if (idx >= clusternum) {
-    return;
-  }
-  if (fragment.isOverlap(pos.time())) {
-    if (clusterPosInRow) {
-      clusterPosInRow[idx] = maxClusterPerRow;
-    }
-    return;
-  }
-  pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry);
-
-  tpc::ClusterNative myCluster;
-  bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param());
-
-  if (rejectCluster) {
-    if (clusterPosInRow) {
-      clusterPosInRow[idx] = maxClusterPerRow;
-    }
-    return;
-  }
-
-  uint32_t rowIndex = 0;
-  if (clusterByRow != nullptr) {
-    rowIndex = sortIntoBuckets(
-      clusterer,
-      myCluster,
-      pos.row(),
-      maxClusterPerRow,
-      clusterInRow,
-      clusterByRow);
-    if (clusterPosInRow != nullptr) {
-      clusterPosInRow[idx] = rowIndex;
-    }
-  } else if (clusterPosInRow) {
-    rowIndex = clusterPosInRow[idx];
-  }
-
-  CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
-}
-
-GPUdii() void GPUTPCCFClusterizer::updateClusterInner(
-  const GPUSettingsRec& calib,
-  uint16_t lid,
-  uint16_t N,
-  const PackedCharge* buf,
-  const ChargePos& pos,
-  ClusterAccumulator* cluster,
-  MCLabelAccumulator* labelAcc,
-  uint8_t* innerAboveThreshold)
-{
-  uint8_t aboveThreshold = 0;
-
-  GPUCA_UNROLL(U(), U())
-  for (uint16_t i = 0; i < N; i++) {
-    Delta2 d = cfconsts::InnerNeighbors[i];
-
-    PackedCharge p = buf[N * lid + i];
-
-    Charge q = cluster->updateInner(p, d);
-
-    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
-
-    aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
-  }
-
-  innerAboveThreshold[lid] = aboveThreshold;
-
-  GPUbarrier();
-}
-
-GPUdii() void GPUTPCCFClusterizer::updateClusterOuter(
-  uint16_t lid,
-  uint16_t N,
-  uint16_t M,
-  uint16_t offset,
-  const PackedCharge* buf,
-  const ChargePos& pos,
-  ClusterAccumulator* cluster,
-  MCLabelAccumulator* labelAcc)
-{
-  GPUCA_UNROLL(U(), U())
-  for (uint16_t i = offset; i < M + offset; i++) {
-    PackedCharge p = buf[N * lid + i];
-
-    Delta2 d = cfconsts::OuterNeighbors[i];
-
-    Charge q = cluster->updateOuter(p, d);
-    static_cast<void>(q); // Avoid unused varible warning on GPU.
-
-    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
-  }
-}
-
-GPUdii() void GPUTPCCFClusterizer::buildCluster(
-  const GPUSettingsRec& calib,
-  const Array2D<PackedCharge>& chargeMap,
-  ChargePos pos,
-  ChargePos* posBcast,
-  PackedCharge* buf,
-  uint8_t* innerAboveThreshold,
-  ClusterAccumulator* myCluster,
-  MCLabelAccumulator* labelAcc)
-{
-  uint16_t ll = get_local_id(0);
-
-  posBcast[ll] = pos;
-  GPUbarrier();
-
-  CfUtils::blockLoad<PackedCharge>(
-    chargeMap,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    8,
-    cfconsts::InnerNeighbors,
-    posBcast,
-    buf);
-  updateClusterInner(
-    calib,
-    ll,
-    8,
-    buf,
-    pos,
-    myCluster,
-    labelAcc,
-    innerAboveThreshold);
-
-  uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
-
-  bool inGroup1 = ll < wgSizeHalf;
-
-  uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
-
-  CfUtils::condBlockLoad(
-    chargeMap,
-    wgSizeHalf,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    16,
-    cfconsts::OuterNeighbors,
-    posBcast,
-    innerAboveThreshold,
-    buf);
-
-  if (inGroup1) {
-    updateClusterOuter(
-      llhalf,
-      16,
-      16,
-      0,
-      buf,
-      pos,
-      myCluster,
-      labelAcc);
-  }
-
-#if defined(GPUCA_GPUCODE)
-  CfUtils::condBlockLoad(
-    chargeMap,
-    wgSizeHalf,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    16,
-    cfconsts::OuterNeighbors,
-    posBcast + wgSizeHalf,
-    innerAboveThreshold + wgSizeHalf,
-    buf);
-  if (!inGroup1) {
-    updateClusterOuter(
-      llhalf,
-      16,
-      16,
-      0,
-      buf,
-      pos,
-      myCluster,
-      labelAcc);
-  }
-#endif
-}
-
-GPUd() uint32_t GPUTPCCFClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t* elemsInBucket, tpc::ClusterNative* buckets)
-{
-  uint32_t index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
-  if (index < maxElemsPerBucket) {
-    buckets[maxElemsPerBucket * row + index] = cluster;
-  } else {
-    clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISector * 1000 + row, index, maxElemsPerBucket);
-    CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
-  }
-  return index;
-}
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
@@ -59,14 +59,14 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate
 
   static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*);
 
+  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
+
+  static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
+
  private:
   static GPUd() void updateClusterInner(const GPUSettingsRec&, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);
 
   static GPUd() void updateClusterOuter(uint16_t, uint16_t, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
-
-  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
-
-  static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
 };
 
 } // namespace o2::gpu
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc