TPC: Add option to write reduced clusterNative, removing rejected clusters

davidrohr · davidrohr · commit ba7056bc990c · 2025-09-24T21:36:01.000+02:00
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompression.h b/GPU/GPUTracking/DataCompression/GPUTPCCompression.h
@@ -89,6 +89,7 @@ class GPUTPCCompression : public GPUProcessor
   void SetPointersCompressedClusters(void*& mem, T& c, uint32_t nClA, uint32_t nTr, uint32_t nClU, bool reducedClA);
   template <class T>
   GPUd() static void truncateSignificantBits(T& val, uint32_t nBits, uint32_t max);
+  GPUd() bool rejectCluster(int32_t idx, GPUParam& param, const GPUTrackingInOutPointers& ioPtrs);
 
   int16_t mMemoryResOutputHost = -1;
   int16_t mMemoryResOutputGPU = -1;
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx
@@ -183,6 +183,31 @@ GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<4>::opera
   return mClsPtr[a].qTot < mClsPtr[b].qTot;
 }
 
+GPUd() bool GPUTPCCompression::rejectCluster(int32_t idx, GPUParam& GPUrestrict() param, const GPUTrackingInOutPointers& GPUrestrict() ioPtrs)
+{
+  if (mClusterStatus[idx]) {
+    return true;
+  }
+  int32_t attach = ioPtrs.mergedTrackHitAttachment[idx];
+  bool unattached = attach == 0;
+
+  if (unattached) {
+    if (param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyB) {
+      return true;
+    }
+  } else if (param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyA) {
+    if (GPUTPCClusterRejection::GetIsRejected(attach)) {
+      return true;
+    }
+    int32_t id = attach & gputpcgmmergertypes::attachTrackMask;
+    auto& trk = ioPtrs.mergedTracks[id];
+    if (CAMath::Abs(trk.GetParam().GetQPt() * param.qptB5Scaler) > param.rec.tpc.rejectQPtB5 || trk.MergedLooper()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 template <>
 GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1unattached>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
 {
@@ -208,33 +233,7 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
     const uint32_t nn = CAMath::nextMultipleOf<GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached)>(clusters->nClusters[iSector][iRow]);
     for (uint32_t i = iThread; i < nn + nThreads; i += nThreads) {
       const int32_t idx = idOffset + i;
-      int32_t storeCluster = 0;
-      do {
-        if (i >= clusters->nClusters[iSector][iRow]) {
-          break;
-        }
-        if (compressor.mClusterStatus[idx]) {
-          break;
-        }
-        int32_t attach = ioPtrs.mergedTrackHitAttachment[idx];
-        bool unattached = attach == 0;
-
-        if (unattached) {
-          if (processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyB) {
-            break;
-          }
-        } else if (processors.param.rec.tpc.rejectionStrategy >= GPUSettings::RejectionStrategyA) {
-          if (GPUTPCClusterRejection::GetIsRejected(attach)) {
-            break;
-          }
-          int32_t id = attach & gputpcgmmergertypes::attachTrackMask;
-          auto& trk = ioPtrs.mergedTracks[id];
-          if (CAMath::Abs(trk.GetParam().GetQPt() * processors.param.qptB5Scaler) > processors.param.rec.tpc.rejectQPtB5 || trk.MergedLooper()) {
-            break;
-          }
-        }
-        storeCluster = 1;
-      } while (false);
+      int32_t storeCluster = i < clusters->nClusters[iSector][iRow] && !compressor.rejectCluster(idx, param, ioPtrs);
 
       GPUbarrier();
       int32_t myIndex = work_group_scan_inclusive_add(storeCluster);
diff --git a/GPU/GPUTracking/DataTypes/GPUDataTypes.h b/GPU/GPUTracking/DataTypes/GPUDataTypes.h
@@ -245,6 +245,7 @@ struct GPUTrackingInOutPointers {
   uint32_t nOutputClusRefsTPCO2 = 0;
   const o2::MCCompLabel* outputTracksTPCO2MC = nullptr;
   const o2::tpc::CompressedClustersFlat* tpcCompressedClusters = nullptr;
+  const o2::tpc::ClusterNativeAccess* clustersNativeReduced = nullptr;
 
   // TPC links
   int32_t* tpcLinkITS = nullptr;
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -361,6 +361,7 @@ AddOption(tpcMaxAttachedClustersPerSectorRow, uint32_t, 51000, "", 0, "Maximum n
 AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding")
 AddOption(tpcApplyCFCutsAtDecoding, bool, false, "", 0, "Apply cluster cuts from clusterization during decoding of compressed clusters")
 AddOption(tpcApplyClusterFilterOnCPU, uint8_t, 0, "", 0, "Apply custom cluster filter of GPUTPCClusterFilter class, 0: off, 1: debug, 2: PbPb23")
+AddOption(tpcWriteClustersAfterRejection, bool, false, "", 0, "Apply TPC rejection strategy before writing clusters")
 AddOption(oclPlatformNum, int32_t, -1, "", 0, "Platform to use, in case the backend provides multiple platforms (OpenCL only, -1 = auto-select, -2 query all platforms (also incompatible))")
 AddOption(oclCompileFromSources, bool, false, "", 0, "Compile OpenCL binary from included source code instead of using included spirv code")
 AddOption(oclOverrideSourceBuildFlags, std::string, "", "", 0, "Override OCL build flags for compilation from source, put a space for empty options")
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -273,6 +273,10 @@ bool GPUChainTracking::ValidateSettings()
     GPUError("Clusterizer and merger Sanity checks only supported when not running on GPU");
     return false;
   }
+  if (GetProcessingSettings().tpcWriteClustersAfterRejection && (mRec->IsGPU() || param().rec.tpc.compressionTypeMask || !(GetRecoSteps() & GPUDataTypes::RecoStep::TPCCompression))) {
+    GPUError("tpcWriteClustersAfterRejection requires compressionTypeMask = 0, no GPU usage, and compression enabled");
+    return false;
+  }
   if (GetProcessingSettings().doublePipeline) {
     if (!GetRecoStepsOutputs().isOnlySet(GPUDataTypes::InOutType::TPCMergedTracks, GPUDataTypes::InOutType::TPCCompressedClusters, GPUDataTypes::InOutType::TPCClusters)) {
       GPUError("Invalid outputs for double pipeline mode 0x%x", (uint32_t)GetRecoStepsOutputs());
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -305,6 +305,7 @@ class GPUChainTracking : public GPUChain
   void RunTPCTrackingMerger_Resolve(int8_t useOrigTrackParam, int8_t mergeAll, GPUReconstruction::krnlDeviceType deviceType);
   void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function<o2::tpc::ClusterNative*(size_t)> allocator, bool applyClusterCuts);
   bool NeedTPCClustersOnGPU();
+  void WriteReducedClusters();
   template <int32_t I>
   int32_t RunTRDTrackingInternal();
   uint32_t StreamForSector(uint32_t sector) const;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -772,7 +772,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       GPUFatal("Cannot use waitForFinalInput callback without delayed output");
     }
     if (!GetProcessingSettings().tpcApplyClusterFilterOnCPU) {
-      AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
+      AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, GetProcessingSettings().tpcWriteClustersAfterRejection ? nullptr : mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
       tmpNativeClusters = mInputsHost->mPclusterNativeOutput;
     } else {
       tmpNativeClusterBuffer = std::make_unique<ClusterNative[]>(mInputsHost->mNClusterNative);
@@ -1269,7 +1269,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     // TODO: write to buffer directly
     o2::dataformats::MCTruthContainer<o2::MCCompLabel> mcLabels;
     std::pair<ConstMCLabelContainer*, ConstMCLabelContainerView*> buffer;
-    if (mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)] && mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->useExternal()) {
+    if (!GetProcessingSettings().tpcWriteClustersAfterRejection && mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)] && mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->useExternal()) {
       if (!mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->allocator) {
         throw std::runtime_error("Cluster MC Label buffer missing");
       }
@@ -1293,7 +1293,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
   if (buildNativeHost && buildNativeGPU && GetProcessingSettings().delayedOutput) {
     mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = nClsTotal;
-    AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
+    AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, GetProcessingSettings().tpcWriteClustersAfterRejection ? nullptr : mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
     tmpNativeClusters = mInputsHost->mPclusterNativeOutput;
     for (uint32_t i = outputQueueStart; i < mOutputQueue.size(); i++) {
       mOutputQueue[i].dst = (char*)tmpNativeClusters + (size_t)mOutputQueue[i].dst;
@@ -1308,7 +1308,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     if (GetProcessingSettings().tpcApplyClusterFilterOnCPU) {
       auto allocator = [this, &tmpNativeClusters](size_t size) {
         this->mInputsHost->mNClusterNative = size;
-        this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
+        this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->GetProcessingSettings().tpcWriteClustersAfterRejection ? nullptr : this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
         return (tmpNativeClusters = this->mInputsHost->mPclusterNativeOutput);
       };
       RunTPCClusterFilter(tmpNativeAccess, allocator, false);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -23,6 +23,7 @@
 #include "GPUConstantMem.h" // TODO: Try to get rid of as many GPUConstantMem includes as possible!
 #include "GPUTPCCompressionKernels.h"
 #include "GPUTPCDecompressionKernels.h"
+#include "SimulationDataFormat/ConstMCTruthContainer.h"
 #include "utils/strtag.h"
 
 #include <numeric>
@@ -52,6 +53,9 @@ int32_t GPUChainTracking::RunTPCCompression()
   TransferMemoryResourcesToGPU(myStep, &Compressor, 0);
   runKernel<GPUMemClean16>(GetGridAutoStep(0, RecoStep::TPCCompression), CompressorShadow.mClusterStatus, Compressor.mMaxClusters * sizeof(CompressorShadow.mClusterStatus[0]));
   runKernel<GPUTPCCompressionKernels, GPUTPCCompressionKernels::step0attached>(GetGridAuto(0));
+  if (GetProcessingSettings().tpcWriteClustersAfterRejection) {
+    WriteReducedClusters();
+  }
   runKernel<GPUTPCCompressionKernels, GPUTPCCompressionKernels::step1unattached>(GetGridAuto(0));
   TransferMemoryResourcesToHost(myStep, &Compressor, 0);
 #ifdef GPUCA_TPC_GEOMETRY_O2
@@ -434,3 +438,20 @@ int32_t GPUChainTracking::RunTPCDecompression()
   DoDebugDump(GPUChainTrackingDebugFlags::TPCDecompressedClusters, &GPUChainTracking::DumpClusters, *mDebugFile, mIOPtrs.clustersNative);
   return 0;
 }
+
+void GPUChainTracking::WriteReducedClusters()
+{
+  GPUOutputControl* clOutput = mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)];
+  if (!clOutput || !clOutput->allocator) {
+    throw std::runtime_error("No output allocator for clusterNative available");
+  }
+  auto* clBuffer = (ClusterNative*)clOutput->allocator(mIOPtrs.clustersNative->nClustersTotal * sizeof(ClusterNative));
+
+  GPUOutputControl* labelOutput = mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)];
+  if (!labelOutput || !labelOutput->allocator) {
+    throw std::runtime_error("No output allocator for clusterNative labels available");
+  }
+
+  ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer* labelContainer = reinterpret_cast<ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer*>(labelOutput->allocator(0));
+  std::pair<o2::dataformats::ConstMCLabelContainer*, o2::dataformats::ConstMCLabelContainerView*> labelBuffer = {&labelContainer->first, &labelContainer->second};
+}
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
@@ -293,7 +293,9 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
     mRec->PushNonPersistentMemory(qStr2Tag("TPCMERG2"));
     AllocateRegisteredMemory(Merger.MemoryResOutputO2Scratch());
     WriteToConstantMemory(RecoStep::TPCMerging, (char*)&processors()->tpcMerger - (char*)processors(), &MergerShadow, sizeof(MergerShadow), 0);
-    runKernel<GPUTPCGMO2Output, GPUTPCGMO2Output::prepare>(GetGridAuto(0, deviceType));
+    if (!GetProcessingSettings().tpcWriteClustersAfterRejection) {
+      runKernel<GPUTPCGMO2Output, GPUTPCGMO2Output::prepare>(GetGridAuto(0, deviceType));
+    }
     TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResMemory(), 0, &mEvents->single);
     runKernel<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(GetGridAuto(0, deviceType));
     mRec->ReturnVolatileDeviceMemory();
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -889,6 +889,9 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
 
   // ------------------------------ Varios postprocessing steps ------------------------------
 
+  if (mConfig->configProcessing.tpcWriteClustersAfterRejection) {
+    ptrs.clustersNative = ptrs.clustersNativeReduced;
+  }
   bool createEmptyOutput = false;
   if (retVal != 0) {
     if (retVal == 3 && mConfig->configProcessing.ignoreNonFatalGPUErrors) {