GPU: Add tpcApplyCFCutsAtDecoding option to apply cluster cuts of CF during CTF decoding

davidrohr · davidrohr · commit 91d4cee764a0 · 2024-10-15T09:15:40.000+02:00
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -285,6 +285,7 @@ AddOption(tpcSingleSector, int32_t, -1, "", 0, "Restrict TPC processing to a sin
 AddOption(tpcDownscaledEdx, uint8_t, 0, "", 0, "If != 0, downscale dEdx processing (if enabled) to x %")
 AddOption(tpcMaxAttachedClustersPerSectorRow, uint32_t, 51000, "", 0, "Maximum number of TPC attached clusters which can be decoded per SectorRow")
 AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding")
+AddOption(tpcApplyCFCutsAtDecoding, bool, false, "", 0, "Apply cluster cuts from clusterization during decoding of compressed clusters")
 AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored")
 AddOption(RTCprependCommand, std::string, "", "", 0, "Prepend RTC compilation commands by this string")
 AddOption(RTCoverrideArchitecture, std::string, "", "", 0, "Override arhcitecture part of RTC compilation command line")
diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
@@ -59,31 +59,30 @@ class GPUChain
 
   const GPUParam& GetParam() const { return mRec->mHostConstantMem->param; }
   const GPUSettingsGRP& GetGRPSettings() const { return mRec->mGRPSettings; }
-  const GPUSettingsDeviceBackend& GetDeviceBackendSettings() const { return mRec->mDeviceBackendSettings; }
-  const GPUSettingsProcessing& GetProcessingSettings() const { return mRec->mProcessingSettings; }
   const GPUCalibObjectsConst& calib() const { return processors()->calibObjects; }
   GPUReconstruction* rec() { return mRec; }
   const GPUReconstruction* rec() const { return mRec; }
   inline const GPUConstantMem* GetProcessors() { return mRec->processors(); }
 
+  // Make functions from GPUReconstruction*** available
   GPUReconstruction::RecoStepField GetRecoSteps() const { return mRec->GetRecoSteps(); }
   GPUReconstruction::RecoStepField GetRecoStepsGPU() const { return mRec->GetRecoStepsGPU(); }
   GPUReconstruction::InOutTypeField GetRecoStepsInputs() const { return mRec->GetRecoStepsInputs(); }
   GPUReconstruction::InOutTypeField GetRecoStepsOutputs() const { return mRec->GetRecoStepsOutputs(); }
+  inline const GPUSettingsDeviceBackend& GetDeviceBackendSettings() const { return mRec->mDeviceBackendSettings; }
+  inline const GPUSettingsProcessing& GetProcessingSettings() const { return mRec->mProcessingSettings; }
 
  protected:
   GPUReconstructionCPU* mRec;
   GPUChain(GPUReconstruction* rec) : mRec((GPUReconstructionCPU*)rec) {}
 
   int32_t GetThread();
-
   // Make functions from GPUReconstruction*** available
   inline GPUConstantMem* processors() { return mRec->processors(); }
   inline GPUConstantMem* processorsShadow() { return mRec->mProcessorsShadow; }
   inline GPUConstantMem* processorsDevice() { return mRec->mDeviceConstantMem; }
   inline GPUParam& param() { return mRec->param(); }
   inline const GPUConstantMem* processors() const { return mRec->processors(); }
-  inline GPUSettingsProcessing& ProcessingSettings() { return mRec->mProcessingSettings; }
   inline void SynchronizeStream(int32_t stream) { mRec->SynchronizeStream(stream); }
   inline void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) { mRec->SynchronizeEvents(evList, nEvents); }
   inline void SynchronizeEventAndRelease(deviceEvent& ev, bool doGPU = true)
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -309,7 +309,7 @@ bool GPUChainTracking::ValidateSettings()
       GPUError("Must use external output for double pipeline mode");
       return false;
     }
-    if (ProcessingSettings().tpcCompressionGatherMode == 1) {
+    if (GetProcessingSettings().tpcCompressionGatherMode == 1) {
       GPUError("Double pipeline incompatible to compression mode 1");
       return false;
     }
@@ -318,7 +318,11 @@ bool GPUChainTracking::ValidateSettings()
       return false;
     }
   }
-  if ((GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && !(GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && (ProcessingSettings().tpcCompressionGatherMode == 1 || ProcessingSettings().tpcCompressionGatherMode == 3)) {
+  if ((GetRecoSteps() & GPUDataTypes::RecoStep::TPCDecompression) && GetProcessingSettings().tpcApplyCFCutsAtDecoding && !GetProcessingSettings().tpcUseOldCPUDecoding) {
+    GPUError("tpcApplyCFCutsAtDecoding currently requires tpcUseOldCPUDecoding");
+    return false;
+  }
+  if ((GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && !(GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && (GetProcessingSettings().tpcCompressionGatherMode == 1 || GetProcessingSettings().tpcCompressionGatherMode == 3)) {
     GPUError("Invalid tpcCompressionGatherMode for compression on CPU");
     return false;
   }
@@ -888,7 +892,7 @@ int32_t GPUChainTracking::RunChainFinalize()
     if (GetProcessingSettings().eventDisplay->getDisplayControl() == 2) {
       mDisplayRunning = false;
       GetProcessingSettings().eventDisplay->DisplayExit();
-      ProcessingSettings().eventDisplay = nullptr;
+      const_cast<GPUSettingsProcessing&>(GetProcessingSettings()).eventDisplay = nullptr; // TODO: fixme - eventDisplay should probably not be put into ProcessingSettings in the first place
       return (2);
     }
     GetProcessingSettings().eventDisplay->setDisplayControl(0);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -40,7 +40,7 @@ int32_t GPUChainTracking::RunTPCCompression()
     RecordMarker(mEvents->single, 0);
   }
 
-  if (ProcessingSettings().tpcCompressionGatherMode == 3) {
+  if (GetProcessingSettings().tpcCompressionGatherMode == 3) {
     mRec->AllocateVolatileDeviceMemory(0); // make future device memory allocation volatile
   }
   SetupGPUProcessor(&Compressor, true);
@@ -73,19 +73,19 @@ int32_t GPUChainTracking::RunTPCCompression()
   Compressor.mOutputFlat->set(outputSize, *Compressor.mOutput);
   char* hostFlatPtr = (char*)Compressor.mOutput->qTotU; // First array as allocated in GPUTPCCompression::SetPointersCompressedClusters
   size_t copySize = 0;
-  if (ProcessingSettings().tpcCompressionGatherMode == 3) {
+  if (GetProcessingSettings().tpcCompressionGatherMode == 3) {
     CompressorShadow.mOutputA = Compressor.mOutput;
     copySize = AllocateRegisteredMemory(Compressor.mMemoryResOutputGPU); // We overwrite Compressor.mOutput with the allocated output pointers on the GPU
   }
   const o2::tpc::CompressedClustersPtrs* P = nullptr;
   HighResTimer* gatherTimer = nullptr;
   int32_t outputStream = 0;
-  if (ProcessingSettings().doublePipeline) {
+  if (GetProcessingSettings().doublePipeline) {
     SynchronizeStream(OutputStream()); // Synchronize output copies running in parallel from memory that might be released, only the following async copy from stacked memory is safe after the chain finishes.
     outputStream = OutputStream();
   }
-  if (ProcessingSettings().tpcCompressionGatherMode >= 2) {
-    if (ProcessingSettings().tpcCompressionGatherMode == 2) {
+  if (GetProcessingSettings().tpcCompressionGatherMode >= 2) {
+    if (GetProcessingSettings().tpcCompressionGatherMode == 2) {
       void* devicePtr = mRec->getGPUPointer(Compressor.mOutputFlat);
       if (devicePtr != Compressor.mOutputFlat) {
         CompressedClustersPtrs& ptrs = *Compressor.mOutput; // We need to update the ptrs with the gpu-mapped version of the host address space
@@ -97,7 +97,7 @@ int32_t GPUChainTracking::RunTPCCompression()
     TransferMemoryResourcesToGPU(myStep, &Compressor, outputStream);
     constexpr uint32_t nBlocksDefault = 2;
     constexpr uint32_t nBlocksMulti = 1 + 2 * 200;
-    switch (ProcessingSettings().tpcCompressionGatherModeKernel) {
+    switch (GetProcessingSettings().tpcCompressionGatherModeKernel) {
       case 0:
         runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::unbuffered>(GetGridBlkStep(nBlocksDefault, outputStream, RecoStep::TPCCompression));
         getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::unbuffered>(RecoStep::TPCCompression, 0, outputSize, false);
@@ -120,10 +120,10 @@ int32_t GPUChainTracking::RunTPCCompression()
         getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::multiBlock>(RecoStep::TPCCompression, 0, outputSize, false);
         break;
       default:
-        GPUError("Invalid compression kernel %d selected.", (int32_t)ProcessingSettings().tpcCompressionGatherModeKernel);
+        GPUError("Invalid compression kernel %d selected.", (int32_t)GetProcessingSettings().tpcCompressionGatherModeKernel);
         return 1;
     }
-    if (ProcessingSettings().tpcCompressionGatherMode == 3) {
+    if (GetProcessingSettings().tpcCompressionGatherMode == 3) {
       RecordMarker(mEvents->stream[outputStream], outputStream);
       char* deviceFlatPts = (char*)Compressor.mOutput->qTotU;
       if (GetProcessingSettings().doublePipeline) {
@@ -138,9 +138,9 @@ int32_t GPUChainTracking::RunTPCCompression()
     }
   } else {
     int8_t direction = 0;
-    if (ProcessingSettings().tpcCompressionGatherMode == 0) {
+    if (GetProcessingSettings().tpcCompressionGatherMode == 0) {
       P = &CompressorShadow.mPtrs;
-    } else if (ProcessingSettings().tpcCompressionGatherMode == 1) {
+    } else if (GetProcessingSettings().tpcCompressionGatherMode == 1) {
       P = &Compressor.mPtrs;
       direction = -1;
       gatherTimer = &getTimer<GPUTPCCompressionKernels>("GPUTPCCompression_GatherOnCPU", 0);
@@ -184,11 +184,11 @@ int32_t GPUChainTracking::RunTPCCompression()
     GPUMemCpyAlways(myStep, O->timeA, P->timeA, O->nTracks * sizeof(O->timeA[0]), outputStream, direction);
     GPUMemCpyAlways(myStep, O->padA, P->padA, O->nTracks * sizeof(O->padA[0]), outputStream, direction);
   }
-  if (ProcessingSettings().tpcCompressionGatherMode == 1) {
+  if (GetProcessingSettings().tpcCompressionGatherMode == 1) {
     gatherTimer->Stop();
   }
   mIOPtrs.tpcCompressedClusters = Compressor.mOutputFlat;
-  if (ProcessingSettings().tpcCompressionGatherMode == 3) {
+  if (GetProcessingSettings().tpcCompressionGatherMode == 3) {
     SynchronizeEventAndRelease(mEvents->stream[outputStream]);
     mRec->ReturnVolatileDeviceMemory();
   }
@@ -209,18 +209,52 @@ int32_t GPUChainTracking::RunTPCDecompression()
   if (GetProcessingSettings().tpcUseOldCPUDecoding) {
     const auto& threadContext = GetThreadContext();
     TPCClusterDecompressor decomp;
-    auto allocator = [this](size_t size) {
+    auto allocatorFinal = [this](size_t size) {
       this->mInputsHost->mNClusterNative = this->mInputsShadow->mNClusterNative = size;
       this->AllocateRegisteredMemory(this->mInputsHost->mResourceClusterNativeOutput, this->mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
       return this->mInputsHost->mPclusterNativeOutput;
     };
-    auto& gatherTimer = getTimer<TPCClusterDecompressor>("TPCDecompression", 0);
-    gatherTimer.Start();
-    if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocator, param(), GetProcessingSettings().deterministicGPUReconstruction)) {
+    std::unique_ptr<ClusterNative[]> tmpBuffer;
+    auto allocatorTmp = [&tmpBuffer](size_t size) {
+      return ((tmpBuffer = std::make_unique<ClusterNative[]>(size))).get();
+    };
+    auto& decompressTimer = getTimer<TPCClusterDecompressor>("TPCDecompression", 0);
+    auto allocatorUse = GetProcessingSettings().tpcApplyCFCutsAtDecoding ? std::function<ClusterNative*(size_t)>{allocatorTmp} : std::function<ClusterNative*(size_t)>{allocatorFinal};
+    decompressTimer.Start();
+    if (decomp.decompress(mIOPtrs.tpcCompressedClusters, *mClusterNativeAccess, allocatorUse, param(), GetProcessingSettings().deterministicGPUReconstruction)) {
       GPUError("Error decompressing clusters");
       return 1;
     }
-    gatherTimer.Stop();
+    if (GetProcessingSettings().tpcApplyCFCutsAtDecoding) {
+      ClusterNative* outputBuffer;
+      for (int32_t iPhase = 0; iPhase < 2; iPhase++) {
+        uint32_t countTotal = 0;
+        for (uint32_t iSector = 0; iSector < GPUCA_NSLICES; iSector++) {
+          for (uint32_t iRow = 0; iRow < GPUCA_ROW_COUNT; iRow++) {
+            uint32_t count = 0;
+            for (uint32_t k = 0; k < mClusterNativeAccess->nClusters[iSector][iRow]; k++) {
+              const ClusterNative& cl = mClusterNativeAccess->clusters[iSector][iRow][k];
+              bool keep = cl.qTot > param().rec.tpc.cfQTotCutoff && cl.qMax > param().rec.tpc.cfQMaxCutoff && (cl.sigmaPadPacked || !(cl.getFlags() & ClusterNative::flagSingle) || cl.qMax > param().rec.tpc.cfQMaxCutoffSinglePad) && (cl.sigmaTimePacked || !(cl.getFlags() & ClusterNative::flagSingle) || cl.qMax > param().rec.tpc.cfQMaxCutoffSingleTime);
+              count += keep;
+              countTotal += keep;
+              if (iPhase) {
+                outputBuffer[countTotal] = cl;
+              }
+            }
+            if (iPhase) {
+              mClusterNativeAccess->nClusters[iSector][iRow] = count;
+            }
+          }
+        }
+        if (iPhase) {
+          mClusterNativeAccess->clustersLinear = outputBuffer;
+          mClusterNativeAccess->setOffsetPtrs();
+        } else {
+          outputBuffer = allocatorFinal(countTotal);
+        }
+      }
+    }
+    decompressTimer.Stop();
     mIOPtrs.clustersNative = mClusterNativeAccess.get();
     if (mRec->IsGPU()) {
       AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
@@ -206,7 +206,7 @@ void GPUChainTracking::PrintOutputStat()
 {
   int32_t nTracks = 0, nAttachedClusters = 0, nAttachedClustersFitted = 0, nAdjacentClusters = 0;
   uint32_t nCls = GetProcessingSettings().doublePipeline ? mIOPtrs.clustersNative->nClustersTotal : GetTPCMerger().NMaxClusters();
-  if (ProcessingSettings().createO2Output > 1) {
+  if (GetProcessingSettings().createO2Output > 1) {
     nTracks = mIOPtrs.nOutputTracksTPCO2;
     nAttachedClusters = mIOPtrs.nMergedTrackHits;
   } else {
@@ -244,7 +244,7 @@ void GPUChainTracking::PrintOutputStat()
     }
     snprintf(trdText, 1024, " - TRD Tracker reconstructed %d tracks (%d tracklets)", nTRDTracks, nTRDTracklets);
   }
-  GPUInfo("Output Tracks: %d (%d / %d / %d / %d clusters (fitted / attached / adjacent / total) - %s format)%s", nTracks, nAttachedClustersFitted, nAttachedClusters, nAdjacentClusters, nCls, ProcessingSettings().createO2Output > 1 ? "O2" : "GPU", trdText);
+  GPUInfo("Output Tracks: %d (%d / %d / %d / %d clusters (fitted / attached / adjacent / total) - %s format)%s", nTracks, nAttachedClustersFitted, nAttachedClusters, nAdjacentClusters, nCls, GetProcessingSettings().createO2Output > 1 ? "O2" : "GPU", trdText);
 }
 
 void GPUChainTracking::SanityCheck()

Original file line number	Diff line number	Diff line change
`@@ -309,7 +309,7 @@ bool GPUChainTracking::ValidateSettings()`
`309`	`309`	`GPUError("Must use external output for double pipeline mode");`
`310`	`310`	`return false;`
`311`	`311`	`}`
`312`		`- if (ProcessingSettings().tpcCompressionGatherMode == 1) {`
	`312`	`+ if (GetProcessingSettings().tpcCompressionGatherMode == 1) {`
`313`	`313`	`GPUError("Double pipeline incompatible to compression mode 1");`
`314`	`314`	`return false;`
`315`	`315`	`}`
`@@ -318,7 +318,11 @@ bool GPUChainTracking::ValidateSettings()`
`318`	`318`	`return false;`
`319`	`319`	`}`
`320`	`320`	`}`
`321`		`- if ((GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && !(GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && (ProcessingSettings().tpcCompressionGatherMode == 1 \|\| ProcessingSettings().tpcCompressionGatherMode == 3)) {`
	`321`	`+ if ((GetRecoSteps() & GPUDataTypes::RecoStep::TPCDecompression) && GetProcessingSettings().tpcApplyCFCutsAtDecoding && !GetProcessingSettings().tpcUseOldCPUDecoding) {`
	`322`	`+ GPUError("tpcApplyCFCutsAtDecoding currently requires tpcUseOldCPUDecoding");`
	`323`	`+ return false;`
	`324`	`+ }`
	`325`	`+ if ((GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && !(GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && (GetProcessingSettings().tpcCompressionGatherMode == 1 \|\| GetProcessingSettings().tpcCompressionGatherMode == 3)) {`
`322`	`326`	`GPUError("Invalid tpcCompressionGatherMode for compression on CPU");`
`323`	`327`	`return false;`
`324`	`328`	`}`
`@@ -888,7 +892,7 @@ int32_t GPUChainTracking::RunChainFinalize()`
`888`	`892`	`if (GetProcessingSettings().eventDisplay->getDisplayControl() == 2) {`
`889`	`893`	`mDisplayRunning = false;`
`890`	`894`	`GetProcessingSettings().eventDisplay->DisplayExit();`
`891`		`- ProcessingSettings().eventDisplay = nullptr;`
	`895`	`+ const_cast<GPUSettingsProcessing&>(GetProcessingSettings()).eventDisplay = nullptr; // TODO: fixme - eventDisplay should probably not be put into ProcessingSettings in the first place`
`892`	`896`	`return (2);`
`893`	`897`	`}`
`894`	`898`	`GetProcessingSettings().eventDisplay->setDisplayControl(0);`
Original file line number	Diff line number	Diff line change
`@@ -206,7 +206,7 @@ void GPUChainTracking::PrintOutputStat()`
`206`	`206`	`{`
`207`	`207`	`int32_t nTracks = 0, nAttachedClusters = 0, nAttachedClustersFitted = 0, nAdjacentClusters = 0;`
`208`	`208`	`uint32_t nCls = GetProcessingSettings().doublePipeline ? mIOPtrs.clustersNative->nClustersTotal : GetTPCMerger().NMaxClusters();`
`209`		`- if (ProcessingSettings().createO2Output > 1) {`
	`209`	`+ if (GetProcessingSettings().createO2Output > 1) {`
`210`	`210`	`nTracks = mIOPtrs.nOutputTracksTPCO2;`
`211`	`211`	`nAttachedClusters = mIOPtrs.nMergedTrackHits;`
`212`	`212`	`} else {`
`@@ -244,7 +244,7 @@ void GPUChainTracking::PrintOutputStat()`
`244`	`244`	`}`
`245`	`245`	`snprintf(trdText, 1024, " - TRD Tracker reconstructed %d tracks (%d tracklets)", nTRDTracks, nTRDTracklets);`
`246`	`246`	`}`
`247`		`- GPUInfo("Output Tracks: %d (%d / %d / %d / %d clusters (fitted / attached / adjacent / total) - %s format)%s", nTracks, nAttachedClustersFitted, nAttachedClusters, nAdjacentClusters, nCls, ProcessingSettings().createO2Output > 1 ? "O2" : "GPU", trdText);`
	`247`	`+ GPUInfo("Output Tracks: %d (%d / %d / %d / %d clusters (fitted / attached / adjacent / total) - %s format)%s", nTracks, nAttachedClustersFitted, nAttachedClusters, nAdjacentClusters, nCls, GetProcessingSettings().createO2Output > 1 ? "O2" : "GPU", trdText);`
`248`	`248`	`}`
`249`	`249`
`250`	`250`	`void GPUChainTracking::SanityCheck()`