NN clusterizer: Bug fixes for MC labels (#14677)

ChSonnabend · alibuild · web-flow · commit d0daea3091c6 · 2025-09-18T10:50:03.000+02:00
* Bug-fix for MC labels

* Switch on timers on CPU for the first four lanes

* Improved boundary checking for input filling and cluster publishing

* Please consider the following formatting changes

---------

Co-authored-by: ALICE Action Bot &lt;alibuild@cern.ch&gt;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -709,19 +709,21 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
       }
     });
+    const int16_t maxFragmentLen = GetProcessingSettings().overrideClusterizerFragmentLen;
+    const uint32_t maxAllowedTimebin = param().par.continuousTracking ? std::max<int32_t>(param().continuousMaxTimeBin, maxFragmentLen) : TPC_MAX_TIME_BIN_TRIGGERED;
     for (int32_t sector = 0; sector < NSECTORS; sector++) {
       GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector];
       GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN;
       int32_t lane = sector % numLanes;
       clustererNN.mDeviceId = deviceId;
       clustererNN.mISector = sector;
       clustererNN.mNnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
-      nnApplications[lane].initClusterizer(nn_settings, clustererNN);
+      nnApplications[lane].initClusterizer(nn_settings, clustererNN, maxFragmentLen, maxAllowedTimebin);
       if (doGPU) {
         clustererNNShadow.mDeviceId = deviceId;
         clustererNNShadow.mISector = sector;
         clustererNNShadow.mNnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
-        nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
+        nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow, maxFragmentLen, maxAllowedTimebin);
       }
       if (nn_settings.nnClusterizerVerbosity > 2) {
         LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Processor initialized. Sector " << sector << ", lane " << lane << ", max clusters " << clustererNN.mNnClusterizerTotalClusters << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
@@ -1051,7 +1053,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             // NN evaluations
             if(clustererNNShadow.mNnClusterizerUseClassification) {
-              if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Start(); }
+              if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Start(); }
               if (clustererNNShadow.mNnInferenceInputDType == 0) {
                 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                   (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
@@ -1065,13 +1067,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                   (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
                 }
               }
-              if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); }
+              if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU || lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time...
               if (nn_settings.nnClusterizerVerbosity > 3) {
                 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN classification inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
               }
             }
             if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
-              if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); }
+              if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Start(); }
               if (clustererNNShadow.mNnInferenceInputDType == 0) {
                 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                   (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
@@ -1085,9 +1087,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                   (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
                 }
               }
-              if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Stop(); }
+              if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Stop(); }
               if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
-                if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Start(); }
+                if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Start(); }
                 if (clustererNNShadow.mNnInferenceInputDType == 0) {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                     (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
@@ -1101,7 +1103,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                     (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
                   }
                 }
-                if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Stop(); }
+                if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Stop(); }
               }
               if (nn_settings.nnClusterizerVerbosity > 3) {
                 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN regression inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -60,6 +60,10 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int32_t mISector = -1;
   int32_t mDeviceId = -1;
 
+  // charge array boundaries
+  int32_t maxFragmentLen = -1;
+  int32_t maxAllowedTimebin = -1; // == tpcMaxTimeBin
+
   // GPU optimizations
   uint32_t mNnClusterizerFullRowSize = 0;
   uint32_t mNnClusterizerFullPadSize = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -21,6 +21,7 @@
 #include "GPUReconstruction.h"
 #include "GPUTPCGeometry.h"
 #include "DataFormatsTPC/Constants.h"
+#include "clusterFinderDefs.h"
 
 #ifdef GPUCA_HAS_ONNX
 #include <onnxruntime_cxx_api.h>
@@ -84,7 +85,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
   }
 }
 
-void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN)
+void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN, int32_t maxFragmentLen, int32_t maxAllowedTimebin)
 {
   clustererNN.mNnClusterizerUseCfRegression = settings.nnClusterizerUseCfRegression;
   clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
@@ -109,6 +110,8 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
   clustererNN.mNnClusterizerUseClassification = settings.nnClusterizerUseClassification;
   clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags;
+  clustererNN.maxFragmentLen = maxFragmentLen == -1 ? TPC_MAX_FRAGMENT_LEN_GPU : maxFragmentLen;
+  clustererNN.maxAllowedTimebin = maxAllowedTimebin == -1 ? TPC_MAX_FRAGMENT_LEN_GPU : maxAllowedTimebin;
   if (clustererNN.mNnSigmoidTrafoClassThreshold) {
     clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold));
   } else {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -48,7 +48,7 @@ class GPUTPCNNClusterizerHost
   GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode = false) { init(settings, useDeterministicMode); }
 
   void init(const GPUSettingsProcessingNNclusterizer&, bool = false);
-  void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+  void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&, int32_t = -1, int32_t = -1);
   void createBoundary(GPUTPCNNClusterizer&);
   void createIndexLookup(GPUTPCNNClusterizer&);
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h