Updating to use explicit calls to kernels instead of if-statements

ChSonnabend · ChSonnabend · commit 57862a6bc547 · 2025-02-25T10:59:03.000+01:00
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -316,6 +316,7 @@ AddOption(nnClusterizerUseCFregression, int, 0, "", 0, "(bool, default = false)
 AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
 AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
 AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
+AddOption(nnClusterizerApplyNoiseSupression, int, 1, "", 0, "Applies the NoiseSupression kernel before the digits to the network are filled")
 AddOption(nnClusterizerApplyCfDeconvolution, int, 0, "", 0, "Applies the CFDeconvolution kernel before the digits to the network are filled")
 AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
 AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -896,6 +896,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
           clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
           clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
+          clusterer.nnClusterizerBoundaryFillValue = GetProcessingSettings().nnClusterizerBoundaryFillValue;
           if (GetProcessingSettings().nnClusterizerVerbosity < 0){
             clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
           } else {
@@ -955,25 +956,29 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             clusterer.centralCharges.resize(iSize);
 
             if (evalDtype == 1) {
-              clusterer.inputData32.resize(iSize * clusterer.nnClusterizerElementSize, (float)(GetProcessingSettings().nnClusterizerBoundaryFillValue));
+              clusterer.inputData32.resize(iSize * clusterer.nnClusterizerElementSize, (float)(clusterer.nnClusterizerBoundaryFillValue));
             } else {
-              clusterer.inputData16.resize(iSize * clusterer.nnClusterizerElementSize, (OrtDataType::Float16_t)((float)GetProcessingSettings().nnClusterizerBoundaryFillValue));
+              clusterer.inputData16.resize(iSize * clusterer.nnClusterizerElementSize, (OrtDataType::Float16_t)((float)clusterer.nnClusterizerBoundaryFillValue));
             }
 
             auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, 0, batchStart); // Filling the data
+            runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Filling the data
             auto stop0 = std::chrono::high_resolution_clock::now();
 
             auto start1 = std::chrono::high_resolution_clock::now();
             GPUTPCNNClusterizer::applyNetworkClass(clusterer, evalDtype);
-            runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 1, 0, batchStart); // Assigning class labels
+            if (clusterer.model_class.getNumOutputNodes()[0][1] > 1){
+              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
+            } else {
+              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
+            }
 
             if (!clusterer.nnClusterizerUseCFregression) {
               GPUTPCNNClusterizer::applyNetworkReg1(clusterer, evalDtype);
-              runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 2, 0, batchStart); // Running the NN for regression class 1
+              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Running the NN for regression class 1
               if (clusterer.model_class.getNumOutputNodes()[0][1] > 1 && reg_model_paths.size() > 1) {
                 GPUTPCNNClusterizer::applyNetworkReg2(clusterer, evalDtype);
-                runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 4, 0, batchStart); // Running the NN for regression class 2
+                runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Running the NN for regression class 2
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
@@ -985,7 +990,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
           auto start1 = std::chrono::high_resolution_clock::now();
           if(clusterer.nnClusterizerUseCFregression) {
-            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, -1, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -155,6 +155,7 @@ class GPUTPCClusterFinder : public GPUProcessor
   int nnClusterizerUseCFregression = 0;
   int nnClusterizerBatchedMode = 1;
   int nnClusterizerVerbosity = 0;
+  int nnClusterizerBoundaryFillValue = -1;
 
   // Memory allocation for neural network
   uint class2_elements = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -27,45 +27,59 @@ using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
 
 template <>
-GPUdii() void GPUTPCNNClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t mode, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  if (mode == -1) {
-    if (clusterer.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
-      return;
-    }
-    Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-    CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
-    tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
-    o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
-    GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
-  } else if (mode == 0){
-    GPUTPCNNClusterizer::fillInputData(nBlocks, nThreads, iBlock, iThread, clusterer, dtype, batchStart);
-  } else if (mode == 1) { // Class labels
-    if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) {
-      clusterer.outputDataClass[glo_idx + batchStart] = (int)(clusterer.modelProbabilities[glo_idx] > clusterer.nnClassThreshold);
-    } else {
-      auto elem_iterator = clusterer.modelProbabilities.begin() + (unsigned int)(glo_idx * clusterer.model_class.getNumOutputNodes()[0][1]);
-      uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1]));
-      clusterer.outputDataClass[glo_idx + batchStart] = class_label;
-    }
-  } else if (mode == 2) { // Publishing for class 1 regression
-    if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
-      return;
-    } else {
-      GPUTPCNNClusterizer::publishClustersReg1(glo_idx, smem, clusterer, dtype, mode, onlyMC, batchStart);
-    }
-  } else if (mode == 3) { // Refilling for class 2 regression -> Deprecated because it needs sequential accumulation
+  if (clusterer.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
     return;
-  } else if (mode == 4) { // Publishing for class 2 regression
-    if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
-      return;
-    } else {
-      GPUTPCNNClusterizer::publishClustersReg2(glo_idx, smem, clusterer, dtype, mode, onlyMC, batchStart);
-    }
-  }  else {
+  }
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
+  GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  GPUTPCNNClusterizer::fillInputData(nBlocks, nThreads, iBlock, iThread, clusterer, dtype, batchStart);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  clusterer.outputDataClass[glo_idx + batchStart] = (int)(clusterer.modelProbabilities[glo_idx] > clusterer.nnClassThreshold);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  auto elem_iterator = clusterer.modelProbabilities.begin() + (unsigned int)(glo_idx * clusterer.model_class.getNumOutputNodes()[0][1]);
+  uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1]));
+  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
+    return;
+  }
+  GPUTPCNNClusterizer::publishClustersReg1(glo_idx, smem, clusterer, dtype, onlyMC, batchStart);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
     return;
   }
+  GPUTPCNNClusterizer::publishClustersReg2(glo_idx, smem, clusterer, dtype, onlyMC, batchStart);
 }
 
 
@@ -188,7 +202,7 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
 }
 
 // ---------------------------------
-GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t mode, int8_t onlyMC, uint batchStart)
+GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
@@ -255,7 +269,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
 }
 
 // ---------------------------------
-GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t mode, int8_t onlyMC, uint batchStart)
+GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -55,14 +55,22 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
     return GPUDataTypes::RecoStep::TPCClusterFinding;
   }
 
-  // Float16 inmplementation
-  template <int32_t iKernel = defaultKernel>
-  GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, int8_t = 0, int8_t = 0, int8_t = 0, uint = 0);
+  enum K : int32_t {
+    runCfClusterizer = 0,
+    fillInputNN = 1,
+    determineClass1Labels = 2,
+    determineClass2Labels = 3,
+    publishClass1Regression = 4,
+    publishClass2Regression = 5,
+  };
+
+  template <int32_t iKernel = defaultKernel, typename... Args>
+  GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, int8_t = 0, int8_t = 0, uint = 0, Args...);
 
   static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, int8_t, uint);
 
-  static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, int8_t, uint);
-  static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, int8_t, uint);
+  static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
+  static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
 
   static void applyNetworkClass(processorType&, int8_t = 0, uint = 0);
 
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
@@ -110,7 +110,12 @@ o2_gpu_add_kernel("GPUTPCCFPeakFinder"                                "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"        "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB      single)
-o2_gpu_add_kernel("GPUTPCNNClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t mode int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, runCfClusterizer"             "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, fillInputNN"                  "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, determineClass1Labels"        "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, determineClass2Labels"        "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, publishClass1Regression"      "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, publishClass2Regression"      "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"           "= TPCCLUSTERFINDER"                                    NO      single)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                 "= TPCCLUSTERFINDER"                                    NO      single GPUTPCLinearLabels* out)