AliceO2Group · ChSonnabend · Mar 15, 2025 · Mar 15, 2025 · Mar 15, 2025 · Mar 15, 2025
@@ -84,7 +84,7 @@ class OrtModel
 
   // Environment settings
   bool mInitialized = false;
-  std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
+  std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
   int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
   std::string printShape(const std::vector<int64_t>&);

@@ -48,7 +48,6 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   if (!optionsMap["model-path"].empty()) {
     modelPath = optionsMap["model-path"];
     device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
-    dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
     deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
     allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
     intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
@@ -226,19 +225,18 @@ template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Fl
 template <class I, class O>
 void OrtModel::inference(I* input, size_t input_size, O* output)
 {
-  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<int64_t> inputShape{input_size, (int64_t)mInputShapes[0][1]};
   Ort::Value inputTensor = Ort::Value(nullptr);
   if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
-    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size());
+    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
   } else {
-    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size());
+    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
   }
 
-  std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
-  size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]);
-  Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
+  std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
+  Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
 
-  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here
+  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is always correct here
 }
 
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);

@@ -228,7 +228,8 @@ AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural
 AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
 AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
 AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
-AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
 AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
 AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
 AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347")
@@ -249,6 +250,16 @@ AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The c
 AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
 AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
 AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
+// CCDB
+AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally")
+AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched")
+AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks")
+AddOption(nnCCDBFetchMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)")
+AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression")
+AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
+AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
+AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp")
+AddOption(nnCCDBInteractionRate, int, 50, "", 0, "Distinguishes between networks for different interaction rates [kHz].")
 AddHelp("help", 'h')
 EndConfig()
 

@@ -612,14 +612,16 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
 
 #ifdef GPUCA_HAS_ONNX
+  const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
+  GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only
   if (GetProcessingSettings().nn.applyNNclusterizer) {
-    uint32_t maxClusters = -1;
+    uint32_t maxClusters = 0;
+    nnApplication.init(nn_settings);
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
     }
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
-      const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
       clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
       clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
       clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
@@ -639,8 +641,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       } else {
         clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
       }
-      clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
-      GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+      clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos;
+      nnApplication.initClusterizer(nn_settings, clustererNN);
       AllocateRegisteredMemory(clustererNN.mMemoryId);
     }
   }
@@ -916,7 +918,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 #ifdef GPUCA_HAS_ONNX
           GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
           const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
-          GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+          int withMC = (doGPU && propagateMCLabels);
 
           if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
@@ -929,23 +931,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
             auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Filling the data
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
-            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
+            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnInferenceInputDType);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
             } else {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
             }
 
             if (!clustererNN.nnClusterizerUseCfRegression) {
-              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
-              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
-                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
+              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnInferenceInputDType);
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1
+              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
+                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnInferenceInputDType);
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
@@ -955,7 +957,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
           auto start1 = std::chrono::high_resolution_clock::now();
           if (clustererNN.nnClusterizerUseCfRegression) {
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;

@@ -24,25 +24,29 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}
 
 void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 {
-  if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
-    computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
-  } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {
-    computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
-  }
-  computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters);
-  if (nnClusterizerModelClassNumOutputNodes > 0) {
-    computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
-  }
-  if (!nnClusterizerUseCfRegression) {
-    if (nnClusterizerModelReg1NumOutputNodes > 0) {
-      computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
+  if (nnClusterizerBatchedMode > 0) {
+    if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) {
+      computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
+    } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
+      computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
     }
-    if (nnClusterizerModelReg2NumOutputNodes > 0) {
-      computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
+    computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
+    computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
+    computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
+    if (nnClusterizerModelClassNumOutputNodes > 0) {
+      computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
     }
+    if (!nnClusterizerUseCfRegression) {
+      if (nnClusterizerModelReg1NumOutputNodes > 0) {
+        computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
+      }
+      if (nnClusterizerModelReg2NumOutputNodes > 0) {
+        computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
+      }
+    }
+  }
+  if (nnClusterizerTotalClusters > 0) {
+    computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters);
   }
   return mem;
 }

@@ -42,7 +42,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int nnClusterizerSizeInputTime = 3;
   int nnClusterizerElementSize = -1;
   bool nnClusterizerAddIndexData = true;
-  float nnClassThreshold = 0.16;
+  float nnClassThreshold = 0.01;
   bool nnSigmoidTrafoClassThreshold = 1;
   int nnClusterizerUseCfRegression = 0;
   int nnClusterizerBatchedMode = 1;
@@ -54,11 +54,10 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int nnClusterizerModelClassNumOutputNodes = -1;
   int nnClusterizerModelReg1NumOutputNodes = -1;
   int nnClusterizerModelReg2NumOutputNodes = -1;
-  int nnClusterizerDtype = 0; // 0: float16, 1: float32
+  int nnInferenceInputDType = 0; // 0: float16, 1: float32
   int mISector = -1;
 
   // Memory allocation for neural network
-  uint class2_elements = 0;
   float* inputData32 = nullptr;
   OrtDataType::Float16_t* inputData16 = nullptr;
   float* outputDataClass = nullptr;