Please consider the following formatting changes

alibuild · alibuild · commit a6bbedbc5b84 · 2025-03-10T23:52:23.000Z
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
@@ -303,7 +303,7 @@ std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Floa
   return outputValuesVec;
 }
 
-template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+template <> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
 float* OrtModel::inference(float* input, size_t input_size)
 {
   std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
@@ -315,7 +315,7 @@ float* OrtModel::inference(float* input, size_t input_size)
   return outputValues;
 }
 
-template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+template <> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
 float* OrtModel::inference(OrtDataType::Float16_t* input, size_t input_size)
 {
   std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
@@ -327,34 +327,30 @@ float* OrtModel::inference(OrtDataType::Float16_t* input, size_t input_size)
   return outputValues;
 }
 
-template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+template <> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
 void OrtModel::inference(float* input, size_t input_size, float* output)
 {
   std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   Ort::Value inputTensor = Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size());
-  
+
   std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
   size_t outputSize = (int64_t)((input_size / mInputShapes[0][1]) * outputShape[1]);
   Ort::Value outputTensor = Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
-  
-  (pImplOrt->session)->Run(pImplOrt->runOptions, 
-                           inputNamesChar.data(), &inputTensor, 1,
-                           outputNamesChar.data(), &outputTensor, 1);
+
+  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, 1);
 }
 
-template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+template <> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
 void OrtModel::inference(OrtDataType::Float16_t* input, size_t input_size, float* output)
 {
   std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   Ort::Value inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size());
-  
+
   std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
   size_t outputSize = (int64_t)((input_size / mInputShapes[0][1]) * outputShape[1]);
   Ort::Value outputTensor = Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
-  
-  (pImplOrt->session)->Run(pImplOrt->runOptions, 
-                           inputNamesChar.data(), &inputTensor, 1,
-                           outputNamesChar.data(), &outputTensor, 1);
+
+  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, 1);
 }
 
 template <>
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -163,20 +163,20 @@ class GPUTPCClusterFinder : public GPUProcessor
   int nnClusterizerModelReg1NumOutputNodes = -1;
   int nnClusterizerModelReg2NumOutputNodes = -1;
   uint nnClusterizerCurrentSize = -1; // This variable determines the size of the memory pointers. It will be set at runtime.
-  int nnClusterizerDtype = 0; // 0: float16, 1: float32
+  int nnClusterizerDtype = 0;         // 0: float16, 1: float32
 
   // Memory allocation for neural network
   uint class2_elements = 0;
-  float* inputData32=nullptr;
-  OrtDataType::Float16_t* inputData16=nullptr;
-  float* outputDataClass=nullptr;
-  float* modelProbabilities=nullptr;
-  float* outputDataReg1=nullptr;
-  float* outputDataReg2=nullptr;
-
-  ChargePos* peakPositions=nullptr;
-  bool* clusterFlags=nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx
-  float* centralCharges=nullptr;
+  float* inputData32 = nullptr;
+  OrtDataType::Float16_t* inputData16 = nullptr;
+  float* outputDataClass = nullptr;
+  float* modelProbabilities = nullptr;
+  float* outputDataReg1 = nullptr;
+  float* outputDataReg2 = nullptr;
+
+  ChargePos* peakPositions = nullptr;
+  bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx
+  float* centralCharges = nullptr;
 
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -146,9 +146,9 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
       for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
         if (!is_boundary) {
           ChargePos tmp_pos(row + r, pad + p, time + t);
-          if (r == 0 && !clusterer.clusterFlags[2*glo_idx] && std::abs(p) < 3 && std::abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
-            clusterer.clusterFlags[2*glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
-            clusterer.clusterFlags[2*glo_idx + 1] = clusterer.clusterFlags[2*glo_idx];
+          if (r == 0 && !clusterer.clusterFlags[2 * glo_idx] && std::abs(p) < 3 && std::abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
+            clusterer.clusterFlags[2 * glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
+            clusterer.clusterFlags[2 * glo_idx + 1] = clusterer.clusterFlags[2 * glo_idx];
           }
           if (dtype == 0) {
             clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
@@ -218,12 +218,12 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
     }
 
     pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg1[model_output_index + 4],
-      static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg1[model_output_index],
-      clusterer.outputDataReg1[model_output_index + 2],
-      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg1[model_output_index + 1],
-      clusterer.outputDataReg1[model_output_index + 3],
-      clusterer.clusterFlags[2*glo_idx],
-      clusterer.clusterFlags[2*glo_idx + 1]);
+               static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg1[model_output_index],
+               clusterer.outputDataReg1[model_output_index + 2],
+               static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg1[model_output_index + 1],
+               clusterer.outputDataReg1[model_output_index + 3],
+               clusterer.clusterFlags[2 * glo_idx],
+               clusterer.clusterFlags[2 * glo_idx + 1]);
 
     tpc::ClusterNative myCluster;
     bool rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
@@ -296,12 +296,12 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
 
     // Cluster 1
     pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg2[model_output_index + 8],
-      static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg2[model_output_index],
-      clusterer.outputDataReg2[model_output_index + 4],
-      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg2[model_output_index + 2],
-      clusterer.outputDataReg2[model_output_index + 6],
-      clusterer.clusterFlags[2*glo_idx],
-      clusterer.clusterFlags[2*glo_idx + 1]);
+               static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg2[model_output_index],
+               clusterer.outputDataReg2[model_output_index + 4],
+               static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg2[model_output_index + 2],
+               clusterer.outputDataReg2[model_output_index + 6],
+               clusterer.clusterFlags[2 * glo_idx],
+               clusterer.clusterFlags[2 * glo_idx + 1]);
 
     tpc::ClusterNative myCluster;
     bool rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
@@ -331,12 +331,12 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
 
     // Cluster 2
     pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg2[model_output_index + 9],
-      static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg2[model_output_index + 1],
-      clusterer.outputDataReg2[model_output_index + 5],
-      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg2[model_output_index + 3],
-      clusterer.outputDataReg2[model_output_index + 7],
-      clusterer.clusterFlags[2*glo_idx],
-      clusterer.clusterFlags[2*glo_idx + 1]);
+               static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg2[model_output_index + 1],
+               clusterer.outputDataReg2[model_output_index + 5],
+               static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg2[model_output_index + 3],
+               clusterer.outputDataReg2[model_output_index + 7],
+               clusterer.clusterFlags[2 * glo_idx],
+               clusterer.clusterFlags[2 * glo_idx + 1]);
 
     rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
     if (rejectCluster) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx
@@ -16,22 +16,22 @@
 
 using namespace o2::gpu;
 
-GPUTPCNNClusterizerInternals::GPUTPCNNClusterizerInternals(GPUSettingsProcessing settings, processorType& clusterer) {
+GPUTPCNNClusterizerInternals::GPUTPCNNClusterizerInternals(GPUSettingsProcessing settings, processorType& clusterer)
+{
   clusterer_internal = &clusterer;
   GPUSettingsProcessingNNclusterizer nn_settings = settings.nn;
   OrtOptions = {{"model-path", nn_settings.nnClassificationPath},
-    {"device", nn_settings.nnInferenceDevice},
-    {"device-id", std::to_string(nn_settings.nnInferenceDeviceId)},
-    {"allocate-device-memory", std::to_string(nn_settings.nnInferenceAllocateDevMem)},
-    {"dtype", nn_settings.nnInferenceDtype},
-    {"intra-op-num-threads", std::to_string(nn_settings.nnInferenceThreadsPerNN)},
-    {"enable-optimizations", std::to_string(nn_settings.nnInferenceEnableOrtOptimization)},
-    {"enable-profiling", std::to_string(nn_settings.nnInferenceOrtProfiling)},
-    {"profiling-output-path", nn_settings.nnInferenceOrtProfilingPath},
-    {"logging-level", std::to_string(nn_settings.nnInferenceVerbosity)}};
+                {"device", nn_settings.nnInferenceDevice},
+                {"device-id", std::to_string(nn_settings.nnInferenceDeviceId)},
+                {"allocate-device-memory", std::to_string(nn_settings.nnInferenceAllocateDevMem)},
+                {"dtype", nn_settings.nnInferenceDtype},
+                {"intra-op-num-threads", std::to_string(nn_settings.nnInferenceThreadsPerNN)},
+                {"enable-optimizations", std::to_string(nn_settings.nnInferenceEnableOrtOptimization)},
+                {"enable-profiling", std::to_string(nn_settings.nnInferenceOrtProfiling)},
+                {"profiling-output-path", nn_settings.nnInferenceOrtProfilingPath},
+                {"logging-level", std::to_string(nn_settings.nnInferenceVerbosity)}};
   sector = clusterer.mISector;
 
-
   model_class.init(OrtOptions);
   reg_model_paths = splitString(nn_settings.nnRegressionPath, ":");
 
@@ -51,24 +51,26 @@ GPUTPCNNClusterizerInternals::GPUTPCNNClusterizerInternals(GPUSettingsProcessing
   }
 }
 
-void* GPUTPCNNClusterizerInternals::setIOPointers(void* mem) {
-  if (clusterer_internal->nnClusterizerDtype == 0){
-      computePointerWithAlignment(mem, clusterer_internal->inputData16, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerElementSize);
-  } else if (clusterer_internal->nnClusterizerDtype == 1){
-      computePointerWithAlignment(mem, clusterer_internal->inputData32, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerElementSize);
+void* GPUTPCNNClusterizerInternals::setIOPointers(void* mem)
+{
+  if (clusterer_internal->nnClusterizerDtype == 0) {
+    computePointerWithAlignment(mem, clusterer_internal->inputData16, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerElementSize);
+  } else if (clusterer_internal->nnClusterizerDtype == 1) {
+    computePointerWithAlignment(mem, clusterer_internal->inputData32, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerElementSize);
   }
   computePointerWithAlignment(mem, clusterer_internal->outputDataClass, clusterer_internal->nnClusterizerCurrentSize);
   computePointerWithAlignment(mem, clusterer_internal->modelProbabilities, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerModelClassNumOutputNodes);
   computePointerWithAlignment(mem, clusterer_internal->outputDataReg1, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerModelReg1NumOutputNodes);
   computePointerWithAlignment(mem, clusterer_internal->outputDataReg2, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerModelReg2NumOutputNodes);
   computePointerWithAlignment(mem, clusterer_internal->peakPositions, clusterer_internal->nnClusterizerCurrentSize);
-  computePointerWithAlignment(mem, clusterer_internal->clusterFlags, 2*clusterer_internal->nnClusterizerCurrentSize);
+  computePointerWithAlignment(mem, clusterer_internal->clusterFlags, 2 * clusterer_internal->nnClusterizerCurrentSize);
   computePointerWithAlignment(mem, clusterer_internal->centralCharges, clusterer_internal->nnClusterizerCurrentSize);
 
   return mem;
 }
 
-void GPUTPCNNClusterizerInternals::RegisterMemoryAllocation() {
+void GPUTPCNNClusterizerInternals::RegisterMemoryAllocation()
+{
   AllocateAndInitializeLate();
   int32_t memType = GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK;
   mMemoryId = mRec->RegisterMemoryAllocation(this, &GPUTPCNNClusterizerInternals::setIOPointers, memType, "TPCNNClusterer", GPUMemoryReuse{GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::NNClusterer, (uint16_t)(sector % mRec->GetProcessingSettings().nTPCClustererLanes)});
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
@@ -30,7 +30,7 @@ namespace o2::gpu
 class GPUTPCNNClusterizerInternals : public GPUProcessor
 {
  public:
- typedef GPUTPCClusterFinder processorType;
+  typedef GPUTPCClusterFinder processorType;
   GPUTPCNNClusterizerInternals() = default;
   GPUTPCNNClusterizerInternals(GPUSettingsProcessing, processorType&);
   void* setIOPointers(void*);
@@ -42,20 +42,22 @@ class GPUTPCNNClusterizerInternals : public GPUProcessor
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
   std::vector<std::string> reg_model_paths;
+
  private:
- processorType* clusterer_internal;
+  processorType* clusterer_internal;
   int sector = -1;
   int16_t mMemoryId = -1;
 
   // Avoid including CommonUtils/StringUtils.h
-  std::vector<std::string> splitString(const std::string& input, const std::string& delimiter) {
+  std::vector<std::string> splitString(const std::string& input, const std::string& delimiter)
+  {
     std::vector<std::string> tokens;
     std::size_t pos = 0;
     std::size_t found;
 
     while ((found = input.find(delimiter, pos)) != std::string::npos) {
-        tokens.push_back(input.substr(pos, found - pos));
-        pos = found + delimiter.length();
+      tokens.push_back(input.substr(pos, found - pos));
+      pos = found + delimiter.length();
     }
     tokens.push_back(input.substr(pos));