Limiting threads for ONNX evaluation

ChSonnabend · ChSonnabend · commit 83d02579b0f1 · 2025-04-03T21:49:53.000+02:00
diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt
@@ -9,13 +9,14 @@
 # granted to it by virtue of its status as an Intergovernmental Organization
 # or submit itself to any jurisdiction.
 
-# Pass ORT variables as a preprocessor definition
-add_compile_definitions(ORT_ROCM_BUILD=${ORT_ROCM_BUILD})
-add_compile_definitions(ORT_CUDA_BUILD=${ORT_CUDA_BUILD})
-add_compile_definitions(ORT_MIGRAPHX_BUILD=${ORT_MIGRAPHX_BUILD})
-add_compile_definitions(ORT_TENSORRT_BUILD=${ORT_TENSORRT_BUILD})
-
 o2_add_library(ML
                SOURCES src/OrtInterface.cxx
                TARGETVARNAME targetName
                PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime)
+
+# Pass ORT variables as a preprocessor definition
+target_compile_definitions(${targetName} PRIVATE
+    ORT_ROCM_BUILD=$<BOOL:${ORT_ROCM_BUILD}>
+    ORT_CUDA_BUILD=$<BOOL:${ORT_CUDA_BUILD}>
+    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>
+    ORT_TENSORRT_BUILD=$<BOOL:${ORT_TENSORRT_BUILD}>)
diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
@@ -64,11 +64,15 @@ class OrtModel
   std::vector<std::string> getOutputNames() const { return mOutputNames; }
   Ort::SessionOptions& getSessionOptions();
   Ort::MemoryInfo& getMemoryInfo();
+  int32_t getIntraOpNumThreads() const { return intraOpNumThreads; }
+  int32_t getInterOpNumThreads() const { return interOpNumThreads; }
 
   // Setters
   void setDeviceId(int32_t id) { deviceId = id; }
   void setIO();
   void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+  void setIntraOpNumThreads(int threads) { if(deviceType == "CPU") { intraOpNumThreads = threads; } }
+  void setInterOpNumThreads(int threads) { if(deviceType == "CPU") { interOpNumThreads = threads; } }
 
   // Conversion
   template <class I, class O>
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -627,6 +627,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     uint32_t maxClusters = 0;
     int32_t deviceId = -1;
     int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
+    int32_t maxThreads = mRec->MemoryScalers()->nTPCdigits / 6000;
     for (uint32_t lane = 0; lane < NSECTORS; lane++) {
       maxClusters = std::max(maxClusters, processors()->tpcClusterer[lane].mNMaxClusters);
     }
@@ -635,16 +636,25 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       if (nnApplications[lane].modelsUsed[0]) {
         SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_class).setDeviceId(deviceId);
+        if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) {
+          nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads);
+        }
         (nnApplications[lane].model_class).initEnvironment();
       }
       if (nnApplications[lane].modelsUsed[1]) {
         SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_reg_1).setDeviceId(deviceId);
+        if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) {
+          nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads);
+        }
         (nnApplications[lane].model_reg_1).initEnvironment();
       }
       if (nnApplications[lane].modelsUsed[2]) {
         SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_reg_2).setDeviceId(deviceId);
+        if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) {
+          nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads);
+        }
         (nnApplications[lane].model_reg_2).initEnvironment();
       }
       if (nn_settings.nnClusterizerVerbosity < 3) {