Adding volatile memory allocation and MockedOrtAllocator. Removing print statements and time measurements

ChSonnabend · ChSonnabend · commit 4b0825ac8d86 · 2025-04-16T13:54:20.000+02:00
diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
@@ -30,6 +30,7 @@ namespace Ort
 {
 struct SessionOptions;
 struct MemoryInfo;
+struct Env;
 } // namespace Ort
 
 namespace o2
@@ -55,6 +56,7 @@ class OrtModel
   // General purpose
   void initOptions(std::unordered_map<std::string, std::string> optionsMap);
   void initEnvironment();
+  void initSession();
   void memoryOnDevice(int32_t = 0);
   bool isInitialized() { return mInitialized; }
   void resetSession();
@@ -64,8 +66,9 @@ class OrtModel
   std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
   std::vector<std::string> getInputNames() const { return mInputNames; }
   std::vector<std::string> getOutputNames() const { return mOutputNames; }
-  Ort::SessionOptions& getSessionOptions();
-  Ort::MemoryInfo& getMemoryInfo();
+  Ort::SessionOptions* getSessionOptions();
+  Ort::MemoryInfo* getMemoryInfo();
+  Ort::Env* getEnv();
   int32_t getIntraOpNumThreads() const { return intraOpNumThreads; }
   int32_t getInterOpNumThreads() const { return interOpNumThreads; }
 
@@ -85,6 +88,7 @@ class OrtModel
       interOpNumThreads = threads;
     }
   }
+  void setEnv(Ort::Env*);
 
   // Conversion
   template <class I, class O>
@@ -103,7 +107,7 @@ class OrtModel
   template <class I, class O>
   void inference(I**, int64_t, O*);
 
-  void release();
+  void release(bool = false);
 
  private:
   // ORT variables -> need to be hidden as pImpl
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
@@ -99,9 +99,6 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
 
 void OrtModel::initEnvironment()
 {
-  if (allocateDeviceMemory) {
-    memoryOnDevice(deviceId);
-  }
   pImplOrt->env = std::make_shared<Ort::Env>(
     OrtLoggingLevel(loggingLevel),
     (envName.empty() ? "ORT" : envName.c_str()),
@@ -123,6 +120,13 @@ void OrtModel::initEnvironment()
     },
     (void*)3);
   (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
+}
+
+void OrtModel::initSession()
+{
+  if (allocateDeviceMemory) {
+    memoryOnDevice(deviceId);
+  }
   pImplOrt->session = std::make_shared<Ort::Session>(*pImplOrt->env, modelPath.c_str(), pImplOrt->sessionOptions);
   pImplOrt->ioBinding = std::make_unique<Ort::IoBinding>(*pImplOrt->session);
 
@@ -138,6 +142,13 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
 #if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
   if (deviceIndex >= 0) {
     (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1");
+    (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+    (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
+
+    // Arena memory shrinkage comes at performance cost
+    /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
+    // (pImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
+
     std::string dev_mem_str = "";
     if (deviceType == "ROCM") {
       dev_mem_str = "Hip";
@@ -159,14 +170,19 @@ void OrtModel::resetSession()
 }
 
 // Getters
-Ort::SessionOptions& OrtModel::getSessionOptions()
+Ort::SessionOptions* OrtModel::getSessionOptions()
+{
+  return &pImplOrt->sessionOptions;
+}
+
+Ort::MemoryInfo* OrtModel::getMemoryInfo()
 {
-  return pImplOrt->sessionOptions;
+  return &pImplOrt->memoryInfo;
 }
 
-Ort::MemoryInfo& OrtModel::getMemoryInfo()
+Ort::Env* OrtModel::getEnv()
 {
-  return pImplOrt->memoryInfo;
+  return (pImplOrt->env).get();
 }
 
 template <class I, class O>
@@ -234,6 +250,11 @@ void OrtModel::setIO()
   }
 }
 
+void OrtModel::setEnv(Ort::Env* env)
+{
+  pImplOrt->env = std::shared_ptr<Ort::Env>(env);
+}
+
 // Inference
 template <class I, class O>
 std::vector<O> OrtModel::inference(std::vector<I>& input)
@@ -404,8 +425,11 @@ template std::vector<float> OrtModel::inference<float, float>(std::vector<std::v
 template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>&);
 
 // Release session
-void OrtModel::release()
+void OrtModel::release(bool profilingEnabled)
 {
+  // if (profilingEnabled) {
+  //   pImplOrt->session->EndProfiling();
+  // }
   LOG(info) << "(ORT) Size of pImplOrt: " << sizeof(*pImplOrt) << " bytes";
 }
 
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
@@ -95,7 +95,6 @@ class GPUReconstructionProcessing : public GPUReconstruction
   void AddGPUEvents(T*& events);
 
   virtual std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() override;
-  // virtual void SetONNXGPUStream(Ort::SessionOptions&, int32_t, int32_t*) {}
 
   struct RecoStepTimerMeta {
     HighResTimer timerToGPU;
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -699,7 +699,7 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options
   // api.GetCurrentGpuDeviceId(deviceId);
   OrtROCMProviderOptions rocm_options;
   rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
-  rocm_options.arena_extend_strategy = 0;
+  rocm_options.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
   rocm_options.user_compute_stream = mInternals->Streams[stream];
   session_options.AppendExecutionProvider_ROCM(rocm_options);
 #endif // ORT_ROCM_BUILD
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
@@ -336,6 +336,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                                        O2::DetectorsRaw
                                        O2::Steer
                                        O2::ML
+                 PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime
                  PUBLIC_INCLUDE_DIRECTORIES ${INCDIRS}
                  SOURCES ${SRCS} ${SRCS_NO_CINT} ${SRCS_NO_H})
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -42,6 +42,7 @@
 #ifdef GPUCA_HAS_ONNX
 #include "GPUTPCNNClusterizerKernels.h"
 #include "GPUTPCNNClusterizerHost.h"
+// #include "ML/3rdparty/GPUORTFloat16.h"
 #endif
 
 using namespace o2::gpu;
@@ -630,31 +631,39 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
       nnApplications[lane].init(nn_settings);
       if (nnApplications[lane].modelsUsed[0]) {
-        SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
+        SetONNXGPUStream(*(nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_class).setDeviceId(deviceId);
         if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads);
         }
         (nnApplications[lane].model_class).initEnvironment();
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, 0);
+        (nnApplications[lane].model_class).initSession();
       }
       if (nnApplications[lane].modelsUsed[1]) {
-        SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
+        SetONNXGPUStream(*(nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_reg_1).setDeviceId(deviceId);
         if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads);
         }
+        // (nnApplications[lane].model_reg_1).setEnv((nnApplications[lane].model_class).getEnv());
         (nnApplications[lane].model_reg_1).initEnvironment();
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, 1);
+        (nnApplications[lane].model_reg_1).initSession();
       }
       if (nnApplications[lane].modelsUsed[2]) {
-        SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
+        SetONNXGPUStream(*(nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_reg_2).setDeviceId(deviceId);
         if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads);
         }
+        // (nnApplications[lane].model_reg_2).setEnv((nnApplications[lane].model_class).getEnv());
         (nnApplications[lane].model_reg_2).initEnvironment();
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_2).getEnv(), (nnApplications[lane].model_reg_2).getMemoryInfo(), mRec, 2);
+        (nnApplications[lane].model_reg_2).initSession();
       }
       if (nn_settings.nnClusterizerVerbosity < 3) {
-        LOG(info) << "Allocated ONNX stream for lane " << lane << " and device " << deviceId;
+        LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
       }
     });
     mRec->runParallelOuterLoop(doGPU, NSECTORS, [&](uint32_t sector) {
@@ -957,9 +966,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto start0 = std::chrono::high_resolution_clock::now();
             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data
-            auto stop0 = std::chrono::high_resolution_clock::now();
+            // auto stop0 = std::chrono::high_resolution_clock::now();
 
-            auto start1 = std::chrono::high_resolution_clock::now();
+            // auto start1 = std::chrono::high_resolution_clock::now();
 
             // NN evaluations
             if (clustererNNShadow.nnInferenceInputDType == 0) {
@@ -1006,7 +1015,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
               }
             }
 
-            auto stopNNs = std::chrono::high_resolution_clock::now();
+            // auto stopNNs = std::chrono::high_resolution_clock::now();
 
             // Publishing kernels
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
@@ -1020,25 +1029,41 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results
               }
             }
-            auto stop1 = std::chrono::high_resolution_clock::now();
 
-            time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count() / 1e9;
-            time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
-            time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
-          }
-          if (clustererNNShadow.nnClusterizerUseCfRegression) {
-            auto start1 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
-            auto stop1 = std::chrono::high_resolution_clock::now();
-            time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
-          }
-          if (clustererNNShadow.nnClusterizerVerbosity < 3) {
-            int acceptedClusters = 0;
-            for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
-              acceptedClusters += clustererNNShadow.outputDataClass[i];
-            }
-            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+            // for(int i = 0; i < iSize; ++i) {
+            //   if(clustererNNShadow.outputDataClass[i + batchStart] > 1) {
+            //     LOG(info) << "WARNING ORT: Output of  " << i + batchStart << " / " << clusterer.mPmemory->counters.nClusters << " is " << clustererNNShadow.modelProbabilities_16[i].ToFloat() << " and " << clustererNNShadow.outputDataClass[i + batchStart] << " thresh " << clustererNNShadow.nnClassThreshold << " instead of 0 or 1. Please check the model and the input data.";
+            //     // std::string input = "[";
+            //     // for(int j = 0; j < clustererNNShadow.nnClusterizerElementSize; j++){
+            //     //   input += std::to_string(clustererNNShadow.inputData_16[i * clustererNNShadow.nnClusterizerElementSize + j].ToFloat()) + ", ";
+            //     // }
+            //     // input += "]";
+            //     // LOG(info) << "Input is: " << input;
+            //   }
+            // }
+
+            // auto stop1 = std::chrono::high_resolution_clock::now();
+
+            // time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count() / 1e9;
+            // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+            // time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
           }
+          // if (clustererNNShadow.nnClusterizerUseCfRegression) {
+          //   auto start1 = std::chrono::high_resolution_clock::now();
+          //   runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+          //   auto stop1 = std::chrono::high_resolution_clock::now();
+          //   time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+          // }
+          // if (clustererNNShadow.nnClusterizerVerbosity < 3) {
+          //   int acceptedClusters = 0;
+          //   for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
+          //     if(clustererNNShadow.outputDataClass[i] > 1 || clustererNNShadow.outputDataClass[i] < 0) {
+          //       LOG(info) << "WARNING ORT 2: " << clustererNNShadow.outputDataClass[i] << " for index " << i << " / " << clusterer.mPmemory->counters.nClusters;
+          //     }
+          //     acceptedClusters += clustererNNShadow.outputDataClass[i];
+          //   }
+          //   LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+          // }
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif
@@ -1139,12 +1164,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     }
   }
   for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
-    if (GetProcessingSettings().nn.applyNNclusterizer) {
-      GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
-      nnApplication.model_class.release();
-      nnApplication.model_reg_1.release();
-      nnApplication.model_reg_2.release();
-    }
+    // if (GetProcessingSettings().nn.applyNNclusterizer) {
+    //   GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
+    //   nnApplication.model_class.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
+    //   nnApplication.model_reg_1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
+    //   nnApplication.model_reg_2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
+    // }
     if (transferRunning[i]) {
       ReleaseEvent(mEvents->stream[i], doGPU);
     }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h