NN clustering: VRAM memory leak fix + (u)int -> (u)int32_t (#14272)

ChSonnabend · alibuild · web-flow · commit 0c5140edf08d · 2025-05-19T12:48:54.000+02:00
* VRAM memory leak fix + (u)int -&gt; (u)int32_t

* Please consider the following formatting changes

* Fixing my own debug messages

* Making shared pointer for releasing

* Bug-fix

* Adding Davids patch

---------

Co-authored-by: ALICE Action Bot &lt;alibuild@cern.ch&gt;
diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
@@ -45,14 +45,10 @@ class OrtModel
 
  public:
   // Constructors & destructors
-  OrtModel() = default;
-  OrtModel(std::unordered_map<std::string, std::string> optionsMap) { init(optionsMap); }
-  void init(std::unordered_map<std::string, std::string> optionsMap)
-  {
-    initOptions(optionsMap);
-    initEnvironment();
-  }
-  virtual ~OrtModel() = default;
+  OrtModel();
+  OrtModel(std::unordered_map<std::string, std::string> optionsMap);
+  void init(std::unordered_map<std::string, std::string> optionsMap);
+  virtual ~OrtModel();
 
   // General purpose
   void initOptions(std::unordered_map<std::string, std::string> optionsMap);
@@ -113,7 +109,7 @@ class OrtModel
  private:
   // ORT variables -> need to be hidden as pImpl
   struct OrtVariables;
-  OrtVariables* mPImplOrt;
+  std::unique_ptr<OrtVariables> mPImplOrt;
 
   // Input & Output specifications of the loaded network
   std::vector<const char*> mInputNamesChar, mOutputNamesChar;
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
@@ -27,11 +27,20 @@ namespace o2
 namespace ml
 {
 
+OrtModel::OrtModel() = default;
+OrtModel::OrtModel(std::unordered_map<std::string, std::string> optionsMap) { init(optionsMap); }
+OrtModel::~OrtModel() = default;
+void OrtModel::init(std::unordered_map<std::string, std::string> optionsMap)
+{
+  initOptions(optionsMap);
+  initEnvironment();
+}
+
 struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file
   // ORT runtime objects
   Ort::RunOptions runOptions;
-  std::shared_ptr<Ort::Env> env = nullptr;
-  std::shared_ptr<Ort::Session> session = nullptr; ///< ONNX session
+  std::unique_ptr<Ort::Env> env = nullptr;
+  std::unique_ptr<Ort::Session> session = nullptr; ///< ONNX session
   Ort::SessionOptions sessionOptions;
   Ort::AllocatorWithDefaultOptions allocator;
   Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
@@ -41,7 +50,7 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c
 // General purpose
 void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsMap)
 {
-  mPImplOrt = new OrtVariables();
+  mPImplOrt = std::make_unique<OrtVariables>();
 
   // Load from options map
   if (!optionsMap.contains("model-path")) {
@@ -101,7 +110,7 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
 
 void OrtModel::initEnvironment()
 {
-  mPImplOrt->env = std::make_shared<Ort::Env>(
+  mPImplOrt->env = std::make_unique<Ort::Env>(
     OrtLoggingLevel(mLoggingLevel),
     (mEnvName.empty() ? "ORT" : mEnvName.c_str()),
     // Integrate ORT logging into Fairlogger
@@ -129,7 +138,7 @@ void OrtModel::initSession()
   if (mAllocateDeviceMemory) {
     memoryOnDevice(mDeviceId);
   }
-  mPImplOrt->session = std::make_shared<Ort::Session>(*mPImplOrt->env, mModelPath.c_str(), mPImplOrt->sessionOptions);
+  mPImplOrt->session = std::make_unique<Ort::Session>(*mPImplOrt->env, mModelPath.c_str(), mPImplOrt->sessionOptions);
   mPImplOrt->ioBinding = std::make_unique<Ort::IoBinding>(*mPImplOrt->session);
 
   setIO();
@@ -147,12 +156,12 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
     (mPImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1");                    // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
     (mPImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0");          // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
     // Arena memory shrinkage comes at performance cost
-    /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
-    // (mPImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
+    // For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
+    (mPImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
 
     std::string dev_mem_str = "";
     if (mDeviceType == "ROCM") {
-      dev_mem_str = "Hip";
+      dev_mem_str = "HipPinned";
     }
     if (mDeviceType == "CUDA") {
       dev_mem_str = "Cuda";
@@ -166,7 +175,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
 
 void OrtModel::resetSession()
 {
-  mPImplOrt->session = std::make_shared<Ort::Session>(*(mPImplOrt->env), mModelPath.c_str(), mPImplOrt->sessionOptions);
+  mPImplOrt->session = std::make_unique<Ort::Session>(*(mPImplOrt->env), mModelPath.c_str(), mPImplOrt->sessionOptions);
 }
 
 // Getters
@@ -252,7 +261,7 @@ void OrtModel::setIO()
 
 void OrtModel::setEnv(Ort::Env* env)
 {
-  mPImplOrt->env = std::shared_ptr<Ort::Env>(env);
+  mPImplOrt->env.reset(env);
 }
 
 // Inference
@@ -308,6 +317,14 @@ void OrtModel::inference(I* input, int64_t input_size, O* output)
   (mPImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor);
 
   (mPImplOrt->session)->Run(mPImplOrt->runOptions, *mPImplOrt->ioBinding);
+  // mPImplOrt->session->Run(
+  //   mPImplOrt->runOptions,
+  //   mInputNamesChar.data(),
+  //   &inputTensor,
+  //   mInputNamesChar.size(),
+  //   mOutputNamesChar.data(),
+  //   &outputTensor,
+  //   mOutputNamesChar.size());
 }
 
 template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t*, int64_t, OrtDataType::Float16_t*);
@@ -427,10 +444,7 @@ template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Fl
 // Release session
 void OrtModel::release(bool profilingEnabled)
 {
-  // if (profilingEnabled) {
-  //   mPImplOrt->session->EndProfiling();
-  // }
-  LOG(info) << "(ORT) Size of mPImplOrt: " << sizeof(*mPImplOrt) << " bytes";
+  mPImplOrt.reset();
 }
 
 // private
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -659,7 +659,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         // But environment must be valid, so we init the model environment first and use it here afterwards.
         // Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator.
         // TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // if (lane == 0) {
+        //   nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // }
         // recreateMemoryAllocator = true;
         (nnApplications[lane].mModelClass).initSession();
       }
@@ -671,7 +673,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         }
         // (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv());
         (nnApplications[lane].mModelReg1).initEnvironment();
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
         (nnApplications[lane].mModelReg1).initSession();
       }
       if (nnApplications[lane].mModelsUsed[2]) {
@@ -680,8 +682,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].mModelReg2.setIntraOpNumThreads(maxThreads);
         }
+        // (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv());
         (nnApplications[lane].mModelReg2).initEnvironment();
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
         (nnApplications[lane].mModelReg2).initSession();
       }
       if (nn_settings.nnClusterizerVerbosity < 3) {
@@ -707,8 +710,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     if (doGPU) {
       WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
     }
-    LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes";
-    LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes";
   }
 #endif
 
@@ -976,6 +977,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
           GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];
 
+          // // bool recreateMemoryAllocator = false;
+          // if (lane == 0) {
+          //   (nnApplications[lane].mModelClass).initEnvironment();
+          //   nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, 0);
+          // }
+          // // recreateMemoryAllocator = true;
+          // (nnApplications[lane].mModelClass).initSession();
+          // (nnApplications[lane].mModelReg1).initSession();
+
           int withMC = (doGPU && propagateMCLabels);
 
           if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
@@ -1188,12 +1198,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     }
   }
   for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
-    // if (GetProcessingSettings().nn.applyNNclusterizer) {
-    //   GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
-    //   nnApplication.mModelClass.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
-    //   nnApplication.mModelReg1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
-    //   nnApplication.mModelReg2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
-    // }
+    if (GetProcessingSettings().nn.applyNNclusterizer) {
+      LOG(info) << "(ORT) Environment releasing...";
+      GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
+      nnApplication.mModelClass.release(true);
+      nnApplication.mModelReg1.release(true);
+      nnApplication.mModelReg2.release(true);
+    }
     if (transferRunning[i]) {
       ReleaseEvent(mEvents->stream[i], doGPU);
     }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -136,8 +136,8 @@ struct MockedOrtAllocator : OrtAllocator {
   std::atomic<size_t> memory_inuse{0};
   std::atomic<size_t> num_allocations{0};
   std::atomic<size_t> num_reserve_allocations{0};
-  OrtMemoryInfo* memory_info;
-  GPUReconstruction* rec;
+  OrtMemoryInfo* mMemoryInfoInternal;
+  GPUReconstruction* mRecInternal;
 };
 
 MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info)
@@ -147,37 +147,36 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info
   OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
   OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
   OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
-  rec = r;
-  memory_info = info;
+  mRecInternal = r;
+  mMemoryInfoInternal = info;
 }
 
 MockedOrtAllocator::~MockedOrtAllocator()
 {
-  // Ort::GetApi().ReleaseMemoryInfo(memory_info);
+  // Ort::GetApi().ReleaseMemoryInfo(mMemoryInfoInternal);
   (void)0; // Suppress warning for empty destructor
 }
 
 void* MockedOrtAllocator::Alloc(size_t size)
 {
-  // LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes";
-  return rec->AllocateVolatileDeviceMemory(size);
+  LOG(info) << "(ORT) Allocating direct memory of size " << size << " bytes";
+  return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
 }
 
 void* MockedOrtAllocator::Reserve(size_t size)
 {
-  // LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes";
-  return rec->AllocateVolatileDeviceMemory(size);
+  LOG(info) << "(ORT) Reserving direct memory of size " << size << " bytes";
+  return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
 }
 
 void MockedOrtAllocator::Free(void* p)
 {
   // LOG(info) << "(ORT) Freeing volatile memory " << p;
-  rec->ReturnVolatileDeviceMemory();
 }
 
 const OrtMemoryInfo* MockedOrtAllocator::Info() const
 {
-  return memory_info;
+  return mMemoryInfoInternal;
 }
 
 size_t MockedOrtAllocator::NumAllocations() const
@@ -197,7 +196,7 @@ void MockedOrtAllocator::LeakCheck()
   }
 }
 
-void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
+void GPUTPCNNClusterizerHost::directOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
 {
   mMockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)(*memInfo));
   if (recreate) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -53,7 +53,7 @@ class GPUTPCNNClusterizerHost
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
 
   // ONNX
-  void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
+  void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
   MockedOrtAllocator* getMockedAllocator();
   const OrtMemoryInfo* getMockedMemoryInfo();
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h

Original file line number	Diff line number	Diff line change
`@@ -136,8 +136,8 @@ struct MockedOrtAllocator : OrtAllocator {`
`136`	`136`	`std::atomic<size_t> memory_inuse{0};`
`137`	`137`	`std::atomic<size_t> num_allocations{0};`
`138`	`138`	`std::atomic<size_t> num_reserve_allocations{0};`
`139`		`- OrtMemoryInfo* memory_info;`
`140`		`- GPUReconstruction* rec;`
	`139`	`+ OrtMemoryInfo* mMemoryInfoInternal;`
	`140`	`+ GPUReconstruction* mRecInternal;`
`141`	`141`	`};`
`142`	`142`
`143`	`143`	`MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info)`
`@@ -147,37 +147,36 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info`
`147`	`147`	`OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };`
`148`	`148`	`OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };`
`149`	`149`	`OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };`
`150`		`- rec = r;`
`151`		`- memory_info = info;`
	`150`	`+ mRecInternal = r;`
	`151`	`+ mMemoryInfoInternal = info;`
`152`	`152`	`}`
`153`	`153`
`154`	`154`	`MockedOrtAllocator::~MockedOrtAllocator()`
`155`	`155`	`{`
`156`		`- // Ort::GetApi().ReleaseMemoryInfo(memory_info);`
	`156`	`+ // Ort::GetApi().ReleaseMemoryInfo(mMemoryInfoInternal);`
`157`	`157`	`(void)0; // Suppress warning for empty destructor`
`158`	`158`	`}`
`159`	`159`
`160`	`160`	`void* MockedOrtAllocator::Alloc(size_t size)`
`161`	`161`	`{`
`162`		`- // LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes";`
`163`		`- return rec->AllocateVolatileDeviceMemory(size);`
	`162`	`+ LOG(info) << "(ORT) Allocating direct memory of size " << size << " bytes";`
	`163`	`+ return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU \| GPUMemoryResource::MEMORY_STACK);`
`164`	`164`	`}`
`165`	`165`
`166`	`166`	`void* MockedOrtAllocator::Reserve(size_t size)`
`167`	`167`	`{`
`168`		`- // LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes";`
`169`		`- return rec->AllocateVolatileDeviceMemory(size);`
	`168`	`+ LOG(info) << "(ORT) Reserving direct memory of size " << size << " bytes";`
	`169`	`+ return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU \| GPUMemoryResource::MEMORY_STACK);`
`170`	`170`	`}`
`171`	`171`
`172`	`172`	`void MockedOrtAllocator::Free(void* p)`
`173`	`173`	`{`
`174`	`174`	`// LOG(info) << "(ORT) Freeing volatile memory " << p;`
`175`		`- rec->ReturnVolatileDeviceMemory();`
`176`	`175`	`}`
`177`	`176`
`178`	`177`	`const OrtMemoryInfo* MockedOrtAllocator::Info() const`
`179`	`178`	`{`
`180`		`- return memory_info;`
	`179`	`+ return mMemoryInfoInternal;`
`181`	`180`	`}`
`182`	`181`
`183`	`182`	`size_t MockedOrtAllocator::NumAllocations() const`
`@@ -197,7 +196,7 @@ void MockedOrtAllocator::LeakCheck()`
`197`	`196`	`}`
`198`	`197`	`}`
`199`	`198`
`200`		`-void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)`
	`199`	`+void GPUTPCNNClusterizerHost::directOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)`
`201`	`200`	`{`
`202`	`201`	`mMockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo)(memInfo));`
`203`	`202`	`if (recreate) {`