AliceO2Group
diff --git a/‎GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx‎
Lines changed: 22 additions & 11 deletions b/‎GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx‎
Lines changed: 22 additions & 11 deletions
diff --git a/‎GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx‎
Lines changed: 11 additions & 12 deletions b/‎GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx‎
Lines changed: 11 additions & 12 deletions
diff --git a/‎GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h‎
Lines changed: 1 addition & 1 deletion b/‎GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h‎
Lines changed: 1 addition & 1 deletion
@@ -658,7 +658,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         // But environment must be valid, so we init the model environment first and use it here afterwards.
         // Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator.
         // TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // if (lane == 0) {
+        //   nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // }
         // recreateMemoryAllocator = true;
         (nnApplications[lane].mModelClass).initSession();
       }
@@ -670,7 +672,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         }
         // (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv());
         (nnApplications[lane].mModelReg1).initEnvironment();
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
         (nnApplications[lane].mModelReg1).initSession();
       }
       if (nnApplications[lane].mModelsUsed[2]) {
@@ -679,8 +681,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].mModelReg2.setIntraOpNumThreads(maxThreads);
         }
+        // (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv());
         (nnApplications[lane].mModelReg2).initEnvironment();
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
         (nnApplications[lane].mModelReg2).initSession();
       }
       if (nn_settings.nnClusterizerVerbosity < 3) {
@@ -706,8 +709,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     if (doGPU) {
       WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
     }
-    LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes";
-    LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes";
   }
 #endif
 
@@ -975,6 +976,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
           GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];
 
+          // // bool recreateMemoryAllocator = false;
+          // if (lane == 0) {
+          //   (nnApplications[lane].mModelClass).initEnvironment();
+          //   nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, 0);
+          // }
+          // // recreateMemoryAllocator = true;
+          // (nnApplications[lane].mModelClass).initSession();
+          // (nnApplications[lane].mModelReg1).initSession();
+
           int withMC = (doGPU && propagateMCLabels);
 
           if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
@@ -1187,12 +1197,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     }
   }
   for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
-    // if (GetProcessingSettings().nn.applyNNclusterizer) {
-    //   GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
-    //   nnApplication.mModelClass.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
-    //   nnApplication.mModelReg1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
-    //   nnApplication.mModelReg2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
-    // }
+    if (GetProcessingSettings().nn.applyNNclusterizer) {
+      LOG(info) << "(ORT) Environment releasing...";
+      GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
+      nnApplication.mModelClass.release(true);
+      nnApplication.mModelReg1.release(true);
+      nnApplication.mModelReg2.release(true);
+    }
     if (transferRunning[i]) {
       ReleaseEvent(mEvents->stream[i], doGPU);
     }
 
@@ -136,8 +136,8 @@ struct MockedOrtAllocator : OrtAllocator {
   std::atomic<size_t> memory_inuse{0};
   std::atomic<size_t> num_allocations{0};
   std::atomic<size_t> num_reserve_allocations{0};
-  OrtMemoryInfo* memory_info;
-  GPUReconstruction* rec;
+  OrtMemoryInfo* mMemoryInfoInternal;
+  GPUReconstruction* mRecInternal;
 };
 
 MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info)
@@ -147,37 +147,36 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info
   OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
   OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
   OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
-  rec = r;
-  memory_info = info;
+  mRecInternal = r;
+  mMemoryInfoInternal = info;
 }
 
 MockedOrtAllocator::~MockedOrtAllocator()
 {
-  // Ort::GetApi().ReleaseMemoryInfo(memory_info);
+  // Ort::GetApi().ReleaseMemoryInfo(mMemoryInfoInternal);
   (void)0; // Suppress warning for empty destructor
 }
 
 void* MockedOrtAllocator::Alloc(size_t size)
 {
-  // LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes";
-  return rec->AllocateVolatileDeviceMemory(size);
+  LOG(info) << "(ORT) Allocating direct memory of size " << size << " bytes";
+  return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
 }
 
 void* MockedOrtAllocator::Reserve(size_t size)
 {
-  // LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes";
-  return rec->AllocateVolatileDeviceMemory(size);
+  LOG(info) << "(ORT) Reserving direct memory of size " << size << " bytes";
+  return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
 }
 
 void MockedOrtAllocator::Free(void* p)
 {
   // LOG(info) << "(ORT) Freeing volatile memory " << p;
-  rec->ReturnVolatileDeviceMemory();
 }
 
 const OrtMemoryInfo* MockedOrtAllocator::Info() const
 {
-  return memory_info;
+  return mMemoryInfoInternal;
 }
 
 size_t MockedOrtAllocator::NumAllocations() const
@@ -197,7 +196,7 @@ void MockedOrtAllocator::LeakCheck()
   }
 }
 
-void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
+void GPUTPCNNClusterizerHost::directOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
 {
   mMockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)(*memInfo));
   if (recreate) {
 
@@ -53,7 +53,7 @@ class GPUTPCNNClusterizerHost
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
 
   // ONNX
-  void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
+  void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
   MockedOrtAllocator* getMockedAllocator();
   const OrtMemoryInfo* getMockedMemoryInfo();
Original file line number	Diff line number	Diff line change
`@@ -136,8 +136,8 @@ struct MockedOrtAllocator : OrtAllocator {`
`136`	`136`	`std::atomic<size_t> memory_inuse{0};`
`137`	`137`	`std::atomic<size_t> num_allocations{0};`
`138`	`138`	`std::atomic<size_t> num_reserve_allocations{0};`
`139`		`- OrtMemoryInfo* memory_info;`
`140`		`- GPUReconstruction* rec;`
	`139`	`+ OrtMemoryInfo* mMemoryInfoInternal;`
	`140`	`+ GPUReconstruction* mRecInternal;`
`141`	`141`	`};`
`142`	`142`
`143`	`143`	`MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info)`
`@@ -147,37 +147,36 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info`
`147`	`147`	`OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };`
`148`	`148`	`OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };`
`149`	`149`	`OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };`
`150`		`- rec = r;`
`151`		`- memory_info = info;`
	`150`	`+ mRecInternal = r;`
	`151`	`+ mMemoryInfoInternal = info;`
`152`	`152`	`}`
`153`	`153`
`154`	`154`	`MockedOrtAllocator::~MockedOrtAllocator()`
`155`	`155`	`{`
`156`		`- // Ort::GetApi().ReleaseMemoryInfo(memory_info);`
	`156`	`+ // Ort::GetApi().ReleaseMemoryInfo(mMemoryInfoInternal);`
`157`	`157`	`(void)0; // Suppress warning for empty destructor`
`158`	`158`	`}`
`159`	`159`
`160`	`160`	`void* MockedOrtAllocator::Alloc(size_t size)`
`161`	`161`	`{`
`162`		`- // LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes";`
`163`		`- return rec->AllocateVolatileDeviceMemory(size);`
	`162`	`+ LOG(info) << "(ORT) Allocating direct memory of size " << size << " bytes";`
	`163`	`+ return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU \| GPUMemoryResource::MEMORY_STACK);`
`164`	`164`	`}`
`165`	`165`
`166`	`166`	`void* MockedOrtAllocator::Reserve(size_t size)`
`167`	`167`	`{`
`168`		`- // LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes";`
`169`		`- return rec->AllocateVolatileDeviceMemory(size);`
	`168`	`+ LOG(info) << "(ORT) Reserving direct memory of size " << size << " bytes";`
	`169`	`+ return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU \| GPUMemoryResource::MEMORY_STACK);`
`170`	`170`	`}`
`171`	`171`
`172`	`172`	`void MockedOrtAllocator::Free(void* p)`
`173`	`173`	`{`
`174`	`174`	`// LOG(info) << "(ORT) Freeing volatile memory " << p;`
`175`		`- rec->ReturnVolatileDeviceMemory();`
`176`	`175`	`}`
`177`	`176`
`178`	`177`	`const OrtMemoryInfo* MockedOrtAllocator::Info() const`
`179`	`178`	`{`
`180`		`- return memory_info;`
	`179`	`+ return mMemoryInfoInternal;`
`181`	`180`	`}`
`182`	`181`
`183`	`182`	`size_t MockedOrtAllocator::NumAllocations() const`
`@@ -197,7 +196,7 @@ void MockedOrtAllocator::LeakCheck()`
`197`	`196`	`}`
`198`	`197`	`}`
`199`	`198`
`200`		`-void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)`
	`199`	`+void GPUTPCNNClusterizerHost::directOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)`
`201`	`200`	`{`
`202`	`201`	`mMockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo)(memInfo));`
`203`	`202`	`if (recreate) {`