Adjust for comments

ChSonnabend · ChSonnabend · commit 938a1edbe669 · 2025-04-20T00:38:19.000+02:00
diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt
@@ -16,7 +16,7 @@ o2_add_library(ML
 
 # Pass ORT variables as a preprocessor definition
 target_compile_definitions(${targetName} PRIVATE
-    ORT_ROCM_BUILD=$<BOOL:${ORT_ROCM_BUILD}>
-    ORT_CUDA_BUILD=$<BOOL:${ORT_CUDA_BUILD}>
-    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>
-    ORT_TENSORRT_BUILD=$<BOOL:${ORT_TENSORRT_BUILD}>)
+    $<$<BOOL:${ORT_ROCM_BUILD}>:ORT_ROCM_BUILD>
+    $<$<BOOL:${ORT_CUDA_BUILD}>:ORT_CUDA_BUILD>
+    $<$<BOOL:${ORT_MIGRAPHX_BUILD}>:ORT_MIGRAPHX_BUILD>
+    $<$<BOOL:${ORT_TENSORRT_BUILD}>:ORT_TENSORRT_BUILD>)
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
@@ -144,7 +144,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
     (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1");
     (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
     (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1");                    // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
-
+    (pImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0");                    // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
     // Arena memory shrinkage comes at performance cost
     /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
     // (pImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
@@ -158,7 +158,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
     }
     pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault);
     if (loggingLevel < 2) {
-      LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex;
+      LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex << " and pImplOrt pointer " << pImplOrt;
     }
   }
 #endif
diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
@@ -122,17 +122,9 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
       ${CMAKE_CURRENT_SOURCE_DIR}
     TARGETVARNAME targetName)
 
-  message("Compile definitions for ONNX runtime (CUDA):")
-  message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}")
-  message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}")
-  message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}")
-  message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}")
-
   target_compile_definitions(${targetName} PRIVATE
     GPUCA_HAS_ONNX=1
-    ORT_ROCM_BUILD=$<BOOL:${ORT_ROCM_BUILD}>
     ORT_CUDA_BUILD=$<BOOL:${ORT_CUDA_BUILD}>
-    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>
     ORT_TENSORRT_BUILD=$<BOOL:${ORT_TENSORRT_BUILD}>)
 
   install(FILES ${HDRS} DESTINATION include/GPU)
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -655,7 +655,7 @@ void GPUReconstructionCUDA::endGPUProfiling()
 
 void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
 {
-#if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1
+#ifdef ORT_CUDA_BUILD
   cudaGetDevice(deviceId);
   OrtCUDAProviderOptionsV2* cuda_options = nullptr;
   CreateCUDAProviderOptions(&cuda_options);
@@ -684,14 +684,15 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr)
 
 void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
 {
-#if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1
+#ifdef ORT_ROCM_BUILD
   // Create ROCm provider options
   cudaGetDevice(deviceId);
   // const auto& api = Ort::GetApi();
   // api.GetCurrentGpuDeviceId(deviceId);
   OrtROCMProviderOptions rocm_options;
   rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
   rocm_options.arena_extend_strategy = 0;   // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
+  // rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit
   rocm_options.user_compute_stream = mInternals->Streams[stream];
   session_options.AppendExecutionProvider_ROCM(rocm_options);
 #endif // ORT_ROCM_BUILD
diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt
@@ -170,18 +170,10 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
       ${GPUCA_HIP_SOURCE_DIR}
     TARGETVARNAME targetName)
 
-  message("Compile definitions for ONNX runtime (HIP / ROCM):")
-  message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}")
-  message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}")
-  message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}")
-  message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}")
-
   target_compile_definitions(${targetName} PRIVATE
     GPUCA_HAS_ONNX=1
     ORT_ROCM_BUILD=$<BOOL:${ORT_ROCM_BUILD}>
-    ORT_CUDA_BUILD=$<BOOL:${ORT_CUDA_BUILD}>
-    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>
-    ORT_TENSORRT_BUILD=$<BOOL:${ORT_TENSORRT_BUILD}>)
+    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>)
 
   install(FILES ${HDRS} DESTINATION include/GPU)
 
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
@@ -349,8 +349,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
     ${targetName}
     PRIVATE $<TARGET_PROPERTY:O2::Framework,INTERFACE_INCLUDE_DIRECTORIES>)
 
-  target_compile_definitions(${targetName} PRIVATE
-    GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1)
+  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1)
 
   o2_target_root_dictionary(${MODULE}
                             HEADERS ${HDRS_CINT_O2} ${HDRS_CINT_O2_ADDITIONAL}
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -42,7 +42,6 @@
 #ifdef GPUCA_HAS_ONNX
 #include "GPUTPCNNClusterizerKernels.h"
 #include "GPUTPCNNClusterizerHost.h"
-// #include "ML/3rdparty/GPUORTFloat16.h"
 #endif
 
 using namespace o2::gpu;
@@ -628,6 +627,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     int32_t deviceId = -1;
     int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
     int32_t maxThreads = mRec->getNKernelHostThreads(true);
+    // bool recreateMemoryAllocator = false;
     mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
       nnApplications[lane].init(nn_settings);
       if (nnApplications[lane].modelsUsed[0]) {
@@ -637,7 +637,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads);
         }
         (nnApplications[lane].model_class).initEnvironment();
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, 0);
+        // Registering this once seems to be enough, even with different environmnents / models. ONNX apparently uses this per device and stores the OrtAllocator internally. All models will then use the volatile allocation.
+        // But environment must be valid, so we init the model environment first and use it here afterwards.
+        // Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator.
+        // TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // recreateMemoryAllocator = true;
         (nnApplications[lane].model_class).initSession();
       }
       if (nnApplications[lane].modelsUsed[1]) {
@@ -648,7 +653,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         }
         // (nnApplications[lane].model_reg_1).setEnv((nnApplications[lane].model_class).getEnv());
         (nnApplications[lane].model_reg_1).initEnvironment();
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, 1);
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, recreateMemoryAllocator);
         (nnApplications[lane].model_reg_1).initSession();
       }
       if (nnApplications[lane].modelsUsed[2]) {
@@ -657,9 +662,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads);
         }
-        // (nnApplications[lane].model_reg_2).setEnv((nnApplications[lane].model_class).getEnv());
         (nnApplications[lane].model_reg_2).initEnvironment();
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_2).getEnv(), (nnApplications[lane].model_reg_2).getMemoryInfo(), mRec, 2);
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, recreateMemoryAllocator);
         (nnApplications[lane].model_reg_2).initSession();
       }
       if (nn_settings.nnClusterizerVerbosity < 3) {
@@ -685,6 +689,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     if (doGPU) {
       WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
     }
+    LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes";
+    LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes";
   }
 #endif
 
@@ -966,8 +972,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto start0 = std::chrono::high_resolution_clock::now();
             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data
-            // auto stop0 = std::chrono::high_resolution_clock::now();
 
+            // auto stop0 = std::chrono::high_resolution_clock::now();
             // auto start1 = std::chrono::high_resolution_clock::now();
 
             // NN evaluations
@@ -1048,12 +1054,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
             // time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
           }
-          // if (clustererNNShadow.nnClusterizerUseCfRegression) {
-          //   auto start1 = std::chrono::high_resolution_clock::now();
-          //   runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
-          //   auto stop1 = std::chrono::high_resolution_clock::now();
-          //   time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
-          // }
+          if (clustererNNShadow.nnClusterizerUseCfRegression) {
+            // auto start1 = std::chrono::high_resolution_clock::now();
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            // auto stop1 = std::chrono::high_resolution_clock::now();
+            // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+          }
           // if (clustererNNShadow.nnClusterizerVerbosity < 3) {
           //   int acceptedClusters = 0;
           //   for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -54,7 +54,8 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
     {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
     {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)},
     {"profiling-output-path", settings.nnInferenceOrtProfilingPath},
-    {"logging-level", std::to_string(settings.nnInferenceVerbosity)}};
+    {"logging-level", std::to_string(settings.nnInferenceVerbosity)},
+    {"onnx-environment-name", "c1"}};
 
   model_class.initOptions(OrtOptions);
   modelsUsed[0] = true;
@@ -64,13 +65,16 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
   if (!settings.nnClusterizerUseCfRegression) {
     if (reg_model_paths.size() == 1) {
       OrtOptions["model-path"] = reg_model_paths[0];
+      OrtOptions["onnx-environment-name"] = "r1";
       model_reg_1.initOptions(OrtOptions);
       modelsUsed[1] = true;
     } else {
       OrtOptions["model-path"] = reg_model_paths[0];
+      OrtOptions["onnx-environment-name"] = "r1";
       model_reg_1.initOptions(OrtOptions);
       modelsUsed[1] = true;
       OrtOptions["model-path"] = reg_model_paths[1];
+      OrtOptions["onnx-environment-name"] = "r2";
       model_reg_2.initOptions(OrtOptions);
       modelsUsed[2] = true;
     }
@@ -154,16 +158,19 @@ MockedOrtAllocator::~MockedOrtAllocator()
 
 void* MockedOrtAllocator::Alloc(size_t size)
 {
+  // LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes";
   return rec->AllocateVolatileDeviceMemory(size);
 }
 
 void* MockedOrtAllocator::Reserve(size_t size)
 {
+  // LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes";
   return rec->AllocateVolatileDeviceMemory(size);
 }
 
 void MockedOrtAllocator::Free(void* p)
 {
+  // LOG(info) << "(ORT) Freeing volatile memory " << p;
   rec->ReturnVolatileDeviceMemory();
 }
 
@@ -188,21 +195,20 @@ void MockedOrtAllocator::LeakCheck()
     LOG(warning) << "memory leak!!!";
 }
 
-void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, int32_t chooseMockedAlloc)
+void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
 {
-  if (chooseMockedAlloc == 0) {
-    mockedAlloc_class = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)memInfo);
-    Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_class.get()));
-    LOG(info) << "(ORT) Mocked ORT allocator for classification network registered";
-  } else if (chooseMockedAlloc == 1) {
-    mockedAlloc_reg_1 = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)memInfo);
-    Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_1.get()));
-    LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 1) registered";
-  } else if (chooseMockedAlloc == 2) {
-    mockedAlloc_reg_2 = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)memInfo);
-    Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_2.get()));
-    LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 2) registered";
-  } else {
-    LOG(fatal) << "Invalid choice for mocked allocator";
+  mockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)(*memInfo));
+  if (recreate) {
+    Ort::ThrowOnError(Ort::GetApi().UnregisterAllocator((OrtEnv*)(*env), (OrtMemoryInfo*)(*memInfo)));
   }
+  Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc.get()));
+  memInfo = (Ort::MemoryInfo*)mockedAlloc->Info();
+}
+
+const OrtMemoryInfo* GPUTPCNNClusterizerHost::getMockedMemoryInfo() {
+  return mockedAlloc->Info();
+}
+
+MockedOrtAllocator* GPUTPCNNClusterizerHost::getMockedAllocator() {
+  return mockedAlloc.get();
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -22,8 +22,8 @@
 
 using namespace o2::ml;
 
-struct OrtAllocator;
-struct OrtMemoryInfo;
+class OrtMemoryInfo;
+class OrtAllocator;
 struct MockedOrtAllocator;
 namespace Ort
 {
@@ -53,15 +53,17 @@ class GPUTPCNNClusterizerHost
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
 
   // ONNX
-  void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, int32_t = 0);
+  void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
+  MockedOrtAllocator* getMockedAllocator();
+  const OrtMemoryInfo* getMockedMemoryInfo();
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
   std::vector<bool> modelsUsed = {false, false, false};   // 0: class, 1: reg_1, 2: reg_2
   int32_t deviceId = -1;
   std::vector<std::string> reg_model_paths;
 
-  std::shared_ptr<MockedOrtAllocator> mockedAlloc_class = nullptr, mockedAlloc_reg_1 = nullptr, mockedAlloc_reg_2 = nullptr;
+  std::shared_ptr<MockedOrtAllocator> mockedAlloc = nullptr;
 }; // class GPUTPCNNClusterizerHost
 
 } // namespace o2::gpu
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx

Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)`
`144`	`144`	`(pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1");`
`145`	`145`	`(pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h`
`146`	`146`	`(pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time`
`147`		`-`
	`147`	`+ (pImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time`
`148`	`148`	`// Arena memory shrinkage comes at performance cost`
`149`	`149`	`/// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;`
`150`	`150`	`// (pImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27`
`@@ -158,7 +158,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)`
`158`	`158`	`}`
`159`	`159`	`pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault);`
`160`	`160`	`if (loggingLevel < 2) {`
`161`		`- LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex;`
	`161`	`+ LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex << " and pImplOrt pointer " << pImplOrt;`
`162`	`162`	`}`
`163`	`163`	`}`
`164`	`164`	`#endif`