Skip to content

Commit 0c5140e

Browse files
NN clustering: VRAM memory leak fix + (u)int -> (u)int32_t (#14272)
* VRAM memory leak fix + (u)int -> (u)int32_t * Please consider the following formatting changes * Fixing my own debug messages * Making shared pointer for releasing * Bug-fix * Adding Davids patch --------- Co-authored-by: ALICE Action Bot <alibuild@cern.ch>
1 parent 35e208b commit 0c5140e

File tree

7 files changed

+128
-101
lines changed

7 files changed

+128
-101
lines changed

Common/ML/include/ML/OrtInterface.h

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,10 @@ class OrtModel
4545

4646
public:
4747
// Constructors & destructors
48-
OrtModel() = default;
49-
OrtModel(std::unordered_map<std::string, std::string> optionsMap) { init(optionsMap); }
50-
void init(std::unordered_map<std::string, std::string> optionsMap)
51-
{
52-
initOptions(optionsMap);
53-
initEnvironment();
54-
}
55-
virtual ~OrtModel() = default;
48+
OrtModel();
49+
OrtModel(std::unordered_map<std::string, std::string> optionsMap);
50+
void init(std::unordered_map<std::string, std::string> optionsMap);
51+
virtual ~OrtModel();
5652

5753
// General purpose
5854
void initOptions(std::unordered_map<std::string, std::string> optionsMap);
@@ -113,7 +109,7 @@ class OrtModel
113109
private:
114110
// ORT variables -> need to be hidden as pImpl
115111
struct OrtVariables;
116-
OrtVariables* mPImplOrt;
112+
std::unique_ptr<OrtVariables> mPImplOrt;
117113

118114
// Input & Output specifications of the loaded network
119115
std::vector<const char*> mInputNamesChar, mOutputNamesChar;

Common/ML/src/OrtInterface.cxx

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,20 @@ namespace o2
2727
namespace ml
2828
{
2929

30+
OrtModel::OrtModel() = default;
31+
OrtModel::OrtModel(std::unordered_map<std::string, std::string> optionsMap) { init(optionsMap); }
32+
OrtModel::~OrtModel() = default;
33+
void OrtModel::init(std::unordered_map<std::string, std::string> optionsMap)
34+
{
35+
initOptions(optionsMap);
36+
initEnvironment();
37+
}
38+
3039
struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file
3140
// ORT runtime objects
3241
Ort::RunOptions runOptions;
33-
std::shared_ptr<Ort::Env> env = nullptr;
34-
std::shared_ptr<Ort::Session> session = nullptr; ///< ONNX session
42+
std::unique_ptr<Ort::Env> env = nullptr;
43+
std::unique_ptr<Ort::Session> session = nullptr; ///< ONNX session
3544
Ort::SessionOptions sessionOptions;
3645
Ort::AllocatorWithDefaultOptions allocator;
3746
Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
@@ -41,7 +50,7 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c
4150
// General purpose
4251
void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsMap)
4352
{
44-
mPImplOrt = new OrtVariables();
53+
mPImplOrt = std::make_unique<OrtVariables>();
4554

4655
// Load from options map
4756
if (!optionsMap.contains("model-path")) {
@@ -101,7 +110,7 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
101110

102111
void OrtModel::initEnvironment()
103112
{
104-
mPImplOrt->env = std::make_shared<Ort::Env>(
113+
mPImplOrt->env = std::make_unique<Ort::Env>(
105114
OrtLoggingLevel(mLoggingLevel),
106115
(mEnvName.empty() ? "ORT" : mEnvName.c_str()),
107116
// Integrate ORT logging into Fairlogger
@@ -129,7 +138,7 @@ void OrtModel::initSession()
129138
if (mAllocateDeviceMemory) {
130139
memoryOnDevice(mDeviceId);
131140
}
132-
mPImplOrt->session = std::make_shared<Ort::Session>(*mPImplOrt->env, mModelPath.c_str(), mPImplOrt->sessionOptions);
141+
mPImplOrt->session = std::make_unique<Ort::Session>(*mPImplOrt->env, mModelPath.c_str(), mPImplOrt->sessionOptions);
133142
mPImplOrt->ioBinding = std::make_unique<Ort::IoBinding>(*mPImplOrt->session);
134143

135144
setIO();
@@ -147,12 +156,12 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
147156
(mPImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
148157
(mPImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
149158
// Arena memory shrinkage comes at performance cost
150-
/// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
151-
// (mPImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
159+
// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
160+
(mPImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
152161

153162
std::string dev_mem_str = "";
154163
if (mDeviceType == "ROCM") {
155-
dev_mem_str = "Hip";
164+
dev_mem_str = "HipPinned";
156165
}
157166
if (mDeviceType == "CUDA") {
158167
dev_mem_str = "Cuda";
@@ -166,7 +175,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
166175

167176
void OrtModel::resetSession()
168177
{
169-
mPImplOrt->session = std::make_shared<Ort::Session>(*(mPImplOrt->env), mModelPath.c_str(), mPImplOrt->sessionOptions);
178+
mPImplOrt->session = std::make_unique<Ort::Session>(*(mPImplOrt->env), mModelPath.c_str(), mPImplOrt->sessionOptions);
170179
}
171180

172181
// Getters
@@ -252,7 +261,7 @@ void OrtModel::setIO()
252261

253262
void OrtModel::setEnv(Ort::Env* env)
254263
{
255-
mPImplOrt->env = std::shared_ptr<Ort::Env>(env);
264+
mPImplOrt->env.reset(env);
256265
}
257266

258267
// Inference
@@ -308,6 +317,14 @@ void OrtModel::inference(I* input, int64_t input_size, O* output)
308317
(mPImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor);
309318

310319
(mPImplOrt->session)->Run(mPImplOrt->runOptions, *mPImplOrt->ioBinding);
320+
// mPImplOrt->session->Run(
321+
// mPImplOrt->runOptions,
322+
// mInputNamesChar.data(),
323+
// &inputTensor,
324+
// mInputNamesChar.size(),
325+
// mOutputNamesChar.data(),
326+
// &outputTensor,
327+
// mOutputNamesChar.size());
311328
}
312329

313330
template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t*, int64_t, OrtDataType::Float16_t*);
@@ -427,10 +444,7 @@ template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Fl
427444
// Release session
428445
void OrtModel::release(bool profilingEnabled)
429446
{
430-
// if (profilingEnabled) {
431-
// mPImplOrt->session->EndProfiling();
432-
// }
433-
LOG(info) << "(ORT) Size of mPImplOrt: " << sizeof(*mPImplOrt) << " bytes";
447+
mPImplOrt.reset();
434448
}
435449

436450
// private

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
659659
// But environment must be valid, so we init the model environment first and use it here afterwards.
660660
// Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator.
661661
// TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor
662-
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
662+
// if (lane == 0) {
663+
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
664+
// }
663665
// recreateMemoryAllocator = true;
664666
(nnApplications[lane].mModelClass).initSession();
665667
}
@@ -671,7 +673,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
671673
}
672674
// (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv());
673675
(nnApplications[lane].mModelReg1).initEnvironment();
674-
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
676+
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
675677
(nnApplications[lane].mModelReg1).initSession();
676678
}
677679
if (nnApplications[lane].mModelsUsed[2]) {
@@ -680,8 +682,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
680682
if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
681683
nnApplications[lane].mModelReg2.setIntraOpNumThreads(maxThreads);
682684
}
685+
// (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv());
683686
(nnApplications[lane].mModelReg2).initEnvironment();
684-
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
687+
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
685688
(nnApplications[lane].mModelReg2).initSession();
686689
}
687690
if (nn_settings.nnClusterizerVerbosity < 3) {
@@ -707,8 +710,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
707710
if (doGPU) {
708711
WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
709712
}
710-
LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes";
711-
LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes";
712713
}
713714
#endif
714715

@@ -976,6 +977,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
976977
GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
977978
GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];
978979

980+
// // bool recreateMemoryAllocator = false;
981+
// if (lane == 0) {
982+
// (nnApplications[lane].mModelClass).initEnvironment();
983+
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, 0);
984+
// }
985+
// // recreateMemoryAllocator = true;
986+
// (nnApplications[lane].mModelClass).initSession();
987+
// (nnApplications[lane].mModelReg1).initSession();
988+
979989
int withMC = (doGPU && propagateMCLabels);
980990

981991
if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
@@ -1188,12 +1198,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
11881198
}
11891199
}
11901200
for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
1191-
// if (GetProcessingSettings().nn.applyNNclusterizer) {
1192-
// GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
1193-
// nnApplication.mModelClass.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
1194-
// nnApplication.mModelReg1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
1195-
// nnApplication.mModelReg2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
1196-
// }
1201+
if (GetProcessingSettings().nn.applyNNclusterizer) {
1202+
LOG(info) << "(ORT) Environment releasing...";
1203+
GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
1204+
nnApplication.mModelClass.release(true);
1205+
nnApplication.mModelReg1.release(true);
1206+
nnApplication.mModelReg2.release(true);
1207+
}
11971208
if (transferRunning[i]) {
11981209
ReleaseEvent(mEvents->stream[i], doGPU);
11991210
}

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ struct MockedOrtAllocator : OrtAllocator {
136136
std::atomic<size_t> memory_inuse{0};
137137
std::atomic<size_t> num_allocations{0};
138138
std::atomic<size_t> num_reserve_allocations{0};
139-
OrtMemoryInfo* memory_info;
140-
GPUReconstruction* rec;
139+
OrtMemoryInfo* mMemoryInfoInternal;
140+
GPUReconstruction* mRecInternal;
141141
};
142142

143143
MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info)
@@ -147,37 +147,36 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info
147147
OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
148148
OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
149149
OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
150-
rec = r;
151-
memory_info = info;
150+
mRecInternal = r;
151+
mMemoryInfoInternal = info;
152152
}
153153

154154
MockedOrtAllocator::~MockedOrtAllocator()
155155
{
156-
// Ort::GetApi().ReleaseMemoryInfo(memory_info);
156+
// Ort::GetApi().ReleaseMemoryInfo(mMemoryInfoInternal);
157157
(void)0; // Suppress warning for empty destructor
158158
}
159159

160160
void* MockedOrtAllocator::Alloc(size_t size)
161161
{
162-
// LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes";
163-
return rec->AllocateVolatileDeviceMemory(size);
162+
LOG(info) << "(ORT) Allocating direct memory of size " << size << " bytes";
163+
return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
164164
}
165165

166166
void* MockedOrtAllocator::Reserve(size_t size)
167167
{
168-
// LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes";
169-
return rec->AllocateVolatileDeviceMemory(size);
168+
LOG(info) << "(ORT) Reserving direct memory of size " << size << " bytes";
169+
return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
170170
}
171171

172172
void MockedOrtAllocator::Free(void* p)
173173
{
174174
// LOG(info) << "(ORT) Freeing volatile memory " << p;
175-
rec->ReturnVolatileDeviceMemory();
176175
}
177176

178177
const OrtMemoryInfo* MockedOrtAllocator::Info() const
179178
{
180-
return memory_info;
179+
return mMemoryInfoInternal;
181180
}
182181

183182
size_t MockedOrtAllocator::NumAllocations() const
@@ -197,7 +196,7 @@ void MockedOrtAllocator::LeakCheck()
197196
}
198197
}
199198

200-
void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
199+
void GPUTPCNNClusterizerHost::directOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
201200
{
202201
mMockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)(*memInfo));
203202
if (recreate) {

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class GPUTPCNNClusterizerHost
5353
void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
5454

5555
// ONNX
56-
void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
56+
void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
5757
MockedOrtAllocator* getMockedAllocator();
5858
const OrtMemoryInfo* getMockedMemoryInfo();
5959

0 commit comments

Comments
 (0)