Skip to content

Commit 4a7cd21

Browse files
committed
VRAM memory leak fix + (u)int -> (u)int32_t
1 parent 3eadf36 commit 4a7cd21

File tree

5 files changed

+111
-80
lines changed

5 files changed

+111
-80
lines changed

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
658658
// But environment must be valid, so we init the model environment first and use it here afterwards.
659659
// Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator.
660660
// TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor
661-
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
661+
// if (lane == 0) {
662+
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
663+
// }
662664
// recreateMemoryAllocator = true;
663665
(nnApplications[lane].mModelClass).initSession();
664666
}
@@ -670,7 +672,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
670672
}
671673
// (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv());
672674
(nnApplications[lane].mModelReg1).initEnvironment();
673-
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
675+
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
674676
(nnApplications[lane].mModelReg1).initSession();
675677
}
676678
if (nnApplications[lane].mModelsUsed[2]) {
@@ -679,8 +681,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
679681
if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
680682
nnApplications[lane].mModelReg2.setIntraOpNumThreads(maxThreads);
681683
}
684+
// (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv());
682685
(nnApplications[lane].mModelReg2).initEnvironment();
683-
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
686+
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
684687
(nnApplications[lane].mModelReg2).initSession();
685688
}
686689
if (nn_settings.nnClusterizerVerbosity < 3) {
@@ -706,8 +709,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
706709
if (doGPU) {
707710
WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
708711
}
709-
LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes";
710-
LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes";
711712
}
712713
#endif
713714

@@ -975,6 +976,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
975976
GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
976977
GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];
977978

979+
// // bool recreateMemoryAllocator = false;
980+
// if (lane == 0) {
981+
// (nnApplications[lane].mModelClass).initEnvironment();
982+
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, 0);
983+
// }
984+
// // recreateMemoryAllocator = true;
985+
// (nnApplications[lane].mModelClass).initSession();
986+
// (nnApplications[lane].mModelReg1).initSession();
987+
978988
int withMC = (doGPU && propagateMCLabels);
979989

980990
if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
@@ -1187,12 +1197,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
11871197
}
11881198
}
11891199
for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
1190-
// if (GetProcessingSettings().nn.applyNNclusterizer) {
1191-
// GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
1192-
// nnApplication.mModelClass.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
1193-
// nnApplication.mModelReg1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
1194-
// nnApplication.mModelReg2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
1195-
// }
1200+
if (GetProcessingSettings().nn.applyNNclusterizer) {
1201+
LOG(info) << "(ORT) Environment releasing...";
1202+
GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
1203+
nnApplication.mModelClass.release(true);
1204+
nnApplication.mModelReg1.release(true);
1205+
nnApplication.mModelReg2.release(true);
1206+
}
11961207
if (transferRunning[i]) {
11971208
ReleaseEvent(mEvents->stream[i], doGPU);
11981209
}

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ struct MockedOrtAllocator : OrtAllocator {
136136
std::atomic<size_t> memory_inuse{0};
137137
std::atomic<size_t> num_allocations{0};
138138
std::atomic<size_t> num_reserve_allocations{0};
139-
OrtMemoryInfo* memory_info;
140-
GPUReconstruction* rec;
139+
OrtMemoryInfo* mMemoryInfoInternal;
140+
GPUReconstruction* mRecInternal;
141141
};
142142

143143
MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info)
@@ -147,37 +147,36 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info
147147
OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
148148
OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
149149
OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
150-
rec = r;
151-
memory_info = info;
150+
mRecInternal = r;
151+
mMemoryInfoInternal = info;
152152
}
153153

154154
MockedOrtAllocator::~MockedOrtAllocator()
155155
{
156-
// Ort::GetApi().ReleaseMemoryInfo(memory_info);
156+
// Ort::GetApi().ReleaseMemoryInfo(mMemoryInfoInternal);
157157
(void)0; // Suppress warning for empty destructor
158158
}
159159

160160
void* MockedOrtAllocator::Alloc(size_t size)
161161
{
162-
// LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes";
163-
return rec->AllocateVolatileDeviceMemory(size);
162+
LOG(info) << "(ORT) Allocating direct memory of size " << size << " bytes";
163+
return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
164164
}
165165

166166
void* MockedOrtAllocator::Reserve(size_t size)
167167
{
168-
// LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes";
169-
return rec->AllocateVolatileDeviceMemory(size);
168+
LOG(info) << "(ORT) Reserving direct memory of size " << size << " bytes";
169+
return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
170170
}
171171

172172
void MockedOrtAllocator::Free(void* p)
173173
{
174174
// LOG(info) << "(ORT) Freeing volatile memory " << p;
175-
rec->ReturnVolatileDeviceMemory();
176175
}
177176

178177
const OrtMemoryInfo* MockedOrtAllocator::Info() const
179178
{
180-
return memory_info;
179+
return mMemoryInfoInternal;
181180
}
182181

183182
size_t MockedOrtAllocator::NumAllocations() const
@@ -197,7 +196,7 @@ void MockedOrtAllocator::LeakCheck()
197196
}
198197
}
199198

200-
void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
199+
void GPUTPCNNClusterizerHost::directOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
201200
{
202201
mMockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)(*memInfo));
203202
if (recreate) {

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class GPUTPCNNClusterizerHost
5353
void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
5454

5555
// ONNX
56-
void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
56+
void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
5757
MockedOrtAllocator* getMockedAllocator();
5858
const OrtMemoryInfo* getMockedMemoryInfo();
5959

0 commit comments

Comments
 (0)