Skip to content

Commit 4b0825a

Browse files
committed
Adding volatile memory allocation and MockedOrtAllocator. Removing print statements and time measurements
1 parent 37955fa commit 4b0825a

File tree

8 files changed

+203
-45
lines changed

8 files changed

+203
-45
lines changed

Common/ML/include/ML/OrtInterface.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ namespace Ort
3030
{
3131
struct SessionOptions;
3232
struct MemoryInfo;
33+
struct Env;
3334
} // namespace Ort
3435

3536
namespace o2
@@ -55,6 +56,7 @@ class OrtModel
5556
// General purpose
5657
void initOptions(std::unordered_map<std::string, std::string> optionsMap);
5758
void initEnvironment();
59+
void initSession();
5860
void memoryOnDevice(int32_t = 0);
5961
bool isInitialized() { return mInitialized; }
6062
void resetSession();
@@ -64,8 +66,9 @@ class OrtModel
6466
std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
6567
std::vector<std::string> getInputNames() const { return mInputNames; }
6668
std::vector<std::string> getOutputNames() const { return mOutputNames; }
67-
Ort::SessionOptions& getSessionOptions();
68-
Ort::MemoryInfo& getMemoryInfo();
69+
Ort::SessionOptions* getSessionOptions();
70+
Ort::MemoryInfo* getMemoryInfo();
71+
Ort::Env* getEnv();
6972
int32_t getIntraOpNumThreads() const { return intraOpNumThreads; }
7073
int32_t getInterOpNumThreads() const { return interOpNumThreads; }
7174

@@ -85,6 +88,7 @@ class OrtModel
8588
interOpNumThreads = threads;
8689
}
8790
}
91+
void setEnv(Ort::Env*);
8892

8993
// Conversion
9094
template <class I, class O>
@@ -103,7 +107,7 @@ class OrtModel
103107
template <class I, class O>
104108
void inference(I**, int64_t, O*);
105109

106-
void release();
110+
void release(bool = false);
107111

108112
private:
109113
// ORT variables -> need to be hidden as pImpl

Common/ML/src/OrtInterface.cxx

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,6 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
9999

100100
void OrtModel::initEnvironment()
101101
{
102-
if (allocateDeviceMemory) {
103-
memoryOnDevice(deviceId);
104-
}
105102
pImplOrt->env = std::make_shared<Ort::Env>(
106103
OrtLoggingLevel(loggingLevel),
107104
(envName.empty() ? "ORT" : envName.c_str()),
@@ -123,6 +120,13 @@ void OrtModel::initEnvironment()
123120
},
124121
(void*)3);
125122
(pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
123+
}
124+
125+
void OrtModel::initSession()
126+
{
127+
if (allocateDeviceMemory) {
128+
memoryOnDevice(deviceId);
129+
}
126130
pImplOrt->session = std::make_shared<Ort::Session>(*pImplOrt->env, modelPath.c_str(), pImplOrt->sessionOptions);
127131
pImplOrt->ioBinding = std::make_unique<Ort::IoBinding>(*pImplOrt->session);
128132

@@ -138,6 +142,13 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
138142
#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
139143
if (deviceIndex >= 0) {
140144
(pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1");
145+
(pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
146+
(pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
147+
148+
// Arena memory shrinkage comes at performance cost
149+
/// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
150+
// (pImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
151+
141152
std::string dev_mem_str = "";
142153
if (deviceType == "ROCM") {
143154
dev_mem_str = "Hip";
@@ -159,14 +170,19 @@ void OrtModel::resetSession()
159170
}
160171

161172
// Getters
162-
Ort::SessionOptions& OrtModel::getSessionOptions()
173+
Ort::SessionOptions* OrtModel::getSessionOptions()
174+
{
175+
return &pImplOrt->sessionOptions;
176+
}
177+
178+
Ort::MemoryInfo* OrtModel::getMemoryInfo()
163179
{
164-
return pImplOrt->sessionOptions;
180+
return &pImplOrt->memoryInfo;
165181
}
166182

167-
Ort::MemoryInfo& OrtModel::getMemoryInfo()
183+
Ort::Env* OrtModel::getEnv()
168184
{
169-
return pImplOrt->memoryInfo;
185+
return (pImplOrt->env).get();
170186
}
171187

172188
template <class I, class O>
@@ -234,6 +250,11 @@ void OrtModel::setIO()
234250
}
235251
}
236252

253+
void OrtModel::setEnv(Ort::Env* env)
254+
{
255+
pImplOrt->env = std::shared_ptr<Ort::Env>(env);
256+
}
257+
237258
// Inference
238259
template <class I, class O>
239260
std::vector<O> OrtModel::inference(std::vector<I>& input)
@@ -404,8 +425,11 @@ template std::vector<float> OrtModel::inference<float, float>(std::vector<std::v
404425
template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>&);
405426

406427
// Release session
407-
void OrtModel::release()
428+
void OrtModel::release(bool profilingEnabled)
408429
{
430+
// if (profilingEnabled) {
431+
// pImplOrt->session->EndProfiling();
432+
// }
409433
LOG(info) << "(ORT) Size of pImplOrt: " << sizeof(*pImplOrt) << " bytes";
410434
}
411435

GPU/GPUTracking/Base/GPUReconstructionProcessing.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,6 @@ class GPUReconstructionProcessing : public GPUReconstruction
9595
void AddGPUEvents(T*& events);
9696

9797
virtual std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() override;
98-
// virtual void SetONNXGPUStream(Ort::SessionOptions&, int32_t, int32_t*) {}
9998

10099
struct RecoStepTimerMeta {
101100
HighResTimer timerToGPU;

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -699,7 +699,7 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options
699699
// api.GetCurrentGpuDeviceId(deviceId);
700700
OrtROCMProviderOptions rocm_options;
701701
rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
702-
rocm_options.arena_extend_strategy = 0;
702+
rocm_options.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
703703
rocm_options.user_compute_stream = mInternals->Streams[stream];
704704
session_options.AppendExecutionProvider_ROCM(rocm_options);
705705
#endif // ORT_ROCM_BUILD

GPU/GPUTracking/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
336336
O2::DetectorsRaw
337337
O2::Steer
338338
O2::ML
339+
PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime
339340
PUBLIC_INCLUDE_DIRECTORIES ${INCDIRS}
340341
SOURCES ${SRCS} ${SRCS_NO_CINT} ${SRCS_NO_H})
341342

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 55 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#ifdef GPUCA_HAS_ONNX
4343
#include "GPUTPCNNClusterizerKernels.h"
4444
#include "GPUTPCNNClusterizerHost.h"
45+
// #include "ML/3rdparty/GPUORTFloat16.h"
4546
#endif
4647

4748
using namespace o2::gpu;
@@ -630,31 +631,39 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
630631
mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
631632
nnApplications[lane].init(nn_settings);
632633
if (nnApplications[lane].modelsUsed[0]) {
633-
SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
634+
SetONNXGPUStream(*(nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
634635
(nnApplications[lane].model_class).setDeviceId(deviceId);
635636
if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) {
636637
nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads);
637638
}
638639
(nnApplications[lane].model_class).initEnvironment();
640+
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, 0);
641+
(nnApplications[lane].model_class).initSession();
639642
}
640643
if (nnApplications[lane].modelsUsed[1]) {
641-
SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
644+
SetONNXGPUStream(*(nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
642645
(nnApplications[lane].model_reg_1).setDeviceId(deviceId);
643646
if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) {
644647
nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads);
645648
}
649+
// (nnApplications[lane].model_reg_1).setEnv((nnApplications[lane].model_class).getEnv());
646650
(nnApplications[lane].model_reg_1).initEnvironment();
651+
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, 1);
652+
(nnApplications[lane].model_reg_1).initSession();
647653
}
648654
if (nnApplications[lane].modelsUsed[2]) {
649-
SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
655+
SetONNXGPUStream(*(nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
650656
(nnApplications[lane].model_reg_2).setDeviceId(deviceId);
651657
if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) {
652658
nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads);
653659
}
660+
// (nnApplications[lane].model_reg_2).setEnv((nnApplications[lane].model_class).getEnv());
654661
(nnApplications[lane].model_reg_2).initEnvironment();
662+
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_2).getEnv(), (nnApplications[lane].model_reg_2).getMemoryInfo(), mRec, 2);
663+
(nnApplications[lane].model_reg_2).initSession();
655664
}
656665
if (nn_settings.nnClusterizerVerbosity < 3) {
657-
LOG(info) << "Allocated ONNX stream for lane " << lane << " and device " << deviceId;
666+
LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
658667
}
659668
});
660669
mRec->runParallelOuterLoop(doGPU, NSECTORS, [&](uint32_t sector) {
@@ -957,9 +966,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
957966

958967
auto start0 = std::chrono::high_resolution_clock::now();
959968
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data
960-
auto stop0 = std::chrono::high_resolution_clock::now();
969+
// auto stop0 = std::chrono::high_resolution_clock::now();
961970

962-
auto start1 = std::chrono::high_resolution_clock::now();
971+
// auto start1 = std::chrono::high_resolution_clock::now();
963972

964973
// NN evaluations
965974
if (clustererNNShadow.nnInferenceInputDType == 0) {
@@ -1006,7 +1015,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10061015
}
10071016
}
10081017

1009-
auto stopNNs = std::chrono::high_resolution_clock::now();
1018+
// auto stopNNs = std::chrono::high_resolution_clock::now();
10101019

10111020
// Publishing kernels
10121021
if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
@@ -1020,25 +1029,41 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10201029
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results
10211030
}
10221031
}
1023-
auto stop1 = std::chrono::high_resolution_clock::now();
10241032

1025-
time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count() / 1e9;
1026-
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
1027-
time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
1028-
}
1029-
if (clustererNNShadow.nnClusterizerUseCfRegression) {
1030-
auto start1 = std::chrono::high_resolution_clock::now();
1031-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
1032-
auto stop1 = std::chrono::high_resolution_clock::now();
1033-
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
1034-
}
1035-
if (clustererNNShadow.nnClusterizerVerbosity < 3) {
1036-
int acceptedClusters = 0;
1037-
for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
1038-
acceptedClusters += clustererNNShadow.outputDataClass[i];
1039-
}
1040-
LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
1033+
// for(int i = 0; i < iSize; ++i) {
1034+
// if(clustererNNShadow.outputDataClass[i + batchStart] > 1) {
1035+
// LOG(info) << "WARNING ORT: Output of " << i + batchStart << " / " << clusterer.mPmemory->counters.nClusters << " is " << clustererNNShadow.modelProbabilities_16[i].ToFloat() << " and " << clustererNNShadow.outputDataClass[i + batchStart] << " thresh " << clustererNNShadow.nnClassThreshold << " instead of 0 or 1. Please check the model and the input data.";
1036+
// // std::string input = "[";
1037+
// // for(int j = 0; j < clustererNNShadow.nnClusterizerElementSize; j++){
1038+
// // input += std::to_string(clustererNNShadow.inputData_16[i * clustererNNShadow.nnClusterizerElementSize + j].ToFloat()) + ", ";
1039+
// // }
1040+
// // input += "]";
1041+
// // LOG(info) << "Input is: " << input;
1042+
// }
1043+
// }
1044+
1045+
// auto stop1 = std::chrono::high_resolution_clock::now();
1046+
1047+
// time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count() / 1e9;
1048+
// time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
1049+
// time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
10411050
}
1051+
// if (clustererNNShadow.nnClusterizerUseCfRegression) {
1052+
// auto start1 = std::chrono::high_resolution_clock::now();
1053+
// runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
1054+
// auto stop1 = std::chrono::high_resolution_clock::now();
1055+
// time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
1056+
// }
1057+
// if (clustererNNShadow.nnClusterizerVerbosity < 3) {
1058+
// int acceptedClusters = 0;
1059+
// for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
1060+
// if(clustererNNShadow.outputDataClass[i] > 1 || clustererNNShadow.outputDataClass[i] < 0) {
1061+
// LOG(info) << "WARNING ORT 2: " << clustererNNShadow.outputDataClass[i] << " for index " << i << " / " << clusterer.mPmemory->counters.nClusters;
1062+
// }
1063+
// acceptedClusters += clustererNNShadow.outputDataClass[i];
1064+
// }
1065+
// LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
1066+
// }
10421067
#else
10431068
GPUFatal("Project not compiled with neural network clusterization. Aborting.");
10441069
#endif
@@ -1139,12 +1164,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
11391164
}
11401165
}
11411166
for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
1142-
if (GetProcessingSettings().nn.applyNNclusterizer) {
1143-
GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
1144-
nnApplication.model_class.release();
1145-
nnApplication.model_reg_1.release();
1146-
nnApplication.model_reg_2.release();
1147-
}
1167+
// if (GetProcessingSettings().nn.applyNNclusterizer) {
1168+
// GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
1169+
// nnApplication.model_class.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
1170+
// nnApplication.model_reg_1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
1171+
// nnApplication.model_reg_2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
1172+
// }
11481173
if (transferRunning[i]) {
11491174
ReleaseEvent(mEvents->stream[i], doGPU);
11501175
}

0 commit comments

Comments
 (0)