Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 5 additions & 9 deletions Common/ML/include/ML/OrtInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,10 @@ class OrtModel

public:
// Constructors & destructors
OrtModel() = default;
OrtModel(std::unordered_map<std::string, std::string> optionsMap) { init(optionsMap); }
void init(std::unordered_map<std::string, std::string> optionsMap)
{
initOptions(optionsMap);
initEnvironment();
}
virtual ~OrtModel() = default;
OrtModel();
OrtModel(std::unordered_map<std::string, std::string> optionsMap);
void init(std::unordered_map<std::string, std::string> optionsMap);
virtual ~OrtModel();

// General purpose
void initOptions(std::unordered_map<std::string, std::string> optionsMap);
Expand Down Expand Up @@ -113,7 +109,7 @@ class OrtModel
private:
// ORT variables -> need to be hidden as pImpl
struct OrtVariables;
OrtVariables* mPImplOrt;
std::unique_ptr<OrtVariables> mPImplOrt;

// Input & Output specifications of the loaded network
std::vector<const char*> mInputNamesChar, mOutputNamesChar;
Expand Down
42 changes: 28 additions & 14 deletions Common/ML/src/OrtInterface.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,20 @@ namespace o2
namespace ml
{

OrtModel::OrtModel() = default;
OrtModel::OrtModel(std::unordered_map<std::string, std::string> optionsMap) { init(optionsMap); }
OrtModel::~OrtModel() = default;
void OrtModel::init(std::unordered_map<std::string, std::string> optionsMap)
{
initOptions(optionsMap);
initEnvironment();
}

struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file
// ORT runtime objects
Ort::RunOptions runOptions;
std::shared_ptr<Ort::Env> env = nullptr;
std::shared_ptr<Ort::Session> session = nullptr; ///< ONNX session
std::unique_ptr<Ort::Env> env = nullptr;
std::unique_ptr<Ort::Session> session = nullptr; ///< ONNX session
Ort::SessionOptions sessionOptions;
Ort::AllocatorWithDefaultOptions allocator;
Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
Expand All @@ -41,7 +50,7 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c
// General purpose
void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsMap)
{
mPImplOrt = new OrtVariables();
mPImplOrt = std::make_unique<OrtVariables>();

// Load from options map
if (!optionsMap.contains("model-path")) {
Expand Down Expand Up @@ -101,7 +110,7 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM

void OrtModel::initEnvironment()
{
mPImplOrt->env = std::make_shared<Ort::Env>(
mPImplOrt->env = std::make_unique<Ort::Env>(
OrtLoggingLevel(mLoggingLevel),
(mEnvName.empty() ? "ORT" : mEnvName.c_str()),
// Integrate ORT logging into Fairlogger
Expand Down Expand Up @@ -129,7 +138,7 @@ void OrtModel::initSession()
if (mAllocateDeviceMemory) {
memoryOnDevice(mDeviceId);
}
mPImplOrt->session = std::make_shared<Ort::Session>(*mPImplOrt->env, mModelPath.c_str(), mPImplOrt->sessionOptions);
mPImplOrt->session = std::make_unique<Ort::Session>(*mPImplOrt->env, mModelPath.c_str(), mPImplOrt->sessionOptions);
mPImplOrt->ioBinding = std::make_unique<Ort::IoBinding>(*mPImplOrt->session);

setIO();
Expand All @@ -147,12 +156,12 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
(mPImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
(mPImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
// Arena memory shrinkage comes at performance cost
/// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
// (mPImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
(mPImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27

std::string dev_mem_str = "";
if (mDeviceType == "ROCM") {
dev_mem_str = "Hip";
dev_mem_str = "HipPinned";
}
if (mDeviceType == "CUDA") {
dev_mem_str = "Cuda";
Expand All @@ -166,7 +175,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)

void OrtModel::resetSession()
{
mPImplOrt->session = std::make_shared<Ort::Session>(*(mPImplOrt->env), mModelPath.c_str(), mPImplOrt->sessionOptions);
mPImplOrt->session = std::make_unique<Ort::Session>(*(mPImplOrt->env), mModelPath.c_str(), mPImplOrt->sessionOptions);
}

// Getters
Expand Down Expand Up @@ -252,7 +261,7 @@ void OrtModel::setIO()

void OrtModel::setEnv(Ort::Env* env)
{
mPImplOrt->env = std::shared_ptr<Ort::Env>(env);
mPImplOrt->env.reset(env);
}

// Inference
Expand Down Expand Up @@ -308,6 +317,14 @@ void OrtModel::inference(I* input, int64_t input_size, O* output)
(mPImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor);

(mPImplOrt->session)->Run(mPImplOrt->runOptions, *mPImplOrt->ioBinding);
// mPImplOrt->session->Run(
// mPImplOrt->runOptions,
// mInputNamesChar.data(),
// &inputTensor,
// mInputNamesChar.size(),
// mOutputNamesChar.data(),
// &outputTensor,
// mOutputNamesChar.size());
}

template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t*, int64_t, OrtDataType::Float16_t*);
Expand Down Expand Up @@ -427,10 +444,7 @@ template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Fl
// Release session
void OrtModel::release(bool profilingEnabled)
{
// if (profilingEnabled) {
// mPImplOrt->session->EndProfiling();
// }
LOG(info) << "(ORT) Size of mPImplOrt: " << sizeof(*mPImplOrt) << " bytes";
mPImplOrt.reset();
}

// private
Expand Down
33 changes: 22 additions & 11 deletions GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
// But environment must be valid, so we init the model environment first and use it here afterwards.
// Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator.
// TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
// if (lane == 0) {
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
// }
// recreateMemoryAllocator = true;
(nnApplications[lane].mModelClass).initSession();
}
Expand All @@ -670,7 +672,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}
// (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv());
(nnApplications[lane].mModelReg1).initEnvironment();
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
(nnApplications[lane].mModelReg1).initSession();
}
if (nnApplications[lane].mModelsUsed[2]) {
Expand All @@ -679,8 +681,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
nnApplications[lane].mModelReg2.setIntraOpNumThreads(maxThreads);
}
// (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv());
(nnApplications[lane].mModelReg2).initEnvironment();
// nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
(nnApplications[lane].mModelReg2).initSession();
}
if (nn_settings.nnClusterizerVerbosity < 3) {
Expand All @@ -706,8 +709,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
if (doGPU) {
WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
}
LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes";
LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes";
}
#endif

Expand Down Expand Up @@ -975,6 +976,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];

// // bool recreateMemoryAllocator = false;
// if (lane == 0) {
// (nnApplications[lane].mModelClass).initEnvironment();
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, 0);
// }
// // recreateMemoryAllocator = true;
// (nnApplications[lane].mModelClass).initSession();
// (nnApplications[lane].mModelReg1).initSession();

int withMC = (doGPU && propagateMCLabels);

if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
Expand Down Expand Up @@ -1187,12 +1197,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}
}
for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
// if (GetProcessingSettings().nn.applyNNclusterizer) {
// GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
// nnApplication.mModelClass.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
// nnApplication.mModelReg1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
// nnApplication.mModelReg2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
// }
if (GetProcessingSettings().nn.applyNNclusterizer) {
LOG(info) << "(ORT) Environment releasing...";
GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
nnApplication.mModelClass.release(true);
nnApplication.mModelReg1.release(true);
nnApplication.mModelReg2.release(true);
}
if (transferRunning[i]) {
ReleaseEvent(mEvents->stream[i], doGPU);
}
Expand Down
23 changes: 11 additions & 12 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,8 @@ struct MockedOrtAllocator : OrtAllocator {
std::atomic<size_t> memory_inuse{0};
std::atomic<size_t> num_allocations{0};
std::atomic<size_t> num_reserve_allocations{0};
OrtMemoryInfo* memory_info;
GPUReconstruction* rec;
OrtMemoryInfo* mMemoryInfoInternal;
GPUReconstruction* mRecInternal;
};

MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info)
Expand All @@ -147,37 +147,36 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info
OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
rec = r;
memory_info = info;
mRecInternal = r;
mMemoryInfoInternal = info;
}

MockedOrtAllocator::~MockedOrtAllocator()
{
// Ort::GetApi().ReleaseMemoryInfo(memory_info);
// Ort::GetApi().ReleaseMemoryInfo(mMemoryInfoInternal);
(void)0; // Suppress warning for empty destructor
}

void* MockedOrtAllocator::Alloc(size_t size)
{
// LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes";
return rec->AllocateVolatileDeviceMemory(size);
LOG(info) << "(ORT) Allocating direct memory of size " << size << " bytes";
return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
}

void* MockedOrtAllocator::Reserve(size_t size)
{
// LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes";
return rec->AllocateVolatileDeviceMemory(size);
LOG(info) << "(ORT) Reserving direct memory of size " << size << " bytes";
return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
}

void MockedOrtAllocator::Free(void* p)
{
// LOG(info) << "(ORT) Freeing volatile memory " << p;
rec->ReturnVolatileDeviceMemory();
}

const OrtMemoryInfo* MockedOrtAllocator::Info() const
{
return memory_info;
return mMemoryInfoInternal;
}

size_t MockedOrtAllocator::NumAllocations() const
Expand All @@ -197,7 +196,7 @@ void MockedOrtAllocator::LeakCheck()
}
}

void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
void GPUTPCNNClusterizerHost::directOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
{
mMockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)(*memInfo));
if (recreate) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class GPUTPCNNClusterizerHost
void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);

// ONNX
void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
MockedOrtAllocator* getMockedAllocator();
const OrtMemoryInfo* getMockedMemoryInfo();

Expand Down
Loading