Skip to content

Commit 497d53f

Browse files
GPU stream implementation for ONNX runtime (#14117)
* Initial set of bug.fixes and cosmetic changes * Please consider the following formatting changes * Adjusting eval sizes. Makes code neater and avoids some calculations * Adding separate functions. Now the host process only needs one instance and one initialization * First version of CCDB implementation * Working CCDB API calls (tested with test-ccdb) * Improve fetching, but have to pass settings by value, not const ref * Using const ref and moving CCDB calls to host initialization * Simplifications and renaming * Please consider the following formatting changes * First version of GPU stream implementation. Still needs testing. * Fixes * Please consider the following formatting changes * Adding the lane variable. This PR will in any case conflict with #14069 * Compiles on EPNs. Need to add shadow processors next. But for this, I will merge #14069 to have the changes in GPUChainTrackingClusterizer. * Adding shadow instance. Not sure if this correctly allocates GPU memory using AllocateRegisteredMemory * This runs, but will eventually fill up the VRAM. Need to include a mem clean * Found the stream allocation issue. Now starting optimizations * Improve readability and adapt for some comments * Fixing memory assignment issue. Reconstruction runs through with FP32 networks * Major reworkings to add FP16 support * Bug-fixes * Improved data filling speeds by factor 3 * Limiting threads for ONNX evaluation * Bug-fix for correct thread assignment and input data filling * Minor changes * Adding I** inference, potentally needed for CNN + FC inference * CCDB fetching of NNs ported to GPUWorkflowSpec * Adjusting CPU threads and ORT copmile definitions * About 10x speed-up due to explicit io binding * Changes for synchronization and consistency. No performance loss. * Please consider the following formatting changes * Fixing warnings (errors due to size_t) * Fixing linker issues * Adding volatile memory allocation and MockedOrtAllocator. Removing print statements and time measurements * Please consider the following formatting changes * Circumvent "unused result" warning and build failure * Adjust for comments * Please consider the following formatting changes * Fixing build flags --------- Co-authored-by: ALICE Action Bot <alibuild@cern.ch>
1 parent 8ffe167 commit 497d53f

27 files changed

+1301
-439
lines changed

Common/ML/CMakeLists.txt

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,14 @@
99
# granted to it by virtue of its status as an Intergovernmental Organization
1010
# or submit itself to any jurisdiction.
1111

12-
# Pass ORT variables as a preprocessor definition
13-
if(ORT_ROCM_BUILD)
14-
add_compile_definitions(ORT_ROCM_BUILD=1)
15-
endif()
16-
if(ORT_CUDA_BUILD)
17-
add_compile_definitions(ORT_CUDA_BUILD=1)
18-
endif()
19-
if(ORT_MIGRAPHX_BUILD)
20-
add_compile_definitions(ORT_MIGRAPHX_BUILD=1)
21-
endif()
22-
if(ORT_TENSORRT_BUILD)
23-
add_compile_definitions(ORT_TENSORRT_BUILD=1)
24-
endif()
25-
2612
o2_add_library(ML
2713
SOURCES src/OrtInterface.cxx
2814
TARGETVARNAME targetName
2915
PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime)
16+
17+
# Pass ORT variables as a preprocessor definition
18+
target_compile_definitions(${targetName} PRIVATE
19+
$<$<BOOL:${ORT_ROCM_BUILD}>:ORT_ROCM_BUILD>
20+
$<$<BOOL:${ORT_CUDA_BUILD}>:ORT_CUDA_BUILD>
21+
$<$<BOOL:${ORT_MIGRAPHX_BUILD}>:ORT_MIGRAPHX_BUILD>
22+
$<$<BOOL:${ORT_TENSORRT_BUILD}>:ORT_TENSORRT_BUILD>)

Common/ML/include/ML/3rdparty/GPUORTFloat16.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -882,4 +882,4 @@ static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
882882
} // namespace OrtDataType
883883

884884
} // namespace o2
885-
#endif
885+
#endif

Common/ML/include/ML/OrtInterface.h

Lines changed: 63 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@
2626
// O2 includes
2727
#include "Framework/Logger.h"
2828

29+
namespace Ort
30+
{
31+
struct SessionOptions;
32+
struct MemoryInfo;
33+
struct Env;
34+
} // namespace Ort
35+
2936
namespace o2
3037
{
3138

@@ -36,14 +43,52 @@ class OrtModel
3643
{
3744

3845
public:
39-
// Constructor
46+
// Constructors & destructors
4047
OrtModel() = default;
41-
OrtModel(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
42-
void init(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
43-
void reset(std::unordered_map<std::string, std::string>);
48+
OrtModel(std::unordered_map<std::string, std::string> optionsMap) { init(optionsMap); }
49+
void init(std::unordered_map<std::string, std::string> optionsMap)
50+
{
51+
initOptions(optionsMap);
52+
initEnvironment();
53+
}
54+
virtual ~OrtModel() = default;
55+
56+
// General purpose
57+
void initOptions(std::unordered_map<std::string, std::string> optionsMap);
58+
void initEnvironment();
59+
void initSession();
60+
void memoryOnDevice(int32_t = 0);
4461
bool isInitialized() { return mInitialized; }
62+
void resetSession();
4563

46-
virtual ~OrtModel() = default;
64+
// Getters
65+
std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
66+
std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
67+
std::vector<std::string> getInputNames() const { return mInputNames; }
68+
std::vector<std::string> getOutputNames() const { return mOutputNames; }
69+
Ort::SessionOptions* getSessionOptions();
70+
Ort::MemoryInfo* getMemoryInfo();
71+
Ort::Env* getEnv();
72+
int32_t getIntraOpNumThreads() const { return intraOpNumThreads; }
73+
int32_t getInterOpNumThreads() const { return interOpNumThreads; }
74+
75+
// Setters
76+
void setDeviceId(int32_t id) { deviceId = id; }
77+
void setIO();
78+
void setActiveThreads(int threads) { intraOpNumThreads = threads; }
79+
void setIntraOpNumThreads(int threads)
80+
{
81+
if (deviceType == "CPU") {
82+
intraOpNumThreads = threads;
83+
}
84+
}
85+
void setInterOpNumThreads(int threads)
86+
{
87+
if (deviceType == "CPU") {
88+
interOpNumThreads = threads;
89+
}
90+
}
91+
void setEnv(Ort::Env*);
4792

4893
// Conversion
4994
template <class I, class O>
@@ -53,41 +98,36 @@ class OrtModel
5398
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
5499
std::vector<O> inference(std::vector<I>&);
55100

56-
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
101+
template <class I, class O>
57102
std::vector<O> inference(std::vector<std::vector<I>>&);
58103

59-
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
60-
void inference(I*, size_t, O*);
61-
62-
// template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
63-
// std::vector<O> inference(std::vector<I>&);
64-
65-
// Reset session
66-
void resetSession();
104+
template <class I, class O>
105+
void inference(I*, int64_t, O*);
67106

68-
std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
69-
std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
70-
std::vector<std::string> getInputNames() const { return mInputNames; }
71-
std::vector<std::string> getOutputNames() const { return mOutputNames; }
107+
template <class I, class O>
108+
void inference(I**, int64_t, O*);
72109

73-
void setActiveThreads(int threads) { intraOpNumThreads = threads; }
110+
void release(bool = false);
74111

75112
private:
76-
// ORT variables -> need to be hidden as Pimpl
113+
// ORT variables -> need to be hidden as pImpl
77114
struct OrtVariables;
78115
OrtVariables* pImplOrt;
79116

80117
// Input & Output specifications of the loaded network
81118
std::vector<const char*> inputNamesChar, outputNamesChar;
82119
std::vector<std::string> mInputNames, mOutputNames;
83-
std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
120+
std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes, inputShapesCopy, outputShapesCopy; // Input shapes
121+
std::vector<int64_t> inputSizePerNode, outputSizePerNode; // Output shapes
122+
int32_t mInputsTotal = 0, mOutputsTotal = 0; // Total number of inputs and outputs
84123

85124
// Environment settings
86125
bool mInitialized = false;
87-
std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
88-
int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
126+
std::string modelPath, envName = "", deviceType = "CPU", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
127+
int32_t intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = -1, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
89128

90129
std::string printShape(const std::vector<int64_t>&);
130+
std::string printShape(const std::vector<std::vector<int64_t>>&, std::vector<std::string>&);
91131
};
92132

93133
} // namespace ml

0 commit comments

Comments
 (0)