Skip to content

Commit 3174e39

Browse files
committed
Merge branch 'gpu_clusterizer_bug_fixes' into onnx_gpu_implementation
2 parents 70320c3 + 0ed7d25 commit 3174e39

File tree

9 files changed

+302
-194
lines changed

9 files changed

+302
-194
lines changed

Common/ML/include/ML/OrtInterface.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,13 @@ class OrtModel
5858

5959
// Inferencing
6060
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
61-
std::vector<O> inference(std::vector<I>&);
61+
std::vector<O> inference(std::vector<I>&, int32_t = -1);
6262

6363
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
64-
std::vector<O> inference(std::vector<std::vector<I>>&);
64+
std::vector<O> inference(std::vector<std::vector<I>>&, int32_t = -1);
6565

6666
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
67-
void inference(I*, size_t, O*);
67+
void inference(I*, size_t, O*, int32_t = -1);
6868

6969
// template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
7070
// std::vector<O> inference(std::vector<I>&);
@@ -92,7 +92,7 @@ class OrtModel
9292
// Environment settings
9393
bool mInitialized = false;
9494
std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
95-
int intraOpNumThreads = 1, interOpNumThreads = 1, streamId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
95+
int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
9696

9797
std::string printShape(const std::vector<int64_t>&);
9898
};

Common/ML/src/OrtInterface.cxx

Lines changed: 68 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
5858
if (!optionsMap["model-path"].empty()) {
5959
modelPath = optionsMap["model-path"];
6060
device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
61-
streamId = (optionsMap.contains("stream-id") ? std::stoi(optionsMap["stream-id"]) : 0);
61+
deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
6262
allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
6363
intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
6464
interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0);
@@ -68,40 +68,26 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
6868

6969
// #if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1
7070
// if (device == "ROCM") {
71-
// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, streamId));
72-
// SetONNXGPUStream(pImplOrt->sessionOptions, streamId);
71+
// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
72+
// SetONNXGPUStream(pImplOrt->sessionOptions, deviceId);
7373
// LOG(info) << "(ORT) ROCM execution provider set";
7474
// }
7575
// #endif
7676
// #if defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1
7777
// if (device == "MIGRAPHX") {
78-
// Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, streamId));
78+
// Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
7979
// LOG(info) << "(ORT) MIGraphX execution provider set";
8080
// }
8181
// #endif
8282
// #if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1
8383
// if (device == "CUDA") {
84-
// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, streamId));
85-
// SetONNXGPUStream(pImplOrt->sessionOptions, streamId);
84+
// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
85+
// SetONNXGPUStream(pImplOrt->sessionOptions, deviceId);
8686
// LOG(info) << "(ORT) CUDA execution provider set";
8787
// dev_mem_str = "Cuda";
8888
// }
8989
// #endif
9090

91-
#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
92-
if (allocateDeviceMemory) {
93-
std::string dev_mem_str = "";
94-
if (device == "ROCM") {
95-
dev_mem_str = "Hip";
96-
}
97-
if (device == "CUDA") {
98-
dev_mem_str = "Cuda";
99-
}
100-
pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, streamId, OrtMemType::OrtMemTypeDefault);
101-
LOG(info) << "(ORT) Memory info set to on-device memory";
102-
}
103-
#endif
104-
10591
if (device == "CPU") {
10692
(pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
10793
(pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads);
@@ -213,8 +199,24 @@ std::string OrtModel::printShape(const std::vector<int64_t>& v)
213199
}
214200

215201
template <class I, class O>
216-
std::vector<O> OrtModel::inference(std::vector<I>& input)
202+
std::vector<O> OrtModel::inference(std::vector<I>& input, int32_t deviceIndex)
217203
{
204+
#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
205+
if (allocateDeviceMemory) {
206+
if (deviceIndex >= 0) {
207+
deviceId = deviceIndex;
208+
}
209+
std::string dev_mem_str = "";
210+
if (device == "ROCM") {
211+
dev_mem_str = "Hip";
212+
}
213+
if (device == "CUDA") {
214+
dev_mem_str = "Cuda";
215+
}
216+
pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
217+
LOG(info) << "(ORT) Memory info set to on-device memory";
218+
}
219+
#endif
218220
std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
219221
std::vector<Ort::Value> inputTensor;
220222
if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
@@ -230,37 +232,68 @@ std::vector<O> OrtModel::inference(std::vector<I>& input)
230232
return outputValuesVec;
231233
}
232234

233-
template std::vector<float> OrtModel::inference<float, float>(std::vector<float>&);
235+
template std::vector<float> OrtModel::inference<float, float>(std::vector<float>&, int32_t);
234236

235-
template std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>&);
237+
template std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>&, int32_t);
236238

237-
template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>&);
239+
template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>&, int32_t);
238240

239241
template <class I, class O>
240-
void OrtModel::inference(I* input, size_t input_size, O* output)
242+
void OrtModel::inference(I* input, size_t input_size, O* output, int32_t deviceIndex)
241243
{
242-
std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
244+
#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
245+
if (allocateDeviceMemory) {
246+
if (deviceIndex >= 0) {
247+
deviceId = deviceIndex;
248+
}
249+
std::string dev_mem_str = "";
250+
if (device == "ROCM") {
251+
dev_mem_str = "Hip";
252+
}
253+
if (device == "CUDA") {
254+
dev_mem_str = "Cuda";
255+
}
256+
pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
257+
LOG(info) << "(ORT) Memory info set to on-device memory";
258+
}
259+
#endif
260+
std::vector<int64_t> inputShape{input_size, (int64_t)mInputShapes[0][1]};
243261
Ort::Value inputTensor = Ort::Value(nullptr);
244262
if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
245-
inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size());
263+
inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
246264
} else {
247-
inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size());
265+
inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
248266
}
249267

250-
std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
251-
size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]);
252-
Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
268+
std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
269+
Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
253270

254-
(pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here
271+
(pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is always correct here
255272
}
256273

257-
template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
274+
template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*, int32_t);
258275

259-
template void OrtModel::inference<float, float>(float*, size_t, float*);
276+
template void OrtModel::inference<float, float>(float*, size_t, float*, int32_t);
260277

261278
template <class I, class O>
262-
std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
279+
std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input, int32_t deviceIndex)
263280
{
281+
#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
282+
if (allocateDeviceMemory) {
283+
if (deviceIndex >= 0) {
284+
deviceId = deviceIndex;
285+
}
286+
std::string dev_mem_str = "";
287+
if (device == "ROCM") {
288+
dev_mem_str = "Hip";
289+
}
290+
if (device == "CUDA") {
291+
dev_mem_str = "Cuda";
292+
}
293+
pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
294+
LOG(info) << "(ORT) Memory info set to on-device memory";
295+
}
296+
#endif
264297
std::vector<Ort::Value> inputTensor;
265298
for (auto i : input) {
266299
std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,8 @@ AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural
229229
AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
230230
AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
231231
AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
232-
AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
232+
AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16
233+
AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
233234
AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
234235
AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
235236
AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347")
@@ -250,6 +251,16 @@ AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The c
250251
AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
251252
AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
252253
AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
254+
// CCDB
255+
AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally")
256+
AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched")
257+
AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks")
258+
AddOption(nnCCDBFetchMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)")
259+
AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression")
260+
AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
261+
AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
262+
AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp")
263+
AddOption(nnCCDBInteractionRate, int, 50, "", 0, "Distinguishes between networks for different interaction rates [kHz].")
253264
AddHelp("help", 'h')
254265
EndConfig()
255266

0 commit comments

Comments
 (0)