Skip to content

Commit fb08f18

Browse files
committed
About 10x speed-up due to explicit io binding
1 parent a985798 commit fb08f18

File tree

4 files changed

+50
-23
lines changed

4 files changed

+50
-23
lines changed

Common/ML/include/ML/OrtInterface.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ class OrtModel
9191
template <class I, class O>
9292
void inference(I**, size_t, O*);
9393

94+
void release();
95+
9496
private:
9597
// ORT variables -> need to be hidden as pImpl
9698
struct OrtVariables;

Common/ML/src/OrtInterface.cxx

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c
3333
Ort::SessionOptions sessionOptions;
3434
Ort::AllocatorWithDefaultOptions allocator;
3535
Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
36+
std::unique_ptr<Ort::IoBinding> ioBinding = nullptr;
3637
};
3738

3839
// General purpose
@@ -122,7 +123,8 @@ void OrtModel::initEnvironment()
122123
},
123124
(void*)3);
124125
(pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
125-
pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
126+
pImplOrt->session = std::make_shared<Ort::Session>(*pImplOrt->env, modelPath.c_str(), pImplOrt->sessionOptions);
127+
pImplOrt->ioBinding = std::make_unique<Ort::IoBinding>(*pImplOrt->session);
126128

127129
setIO();
128130

@@ -135,6 +137,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
135137
{
136138
#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
137139
if (deviceIndex >= 0) {
140+
(pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1");
138141
std::string dev_mem_str = "";
139142
if (deviceType == "ROCM") {
140143
dev_mem_str = "Hip";
@@ -268,20 +271,22 @@ void OrtModel::inference(I* input, size_t input_size, O* output)
268271
std::vector<int64_t> inputShape{input_size, (int64_t)mInputShapes[0][1]};
269272
Ort::Value inputTensor = Ort::Value(nullptr);
270273
if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
271-
inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
274+
inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1] * sizeof(Ort::Float16_t), inputShape.data(), inputShape.size());
272275
} else {
273-
inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
276+
inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1] * sizeof(float), inputShape.data(), inputShape.size());
274277
}
278+
(pImplOrt->ioBinding)->BindInput(mInputNames[0].c_str(), inputTensor);
275279

276280
std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
277281
Ort::Value outputTensor = Ort::Value(nullptr);
278282
if constexpr (std::is_same_v<O, OrtDataType::Float16_t>) {
279-
outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
283+
outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(output), input_size * mOutputShapes[0][1] * sizeof(Ort::Float16_t), outputShape.data(), outputShape.size());
280284
} else {
281-
outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
285+
outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1] * sizeof(float), outputShape.data(), outputShape.size());
282286
}
287+
(pImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor);
283288

284-
(pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size());
289+
(pImplOrt->session)->Run(pImplOrt->runOptions, *pImplOrt->ioBinding);
285290
}
286291

287292
template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t*, size_t, OrtDataType::Float16_t*);
@@ -398,6 +403,12 @@ std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& inputs)
398403
template std::vector<float> OrtModel::inference<float, float>(std::vector<std::vector<float>>&);
399404
template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>&);
400405

406+
// Release session
407+
void OrtModel::release()
408+
{
409+
LOG(info) << "(ORT) Size of pImplOrt: " << sizeof(*pImplOrt) << " bytes";
410+
}
411+
401412
// private
402413
std::string OrtModel::printShape(const std::vector<int64_t>& v)
403414
{

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,7 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_option
673673
// UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
674674

675675
// this implicitly sets "has_user_compute_stream"
676+
cuda_options.has_user_compute_stream = 1;
676677
UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream]);
677678
session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
678679

@@ -698,10 +699,9 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options
698699
// api.GetCurrentGpuDeviceId(deviceId);
699700
OrtROCMProviderOptions rocm_options;
700701
rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
702+
rocm_options.arena_extend_strategy = 0;
701703
rocm_options.user_compute_stream = mInternals->Streams[stream];
702704
session_options.AppendExecutionProvider_ROCM(rocm_options);
703-
// OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, *deviceId);
704-
// api.ReleaseROCMProviderOptions(rocm_options);
705705
}
706706

707707
#endif // GPUCA_HAS_ONNX

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -630,23 +630,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
630630
mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
631631
nnApplications[lane].init(nn_settings);
632632
if (nnApplications[lane].modelsUsed[0]) {
633-
SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
633+
SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane + numLanes, &deviceId);
634634
(nnApplications[lane].model_class).setDeviceId(deviceId);
635635
if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) {
636636
nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads);
637637
}
638638
(nnApplications[lane].model_class).initEnvironment();
639639
}
640640
if (nnApplications[lane].modelsUsed[1]) {
641-
SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
641+
SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane + 2*numLanes, &deviceId);
642642
(nnApplications[lane].model_reg_1).setDeviceId(deviceId);
643643
if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) {
644644
nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads);
645645
}
646646
(nnApplications[lane].model_reg_1).initEnvironment();
647647
}
648648
if (nnApplications[lane].modelsUsed[2]) {
649-
SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
649+
SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane + 3*numLanes, &deviceId);
650650
(nnApplications[lane].model_reg_2).setDeviceId(deviceId);
651651
if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) {
652652
nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads);
@@ -950,7 +950,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
950950
DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
951951
}
952952

953-
float time_clusterizer = 0, time_fill = 0;
953+
float time_clusterizer = 0, time_fill = 0, time_networks = 0;
954954
for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.nnClusterizerBatchedMode); batch++) {
955955
uint batchStart = batch * clustererNNShadow.nnClusterizerBatchedMode;
956956
size_t iSize = CAMath::Min((uint)clustererNNShadow.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
@@ -961,6 +961,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
961961

962962
auto start1 = std::chrono::high_resolution_clock::now();
963963

964+
// NN evaluations
964965
if (clustererNNShadow.nnInferenceInputDType == 0) {
965966
if (clustererNNShadow.nnInferenceOutputDType == 0) {
966967
(nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_16);
@@ -974,14 +975,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
974975
(nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_32);
975976
}
976977
}
977-
978-
if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
979-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
980-
} else {
981-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
982-
}
983978
if (!clustererNNShadow.nnClusterizerUseCfRegression) {
984-
// nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType);
985979
if (clustererNNShadow.nnInferenceInputDType == 0) {
986980
if (clustererNNShadow.nnInferenceOutputDType == 0) {
987981
(nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_16);
@@ -995,9 +989,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
995989
(nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_32);
996990
}
997991
}
998-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Running the NN for regression class 1
999992
if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
1000-
// nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType);
1001993
if (clustererNNShadow.nnInferenceInputDType == 0) {
1002994
if (clustererNNShadow.nnInferenceOutputDType == 0) {
1003995
(nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_16);
@@ -1011,11 +1003,26 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10111003
(nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_32);
10121004
}
10131005
}
1014-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Running the NN for regression class 2
1006+
}
1007+
}
1008+
1009+
auto stopNNs = std::chrono::high_resolution_clock::now();
1010+
1011+
// Publishing kernels
1012+
if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
1013+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
1014+
} else {
1015+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
1016+
}
1017+
if (!clustererNNShadow.nnClusterizerUseCfRegression) {
1018+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 1 regression results
1019+
if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
1020+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results
10151021
}
10161022
}
10171023
auto stop1 = std::chrono::high_resolution_clock::now();
10181024

1025+
time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count() / 1e9;
10191026
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
10201027
time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
10211028
}
@@ -1030,8 +1037,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10301037
for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
10311038
acceptedClusters += clustererNNShadow.outputDataClass[i];
10321039
}
1033-
LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
1040+
LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
10341041
}
1042+
TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clustererNN, lane);
10351043
#else
10361044
GPUFatal("Project not compiled with neural network clusterization. Aborting.");
10371045
#endif
@@ -1132,6 +1140,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
11321140
}
11331141
}
11341142
for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
1143+
if (GetProcessingSettings().nn.applyNNclusterizer) {
1144+
GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
1145+
nnApplication.model_class.release();
1146+
nnApplication.model_reg_1.release();
1147+
nnApplication.model_reg_2.release();
1148+
}
11351149
if (transferRunning[i]) {
11361150
ReleaseEvent(mEvents->stream[i], doGPU);
11371151
}

0 commit comments

Comments
 (0)