Skip to content

Commit b1c88f0

Browse files
committed
Changes for synchronization and consistency. No performance loss.
1 parent fb08f18 commit b1c88f0

File tree

2 files changed

+7
-8
lines changed

2 files changed

+7
-8
lines changed

Common/ML/src/OrtInterface.cxx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -271,18 +271,18 @@ void OrtModel::inference(I* input, size_t input_size, O* output)
271271
std::vector<int64_t> inputShape{input_size, (int64_t)mInputShapes[0][1]};
272272
Ort::Value inputTensor = Ort::Value(nullptr);
273273
if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
274-
inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1] * sizeof(Ort::Float16_t), inputShape.data(), inputShape.size());
274+
inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
275275
} else {
276-
inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1] * sizeof(float), inputShape.data(), inputShape.size());
276+
inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
277277
}
278278
(pImplOrt->ioBinding)->BindInput(mInputNames[0].c_str(), inputTensor);
279279

280280
std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
281281
Ort::Value outputTensor = Ort::Value(nullptr);
282282
if constexpr (std::is_same_v<O, OrtDataType::Float16_t>) {
283-
outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(output), input_size * mOutputShapes[0][1] * sizeof(Ort::Float16_t), outputShape.data(), outputShape.size());
283+
outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
284284
} else {
285-
outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1] * sizeof(float), outputShape.data(), outputShape.size());
285+
outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
286286
}
287287
(pImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor);
288288

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -630,23 +630,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
630630
mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
631631
nnApplications[lane].init(nn_settings);
632632
if (nnApplications[lane].modelsUsed[0]) {
633-
SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane + numLanes, &deviceId);
633+
SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
634634
(nnApplications[lane].model_class).setDeviceId(deviceId);
635635
if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) {
636636
nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads);
637637
}
638638
(nnApplications[lane].model_class).initEnvironment();
639639
}
640640
if (nnApplications[lane].modelsUsed[1]) {
641-
SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane + 2*numLanes, &deviceId);
641+
SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
642642
(nnApplications[lane].model_reg_1).setDeviceId(deviceId);
643643
if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) {
644644
nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads);
645645
}
646646
(nnApplications[lane].model_reg_1).initEnvironment();
647647
}
648648
if (nnApplications[lane].modelsUsed[2]) {
649-
SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane + 3*numLanes, &deviceId);
649+
SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
650650
(nnApplications[lane].model_reg_2).setDeviceId(deviceId);
651651
if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) {
652652
nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads);
@@ -1039,7 +1039,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10391039
}
10401040
LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
10411041
}
1042-
TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clustererNN, lane);
10431042
#else
10441043
GPUFatal("Project not compiled with neural network clusterization. Aborting.");
10451044
#endif

0 commit comments

Comments
 (0)