Skip to content

Commit 08753dd

Browse files
committed
Modifying the approach to not use std:: types. Still needs to be tested and need to do proper memory allocation
1 parent e8af1c2 commit 08753dd

File tree

9 files changed

+341
-190
lines changed

9 files changed

+341
-190
lines changed

Common/ML/include/ML/OrtInterface.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,12 @@ class OrtModel
5353
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
5454
std::vector<O> inference(std::vector<I>&);
5555

56+
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
57+
O* inference(I*, size_t);
58+
59+
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
60+
void inference(I*, size_t, O*);
61+
5662
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
5763
std::vector<O> inference(std::vector<std::vector<I>>&);
5864

Common/ML/src/OrtInterface.cxx

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,29 @@ std::vector<O> OrtModel::inference(std::vector<I>& input)
198198
return outputValuesVec;
199199
}
200200

201+
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
202+
O* OrtModel::inference(I* input, size_t input_size)
203+
{
204+
std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
205+
std::vector<Ort::Value> inputTensor;
206+
inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(input), input_size, inputShape.data(), inputShape.size()));
207+
// input.clear();
208+
auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
209+
O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
210+
return outputValues;
211+
}
212+
213+
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
214+
void OrtModel::inference(I* input, size_t input_size, O* output)
215+
{
216+
std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
217+
std::vector<Ort::Value> inputTensor;
218+
inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(input), input_size, inputShape.data(), inputShape.size()));
219+
// input.clear();
220+
auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
221+
output = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
222+
}
223+
201224
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
202225
std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
203226
{
@@ -280,6 +303,60 @@ std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Floa
280303
return outputValuesVec;
281304
}
282305

306+
template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
307+
float* OrtModel::inference(float* input, size_t input_size)
308+
{
309+
std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
310+
std::vector<Ort::Value> inputTensor;
311+
inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, reinterpret_cast<float*>(input), input_size, inputShape.data(), inputShape.size()));
312+
// input.clear();
313+
auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
314+
float* outputValues = reinterpret_cast<float*>(outputTensors[0].template GetTensorMutableData<float>());
315+
return outputValues;
316+
}
317+
318+
template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
319+
float* OrtModel::inference(OrtDataType::Float16_t* input, size_t input_size)
320+
{
321+
std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
322+
std::vector<Ort::Value> inputTensor;
323+
inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size()));
324+
// input.clear();
325+
auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
326+
float* outputValues = reinterpret_cast<float*>(outputTensors[0].template GetTensorMutableData<float>());
327+
return outputValues;
328+
}
329+
330+
template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
331+
void OrtModel::inference(float* input, size_t input_size, float* output)
332+
{
333+
std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
334+
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size());
335+
336+
std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
337+
size_t outputSize = (int64_t)((input_size / mInputShapes[0][1]) * outputShape[1]);
338+
Ort::Value outputTensor = Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
339+
340+
(pImplOrt->session)->Run(pImplOrt->runOptions,
341+
inputNamesChar.data(), &inputTensor, 1,
342+
outputNamesChar.data(), &outputTensor, 1);
343+
}
344+
345+
template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
346+
void OrtModel::inference(OrtDataType::Float16_t* input, size_t input_size, float* output)
347+
{
348+
std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
349+
Ort::Value inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size());
350+
351+
std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
352+
size_t outputSize = (int64_t)((input_size / mInputShapes[0][1]) * outputShape[1]);
353+
Ort::Value outputTensor = Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
354+
355+
(pImplOrt->session)->Run(pImplOrt->runOptions,
356+
inputNamesChar.data(), &inputTensor, 1,
357+
outputNamesChar.data(), &outputTensor, 1);
358+
}
359+
283360
template <>
284361
std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input)
285362
{

GPU/GPUTracking/Base/GPUMemoryResource.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ struct GPUMemoryReuse {
2828
};
2929
enum Group : uint16_t {
3030
ClustererScratch,
31+
NNClusterer,
3132
ClustererZS,
3233
TrackerScratch,
3334
TrackerDataLinks,

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 29 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
/// \file GPUChainTrackingClusterizer.cxx
1313
/// \author David Rohr
1414

15-
#include <CommonUtils/StringUtils.h>
16-
1715
#include "GPUChainTracking.h"
1816
#include "GPUChainTrackingDefs.h"
1917
#include "GPULogging.h"
@@ -882,104 +880,59 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
882880
#ifdef GPUCA_HAS_ONNX
883881
// Settings for the clusterizer
884882
GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn;
885-
GPUTPCNNClusterizerInternals nnSettingsInternal;
886-
clusterer.nnInternals = &nnSettingsInternal;
887-
(clusterer.nnInternals)->nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
888-
(clusterer.nnInternals)->nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
889-
(clusterer.nnInternals)->nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
890-
(clusterer.nnInternals)->nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
891-
(clusterer.nnInternals)->nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
892-
(clusterer.nnInternals)->nnClusterizerElementSize = ((2 * (clusterer.nnInternals)->nnClusterizerSizeInputRow + 1) * (2 * (clusterer.nnInternals)->nnClusterizerSizeInputPad + 1) * (2 * (clusterer.nnInternals)->nnClusterizerSizeInputTime + 1)) + ((clusterer.nnInternals)->nnClusterizerAddIndexData ? 3 : 0);
893-
(clusterer.nnInternals)->nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
894-
(clusterer.nnInternals)->nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
883+
clusterer.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
884+
clusterer.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
885+
clusterer.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
886+
clusterer.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
887+
clusterer.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
888+
clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
889+
clusterer.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
890+
clusterer.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
895891
if (nn_settings.nnClusterizerVerbosity < 0) {
896-
(clusterer.nnInternals)->nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
892+
clusterer.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
897893
} else {
898-
(clusterer.nnInternals)->nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
894+
clusterer.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
899895
}
900896

901897
// Settings for the NN evaluation
902-
(clusterer.nnInternals)->nnClassThreshold = nn_settings.nnClassThreshold;
903-
(clusterer.nnInternals)->nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
904-
905-
// Settings for the neural network evaluation
906-
(clusterer.nnInternals)->OrtOptions = {
907-
{"model-path", nn_settings.nnClassificationPath},
908-
{"device", nn_settings.nnInferenceDevice},
909-
{"device-id", std::to_string(nn_settings.nnInferenceDeviceId)},
910-
{"allocate-device-memory", std::to_string(nn_settings.nnInferenceAllocateDevMem)},
911-
{"dtype", nn_settings.nnInferenceDtype},
912-
{"intra-op-num-threads", std::to_string(nn_settings.nnInferenceThreadsPerNN)},
913-
{"enable-optimizations", std::to_string(nn_settings.nnInferenceEnableOrtOptimization)},
914-
{"enable-profiling", std::to_string(nn_settings.nnInferenceOrtProfiling)},
915-
{"profiling-output-path", nn_settings.nnInferenceOrtProfilingPath},
916-
{"logging-level", std::to_string(nn_settings.nnInferenceVerbosity)}};
917-
(clusterer.nnInternals)->model_class.init((clusterer.nnInternals)->OrtOptions);
918-
std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(nn_settings.nnRegressionPath, ':');
919-
920-
if (!(clusterer.nnInternals)->nnClusterizerUseCfRegression) {
921-
if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
922-
(clusterer.nnInternals)->OrtOptions["model-path"] = reg_model_paths[0];
923-
(clusterer.nnInternals)->model_reg_1.init((clusterer.nnInternals)->OrtOptions);
924-
(clusterer.nnInternals)->nnClusterizerModelClassNumOutputNodes = (clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1];
925-
} else {
926-
(clusterer.nnInternals)->OrtOptions["model-path"] = reg_model_paths[0];
927-
(clusterer.nnInternals)->model_reg_1.init((clusterer.nnInternals)->OrtOptions);
928-
(clusterer.nnInternals)->nnClusterizerModelReg1NumOutputNodes = (clusterer.nnInternals)->model_reg_1.getNumOutputNodes()[0][1];
929-
(clusterer.nnInternals)->OrtOptions["model-path"] = reg_model_paths[1];
930-
(clusterer.nnInternals)->model_reg_2.init((clusterer.nnInternals)->OrtOptions);
931-
(clusterer.nnInternals)->nnClusterizerModelReg2NumOutputNodes = (clusterer.nnInternals)->model_reg_2.getNumOutputNodes()[0][1];
932-
}
933-
}
898+
clusterer.nnClassThreshold = nn_settings.nnClassThreshold;
899+
clusterer.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
900+
901+
GPUTPCNNClusterizerInternals nnApplication(GetProcessingSettings(), clusterer);
934902

935-
if ((clusterer.nnInternals)->nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
903+
if (clusterer.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
936904
runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
937905
DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
938906
}
939907

940-
if ((clusterer.nnInternals)->nnSigmoidTrafoClassThreshold) {
908+
if (clusterer.nnSigmoidTrafoClassThreshold) {
941909
// Inverse sigmoid transformation
942-
(clusterer.nnInternals)->nnClassThreshold = (float)std::log((clusterer.nnInternals)->nnClassThreshold / (1.f - (clusterer.nnInternals)->nnClassThreshold));
910+
clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold));
943911
}
944912

945913
float time_clusterizer = 0, time_fill = 0;
946-
int evalDtype = (clusterer.nnInternals)->OrtOptions["dtype"].find("32") != std::string::npos;
947-
(clusterer.nnInternals)->outputDataClass.resize(clusterer.mPmemory->counters.nClusters, -1);
914+
int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
948915

949-
for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / (clusterer.nnInternals)->nnClusterizerBatchedMode); batch++) {
950-
uint batchStart = batch * (clusterer.nnInternals)->nnClusterizerBatchedMode;
951-
uint iSize = CAMath::Min((uint)(clusterer.nnInternals)->nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
952-
953-
(clusterer.nnInternals)->clusterFlags.clear();
954-
(clusterer.nnInternals)->peakPositions.clear();
955-
(clusterer.nnInternals)->centralCharges.clear();
956-
957-
(clusterer.nnInternals)->clusterFlags.resize(iSize, {0, 0});
958-
(clusterer.nnInternals)->peakPositions.resize(iSize);
959-
(clusterer.nnInternals)->centralCharges.resize(iSize);
960-
961-
if (evalDtype == 1) {
962-
(clusterer.nnInternals)->inputData32.resize(iSize * (clusterer.nnInternals)->nnClusterizerElementSize, (float)((clusterer.nnInternals)->nnClusterizerBoundaryFillValue));
963-
} else {
964-
(clusterer.nnInternals)->inputData16.resize(iSize * (clusterer.nnInternals)->nnClusterizerElementSize, (OrtDataType::Float16_t)((float)(clusterer.nnInternals)->nnClusterizerBoundaryFillValue));
965-
}
916+
for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clusterer.nnClusterizerBatchedMode); batch++) {
917+
uint batchStart = batch * clusterer.nnClusterizerBatchedMode;
918+
uint iSize = CAMath::Min((uint)clusterer.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
966919

967920
auto start0 = std::chrono::high_resolution_clock::now();
968921
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Filling the data
969922
auto stop0 = std::chrono::high_resolution_clock::now();
970923
auto start1 = std::chrono::high_resolution_clock::now();
971-
GPUTPCNNClusterizer::inferenceNetworkClass(clusterer, evalDtype);
972-
if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] == 1) {
924+
nnApplication.inferenceNetworkClass(clusterer, evalDtype, batchStart);
925+
if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
973926
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Assigning class labels
974927
} else {
975928
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Assigning class labels
976929
}
977930

978-
if (!(clusterer.nnInternals)->nnClusterizerUseCfRegression) {
979-
GPUTPCNNClusterizer::inferenceNetworkReg1(clusterer, evalDtype);
931+
if (!clusterer.nnClusterizerUseCfRegression) {
932+
nnApplication.inferenceNetworkReg1(clusterer, evalDtype, batchStart);
980933
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Running the NN for regression class 1
981-
if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] > 1 && reg_model_paths.size() > 1) {
982-
GPUTPCNNClusterizer::inferenceNetworkReg2(clusterer, evalDtype);
934+
if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
935+
nnApplication.inferenceNetworkReg2(clusterer, evalDtype, batchStart);
983936
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Running the NN for regression class 2
984937
}
985938
}
@@ -990,13 +943,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
990943
}
991944

992945
auto start1 = std::chrono::high_resolution_clock::now();
993-
if ((clusterer.nnInternals)->nnClusterizerUseCfRegression) {
946+
if (clusterer.nnClusterizerUseCfRegression) {
994947
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
995948
}
996949
auto stop1 = std::chrono::high_resolution_clock::now();
997950
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
998951

999-
if ((clusterer.nnInternals)->nnClusterizerVerbosity < 3) {
952+
if (clusterer.nnClusterizerVerbosity < 3) {
1000953
LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
1001954
}
1002955
#else

0 commit comments

Comments
 (0)