Skip to content

Commit 1dcb1da

Browse files
committed
Major reworkings to add FP16 support
1 parent 2801c2e commit 1dcb1da

File tree

8 files changed

+177
-78
lines changed

8 files changed

+177
-78
lines changed

Common/ML/include/ML/3rdparty/GPUORTFloat16.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -882,4 +882,4 @@ static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
882882
} // namespace OrtDataType
883883

884884
} // namespace o2
885-
#endif
885+
#endif

Common/ML/src/OrtInterface.cxx

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,13 +244,22 @@ void OrtModel::inference(I* input, size_t input_size, O* output)
244244
}
245245

246246
std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
247-
Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
247+
Ort::Value outputTensor = Ort::Value(nullptr);
248+
if constexpr (std::is_same_v<O, OrtDataType::Float16_t>) {
249+
Ort::Value outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
250+
} else {
251+
Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
252+
}
248253

249254
(pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size());
250255
}
251256

257+
template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t*, size_t, OrtDataType::Float16_t*);
258+
252259
template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
253260

261+
template void OrtModel::inference<float, OrtDataType::Float16_t>(float*, size_t, OrtDataType::Float16_t*);
262+
254263
template void OrtModel::inference<float, float>(float*, size_t, float*);
255264

256265
template <class I, class O>

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -943,9 +943,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
943943
GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
944944
GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];
945945

946-
LOG(info) << "clustererNNShadow.inputData32: " << clustererNNShadow.inputData32;
947-
LOG(info) << "clustererShadow.mPclusterInRow: " << clustererShadow.mPclusterInRow;
948-
949946
int withMC = (doGPU && propagateMCLabels);
950947

951948
if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
@@ -963,19 +960,58 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
963960

964961
auto stop0 = std::chrono::high_resolution_clock::now();
965962
auto start1 = std::chrono::high_resolution_clock::now();
966-
nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType);
963+
964+
// nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType);
965+
if (clustererNNShadow.nnInferenceInputDType == 0) {
966+
if (clustererNNShadow.nnInferenceOutputDType == 0) {
967+
(nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_16);
968+
} else if (clustererNNShadow.nnInferenceOutputDType == 1) {
969+
(nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_32);
970+
}
971+
} else if (clustererNNShadow.nnInferenceInputDType == 1) {
972+
if (clustererNNShadow.nnInferenceOutputDType == 0) {
973+
(nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_16);
974+
} else if (clustererNNShadow.nnInferenceOutputDType == 1) {
975+
(nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_32);
976+
}
977+
}
967978
if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
968-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
979+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
969980
} else {
970-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
981+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
971982
}
972-
973983
if (!clustererNNShadow.nnClusterizerUseCfRegression) {
974-
nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType);
975-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1
984+
// nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType);
985+
if (clustererNNShadow.nnInferenceInputDType == 0) {
986+
if (clustererNNShadow.nnInferenceOutputDType == 0) {
987+
(nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_16);
988+
} else if (clustererNNShadow.nnInferenceOutputDType == 1) {
989+
(nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_32);
990+
}
991+
} else if (clustererNNShadow.nnInferenceInputDType == 1) {
992+
if (clustererNNShadow.nnInferenceOutputDType == 0) {
993+
(nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_16);
994+
} else if (clustererNNShadow.nnInferenceOutputDType == 1) {
995+
(nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_32);
996+
}
997+
}
998+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Running the NN for regression class 1
976999
if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
977-
nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType);
978-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2
1000+
// nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType);
1001+
if (clustererNNShadow.nnInferenceInputDType == 0) {
1002+
if (clustererNNShadow.nnInferenceOutputDType == 0) {
1003+
(nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_16);
1004+
} else if (clustererNNShadow.nnInferenceOutputDType == 1) {
1005+
(nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_32);
1006+
}
1007+
} else if (clustererNNShadow.nnInferenceInputDType == 1) {
1008+
if (clustererNNShadow.nnInferenceOutputDType == 0) {
1009+
(nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_16);
1010+
} else if (clustererNNShadow.nnInferenceOutputDType == 1) {
1011+
(nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_32);
1012+
}
1013+
}
1014+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Running the NN for regression class 2
9791015
}
9801016
}
9811017
auto stop1 = std::chrono::high_resolution_clock::now();

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,35 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
2626
{
2727
if (nnClusterizerBatchedMode > 0) {
2828
if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) {
29-
computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
29+
computePointerWithAlignment(mem, inputData_16, nnClusterizerBatchedMode * nnClusterizerElementSize);
3030
} else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
31-
computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
31+
computePointerWithAlignment(mem, inputData_32, nnClusterizerBatchedMode * nnClusterizerElementSize);
3232
}
3333
computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
34-
if (nnClusterizerModelClassNumOutputNodes > 0) {
35-
computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
36-
}
37-
if (!nnClusterizerUseCfRegression) {
38-
if (nnClusterizerModelReg1NumOutputNodes > 0) {
39-
computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
34+
35+
if (nnInferenceOutputDType == 0 && nnClusterizerElementSize > 0) {
36+
if (nnClusterizerModelClassNumOutputNodes > 0) {
37+
computePointerWithAlignment(mem, modelProbabilities_16, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
4038
}
41-
if (nnClusterizerModelReg2NumOutputNodes > 0) {
42-
computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
39+
if (!nnClusterizerUseCfRegression) {
40+
if (nnClusterizerModelReg1NumOutputNodes > 0) {
41+
computePointerWithAlignment(mem, outputDataReg1_16, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
42+
}
43+
if (nnClusterizerModelReg2NumOutputNodes > 0) {
44+
computePointerWithAlignment(mem, outputDataReg2_16, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
45+
}
46+
}
47+
} else if (nnInferenceOutputDType == 1 && nnClusterizerElementSize > 0) {
48+
if (nnClusterizerModelClassNumOutputNodes > 0) {
49+
computePointerWithAlignment(mem, modelProbabilities_32, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
50+
}
51+
if (!nnClusterizerUseCfRegression) {
52+
if (nnClusterizerModelReg1NumOutputNodes > 0) {
53+
computePointerWithAlignment(mem, outputDataReg1_32, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
54+
}
55+
if (nnClusterizerModelReg2NumOutputNodes > 0) {
56+
computePointerWithAlignment(mem, outputDataReg2_32, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
57+
}
4358
}
4459
}
4560
}

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,20 +56,27 @@ class GPUTPCNNClusterizer : public GPUProcessor
5656
int nnClusterizerModelReg1NumOutputNodes = -1;
5757
int nnClusterizerModelReg2NumOutputNodes = -1;
5858
int nnInferenceInputDType = 0; // 0: float16, 1: float32
59+
int nnInferenceOutputDType = 0; // 0: float16, 1: float32
5960
int mISector = -1;
6061
int deviceId = -1;
6162

6263
// Memory allocation for neural network
63-
float* inputData32 = nullptr;
64-
OrtDataType::Float16_t* inputData16 = nullptr;
65-
float* outputDataClass = nullptr;
66-
float* modelProbabilities = nullptr;
67-
float* outputDataReg1 = nullptr;
68-
float* outputDataReg2 = nullptr;
6964

70-
ChargePos* peakPositions = nullptr;
71-
bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx
72-
float* centralCharges = nullptr;
65+
bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
66+
int* outputDataClass = nullptr;
67+
68+
// FP32
69+
float* inputData_32 = nullptr;
70+
float* modelProbabilities_32 = nullptr;
71+
float* outputDataReg1_32 = nullptr;
72+
float* outputDataReg2_32 = nullptr;
73+
74+
// FP16
75+
OrtDataType::Float16_t* inputData_16 = nullptr;
76+
OrtDataType::Float16_t* modelProbabilities_16 = nullptr;
77+
OrtDataType::Float16_t* outputDataReg1_16 = nullptr;
78+
OrtDataType::Float16_t* outputDataReg2_16 = nullptr;
79+
7380
int16_t mMemoryId = -1;
7481
}; // class GPUTPCNNClusterizer
7582

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
149149
clustererNN.nnClusterizerVerbosity = settings.nnClusterizerVerbosity;
150150
}
151151
clustererNN.nnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos;
152+
clustererNN.nnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos;
152153
clustererNN.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
153154
if (!settings.nnClusterizerUseCfRegression) {
154155
if (model_class.getNumOutputNodes()[0][1] == 1 || model_reg_2.isInitialized()) {
@@ -159,12 +160,3 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
159160
}
160161
}
161162
}
162-
163-
void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype)
164-
{
165-
if (dtype == 0) {
166-
model.inference<OrtDataType::Float16_t, float>(clustererNN.inputData16, size, output);
167-
} else {
168-
model.inference<float, float>(clustererNN.inputData32, size, output);
169-
}
170-
}

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,6 @@ class GPUTPCNNClusterizerHost
4343
void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
4444
void loadFromCCDB(std::map<std::string, std::string>);
4545

46-
void networkInference(o2::ml::OrtModel, GPUTPCNNClusterizer&, size_t, float*, int32_t);
47-
4846
std::unordered_map<std::string, std::string> OrtOptions;
4947
o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
5048
std::vector<bool> modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2

0 commit comments

Comments
 (0)