Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Common/ML/include/ML/OrtInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class OrtModel

// Environment settings
bool mInitialized = false;
std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;

std::string printShape(const std::vector<int64_t>&);
Expand Down
14 changes: 6 additions & 8 deletions Common/ML/src/OrtInterface.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
if (!optionsMap["model-path"].empty()) {
modelPath = optionsMap["model-path"];
device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
Expand Down Expand Up @@ -226,19 +225,18 @@ template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Fl
template <class I, class O>
void OrtModel::inference(I* input, size_t input_size, O* output)
{
std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
std::vector<int64_t> inputShape{input_size, (int64_t)mInputShapes[0][1]};
Ort::Value inputTensor = Ort::Value(nullptr);
if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size());
inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
} else {
inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size());
inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
}

std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]);
Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());

(pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here
(pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is always correct here
}

template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
Expand Down
13 changes: 12 additions & 1 deletion GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,8 @@ AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural
AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16
AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347")
Expand All @@ -249,6 +250,16 @@ AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The c
AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
// CCDB
AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally")
AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched")
AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks")
AddOption(nnCCDBFetchMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)")
AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression")
AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp")
AddOption(nnCCDBInteractionRate, int, 50, "", 0, "Distinguishes between networks for different interaction rates [kHz].")
AddHelp("help", 'h')
EndConfig()

Expand Down
32 changes: 17 additions & 15 deletions GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -612,14 +612,16 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}

#ifdef GPUCA_HAS_ONNX
const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only
if (GetProcessingSettings().nn.applyNNclusterizer) {
uint32_t maxClusters = -1;
uint32_t maxClusters = 0;
nnApplication.init(nn_settings);
for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
}
for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
Expand All @@ -639,8 +641,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
} else {
clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
}
clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos;
nnApplication.initClusterizer(nn_settings, clustererNN);
AllocateRegisteredMemory(clustererNN.mMemoryId);
}
}
Expand Down Expand Up @@ -916,7 +918,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
#ifdef GPUCA_HAS_ONNX
GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
int withMC = (doGPU && propagateMCLabels);

if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
Expand All @@ -929,23 +931,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));

auto start0 = std::chrono::high_resolution_clock::now();
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Filling the data

auto stop0 = std::chrono::high_resolution_clock::now();
auto start1 = std::chrono::high_resolution_clock::now();
nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnInferenceInputDType);
if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
} else {
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
}

if (!clustererNN.nnClusterizerUseCfRegression) {
nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnInferenceInputDType);
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1
if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnInferenceInputDType);
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2
}
}
auto stop1 = std::chrono::high_resolution_clock::now();
Expand All @@ -955,7 +957,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
}
auto start1 = std::chrono::high_resolution_clock::now();
if (clustererNN.nnClusterizerUseCfRegression) {
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
}
auto stop1 = std::chrono::high_resolution_clock::now();
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
Expand Down
38 changes: 21 additions & 17 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -24,25 +24,29 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}

void* GPUTPCNNClusterizer::setIOPointers(void* mem)
{
if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
} else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {
computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
}
computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters);
if (nnClusterizerModelClassNumOutputNodes > 0) {
computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
}
if (!nnClusterizerUseCfRegression) {
if (nnClusterizerModelReg1NumOutputNodes > 0) {
computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
if (nnClusterizerBatchedMode > 0) {
if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) {
computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
} else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
}
if (nnClusterizerModelReg2NumOutputNodes > 0) {
computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
if (nnClusterizerModelClassNumOutputNodes > 0) {
computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
}
if (!nnClusterizerUseCfRegression) {
if (nnClusterizerModelReg1NumOutputNodes > 0) {
computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
}
if (nnClusterizerModelReg2NumOutputNodes > 0) {
computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
}
}
}
if (nnClusterizerTotalClusters > 0) {
computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters);
}
return mem;
}
Expand Down
5 changes: 2 additions & 3 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
int nnClusterizerSizeInputTime = 3;
int nnClusterizerElementSize = -1;
bool nnClusterizerAddIndexData = true;
float nnClassThreshold = 0.16;
float nnClassThreshold = 0.01;
bool nnSigmoidTrafoClassThreshold = 1;
int nnClusterizerUseCfRegression = 0;
int nnClusterizerBatchedMode = 1;
Expand All @@ -54,11 +54,10 @@ class GPUTPCNNClusterizer : public GPUProcessor
int nnClusterizerModelClassNumOutputNodes = -1;
int nnClusterizerModelReg1NumOutputNodes = -1;
int nnClusterizerModelReg2NumOutputNodes = -1;
int nnClusterizerDtype = 0; // 0: float16, 1: float32
int nnInferenceInputDType = 0; // 0: float16, 1: float32
int mISector = -1;

// Memory allocation for neural network
uint class2_elements = 0;
float* inputData32 = nullptr;
OrtDataType::Float16_t* inputData16 = nullptr;
float* outputDataClass = nullptr;
Expand Down
Loading