Skip to content

Commit 566ddb7

Browse files
committed
Simplifications and renaming
1 parent 81c646b commit 566ddb7

File tree

8 files changed

+27
-26
lines changed

8 files changed

+27
-26
lines changed

Common/ML/include/ML/OrtInterface.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ class OrtModel
8484

8585
// Environment settings
8686
bool mInitialized = false;
87-
std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
87+
std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
8888
int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
8989

9090
std::string printShape(const std::vector<int64_t>&);

Common/ML/src/OrtInterface.cxx

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
4848
if (!optionsMap["model-path"].empty()) {
4949
modelPath = optionsMap["model-path"];
5050
device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
51-
dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
5251
deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
5352
allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
5453
intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,6 @@ AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural
228228
AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
229229
AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
230230
AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
231-
AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
232231
AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16
233232
AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
234233
AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -641,7 +641,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
641641
} else {
642642
clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
643643
}
644-
clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
644+
clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos;
645645
nnApplication.initClusterizer(nn_settings, clustererNN);
646646
AllocateRegisteredMemory(clustererNN.mMemoryId);
647647
}
@@ -931,23 +931,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
931931
size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
932932

933933
auto start0 = std::chrono::high_resolution_clock::now();
934-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Filling the data
934+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Filling the data
935935

936936
auto stop0 = std::chrono::high_resolution_clock::now();
937937
auto start1 = std::chrono::high_resolution_clock::now();
938-
nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
938+
nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnInferenceInputDType);
939939
if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
940-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels
940+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
941941
} else {
942-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels
942+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
943943
}
944944

945945
if (!clustererNN.nnClusterizerUseCfRegression) {
946-
nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
947-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 1
946+
nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnInferenceInputDType);
947+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1
948948
if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
949-
nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
950-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 2
949+
nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnInferenceInputDType);
950+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2
951951
}
952952
}
953953
auto stop1 = std::chrono::high_resolution_clock::now();
@@ -957,7 +957,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
957957
}
958958
auto start1 = std::chrono::high_resolution_clock::now();
959959
if (clustererNN.nnClusterizerUseCfRegression) {
960-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
960+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
961961
}
962962
auto stop1 = std::chrono::high_resolution_clock::now();
963963
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}
2525
void* GPUTPCNNClusterizer::setIOPointers(void* mem)
2626
{
2727
if (nnClusterizerBatchedMode > 0) {
28-
if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
28+
if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) {
2929
computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
30-
} else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {
30+
} else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
3131
computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
3232
}
3333
computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
5454
int nnClusterizerModelClassNumOutputNodes = -1;
5555
int nnClusterizerModelReg1NumOutputNodes = -1;
5656
int nnClusterizerModelReg2NumOutputNodes = -1;
57-
int nnClusterizerDtype = 0; // 0: float16, 1: float32
57+
int nnInferenceInputDType = 0; // 0: float16, 1: float32
5858
int mISector = -1;
5959

6060
// Memory allocation for neural network

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
105105
{"device", settings.nnInferenceDevice},
106106
{"device-id", std::to_string(settings.nnInferenceDeviceId)},
107107
{"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
108-
{"dtype", settings.nnInferenceDtype},
109108
{"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
110109
{"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)},
111110
{"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
@@ -134,7 +133,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
134133
{
135134
clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
136135
if (!settings.nnClusterizerUseCfRegression) {
137-
if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
136+
if (model_class.getNumOutputNodes()[0][1] == 1 || model_reg_2.isInitialized()) {
138137
clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
139138
} else {
140139
clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -125,20 +125,24 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
125125
template <>
126126
GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
127127
{
128-
auto& clusterer = processors.tpcNNClusterer[sector];
128+
auto& clustererNN = processors.tpcNNClusterer[sector];
129129
uint glo_idx = get_global_id(0);
130-
uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
130+
uint elem_iterator = glo_idx * clustererNN.nnClusterizerModelClassNumOutputNodes;
131131
float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
132132
uint class_label = 0;
133-
for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
133+
for (int pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes; pIdx++) {
134134
if (pIdx == elem_iterator) {
135-
current_max_prob = clusterer.modelProbabilities[pIdx];
135+
current_max_prob = clustererNN.modelProbabilities[pIdx];
136136
} else {
137-
class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
137+
class_label = (clustererNN.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
138138
}
139139
}
140-
// uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
141-
clusterer.outputDataClass[glo_idx + batchStart] = class_label;
140+
// uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
141+
clustererNN.outputDataClass[glo_idx + batchStart] = class_label;
142+
if (class_label > 1) {
143+
clustererNN.clusterFlags[2 * glo_idx] = 1;
144+
clustererNN.clusterFlags[2 * glo_idx + 1] = 1;
145+
}
142146
}
143147

144148
template <>
@@ -157,7 +161,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
157161

158162
// LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
159163

160-
if (clustererNN.outputDataClass[full_glo_idx] == 1) {
164+
if (clustererNN.outputDataClass[full_glo_idx] == 1 || (clustererNN.nnClusterizerModelReg2NumOutputNodes == -1 && clustererNN.outputDataClass[full_glo_idx] >= 1)) {
161165

162166
ClusterAccumulator pc;
163167

0 commit comments

Comments
 (0)