Skip to content

Commit 57862a6

Browse files
committed
Updating to use explicit calls to kernels instead of if-statements
1 parent 984857e commit 57862a6

File tree

6 files changed

+83
-49
lines changed

6 files changed

+83
-49
lines changed

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,7 @@ AddOption(nnClusterizerUseCFregression, int, 0, "", 0, "(bool, default = false)
316316
AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
317317
AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
318318
AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
319+
AddOption(nnClusterizerApplyNoiseSupression, int, 1, "", 0, "Applies the NoiseSupression kernel before the digits to the network are filled")
319320
AddOption(nnClusterizerApplyCfDeconvolution, int, 0, "", 0, "Applies the CFDeconvolution kernel before the digits to the network are filled")
320321
AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
321322
AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -896,6 +896,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
896896
clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
897897
clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
898898
clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
899+
clusterer.nnClusterizerBoundaryFillValue = GetProcessingSettings().nnClusterizerBoundaryFillValue;
899900
if (GetProcessingSettings().nnClusterizerVerbosity < 0){
900901
clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
901902
} else {
@@ -955,25 +956,29 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
955956
clusterer.centralCharges.resize(iSize);
956957

957958
if (evalDtype == 1) {
958-
clusterer.inputData32.resize(iSize * clusterer.nnClusterizerElementSize, (float)(GetProcessingSettings().nnClusterizerBoundaryFillValue));
959+
clusterer.inputData32.resize(iSize * clusterer.nnClusterizerElementSize, (float)(clusterer.nnClusterizerBoundaryFillValue));
959960
} else {
960-
clusterer.inputData16.resize(iSize * clusterer.nnClusterizerElementSize, (OrtDataType::Float16_t)((float)GetProcessingSettings().nnClusterizerBoundaryFillValue));
961+
clusterer.inputData16.resize(iSize * clusterer.nnClusterizerElementSize, (OrtDataType::Float16_t)((float)clusterer.nnClusterizerBoundaryFillValue));
961962
}
962963

963964
auto start0 = std::chrono::high_resolution_clock::now();
964-
runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, 0, batchStart); // Filling the data
965+
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Filling the data
965966
auto stop0 = std::chrono::high_resolution_clock::now();
966967

967968
auto start1 = std::chrono::high_resolution_clock::now();
968969
GPUTPCNNClusterizer::applyNetworkClass(clusterer, evalDtype);
969-
runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 1, 0, batchStart); // Assigning class labels
970+
if (clusterer.model_class.getNumOutputNodes()[0][1] > 1){
971+
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
972+
} else {
973+
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
974+
}
970975

971976
if (!clusterer.nnClusterizerUseCFregression) {
972977
GPUTPCNNClusterizer::applyNetworkReg1(clusterer, evalDtype);
973-
runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 2, 0, batchStart); // Running the NN for regression class 1
978+
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Running the NN for regression class 1
974979
if (clusterer.model_class.getNumOutputNodes()[0][1] > 1 && reg_model_paths.size() > 1) {
975980
GPUTPCNNClusterizer::applyNetworkReg2(clusterer, evalDtype);
976-
runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 4, 0, batchStart); // Running the NN for regression class 2
981+
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Running the NN for regression class 2
977982
}
978983
}
979984
auto stop1 = std::chrono::high_resolution_clock::now();
@@ -985,7 +990,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
985990

986991
auto start1 = std::chrono::high_resolution_clock::now();
987992
if(clusterer.nnClusterizerUseCFregression) {
988-
runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, -1, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
993+
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
989994
}
990995
auto stop1 = std::chrono::high_resolution_clock::now();
991996
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;

GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ class GPUTPCClusterFinder : public GPUProcessor
155155
int nnClusterizerUseCFregression = 0;
156156
int nnClusterizerBatchedMode = 1;
157157
int nnClusterizerVerbosity = 0;
158+
int nnClusterizerBoundaryFillValue = -1;
158159

159160
// Memory allocation for neural network
160161
uint class2_elements = 0;

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx

Lines changed: 50 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -27,45 +27,59 @@ using namespace o2::gpu;
2727
using namespace o2::gpu::tpccf;
2828

2929
template <>
30-
GPUdii() void GPUTPCNNClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t mode, int8_t onlyMC, uint batchStart)
30+
GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
3131
{
3232
uint glo_idx = get_global_id(0);
33-
if (mode == -1) {
34-
if (clusterer.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
35-
return;
36-
}
37-
Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
38-
CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
39-
tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
40-
o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
41-
GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
42-
} else if (mode == 0){
43-
GPUTPCNNClusterizer::fillInputData(nBlocks, nThreads, iBlock, iThread, clusterer, dtype, batchStart);
44-
} else if (mode == 1) { // Class labels
45-
if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) {
46-
clusterer.outputDataClass[glo_idx + batchStart] = (int)(clusterer.modelProbabilities[glo_idx] > clusterer.nnClassThreshold);
47-
} else {
48-
auto elem_iterator = clusterer.modelProbabilities.begin() + (unsigned int)(glo_idx * clusterer.model_class.getNumOutputNodes()[0][1]);
49-
uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1]));
50-
clusterer.outputDataClass[glo_idx + batchStart] = class_label;
51-
}
52-
} else if (mode == 2) { // Publishing for class 1 regression
53-
if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
54-
return;
55-
} else {
56-
GPUTPCNNClusterizer::publishClustersReg1(glo_idx, smem, clusterer, dtype, mode, onlyMC, batchStart);
57-
}
58-
} else if (mode == 3) { // Refilling for class 2 regression -> Deprecated because it needs sequential accumulation
33+
if (clusterer.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
5934
return;
60-
} else if (mode == 4) { // Publishing for class 2 regression
61-
if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
62-
return;
63-
} else {
64-
GPUTPCNNClusterizer::publishClustersReg2(glo_idx, smem, clusterer, dtype, mode, onlyMC, batchStart);
65-
}
66-
} else {
35+
}
36+
Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
37+
CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
38+
tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
39+
o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
40+
GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
41+
}
42+
43+
template <>
44+
GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
45+
{
46+
GPUTPCNNClusterizer::fillInputData(nBlocks, nThreads, iBlock, iThread, clusterer, dtype, batchStart);
47+
}
48+
49+
template <>
50+
GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
51+
{
52+
uint glo_idx = get_global_id(0);
53+
clusterer.outputDataClass[glo_idx + batchStart] = (int)(clusterer.modelProbabilities[glo_idx] > clusterer.nnClassThreshold);
54+
}
55+
56+
template <>
57+
GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
58+
{
59+
uint glo_idx = get_global_id(0);
60+
auto elem_iterator = clusterer.modelProbabilities.begin() + (unsigned int)(glo_idx * clusterer.model_class.getNumOutputNodes()[0][1]);
61+
uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1]));
62+
clusterer.outputDataClass[glo_idx + batchStart] = class_label;
63+
}
64+
65+
template <>
66+
GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
67+
{
68+
uint glo_idx = get_global_id(0);
69+
if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
70+
return;
71+
}
72+
GPUTPCNNClusterizer::publishClustersReg1(glo_idx, smem, clusterer, dtype, onlyMC, batchStart);
73+
}
74+
75+
template <>
76+
GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
77+
{
78+
uint glo_idx = get_global_id(0);
79+
if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
6780
return;
6881
}
82+
GPUTPCNNClusterizer::publishClustersReg2(glo_idx, smem, clusterer, dtype, onlyMC, batchStart);
6983
}
7084

7185

@@ -188,7 +202,7 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
188202
}
189203

190204
// ---------------------------------
191-
GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t mode, int8_t onlyMC, uint batchStart)
205+
GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
192206
{
193207
Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
194208
CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
@@ -255,7 +269,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
255269
}
256270

257271
// ---------------------------------
258-
GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t mode, int8_t onlyMC, uint batchStart)
272+
GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
259273
{
260274
Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
261275
CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,22 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
5555
return GPUDataTypes::RecoStep::TPCClusterFinding;
5656
}
5757

58-
// Float16 inmplementation
59-
template <int32_t iKernel = defaultKernel>
60-
GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, int8_t = 0, int8_t = 0, int8_t = 0, uint = 0);
58+
enum K : int32_t {
59+
runCfClusterizer = 0,
60+
fillInputNN = 1,
61+
determineClass1Labels = 2,
62+
determineClass2Labels = 3,
63+
publishClass1Regression = 4,
64+
publishClass2Regression = 5,
65+
};
66+
67+
template <int32_t iKernel = defaultKernel, typename... Args>
68+
GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, int8_t = 0, int8_t = 0, uint = 0, Args...);
6169

6270
static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, int8_t, uint);
6371

64-
static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, int8_t, uint);
65-
static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, int8_t, uint);
72+
static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
73+
static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
6674

6775
static void applyNetworkClass(processorType&, int8_t = 0, uint = 0);
6876

GPU/GPUTracking/kernels.cmake

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,12 @@ o2_gpu_add_kernel("GPUTPCCFPeakFinder" "= TPCCLUS
110110
o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression" "= TPCCLUSTERFINDER" LB single)
111111
o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks" "= TPCCLUSTERFINDER" LB single)
112112
o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB single)
113-
o2_gpu_add_kernel("GPUTPCNNClusterizer" "= TPCCLUSTERFINDER" LB single int8_t dtype int8_t mode int8_t onlyMC uint batchStart)
113+
o2_gpu_add_kernel("GPUTPCNNClusterizer, runCfClusterizer" "= TPCCLUSTERFINDER" LB single int8_t dtype int8_t onlyMC uint batchStart)
114+
o2_gpu_add_kernel("GPUTPCNNClusterizer, fillInputNN" "= TPCCLUSTERFINDER" LB single int8_t dtype int8_t onlyMC uint batchStart)
115+
o2_gpu_add_kernel("GPUTPCNNClusterizer, determineClass1Labels" "= TPCCLUSTERFINDER" LB single int8_t dtype int8_t onlyMC uint batchStart)
116+
o2_gpu_add_kernel("GPUTPCNNClusterizer, determineClass2Labels" "= TPCCLUSTERFINDER" LB single int8_t dtype int8_t onlyMC uint batchStart)
117+
o2_gpu_add_kernel("GPUTPCNNClusterizer, publishClass1Regression" "= TPCCLUSTERFINDER" LB single int8_t dtype int8_t onlyMC uint batchStart)
118+
o2_gpu_add_kernel("GPUTPCNNClusterizer, publishClass2Regression" "= TPCCLUSTERFINDER" LB single int8_t dtype int8_t onlyMC uint batchStart)
114119
o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUSTERFINDER" LB single int8_t onlyMC)
115120
o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER" NO single)
116121
o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten" "= TPCCLUSTERFINDER" NO single GPUTPCLinearLabels* out)

0 commit comments

Comments
 (0)