Skip to content

Commit 69f6737

Browse files
NN clusterizer: Bug-fixes and adding deterministic mode (#14530)
* Adding first version of kernel timers * Removing GPU_CONFIG_KEY from dpl-workflow.sh to set my own values * Bug fixes * undoing changes in dpl-workflow.sh * Furhter fixes and beautifications * Please consider the following formatting changes * Removing unused timers * Moving Stop() of classification timer * Adding force method to fill input like it is done on GPU * Removing unnecessary static asserts * Adding deterministic mode (unfortunately that did not make it deterministic on GPU -> general problem with ONNX) * Please consider the following formatting changes * Adjusting for comment * Adding deterministic mode * Please consider the following formatting changes --------- Co-authored-by: ALICE Action Bot <alibuild@cern.ch>
1 parent 6f47846 commit 69f6737

File tree

9 files changed

+65
-39
lines changed

9 files changed

+65
-39
lines changed

Common/ML/include/ML/OrtInterface.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ class OrtModel
116116
int32_t mInputsTotal = 0, mOutputsTotal = 0; // Total number of inputs and outputs
117117

118118
// Environment settings
119-
bool mInitialized = false;
119+
bool mInitialized = false, mDeterministicMode = false;
120120
std::string mModelPath, mEnvName = "", mDeviceType = "CPU", mThreadAffinity = ""; // device options should be cpu, rocm, migraphx, cuda
121121
int32_t mIntraOpNumThreads = 1, mInterOpNumThreads = 1, mDeviceId = -1, mEnableProfiling = 0, mLoggingLevel = 0, mAllocateDeviceMemory = 0, mEnableOptimizations = 0;
122122

Common/ML/src/OrtInterface.cxx

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
6868
mEnableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
6969
mEnableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
7070
mEnvName = (optionsMap.contains("onnx-environment-name") ? optionsMap["onnx-environment-name"] : "onnx_model_inference");
71+
mDeterministicMode = (optionsMap.contains("deterministic-compute") ? std::stoi(optionsMap["deterministic-compute"]) : 0);
7172

7273
if (mDeviceType == "CPU") {
7374
(mPImplOrt->sessionOptions).SetIntraOpNumThreads(mIntraOpNumThreads);
@@ -99,6 +100,10 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
99100
(mPImplOrt->sessionOptions).DisableProfiling();
100101
}
101102

103+
if (mDeterministicMode > 0) {
104+
(mPImplOrt->sessionOptions).AddConfigEntry("session_options.use_deterministic_compute", "1");
105+
}
106+
102107
(mPImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(mEnableOptimizations));
103108
(mPImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(mLoggingLevel));
104109

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Spe
256256
AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
257257
AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
258258
AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347")
259+
AddOption(nnInferenceUseDeterministicCompute, int, 0, "", 0, "Enables deterministic compute in ONNX Runtime were possible. Can be [0, 1] -> see https://github.com/microsoft/onnxruntime/blob/3b97d79b3c12dbf93aa0d563f345714596dc8ab6/onnxruntime/core/framework/session_options.h#L208")
259260
AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
260261
AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data")
261262
AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
@@ -275,6 +276,8 @@ AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters wil
275276
AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
276277
AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
277278
AddOption(nnEvalMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)")
279+
AddOption(nnClusterizerUseClassification, int, 1, "", 0, "If 1, the classification output of the network is used to select clusters, else only the regression output is used and no clusters are rejected by classification")
280+
AddOption(nnClusterizerForceGpuInputFill, int, 0, "", 0, "Forces to use the fillInputNNGPU function")
278281
// CCDB
279282
AddOption(nnLoadFromCCDB, int, 0, "", 0, "If 1 networks are fetched from ccdb, else locally")
280283
AddOption(nnLocalFolder, std::string, ".", "", 0, "Local folder in which the networks will be fetched")

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -641,13 +641,30 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
641641
const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
642642
GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes];
643643

644+
// Maximum of 4 lanes supported
645+
HighResTimer* nnTimers[12];
646+
if (GetProcessingSettings().nn.applyNNclusterizer && GetProcessingSettings().debugLevel >= 1) {
647+
nnTimers[0] = &getTimer<GPUTPCNNClusterizer, 0>("GPUTPCNNClusterizer_ONNXClassification_0_", 0);
648+
nnTimers[1] = &getTimer<GPUTPCNNClusterizer, 1>("GPUTPCNNClusterizer_ONNXRegression_1_", 1);
649+
nnTimers[2] = &getTimer<GPUTPCNNClusterizer, 2>("GPUTPCNNClusterizer_ONNXRegression2_2_", 2);
650+
nnTimers[3] = &getTimer<GPUTPCNNClusterizer, 3>("GPUTPCNNClusterizer_ONNXClassification_0_", 3);
651+
nnTimers[4] = &getTimer<GPUTPCNNClusterizer, 4>("GPUTPCNNClusterizer_ONNXRegression_1_", 4);
652+
nnTimers[5] = &getTimer<GPUTPCNNClusterizer, 5>("GPUTPCNNClusterizer_ONNXRegression2_2_", 5);
653+
nnTimers[6] = &getTimer<GPUTPCNNClusterizer, 6>("GPUTPCNNClusterizer_ONNXClassification_0_", 6);
654+
nnTimers[7] = &getTimer<GPUTPCNNClusterizer, 7>("GPUTPCNNClusterizer_ONNXRegression_1_", 7);
655+
nnTimers[8] = &getTimer<GPUTPCNNClusterizer, 8>("GPUTPCNNClusterizer_ONNXRegression2_2_", 8);
656+
nnTimers[9] = &getTimer<GPUTPCNNClusterizer, 9>("GPUTPCNNClusterizer_ONNXClassification_0_", 9);
657+
nnTimers[10] = &getTimer<GPUTPCNNClusterizer, 10>("GPUTPCNNClusterizer_ONNXRegression_1_", 10);
658+
nnTimers[11] = &getTimer<GPUTPCNNClusterizer, 11>("GPUTPCNNClusterizer_ONNXRegression2_2_", 11);
659+
}
660+
644661
if (GetProcessingSettings().nn.applyNNclusterizer) {
645662
int32_t deviceId = -1;
646663
int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
647664
int32_t maxThreads = mRec->getNKernelHostThreads(true);
648665
// bool recreateMemoryAllocator = false;
649666
mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
650-
nnApplications[lane].init(nn_settings);
667+
nnApplications[lane].init(nn_settings, GetProcessingSettings().deterministicGPUReconstruction);
651668
if (nnApplications[lane].mModelsUsed[0]) {
652669
SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId);
653670
(nnApplications[lane].mModelClass).setDeviceId(deviceId);
@@ -993,9 +1010,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
9931010
size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
9941011

9951012
// Filling the data
996-
if (mRec->IsGPU()) {
1013+
if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
9971014
// Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations
998-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
1015+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeFull, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
9991016
} else {
10001017
// Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
10011018
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
@@ -1006,20 +1023,25 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10061023
}
10071024

10081025
// NN evaluations
1009-
if (clustererNNShadow.mNnInferenceInputDType == 0) {
1010-
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1011-
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
1012-
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1013-
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32);
1014-
}
1015-
} else if (clustererNNShadow.mNnInferenceInputDType == 1) {
1016-
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1017-
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16);
1018-
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1019-
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
1026+
if(clustererNNShadow.mNnClusterizerUseClassification) {
1027+
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Start(); }
1028+
if (clustererNNShadow.mNnInferenceInputDType == 0) {
1029+
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1030+
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
1031+
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1032+
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32);
1033+
}
1034+
} else if (clustererNNShadow.mNnInferenceInputDType == 1) {
1035+
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
1036+
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16);
1037+
} else if (clustererNNShadow.mNnInferenceOutputDType == 1) {
1038+
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
1039+
}
10201040
}
1041+
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); }
10211042
}
10221043
if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
1044+
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); }
10231045
if (clustererNNShadow.mNnInferenceInputDType == 0) {
10241046
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
10251047
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
@@ -1033,7 +1055,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10331055
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
10341056
}
10351057
}
1058+
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Stop(); }
10361059
if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
1060+
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Start(); }
10371061
if (clustererNNShadow.mNnInferenceInputDType == 0) {
10381062
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
10391063
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
@@ -1047,6 +1071,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10471071
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
10481072
}
10491073
}
1074+
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Stop(); }
10501075
}
10511076
}
10521077

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
4343
int32_t mNnClusterizerChargeArraySize = -1;
4444
int32_t mNnClusterizerElementSize = -1;
4545
int8_t mNnClusterizerAddIndexData = 1;
46+
int8_t mNnClusterizerUseClassification = 1;
4647
float mNnClassThreshold = 0.01;
4748
int8_t mNnSigmoidTrafoClassThreshold = 1;
4849
int8_t mNnClusterizerSetDeconvolutionFlags = 1;

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
using namespace o2::gpu;
3030

31-
void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
31+
void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode)
3232
{
3333
std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath;
3434
std::vector<std::string> reg_model_paths_local;
@@ -54,6 +54,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
5454
{"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
5555
{"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)},
5656
{"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
57+
{"deterministic-compute", std::to_string(useDeterministicMode ? 1 : settings.nnInferenceUseDeterministicCompute)}, // TODO: This unfortunately doesn't guarantee determinism (25.07.2025)
5758
{"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)},
5859
{"profiling-output-path", settings.nnInferenceOrtProfilingPath},
5960
{"logging-level", std::to_string(settings.nnInferenceVerbosity)},
@@ -106,6 +107,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
106107
clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
107108
clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
108109
clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
110+
clustererNN.mNnClusterizerUseClassification = settings.nnClusterizerUseClassification;
109111
clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags;
110112
if (clustererNN.mNnSigmoidTrafoClassThreshold) {
111113
clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold));

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,9 @@ class GPUTPCNNClusterizerHost
4545
{
4646
public:
4747
GPUTPCNNClusterizerHost() = default;
48-
GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) { init(settings); }
48+
GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode = false) { init(settings, useDeterministicMode); }
4949

50-
void init(const GPUSettingsProcessingNNclusterizer&);
50+
void init(const GPUSettingsProcessingNNclusterizer&, bool = false);
5151
void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
5252
void createBoundary(GPUTPCNNClusterizer&);
5353
void createIndexLookup(GPUTPCNNClusterizer&);

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -117,18 +117,14 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
117117
}
118118

119119
if (clustererNN.mNnClusterizerAddIndexData) {
120-
float sector_norm = sector / 36.f;
121-
float row_norm = row / 152.f;
122-
float pad_norm = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
123-
124120
if (dtype == 0) {
125-
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)sector_norm;
126-
clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)row_norm;
127-
clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)pad_norm;
121+
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
122+
clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
123+
clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
128124
} else {
129-
clustererNN.mInputData_32[write_idx] = sector_norm;
130-
clustererNN.mInputData_32[write_idx + 1] = row_norm;
131-
clustererNN.mInputData_32[write_idx + 2] = pad_norm;
125+
clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
126+
clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
127+
clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
132128
}
133129
}
134130

@@ -178,8 +174,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
178174
uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
179175

180176
float index_values[3] = {
181-
sector / 36.f,
182-
row / 152.f,
177+
static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR,
178+
static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW,
183179
static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
184180

185181
if (dtype == 0) {
@@ -335,11 +331,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
335331
return;
336332
}
337333

338-
tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
334+
tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
339335

340336
// LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.mNnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
341337

342-
if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerModelReg2NumOutputNodes != -1 && clustererNN.mOutputDataClass[full_glo_idx] >= 1)) {
338+
if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerUseClassification <= 0)) {
343339

344340
ClusterAccumulator pc;
345341

@@ -451,7 +447,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
451447

452448
uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes;
453449

454-
if (clustererNN.mOutputDataClass[full_glo_idx] > 0) {
450+
if ((clustererNN.mOutputDataClass[full_glo_idx] > 0) || (clustererNN.mNnClusterizerUseClassification <= 0)) {
455451

456452
ClusterAccumulator pc;
457453

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,6 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
3838
{
3939
public:
4040
// Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below
41-
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNCPU) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
42-
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
43-
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
44-
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
45-
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
46-
static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
4741
static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer);
4842
struct GPUSharedMemory {
4943
// Regular cluster finder

0 commit comments

Comments
 (0)