Skip to content

Commit cc7210c

Browse files
NN clusterizer: Fixing memory access faults (#14657)
* Adding verbosity and fixing off-by-one error * removing unnecessary include, using GPUCommonLogger to fix CI build * GetGrid spawns more threads than actual number -> Most probably explains out-of-bounds accesses and memory faults * Fixing smem usage from CFClusterizer and adding rejection flag -> No out-of-bounds in QC anymore * Adjusting kernels for GPU safe rejection * Please consider the following formatting changes * Casting to avoid CI build failures * Changing formatter to not use std:: * Remove usage of std:: * Adding back the runParallelOuterLoop * Declaring CfChargePos as struct, not class --------- Co-authored-by: ALICE Action Bot <alibuild@cern.ch>
1 parent 1f95ef5 commit cc7210c

File tree

10 files changed

+399
-249
lines changed

10 files changed

+399
-249
lines changed

Common/ML/src/OrtInterface.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
5454

5555
// Load from options map
5656
if (!optionsMap.contains("model-path")) {
57-
LOG(fatal) << "(ORT) Model path cannot be empty!";
57+
LOG(fatal) << "(ORT) Model path must be contained in options map!";
5858
}
5959

6060
if (!optionsMap["model-path"].empty()) {

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,15 +259,15 @@ AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables gr
259259
AddOption(nnInferenceUseDeterministicCompute, int, 0, "", 0, "Enables deterministic compute in ONNX Runtime were possible. Can be [0, 1] -> see https://github.com/microsoft/onnxruntime/blob/3b97d79b3c12dbf93aa0d563f345714596dc8ab6/onnxruntime/core/framework/session_options.h#L208")
260260
AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
261261
AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data")
262-
AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
262+
AddOption(nnInferenceVerbosity, int, 2, "", 0, "0: All debugs; 1: Warnings + major debugs; 2: Warnings; >=3: No messages")
263263
AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
264264
AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
265265
AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
266266
AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
267267
AddOption(nnClusterizerUseCfRegression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN")
268268
AddOption(nnClusterizerApplyCfDeconvolution, int, 0, "", 0, "Applies the CFDeconvolution kernel before the digits to the network are filled")
269269
AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
270-
AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
270+
AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed. Higher number = higher verbosity")
271271
AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
272272
AddOption(nnClusterizerApplyNoiseSuppression, int, 1, "", 0, "Applies the NoiseSuppression kernel before the digits to the network are filled")
273273
AddOption(nnClusterizerSetDeconvolutionFlags, int, 1, "", 0, "Runs the deconvolution kernel without overwriting the charge in order to make cluster-to-track attachment identical to heuristic CF")

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@
4343
#include "DataFormatsTPC/Digit.h"
4444
#include "DataFormatsTPC/Constants.h"
4545
#include "TPCBase/RDHUtils.h"
46-
#include "GPULogging.h"
4746

4847
#ifdef GPUCA_HAS_ONNX
4948
#include "GPUTPCNNClusterizerKernels.h"
@@ -706,7 +705,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
706705
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
707706
(nnApplications[lane].mModelReg2).initSession();
708707
}
709-
if (nn_settings.nnClusterizerVerbosity < 3) {
708+
if (nn_settings.nnClusterizerVerbosity > 0) {
710709
LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
711710
}
712711
});
@@ -724,12 +723,24 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
724723
clustererNNShadow.mNnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
725724
nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
726725
}
726+
if (nn_settings.nnClusterizerVerbosity > 2) {
727+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Processor initialized. Sector " << sector << ", lane " << lane << ", max clusters " << clustererNN.mNnClusterizerTotalClusters << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
728+
}
727729
AllocateRegisteredMemory(clustererNN.mMemoryId);
730+
if (nn_settings.nnClusterizerVerbosity > 2) {
731+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Memory registered for memoryId " << clustererNN.mMemoryId << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
732+
}
728733
// nnApplications[lane].createBoundary(clustererNNShadow);
729734
// nnApplications[lane].createIndexLookup(clustererNNShadow);
730735
}
731736
if (doGPU) {
737+
if (nn_settings.nnClusterizerVerbosity > 2) {
738+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Writing to constant memory...";
739+
}
732740
WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
741+
if (nn_settings.nnClusterizerVerbosity > 2) {
742+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Writing to constant memory done";
743+
}
733744
}
734745
}
735746
#endif
@@ -1010,7 +1021,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10101021
}
10111022

10121023
// float time_clusterizer = 0, time_fill = 0, time_networks = 0;
1024+
if (nn_settings.nnClusterizerVerbosity > 2) {
1025+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Starting loop over batched data. clustererNNShadow.mNnClusterizerBatchedMode=" << clustererNNShadow.mNnClusterizerBatchedMode << ", numLoops=" << std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode) << ", numClusters=" << clusterer.mPmemory->counters.nClusters << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1026+
}
10131027
for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode); batch++) {
1028+
if (nn_settings.nnClusterizerVerbosity > 3) {
1029+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Start. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1030+
}
10141031
uint batchStart = batch * clustererNNShadow.mNnClusterizerBatchedMode;
10151032
size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
10161033

@@ -1022,9 +1039,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10221039
// Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
10231040
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
10241041
}
1042+
if (nn_settings.nnClusterizerVerbosity > 3) {
1043+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done filling data. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1044+
}
10251045

10261046
if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
10271047
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); // Publishing the deconvolution flags
1048+
if (nn_settings.nnClusterizerVerbosity > 3) {
1049+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done setting deconvolution flags. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1050+
}
10281051
}
10291052

10301053
// NN evaluations
@@ -1044,6 +1067,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10441067
}
10451068
}
10461069
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); }
1070+
if (nn_settings.nnClusterizerVerbosity > 3) {
1071+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN classification inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1072+
}
10471073
}
10481074
if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
10491075
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); }
@@ -1078,9 +1104,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10781104
}
10791105
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Stop(); }
10801106
}
1107+
if (nn_settings.nnClusterizerVerbosity > 3) {
1108+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN regression inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1109+
}
10811110
}
10821111

10831112
// Publishing kernels for class labels and regression results
1113+
// In case classification should not be used, this kernel should still be executed to fill the mOutputDataClass array with default values
10841114
if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) {
10851115
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels
10861116
} else {
@@ -1092,6 +1122,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10921122
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Publishing class 2 regression results
10931123
}
10941124
}
1125+
if (nn_settings.nnClusterizerVerbosity > 3) {
1126+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done publishing. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1127+
}
10951128
}
10961129

10971130
if (clustererNNShadow.mNnClusterizerUseCfRegression) {
@@ -1100,6 +1133,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
11001133
}
11011134
DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
11021135
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
1136+
if (nn_settings.nnClusterizerVerbosity > 3) {
1137+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1138+
}
11031139
}
11041140
#else
11051141
GPUFatal("Project not compiled with neural network clusterization. Aborting.");
@@ -1203,7 +1239,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
12031239
for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
12041240
#ifdef GPUCA_HAS_ONNX
12051241
if (GetProcessingSettings().nn.applyNNclusterizer) {
1206-
LOG(info) << "(ORT) Environment releasing...";
1242+
if (GetProcessingSettings().nn.nnClusterizerVerbosity > 0) {
1243+
LOG(info) << "(ORT) Environment releasing...";
1244+
}
12071245
GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
12081246
nnApplication.mModelClass.release(true);
12091247
nnApplication.mModelReg1.release(true);

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,5 @@ GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
3535

3636
tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
3737

38-
GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
38+
GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, true);
3939
}

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate
5757
template <int32_t iKernel = defaultKernel>
5858
GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t);
5959

60-
static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const CfArray2D<PackedCharge>&, const CfChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*);
60+
static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const CfArray2D<PackedCharge>&, const CfChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*, int8_t);
6161

6262
static GPUd() void buildCluster(const GPUSettingsRec&, const CfArray2D<PackedCharge>&, CfChargePos, CfChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
6363

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t
2727
uint32_t maxClusterPerRow,
2828
uint32_t* clusterInRow,
2929
tpc::ClusterNative* clusterByRow,
30-
uint32_t* clusterPosInRow)
30+
uint32_t* clusterPosInRow,
31+
int8_t isAccepted)
3132
{
3233
uint32_t idx = get_global_id(0);
3334

@@ -62,6 +63,9 @@ GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t
6263
tpc::ClusterNative myCluster;
6364
pc.finalize(pos, charge, fragment.start);
6465
bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param(), chargeMap);
66+
if (!isAccepted) {
67+
rejectCluster = true;
68+
}
6569

6670
if (rejectCluster) {
6771
if (clusterPosInRow) {

0 commit comments

Comments
 (0)