Skip to content

Commit d87f8fa

Browse files
committed
GetGrid spawns more threads than actual number -> Most probably explains out-of-bounds accesses and memory faults
1 parent 5368d52 commit d87f8fa

File tree

6 files changed

+119
-53
lines changed

6 files changed

+119
-53
lines changed

Common/ML/src/OrtInterface.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
5454

5555
// Load from options map
5656
if (!optionsMap.contains("model-path")) {
57-
LOG(fatal) << "(ORT) Model path cannot be empty!";
57+
LOG(fatal) << "(ORT) Model path must be contained in options map!";
5858
}
5959

6060
if (!optionsMap["model-path"].empty()) {

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,15 +259,15 @@ AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables gr
259259
AddOption(nnInferenceUseDeterministicCompute, int, 0, "", 0, "Enables deterministic compute in ONNX Runtime were possible. Can be [0, 1] -> see https://github.com/microsoft/onnxruntime/blob/3b97d79b3c12dbf93aa0d563f345714596dc8ab6/onnxruntime/core/framework/session_options.h#L208")
260260
AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
261261
AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data")
262-
AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
262+
AddOption(nnInferenceVerbosity, int, 2, "", 0, "0: All debugs; 1: Warnings + major debugs; 2: Warnings; >=3: No messages")
263263
AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
264264
AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
265265
AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
266266
AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
267267
AddOption(nnClusterizerUseCfRegression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN")
268268
AddOption(nnClusterizerApplyCfDeconvolution, int, 0, "", 0, "Applies the CFDeconvolution kernel before the digits to the network are filled")
269269
AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
270-
AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
270+
AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed. Higher number = higher verbosity")
271271
AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
272272
AddOption(nnClusterizerApplyNoiseSuppression, int, 1, "", 0, "Applies the NoiseSuppression kernel before the digits to the network are filled")
273273
AddOption(nnClusterizerSetDeconvolutionFlags, int, 1, "", 0, "Runs the deconvolution kernel without overwriting the charge in order to make cluster-to-track attachment identical to heuristic CF")

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
708708
if (nn_settings.nnClusterizerVerbosity > 0) {
709709
LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
710710
}
711-
};
711+
}
712712
for (int32_t sector = 0; sector < NSECTORS; sector++) {
713713
GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector];
714714
GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN;
@@ -1029,7 +1029,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10291029
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Start. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
10301030
}
10311031
uint batchStart = batch * clustererNNShadow.mNnClusterizerBatchedMode;
1032-
size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart - 1));
1032+
size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
10331033

10341034
// Filling the data
10351035
if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
@@ -1039,9 +1039,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10391039
// Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
10401040
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
10411041
}
1042-
if (doGPU) { // This is to make sure that the network does not start the evaluation before all data is filled
1043-
SynchronizeStream(lane);
1044-
}
10451042
if (nn_settings.nnClusterizerVerbosity > 3) {
10461043
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done filling data. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
10471044
}
@@ -1240,8 +1237,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
12401237
}
12411238
for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
12421239
#ifdef GPUCA_HAS_ONNX
1243-
if (GetProcessingSettings().nn.applyNNclusterizer && GetProcessingSettings().nn.nnClusterizerVerbosity > 0) {
1244-
LOG(info) << "(ORT) Environment releasing...";
1240+
if (GetProcessingSettings().nn.applyNNclusterizer) {
1241+
if (GetProcessingSettings().nn.nnClusterizerVerbosity > 0) {
1242+
LOG(info) << "(ORT) Environment releasing...";
1243+
}
12451244
GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
12461245
nnApplication.mModelClass.release(true);
12471246
nnApplication.mModelReg1.release(true);

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx

Lines changed: 57 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -69,35 +69,67 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
6969
if (mNnClusterizerVerbosity > 2) {
7070
if (mNnClusterizerVerbosity > 3) {
7171
auto fmt = [](size_t bytes) {
72-
std::ostringstream os;
73-
double mb = bytes / (1024.0 * 1024.0);
74-
os << bytes << " bytes (" << std::fixed << std::setprecision(3) << mb << " MB)";
75-
return os.str();
72+
std::ostringstream os;
73+
double mb = bytes / (1024.0 * 1024.0);
74+
os << bytes << " bytes (" << std::fixed << std::setprecision(3) << mb << " MB)";
75+
return os.str();
7676
};
7777

78-
// Safely compute sizes only if corresponding pointer was allocated (and dimensions positive)
79-
size_t szClusterFlags = (mClusterFlags && mNnClusterizerBatchedMode > 0) ? (size_t)2 * mNnClusterizerBatchedMode * sizeof(int8_t) : 0;
80-
size_t szInput16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize * sizeof(OrtDataType::Float16_t) : 0;
81-
size_t szInput32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize * sizeof(float) : 0;
82-
size_t szProb16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes * sizeof(OrtDataType::Float16_t) : 0;
83-
size_t szProb32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes * sizeof(float) : 0;
84-
size_t szReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes * sizeof(OrtDataType::Float16_t) : 0;
85-
size_t szReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes * sizeof(OrtDataType::Float16_t) : 0;
86-
size_t szReg1_32 = (mOutputDataReg1_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes * sizeof(float) : 0;
87-
size_t szReg2_32 = (mOutputDataReg2_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes * sizeof(float) : 0;
88-
size_t szOutputDataClass = (mOutputDataClass && mNnClusterizerTotalClusters > 0) ? (size_t)mNnClusterizerTotalClusters * sizeof(int32_t) : 0;
78+
// Element counts (number of array entries, not bytes)
79+
size_t elemsClusterFlags = (mClusterFlags && mNnClusterizerBatchedMode > 0) ? (size_t)2 * mNnClusterizerBatchedMode : 0;
80+
size_t elemsInput16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0;
81+
size_t elemsInput32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize : 0;
82+
size_t elemsProb16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0;
83+
size_t elemsProb32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes : 0;
84+
size_t elemsReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0;
85+
size_t elemsReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0;
86+
size_t elemsReg1_32 = (mOutputDataReg1_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes : 0;
87+
size_t elemsReg2_32 = (mOutputDataReg2_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes : 0;
88+
size_t elemsOutputDataClass = (mOutputDataClass && mNnClusterizerTotalClusters > 0) ? (size_t)mNnClusterizerTotalClusters : 0;
89+
90+
// Byte sizes
91+
size_t szClusterFlags = elemsClusterFlags * sizeof(int8_t);
92+
size_t szInput16 = elemsInput16 * sizeof(OrtDataType::Float16_t);
93+
size_t szInput32 = elemsInput32 * sizeof(float);
94+
size_t szProb16 = elemsProb16 * sizeof(OrtDataType::Float16_t);
95+
size_t szProb32 = elemsProb32 * sizeof(float);
96+
size_t szReg1_16 = elemsReg1_16 * sizeof(OrtDataType::Float16_t);
97+
size_t szReg2_16 = elemsReg2_16 * sizeof(OrtDataType::Float16_t);
98+
size_t szReg1_32 = elemsReg1_32 * sizeof(float);
99+
size_t szReg2_32 = elemsReg2_32 * sizeof(float);
100+
size_t szOutputDataClass = elemsOutputDataClass * sizeof(int32_t);
89101

90102
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") Pointers set for clusterizer with memoryID " << mMemoryId << " deviceID " << mDeviceId << " and sector " << mISector;
91-
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataClass pointer: " << mOutputDataClass << " | " << fmt(szOutputDataClass) << " MB";
92-
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mClusterFlags pointer: " << static_cast<const void*>(mClusterFlags) << " | " << fmt(szClusterFlags) << " MB";
93-
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_16 pointer: " << mInputData_16 << " | " << fmt(szInput16) << " MB";
94-
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_16 pointer: " << mModelProbabilities_16 << " | " << fmt(szProb16) << " MB";
95-
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_16 pointer: " << mOutputDataReg1_16 << " | " << fmt(szReg1_16) << " MB";
96-
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_16 pointer: " << mOutputDataReg2_16 << " | " << fmt(szReg2_16) << " MB";
97-
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_32 pointer: " << mInputData_32 << " | " << fmt(szInput32) << " MB";
98-
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_32 pointer: " << mModelProbabilities_32 << " | " << fmt(szProb32) << " MB";
99-
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_32 pointer: " << mOutputDataReg1_32 << " | " << fmt(szReg1_32) << " MB";
100-
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_32 pointer: " << mOutputDataReg2_32 << " | " << fmt(szReg2_32) << " MB";
103+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataClass pointer: " << mOutputDataClass
104+
<< " | elements=" << elemsOutputDataClass << " (= mNnClusterizerTotalClusters)"
105+
<< " | " << fmt(szOutputDataClass);
106+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mClusterFlags pointer: " << static_cast<const void*>(mClusterFlags)
107+
<< " | elements=" << elemsClusterFlags << " (= 2 * mNnClusterizerBatchedMode)"
108+
<< " | " << fmt(szClusterFlags);
109+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_16 pointer: " << mInputData_16
110+
<< " | elements=" << elemsInput16 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
111+
<< " | " << fmt(szInput16);
112+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_16 pointer: " << mModelProbabilities_16
113+
<< " | elements=" << elemsProb16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
114+
<< " | " << fmt(szProb16);
115+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_16 pointer: " << mOutputDataReg1_16
116+
<< " | elements=" << elemsReg1_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)"
117+
<< " | " << fmt(szReg1_16);
118+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_16 pointer: " << mOutputDataReg2_16
119+
<< " | elements=" << elemsReg2_16 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)"
120+
<< " | " << fmt(szReg2_16);
121+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_32 pointer: " << mInputData_32
122+
<< " | elements=" << elemsInput32 << " (= mNnClusterizerBatchedMode * mNnClusterizerElementSize)"
123+
<< " | " << fmt(szInput32);
124+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_32 pointer: " << mModelProbabilities_32
125+
<< " | elements=" << elemsProb32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes)"
126+
<< " | " << fmt(szProb32);
127+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_32 pointer: " << mOutputDataReg1_32
128+
<< " | elements=" << elemsReg1_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes)"
129+
<< " | " << fmt(szReg1_32);
130+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_32 pointer: " << mOutputDataReg2_32
131+
<< " | elements=" << elemsReg2_32 << " (= mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes)"
132+
<< " | " << fmt(szReg2_32);
101133
}
102134
// Compute allocated bytes (difference between advanced pointer and start pointer)
103135
size_t allocatedBytes = static_cast<size_t>(reinterpret_cast<uintptr_t>(mem) - reinterpret_cast<uintptr_t>(startMem));

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
5050
int32_t mNnClusterizerUseCfRegression = 0;
5151
int32_t mNnClusterizerBatchedMode = 1;
5252
int32_t mNnClusterizerTotalClusters = 1;
53-
int32_t mNnClusterizerVerbosity = 0;
53+
int32_t mNnClusterizerVerbosity = 1;
5454
int32_t mNnClusterizerBoundaryFillValue = -1;
5555
int32_t mNnClusterizerModelClassNumOutputNodes = -1;
5656
int32_t mNnClusterizerModelReg1NumOutputNodes = -1;

0 commit comments

Comments
 (0)