Skip to content

Commit 4998576

Browse files
committed
Adding verbosity and fixing off-by-one error
1 parent c09477e commit 4998576

File tree

3 files changed

+94
-5
lines changed

3 files changed

+94
-5
lines changed

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -665,7 +665,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
665665
nnTimers[11] = &getTimer<GPUTPCNNClusterizer, 11>("GPUTPCNNClusterizer_ONNXRegression2_2_", 11);
666666
}
667667

668-
mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
668+
for (int32_t lane = 0; lane < numLanes; lane++) {
669669
nnApplications[lane].init(nn_settings, GetProcessingSettings().deterministicGPUReconstruction);
670670
if (nnApplications[lane].mModelsUsed[0]) {
671671
SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId);
@@ -706,10 +706,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
706706
// nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
707707
(nnApplications[lane].mModelReg2).initSession();
708708
}
709-
if (nn_settings.nnClusterizerVerbosity < 3) {
709+
if (nn_settings.nnClusterizerVerbosity > 0) {
710710
LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
711711
}
712-
});
712+
};
713713
for (int32_t sector = 0; sector < NSECTORS; sector++) {
714714
GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector];
715715
GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN;
@@ -724,12 +724,24 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
724724
clustererNNShadow.mNnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
725725
nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
726726
}
727+
if (nn_settings.nnClusterizerVerbosity > 2) {
728+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Processor initialized. Sector " << sector << ", lane " << lane << ", max clusters " << clustererNN.mNnClusterizerTotalClusters << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
729+
}
727730
AllocateRegisteredMemory(clustererNN.mMemoryId);
731+
if (nn_settings.nnClusterizerVerbosity > 2) {
732+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Memory registered for memoryId " << clustererNN.mMemoryId << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
733+
}
728734
// nnApplications[lane].createBoundary(clustererNNShadow);
729735
// nnApplications[lane].createIndexLookup(clustererNNShadow);
730736
}
731737
if (doGPU) {
738+
if (nn_settings.nnClusterizerVerbosity > 2) {
739+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Writing to constant memory...";
740+
}
732741
WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
742+
if (nn_settings.nnClusterizerVerbosity > 2) {
743+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Writing to constant memory done";
744+
}
733745
}
734746
}
735747
#endif
@@ -1010,9 +1022,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10101022
}
10111023

10121024
// float time_clusterizer = 0, time_fill = 0, time_networks = 0;
1025+
if (nn_settings.nnClusterizerVerbosity > 2) {
1026+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Starting loop over batched data. clustererNNShadow.mNnClusterizerBatchedMode=" << clustererNNShadow.mNnClusterizerBatchedMode << ", numLoops=" << std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode) << ", numClusters=" << clusterer.mPmemory->counters.nClusters << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1027+
}
10131028
for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode); batch++) {
1029+
if (nn_settings.nnClusterizerVerbosity > 3) {
1030+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Start. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1031+
}
10141032
uint batchStart = batch * clustererNNShadow.mNnClusterizerBatchedMode;
1015-
size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
1033+
size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart - 1));
10161034

10171035
// Filling the data
10181036
if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
@@ -1022,9 +1040,18 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10221040
// Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
10231041
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
10241042
}
1043+
if (doGPU) { // This is to make sure that the network does not start the evaluation before all data is filled
1044+
SynchronizeStream(lane);
1045+
}
1046+
if (nn_settings.nnClusterizerVerbosity > 3) {
1047+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done filling data. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1048+
}
10251049

10261050
if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
10271051
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); // Publishing the deconvolution flags
1052+
if (nn_settings.nnClusterizerVerbosity > 3) {
1053+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done setting deconvolution flags. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1054+
}
10281055
}
10291056

10301057
// NN evaluations
@@ -1044,6 +1071,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10441071
}
10451072
}
10461073
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); }
1074+
if (nn_settings.nnClusterizerVerbosity > 3) {
1075+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN classification inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1076+
}
10471077
}
10481078
if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
10491079
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); }
@@ -1078,6 +1108,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10781108
}
10791109
if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Stop(); }
10801110
}
1111+
if (nn_settings.nnClusterizerVerbosity > 3) {
1112+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN regression inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1113+
}
10811114
}
10821115

10831116
// Publishing kernels for class labels and regression results
@@ -1092,6 +1125,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10921125
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Publishing class 2 regression results
10931126
}
10941127
}
1128+
if (nn_settings.nnClusterizerVerbosity > 3) {
1129+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done publishing. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1130+
}
10951131
}
10961132

10971133
if (clustererNNShadow.mNnClusterizerUseCfRegression) {
@@ -1100,6 +1136,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
11001136
}
11011137
DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
11021138
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
1139+
if (nn_settings.nnClusterizerVerbosity > 3) {
1140+
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
1141+
}
11031142
}
11041143
#else
11051144
GPUFatal("Project not compiled with neural network clusterization. Aborting.");
@@ -1202,7 +1241,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
12021241
}
12031242
for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
12041243
#ifdef GPUCA_HAS_ONNX
1205-
if (GetProcessingSettings().nn.applyNNclusterizer) {
1244+
if (GetProcessingSettings().nn.applyNNclusterizer && GetProcessingSettings().nn.nnClusterizerVerbosity > 0) {
12061245
LOG(info) << "(ORT) Environment releasing...";
12071246
GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
12081247
nnApplication.mModelClass.release(true);

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,11 @@
1616
#include "ML/3rdparty/GPUORTFloat16.h"
1717
#include "GPUTPCNNClusterizer.h"
1818
#include "GPUSettings.h"
19+
#include "GPULogging.h"
20+
#include <cstdint> // uintptr_t
21+
#include <iomanip> // setprecision
22+
#include <ostream>
23+
#include <sstream>
1924

2025
using namespace o2::gpu;
2126

@@ -25,6 +30,8 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}
2530

2631
void* GPUTPCNNClusterizer::setIOPointers(void* mem)
2732
{
33+
// Keep track of the start address to compute how much memory we assign
34+
void* startMem = mem;
2835
if (mNnClusterizerBatchedMode > 0) {
2936
if (mNnInferenceInputDType == 0 && mNnClusterizerElementSize > 0) {
3037
computePointerWithAlignment(mem, mInputData_16, mNnClusterizerBatchedMode * mNnClusterizerElementSize);
@@ -62,6 +69,48 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
6269
if (mNnClusterizerTotalClusters > 0) {
6370
computePointerWithAlignment(mem, mOutputDataClass, mNnClusterizerTotalClusters);
6471
}
72+
73+
if (mNnClusterizerVerbosity > 2) {
74+
if (mNnClusterizerVerbosity > 3) {
75+
auto fmt = [](size_t bytes) {
76+
std::ostringstream os;
77+
double mb = bytes / (1024.0 * 1024.0);
78+
os << bytes << " bytes (" << std::fixed << std::setprecision(3) << mb << " MB)";
79+
return os.str();
80+
};
81+
82+
// Safely compute sizes only if corresponding pointer was allocated (and dimensions positive)
83+
size_t szClusterFlags = (mClusterFlags && mNnClusterizerBatchedMode > 0) ? (size_t)2 * mNnClusterizerBatchedMode * sizeof(int8_t) : 0;
84+
size_t szInput16 = (mInputData_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize * sizeof(OrtDataType::Float16_t) : 0;
85+
size_t szInput32 = (mInputData_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerElementSize > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerElementSize * sizeof(float) : 0;
86+
size_t szProb16 = (mModelProbabilities_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes * sizeof(OrtDataType::Float16_t) : 0;
87+
size_t szProb32 = (mModelProbabilities_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelClassNumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes * sizeof(float) : 0;
88+
size_t szReg1_16 = (mOutputDataReg1_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes * sizeof(OrtDataType::Float16_t) : 0;
89+
size_t szReg2_16 = (mOutputDataReg2_16 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes * sizeof(OrtDataType::Float16_t) : 0;
90+
size_t szReg1_32 = (mOutputDataReg1_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg1NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes * sizeof(float) : 0;
91+
size_t szReg2_32 = (mOutputDataReg2_32 && mNnClusterizerBatchedMode > 0 && mNnClusterizerModelReg2NumOutputNodes > 0) ? (size_t)mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes * sizeof(float) : 0;
92+
size_t szOutputDataClass = (mOutputDataClass && mNnClusterizerTotalClusters > 0) ? (size_t)mNnClusterizerTotalClusters * sizeof(int32_t) : 0;
93+
94+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") Pointers set for clusterizer with memoryID " << mMemoryId << " deviceID " << mDeviceId << " and sector " << mISector;
95+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataClass pointer: " << mOutputDataClass << " | " << fmt(szOutputDataClass) << " MB";
96+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mClusterFlags pointer: " << static_cast<const void*>(mClusterFlags) << " | " << fmt(szClusterFlags) << " MB";
97+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_16 pointer: " << mInputData_16 << " | " << fmt(szInput16) << " MB";
98+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_16 pointer: " << mModelProbabilities_16 << " | " << fmt(szProb16) << " MB";
99+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_16 pointer: " << mOutputDataReg1_16 << " | " << fmt(szReg1_16) << " MB";
100+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_16 pointer: " << mOutputDataReg2_16 << " | " << fmt(szReg2_16) << " MB";
101+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mInputData_32 pointer: " << mInputData_32 << " | " << fmt(szInput32) << " MB";
102+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mModelProbabilities_32 pointer: " << mModelProbabilities_32 << " | " << fmt(szProb32) << " MB";
103+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg1_32 pointer: " << mOutputDataReg1_32 << " | " << fmt(szReg1_32) << " MB";
104+
LOG(info) << "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") mOutputDataReg2_32 pointer: " << mOutputDataReg2_32 << " | " << fmt(szReg2_32) << " MB";
105+
}
106+
// Compute allocated bytes (difference between advanced pointer and start pointer)
107+
size_t allocatedBytes = static_cast<size_t>(reinterpret_cast<uintptr_t>(mem) - reinterpret_cast<uintptr_t>(startMem));
108+
double allocatedMB = static_cast<double>(allocatedBytes) / (1024.0 * 1024.0);
109+
LOG(info) << std::fixed << std::setprecision(3)
110+
<< "(NNCLUS, GPUTPCNNClusterizer, this=" << this << ") Total scratch allocation in setIOPointers: " << allocatedBytes
111+
<< " bytes (" << allocatedMB << " MB)";
112+
}
113+
65114
return mem;
66115
}
67116

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
360360
return;
361361
}
362362

363+
// For flag influence on cluster error setting: O2/GPU/GPUTracking/Base/GPUParam.inc -> UpdateClusterError2ByState
363364
bool notSinglePad = false, notSingleTime = false;
364365
for (uint16_t i = 0; i < 8; i++) {
365366
Delta2 d = cfconsts::InnerNeighbors[i];

0 commit comments

Comments
 (0)