Skip to content

Commit da3a178

Browse files
NN clusterizer: Improve filling kernel speed (#14510)
* First version of lookup tables * Simplifying computations + bug-fixes * Fixes for indexing and offsets * Adjusting CPU kernel * Please consider the following formatting changes * Fix for row-number access * Please consider the following formatting changes * Improve kernel speed by ~15%. Next test: for-loop in pad direction for coallesced access * IMproving kernel speed by 30% compared to original version. Next try: for-loop over row dimension as access is somewhat coalsced too * Please consider the following formatting changes * Minor improvements for MC handling * Beautifications to trigger the CI * Compile-fix * Fix int32_t error in fullCI build --------- Co-authored-by: ALICE Action Bot <alibuild@cern.ch>
1 parent 9b9ef9a commit da3a178

File tree

7 files changed

+289
-140
lines changed

7 files changed

+289
-140
lines changed

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -706,6 +706,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
706706
nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
707707
}
708708
AllocateRegisteredMemory(clustererNN.mMemoryId);
709+
// nnApplications[lane].createBoundary(clustererNNShadow);
710+
// nnApplications[lane].createIndexLookup(clustererNNShadow);
709711
});
710712
if (doGPU) {
711713
WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -65,33 +65,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
6565
return mem;
6666
}
6767

68-
// std::vector<int32_t> GPUTPCNNClusterizer::pointerSizes() {
69-
// std::vector<int32_t> sizes(7, -1);
70-
// if (mNnClusterizerBatchedMode > 0) {
71-
// if (mNnInferenceInputDType == 0 && mNnClusterizerElementSize > 0) {
72-
// sizes[0] = mNnClusterizerBatchedMode * mNnClusterizerElementSize; // inputData16
73-
// } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) {
74-
// sizes[1] = mNnClusterizerBatchedMode * mNnClusterizerElementSize; // inputData32
75-
// }
76-
// sizes[2] = 2 * mNnClusterizerBatchedMode; // mClusterFlags
77-
// if (mNnClusterizerModelClassNumOutputNodes > 0) {
78-
// sizes[3] = mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes; // modelProbabilities
79-
// }
80-
// if (!mNnClusterizerUseCfRegression) {
81-
// if (mNnClusterizerModelReg1NumOutputNodes > 0) {
82-
// sizes[4] = mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes; // outputDataReg1
83-
// }
84-
// if (mNnClusterizerModelReg2NumOutputNodes > 0) {
85-
// sizes[5] = mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes; // outputDataReg2
86-
// }
87-
// }
88-
// }
89-
// if (mNnClusterizerTotalClusters > 0) {
90-
// sizes[6] = mNnClusterizerTotalClusters; // mOutputDataClass
91-
// }
92-
// return sizes;
93-
// }
94-
9568
void GPUTPCNNClusterizer::RegisterMemoryAllocation()
9669
{
9770
AllocateAndInitializeLate();

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -37,31 +37,51 @@ class GPUTPCNNClusterizer : public GPUProcessor
3737

3838
// Neural network clusterization
3939

40-
int mNnClusterizerSizeInputRow = 3;
41-
int mNnClusterizerSizeInputPad = 3;
42-
int mNnClusterizerSizeInputTime = 3;
43-
int mNnClusterizerElementSize = -1;
44-
bool mNnClusterizerAddIndexData = true;
40+
int32_t mNnClusterizerSizeInputRow = 3;
41+
int32_t mNnClusterizerSizeInputPad = 3;
42+
int32_t mNnClusterizerSizeInputTime = 3;
43+
int32_t mNnClusterizerChargeArraySize = -1;
44+
int32_t mNnClusterizerElementSize = -1;
45+
int8_t mNnClusterizerAddIndexData = 1;
4546
float mNnClassThreshold = 0.01;
46-
bool mNnSigmoidTrafoClassThreshold = 1;
47-
bool mNnClusterizerSetDeconvolutionFlags = true;
48-
int mNnClusterizerUseCfRegression = 0;
49-
int mNnClusterizerBatchedMode = 1;
50-
int mNnClusterizerTotalClusters = 1;
51-
int mNnClusterizerVerbosity = 0;
52-
int mNnClusterizerBoundaryFillValue = -1;
53-
int mNnClusterizerModelClassNumOutputNodes = -1;
54-
int mNnClusterizerModelReg1NumOutputNodes = -1;
55-
int mNnClusterizerModelReg2NumOutputNodes = -1;
56-
int mNnInferenceInputDType = 0; // 0: float16, 1: float32
57-
int mNnInferenceOutputDType = 0; // 0: float16, 1: float32
58-
int mISector = -1;
59-
int mDeviceId = -1;
47+
int8_t mNnSigmoidTrafoClassThreshold = 1;
48+
int8_t mNnClusterizerSetDeconvolutionFlags = 1;
49+
int32_t mNnClusterizerUseCfRegression = 0;
50+
int32_t mNnClusterizerBatchedMode = 1;
51+
int32_t mNnClusterizerTotalClusters = 1;
52+
int32_t mNnClusterizerVerbosity = 0;
53+
int32_t mNnClusterizerBoundaryFillValue = -1;
54+
int32_t mNnClusterizerModelClassNumOutputNodes = -1;
55+
int32_t mNnClusterizerModelReg1NumOutputNodes = -1;
56+
int32_t mNnClusterizerModelReg2NumOutputNodes = -1;
57+
int32_t mNnInferenceInputDType = 0; // 0: float16, 1: float32
58+
int32_t mNnInferenceOutputDType = 0; // 0: float16, 1: float32
59+
int32_t mISector = -1;
60+
int32_t mDeviceId = -1;
61+
62+
// GPU optimizations
63+
uint32_t mNnClusterizerFullRowSize = 0;
64+
uint32_t mNnClusterizerFullPadSize = 0;
65+
uint32_t mNnClusterizerFullTimeSize = 0;
66+
uint32_t mNnClusterizerPadTimeSize = 0;
67+
uint32_t mNnClusterizerRowTimeSize = 0;
68+
uint32_t mNnClusterizerRowTimeSizeFull = 0;
69+
70+
// Boundary lookup table
71+
// int32_t mBoundaryMapSizeRow = 0;
72+
// int32_t mBoundaryMapSizePadsPerRow = 0;
73+
// int32_t mBoundaryMapSize = 0;
74+
// int32_t mBoundaryPadding = 11; // Padding on each side of the boundary map to account for pad_offset
75+
// int8_t* mIsBoundary = nullptr;
76+
77+
// Index lookup table
78+
// int32_t mIndexLookupSize = 0;
79+
// int32_t* mIndexLookup = nullptr;
6080

6181
// Memory allocation for neural network
6282

63-
bool* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
64-
int* mOutputDataClass = nullptr;
83+
int8_t* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
84+
int32_t* mOutputDataClass = nullptr;
6585

6686
// FP32
6787
float* mInputData_32 = nullptr;

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include "GPUSettings.h"
2020
#include "ML/3rdparty/GPUORTFloat16.h"
2121
#include "GPUReconstruction.h"
22+
#include "GPUTPCGeometry.h"
23+
#include "DataFormatsTPC/Constants.h"
2224

2325
#ifdef GPUCA_HAS_ONNX
2426
#include <onnxruntime_cxx_api.h>
@@ -87,8 +89,20 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
8789
clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
8890
clustererNN.mNnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad;
8991
clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
92+
clustererNN.mNnClusterizerFullRowSize = 2 * settings.nnClusterizerSizeInputRow + 1;
93+
clustererNN.mNnClusterizerFullPadSize = 2 * settings.nnClusterizerSizeInputPad + 1;
94+
clustererNN.mNnClusterizerFullTimeSize = 2 * settings.nnClusterizerSizeInputTime + 1;
95+
clustererNN.mNnClusterizerChargeArraySize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
96+
clustererNN.mNnClusterizerPadTimeSize = clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
97+
clustererNN.mNnClusterizerRowTimeSize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullTimeSize;
98+
clustererNN.mNnClusterizerRowTimeSizeFull = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 3 : 0);
99+
clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
100+
// clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
101+
// clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
102+
// clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding;
103+
// clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow;
104+
// clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
90105
clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
91-
clustererNN.mNnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0);
92106
clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
93107
clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
94108
clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
@@ -116,6 +130,39 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
116130
}
117131
}
118132

133+
// void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN)
134+
// {
135+
// // Call after init of the clustererNN elements
136+
// for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
137+
// int8_t skipCheckInRow = 0;
138+
// for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) {
139+
// int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p;
140+
// clustererNN.mIsBoundary[i] = 1;
141+
// if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) {
142+
// if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
143+
// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
144+
// } else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) {
145+
// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow)));
146+
// }
147+
// skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row
148+
// }
149+
// }
150+
// }
151+
// }
152+
153+
// void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN)
154+
// {
155+
// for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) {
156+
// int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
157+
// int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
158+
// int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad;
159+
// int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
160+
// clustererNN.mIndexLookup[3 * i] = r;
161+
// clustererNN.mIndexLookup[3 * i + 1] = p;
162+
// clustererNN.mIndexLookup[3 * i + 2] = t;
163+
// }
164+
// }
165+
119166
// MockedOrtAllocator implementation to be able to use volatile assignment
120167
struct MockedOrtAllocator : OrtAllocator {
121168
MockedOrtAllocator(GPUReconstruction* = nullptr, OrtMemoryInfo* = nullptr);

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ class GPUTPCNNClusterizerHost
4949

5050
void init(const GPUSettingsProcessingNNclusterizer&);
5151
void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
52+
void createBoundary(GPUTPCNNClusterizer&);
53+
void createIndexLookup(GPUTPCNNClusterizer&);
5254

5355
// ONNX
5456
void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);

0 commit comments

Comments
 (0)