Skip to content

Commit 290ba0e

Browse files
committed
First version of lookup tables
1 parent d9d6894 commit 290ba0e

File tree

6 files changed

+94
-50
lines changed

6 files changed

+94
-50
lines changed

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
6161
}
6262
if (mNnClusterizerTotalClusters > 0) {
6363
computePointerWithAlignment(mem, mOutputDataClass, mNnClusterizerTotalClusters);
64+
computePointerWithAlignment(mem, mIsBoundary, mBoundaryMapSize);
65+
computePointerWithAlignment(mem, mIndexLookup, mIndexLookupSize);
6466
}
6567
return mem;
6668
}

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -37,31 +37,42 @@ class GPUTPCNNClusterizer : public GPUProcessor
3737

3838
// Neural network clusterization
3939

40-
int mNnClusterizerSizeInputRow = 3;
41-
int mNnClusterizerSizeInputPad = 3;
42-
int mNnClusterizerSizeInputTime = 3;
43-
int mNnClusterizerElementSize = -1;
44-
bool mNnClusterizerAddIndexData = true;
40+
int32_t mNnClusterizerSizeInputRow = 3;
41+
int32_t mNnClusterizerSizeInputPad = 3;
42+
int32_t mNnClusterizerSizeInputTime = 3;
43+
int32_t mNnClusterizerChargeArraySize = -1;
44+
int32_t mNnClusterizerElementSize = -1;
45+
int8_t mNnClusterizerAddIndexData = 1;
4546
float mNnClassThreshold = 0.01;
46-
bool mNnSigmoidTrafoClassThreshold = 1;
47-
bool mNnClusterizerSetDeconvolutionFlags = true;
48-
int mNnClusterizerUseCfRegression = 0;
49-
int mNnClusterizerBatchedMode = 1;
50-
int mNnClusterizerTotalClusters = 1;
51-
int mNnClusterizerVerbosity = 0;
52-
int mNnClusterizerBoundaryFillValue = -1;
53-
int mNnClusterizerModelClassNumOutputNodes = -1;
54-
int mNnClusterizerModelReg1NumOutputNodes = -1;
55-
int mNnClusterizerModelReg2NumOutputNodes = -1;
56-
int mNnInferenceInputDType = 0; // 0: float16, 1: float32
57-
int mNnInferenceOutputDType = 0; // 0: float16, 1: float32
58-
int mISector = -1;
59-
int mDeviceId = -1;
47+
int8_t mNnSigmoidTrafoClassThreshold = 1;
48+
int8_t mNnClusterizerSetDeconvolutionFlags = 1;
49+
int32_t mNnClusterizerUseCfRegression = 0;
50+
int32_t mNnClusterizerBatchedMode = 1;
51+
int32_t mNnClusterizerTotalClusters = 1;
52+
int32_t mNnClusterizerVerbosity = 0;
53+
int32_t mNnClusterizerBoundaryFillValue = -1;
54+
int32_t mNnClusterizerModelClassNumOutputNodes = -1;
55+
int32_t mNnClusterizerModelReg1NumOutputNodes = -1;
56+
int32_t mNnClusterizerModelReg2NumOutputNodes = -1;
57+
int32_t mNnInferenceInputDType = 0; // 0: float16, 1: float32
58+
int32_t mNnInferenceOutputDType = 0; // 0: float16, 1: float32
59+
int32_t mISector = -1;
60+
int32_t mDeviceId = -1;
61+
62+
// Boundary lookup table
63+
int32_t mBoundaryMapSizeRow = 0;
64+
int32_t mBoundaryMapSizePerRow = 0;
65+
int32_t mBoundaryMapSize = 0;
66+
int8_t* mIsBoundary = nullptr;
67+
68+
// Index lookup table
69+
int32_t mIndexLookupSize = 0;
70+
int32_t* mIndexLookup = nullptr;
6071

6172
// Memory allocation for neural network
6273

63-
bool* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
64-
int* mOutputDataClass = nullptr;
74+
int8_t* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
75+
int32_t* mOutputDataClass = nullptr;
6576

6677
// FP32
6778
float* mInputData_32 = nullptr;

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#include "GPUSettings.h"
2020
#include "ML/3rdparty/GPUORTFloat16.h"
2121
#include "GPUReconstruction.h"
22+
#include "GPUTPCGeometry.h"
23+
#include "DataFormatsTPC/Constants.h"
2224

2325
#ifdef GPUCA_HAS_ONNX
2426
#include <onnxruntime_cxx_api.h>
@@ -87,8 +89,11 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
8789
clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
8890
clustererNN.mNnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad;
8991
clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
92+
clustererNN.mNnClusterizerChargeArraySize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1));
93+
clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
94+
clustererNN.mBoundaryMapSize = (3*clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW)*(GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2*clustererNN.mNnClusterizerSizeInputPad);
95+
clustererNN.mIndexLookupSize = 3*clustererNN.mNnClusterizerElementSize; // local row, pad, time coordinate from flat index
9096
clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
91-
clustererNN.mNnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0);
9297
clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
9398
clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
9499
clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
@@ -114,6 +119,41 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
114119
clustererNN.mNnClusterizerModelReg2NumOutputNodes = mModelReg2.getNumOutputNodes()[0][1];
115120
}
116121
}
122+
createBoundary(clustererNN);
123+
createIndexLookup(clustererNN);
124+
}
125+
126+
void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) {
127+
// Call after init of the clustererNN elements
128+
clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
129+
clustererNN.mBoundaryMapSizePerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2 * clustererNN.mNnClusterizerSizeInputPad;
130+
for(int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
131+
for (int p = 0; p < clustererNN.mBoundaryMapSizePerRow; p++) {
132+
int32_t i = r * clustererNN.mBoundaryMapSizePerRow + p;
133+
clustererNN.mIsBoundary[i] = 1;
134+
if (p >= clustererNN.mNnClusterizerSizeInputPad || r >= clustererNN.mNnClusterizerSizeInputRow) {
135+
if ((r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) ||
136+
(r >= (GPUTPCGeometry::EndIROC() + 2*clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2*clustererNN.mNnClusterizerSizeInputRow))) {
137+
clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mNnClusterizerSizeInputPad) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
138+
}
139+
if (clustererNN.mIsBoundary[i] == 1) {
140+
break; // No need to check further pads in this row
141+
}
142+
}
143+
}
144+
}
145+
}
146+
147+
void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN) {
148+
for(int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++){
149+
int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
150+
int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
151+
int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad;
152+
int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
153+
clustererNN.mIndexLookup[3*i] = r;
154+
clustererNN.mIndexLookup[3*i + 1] = p;
155+
clustererNN.mIndexLookup[3*i + 2] = t;
156+
}
117157
}
118158

119159
// MockedOrtAllocator implementation to be able to use volatile assignment

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ class GPUTPCNNClusterizerHost
4949

5050
void init(const GPUSettingsProcessingNNclusterizer&);
5151
void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
52+
void createBoundary(GPUTPCNNClusterizer&);
53+
void createIndexLookup(GPUTPCNNClusterizer&);
5254

5355
// ONNX
5456
void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -155,37 +155,26 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
155155
}
156156
} else if ((int32_t)transient_index < (clustererNN.mNnClusterizerElementSize - 3)) {
157157
int32_t time = static_cast<int>(peak.time());
158-
int32_t r = CAMath::Floor(transient_index / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
159-
bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
160-
if (is_row_boundary) {
158+
int32_t idxLookup = 3*transient_index;
159+
int32_t r = clustererNN.mIndexLookup[idxLookup], p = clustererNN.mIndexLookup[idxLookup + 1], t = clustererNN.mIndexLookup[idxLookup + 2] + time;
160+
int32_t current_row = row + r, current_pad = pad + p;
161+
int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
162+
int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, current_row);
163+
int32_t isBoundaryIndex = (current_row + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePerRow + current_pad + clustererNN.mNnClusterizerSizeInputPad;
164+
165+
if (!clustererNN.mIsBoundary[isBoundaryIndex] && (t >= 0) && (t < TPC_MAX_FRAGMENT_LEN_GPU)) {
166+
float central_charge = static_cast<float>(chargeMap[peak].unpack());
167+
CfChargePos tmp_pos(current_row, current_pad + pad_offset, t);
161168
if (dtype == 0) {
162-
clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
163-
} else {
164-
clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
169+
clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
170+
} else if (dtype == 1) {
171+
clustererNN.mInputData_32[glo_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
165172
}
166173
} else {
167-
int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
168-
int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r);
169-
int32_t rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
170-
int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad + pad_offset;
171-
int32_t time_pos = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime + time;
172-
173-
bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (time_pos < 0 || time_pos >= TPC_MAX_FRAGMENT_LEN_GPU);
174-
175-
if (!is_boundary) {
176-
float central_charge = static_cast<float>(chargeMap[peak].unpack());
177-
CfChargePos tmp_pos(row + r, pad + p, time_pos);
178-
if (dtype == 0) {
179-
clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
180-
} else if (dtype == 1) {
181-
clustererNN.mInputData_32[glo_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
182-
}
174+
if (dtype == 0) {
175+
clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
183176
} else {
184-
if (dtype == 0) {
185-
clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
186-
} else {
187-
clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
188-
}
177+
clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
189178
}
190179
}
191180
}

prodtests/full-system-test/dpl-workflow.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ fi
7474
GPU_INPUT=zsraw
7575
GPU_OUTPUT=tracks,clusters
7676
GPU_CONFIG=
77-
GPU_CONFIG_KEY=
77+
#GPU_CONFIG_KEY=
7878
TOF_CONFIG=
7979
TOF_INPUT=raw
8080
TOF_OUTPUT=clusters

0 commit comments

Comments
 (0)