Skip to content

Commit 19b5bd5

Browse files
committed
Improved data filling speeds by factor 3
1 parent 381955a commit 19b5bd5

File tree

5 files changed

+80
-9
lines changed

5 files changed

+80
-9
lines changed

GPU/GPUTracking/Definitions/GPUDefParametersDefault.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,7 @@
500500
#ifdef GPUCA_HAS_ONNX
501501
#define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels
502502
#define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN GPUCA_LB_GPUTPCNNClusterizerKernels
503+
#define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNSingleElement GPUCA_LB_GPUTPCNNClusterizerKernels
503504
#define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels
504505
#define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels
505506
#define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -950,9 +950,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
950950
size_t iSize = CAMath::Min((uint)clustererNNShadow.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
951951

952952
auto start0 = std::chrono::high_resolution_clock::now();
953-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data
954-
953+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data
955954
auto stop0 = std::chrono::high_resolution_clock::now();
955+
956956
auto start1 = std::chrono::high_resolution_clock::now();
957957

958958
if (clustererNNShadow.nnInferenceInputDType == 0) {

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
7777
if (!is_boundary) {
7878
ChargePos tmp_pos(row + r, pad + p, time + t);
7979
if (r == 0 && !clustererNN.clusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
80-
clustererNN.clusterFlags[2 * glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
80+
clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
8181
clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
8282
}
8383
if (dtype == 0) {
@@ -99,17 +99,85 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
9999
}
100100
if (clustererNN.nnClusterizerAddIndexData) {
101101
if (dtype == 0) {
102-
clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f);
102+
clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(sector / 36.f);
103103
clustererNN.inputData_16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
104104
clustererNN.inputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
105105
} else {
106-
clustererNN.inputData_32[write_idx] = clusterer.mISector / 36.f;
106+
clustererNN.inputData_32[write_idx] = sector / 36.f;
107107
clustererNN.inputData_32[write_idx + 1] = row / 152.f;
108108
clustererNN.inputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
109109
}
110110
}
111111
}
112112

113+
template <>
114+
GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNSingleElement>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
115+
{
116+
uint glo_idx = get_global_id(0);
117+
auto& clusterer = processors.tpcClusterer[sector];
118+
auto& clustererNN = processors.tpcNNClusterer[sector];
119+
uint base_idx = CAMath::Floor(glo_idx / clustererNN.nnClusterizerElementSize);
120+
uint transient_index = glo_idx % clustererNN.nnClusterizerElementSize;
121+
122+
Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
123+
Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
124+
ChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart];
125+
int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad());
126+
127+
if (clustererNN.nnClusterizerAddIndexData && transient_index == 0) {
128+
uint top_idx = (base_idx + 1) * clustererNN.nnClusterizerElementSize;
129+
for (uint16_t i = 0; i < 8; i++) {
130+
Delta2 d = cfconsts::InnerNeighbors[i];
131+
ChargePos tmp_pos = peak.delta(d);
132+
clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
133+
clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
134+
}
135+
if (dtype == 0) {
136+
clustererNN.inputData_16[top_idx - 3] = (OrtDataType::Float16_t)(sector / 36.f);
137+
clustererNN.inputData_16[top_idx - 2] = (OrtDataType::Float16_t)(row / 152.f);
138+
clustererNN.inputData_16[top_idx - 1] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
139+
} else {
140+
clustererNN.inputData_32[top_idx - 3] = sector / 36.f;
141+
clustererNN.inputData_32[top_idx - 2] = row / 152.f;
142+
clustererNN.inputData_32[top_idx - 1] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
143+
}
144+
} else {
145+
int time = static_cast<int>(peak.time());
146+
int r = CAMath::Floor(transient_index / ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1))) - clustererNN.nnClusterizerSizeInputRow;
147+
bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
148+
if (is_row_boundary) {
149+
if (dtype == 0) {
150+
clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
151+
} else {
152+
clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
153+
}
154+
} else {
155+
int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
156+
int pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r);
157+
int rest_1 = transient_index % ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1));
158+
int p = CAMath::Floor(rest_1 / (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputPad + pad_offset;
159+
bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow);
160+
161+
if (!is_boundary) {
162+
float central_charge = static_cast<float>(chargeMap[peak].unpack());
163+
int t = (rest_1 % (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputTime;
164+
ChargePos tmp_pos(row + r, pad + p, time + t);
165+
if (dtype == 0) {
166+
clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
167+
} else if (dtype == 1) {
168+
clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
169+
}
170+
} else {
171+
if (dtype == 0) {
172+
clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
173+
} else {
174+
clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
175+
}
176+
}
177+
}
178+
}
179+
}
180+
113181
template <>
114182
GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
115183
{

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,11 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
5353
enum K : int32_t {
5454
runCfClusterizer = 0,
5555
fillInputNN = 1,
56-
determineClass1Labels = 2,
57-
determineClass2Labels = 3,
58-
publishClass1Regression = 4,
59-
publishClass2Regression = 5,
56+
fillInputNNSingleElement = 2,
57+
determineClass1Labels = 3,
58+
determineClass2Labels = 4,
59+
publishClass1Regression = 5,
60+
publishClass2Regression = 6,
6061
};
6162

6263
template <int32_t iKernel = defaultKernel, typename... Args>

GPU/GPUTracking/kernels.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUS
116116
if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
117117
o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
118118
o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
119+
o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
119120
o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
120121
o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
121122
o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)

0 commit comments

Comments
 (0)