Skip to content

Commit 3447927

Browse files
committed
IMproving kernel speed by 30% compared to original version. Next try: for-loop over row dimension as access is somewhat coalsced too
1 parent eecb8a8 commit 3447927

File tree

3 files changed

+50
-41
lines changed

3 files changed

+50
-41
lines changed

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ class GPUTPCNNClusterizer : public GPUProcessor
6464
uint32_t mNnClusterizerFullPadSize = 0;
6565
uint32_t mNnClusterizerFullTimeSize = 0;
6666
uint32_t mNnClusterizerPadTimeSize = 0;
67+
uint32_t mNnClusterizerRowTimeSize = 0;
68+
uint32_t mNnClusterizerRowTimeSizeFull = 0;
6769

6870
// Boundary lookup table
6971
// int32_t mBoundaryMapSizeRow = 0;
@@ -89,7 +91,6 @@ class GPUTPCNNClusterizer : public GPUProcessor
8991

9092
// FP16
9193
OrtDataType::Float16_t* mInputData_16 = nullptr;
92-
OrtDataType::Float16_t* mInputData_16_Test = nullptr;
9394
OrtDataType::Float16_t* mModelProbabilities_16 = nullptr;
9495
OrtDataType::Float16_t* mOutputDataReg1_16 = nullptr;
9596
OrtDataType::Float16_t* mOutputDataReg2_16 = nullptr;

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
9494
clustererNN.mNnClusterizerFullTimeSize = 2 * settings.nnClusterizerSizeInputTime + 1;
9595
clustererNN.mNnClusterizerChargeArraySize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
9696
clustererNN.mNnClusterizerPadTimeSize = clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
97+
clustererNN.mNnClusterizerRowTimeSize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullTimeSize;
98+
clustererNN.mNnClusterizerRowTimeSizeFull = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 3 : 0);
9799
clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
98100
// clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
99101
// clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx

Lines changed: 46 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
137137
auto& clustererNN = processors.tpcNNClusterer[sector];
138138

139139
// Optimized division using bit operations
140-
uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerElementSize;
141-
uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerElementSize);
140+
uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeFull;
141+
uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeFull);
142142

143143
// Early exit for out-of-bounds threads
144144
if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
@@ -156,9 +156,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
156156
int32_t time = static_cast<int>(peak.time());
157157

158158
// Handle index data with fewer branches
159-
if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index >= clustererNN.mNnClusterizerChargeArraySize) {
160-
uint32_t output_idx = base_idx * clustererNN.mNnClusterizerElementSize + transient_index;
161-
int32_t data_idx = transient_index - clustererNN.mNnClusterizerChargeArraySize;
159+
if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
160+
int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
161+
uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
162162

163163
float index_values[3] = {
164164
sector / 36.f,
@@ -167,9 +167,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
167167
};
168168

169169
if (dtype == 0) {
170-
clustererNN.mInputData_16[output_idx] = (OrtDataType::Float16_t)index_values[data_idx];
170+
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
171171
} else {
172-
clustererNN.mInputData_32[output_idx] = index_values[data_idx];
172+
clustererNN.mInputData_32[write_idx] = index_values[data_idx];
173173
}
174174

175175
// Handle deconvolution flags only once per cluster (last thread in element)
@@ -187,51 +187,57 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
187187
}
188188

189189
// Main data processing - optimize index calculations
190-
if ((int32_t)transient_index < clustererNN.mNnClusterizerChargeArraySize) {
190+
if ((int32_t)transient_index < clustererNN.mNnClusterizerRowTimeSize) {
191191
// Optimize 3D index calculation
192-
int32_t r_local = (transient_index / clustererNN.mNnClusterizerPadTimeSize) - clustererNN.mNnClusterizerSizeInputRow;
193-
int32_t pad_time_slice = (transient_index % clustererNN.mNnClusterizerPadTimeSize);
194-
int32_t p_local = (pad_time_slice / clustererNN.mNnClusterizerFullPadSize) - clustererNN.mNnClusterizerSizeInputPad;
195-
int32_t t_local = (pad_time_slice % clustererNN.mNnClusterizerFullPadSize) - clustererNN.mNnClusterizerSizeInputTime;
192+
int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
193+
int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow;
194+
int32_t time_idx = transient_index - row_idx*clustererNN.mNnClusterizerFullTimeSize;
195+
int32_t t_local = time_idx - clustererNN.mNnClusterizerSizeInputTime;
196+
int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx;
196197

197198
// Early boundary check for row
198199
int32_t target_row = row + r_local;
199200
int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1));
200201

201-
if (is_row_boundary) {
202-
// Use boundary fill value
203-
float boundary_val = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
204-
if (dtype == 0) {
205-
clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)boundary_val;
206-
} else {
207-
clustererNN.mInputData_32[glo_idx] = boundary_val;
208-
}
209-
return;
210-
}
211-
212202
// Calculate offsets
213203
int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
214204
int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
215-
int32_t target_pad = pad + p_local + pad_offset;
216-
int32_t target_time = time + t_local;
205+
for (int32_t p_local = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local++) {
206+
if (is_row_boundary) {
207+
// Use boundary fill value
208+
float boundary_val = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
209+
if (dtype == 0) {
210+
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_val;
211+
} else {
212+
clustererNN.mInputData_32[write_idx] = boundary_val;
213+
}
214+
write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
215+
continue;
216+
}
217217

218-
// Optimized boundary check
219-
int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= TPC_MAX_FRAGMENT_LEN_GPU);
218+
// Calculate target pad and time
219+
int32_t target_pad = pad + p_local;
220+
int32_t target_time = time + t_local;
220221

221-
float output_value;
222-
if (is_boundary) {
223-
output_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
224-
} else {
225-
// Coalesced memory access - create position and read charge
226-
CfChargePos tmp_pos(target_row, target_pad, target_time);
227-
output_value = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge
228-
}
222+
// Optimized boundary check
223+
int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= TPC_MAX_FRAGMENT_LEN_GPU);
229224

230-
// Write output with reduced branching
231-
if (dtype == 0) {
232-
clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)output_value;
233-
} else {
234-
clustererNN.mInputData_32[glo_idx] = output_value;
225+
float output_value;
226+
if (is_boundary) {
227+
output_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
228+
} else {
229+
// Coalesced memory access - create position and read charge
230+
CfChargePos tmp_pos(target_row, target_pad, target_time);
231+
output_value = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge
232+
}
233+
234+
// Write output with reduced branching
235+
if (dtype == 0) {
236+
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
237+
} else {
238+
clustererNN.mInputData_32[write_idx] = output_value;
239+
}
240+
write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
235241
}
236242
}
237243
}

0 commit comments

Comments
 (0)