Skip to content

Commit a075c43

Browse files
committed
Adjusting parameter bounds and additional GPU kernel optimizations
1 parent 9c8984c commit a075c43

File tree

3 files changed

+40
-73
lines changed

3 files changed

+40
-73
lines changed

GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,7 @@
482482

483483
#define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels
484484
#define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNCPU GPUCA_LB_GPUTPCNNClusterizerKernels
485-
#define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU GPUCA_LB_GPUTPCNNClusterizerKernels
485+
#define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU 1024
486486
#define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels
487487
#define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels
488488
#define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -642,7 +642,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
642642

643643
// Maximum of 4 lanes supported
644644
HighResTimer* nnTimers[12];
645-
int32_t countLoops = 0;
646645

647646
if (GetProcessingSettings().nn.applyNNclusterizer) {
648647
int32_t deviceId = -1;
@@ -1036,10 +1035,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10361035
// Filling the data
10371036
if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
10381037
// Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations
1039-
for(int throughput_counter = 0; throughput_counter < 16; throughput_counter++) { // Loop to increase throughput on GPU, at least for large batch sizes
1040-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
1041-
countLoops++;
1042-
}
1038+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
10431039
} else {
10441040
// Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
10451041
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
@@ -1142,7 +1138,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
11421138
LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
11431139
}
11441140
}
1145-
LOG(info) << "countLoops: " << countLoops;
11461141
#else
11471142
GPUFatal("Project not compiled with neural network clusterization. Aborting.");
11481143
#endif

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx

Lines changed: 38 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
7272
int32_t time = static_cast<int>(peak.time());
7373
float central_charge = static_cast<float>(chargeMap[peak].unpack());
7474
int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
75-
const int32_t iroc_row = 63 + row_offset;
76-
const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset;
75+
const int32_t iroc_row = 63 + clustererNN.mNnClusterizerSizeInputRow;
76+
const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + clustererNN.mNnClusterizerSizeInputRow;
7777
const int32_t npads_row = GPUTPCGeometry::NPads(row);
7878

7979
for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) {
@@ -169,45 +169,18 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
169169
int32_t pad = static_cast<int>(peak.pad());
170170
int32_t time = static_cast<int>(peak.time());
171171

172-
const int32_t npads_row = GPUTPCGeometry::NPads(row);
173-
174172
// Handle index data with fewer branches
175173
if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
176-
// int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
177-
// uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
178-
//
179-
// float index_values[3] = {
180-
// static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR,
181-
// static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW,
182-
// static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
183-
//
184-
// if (dtype == 0) {
185-
// clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
186-
// } else {
187-
// clustererNN.mInputData_32[write_idx] = index_values[data_idx];
188-
// }
189-
//
190-
// // Handle deconvolution flags only once per cluster (last thread in element)
191-
// if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) {
192-
// uint8_t cluster_flags = 0;
193-
// for (uint16_t i = 0; i < 8; i++) {
194-
// Delta2 d = cfconsts::InnerNeighbors[i];
195-
// CfChargePos tmp_pos = peak.delta(d);
196-
// cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
197-
// }
198-
// clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
199-
// clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
200-
// }
201-
// return;
202174
uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize;
175+
const int32_t npads = GPUTPCGeometry::NPads(row);
203176
if (dtype == 0) {
204177
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
205178
clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
206-
clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads_row);
179+
clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads);
207180
} else {
208181
clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
209182
clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
210-
clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads_row;
183+
clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads;
211184
}
212185
}
213186

@@ -220,46 +193,45 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
220193

221194
// Early boundary check for row
222195
const int32_t target_row = row + row_idx - clustererNN.mNnClusterizerSizeInputRow;
223-
const int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1));
224-
const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime;
225-
const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
226-
const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide
227-
228-
// Calculate offsets
229-
// int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
230-
// int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
231-
const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
232-
const int32_t iroc_row = 63 + row_offset;
233-
const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset;
234-
const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row);
235-
const int32_t boundary_row = target_row + row_offset;
236-
const int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(boundary_row - clustererNN.mNnClusterizerSizeInputRow);
237-
const float boundary_val = clustererNN.mNnClusterizerBoundaryFillValue;
238-
239-
float output_value = boundary_val;
240-
241-
const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local;
242-
const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local;
243-
244-
for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) {
245-
if (is_row_boundary) {
246-
output_value = boundary_val;
247-
} else {
248-
const uint8_t is_boundary = is_time_boundary || GPUTPCNNClusterizerKernels::isBoundary(boundary_row, target_pad, maxrow, iroc_row, npads_row, npads_reference);
249-
if (!is_boundary) {
196+
float output_value = clustererNN.mNnClusterizerBoundaryFillValue;
197+
198+
if ((row < 63 && target_row > 62) || (target_row < 0) || (row > 62 && target_row < 63) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) {
199+
for (int32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) {
200+
if (dtype == 0) {
201+
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
202+
} else {
203+
clustererNN.mInputData_32[write_idx] = output_value;
204+
}
205+
write_idx += clustererNN.mNnClusterizerFullTimeSize;
206+
}
207+
return;
208+
} else {
209+
// Calculate offsets
210+
const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime;
211+
const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
212+
const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide
213+
const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row);
214+
const int32_t npads = GPUTPCGeometry::NPads(target_row);
215+
216+
const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local;
217+
const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local;
218+
219+
for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) {
220+
if (target_pad >= npads || target_pad < 0 || is_time_boundary) {
221+
output_value = clustererNN.mNnClusterizerBoundaryFillValue;
222+
} else {
250223
CfChargePos pos(target_row, target_pad, target_time);
251224
// one load + one multiply
252225
output_value = chargeMap[pos].unpack() * inverse_central_charge;
226+
}
227+
if (dtype == 0) {
228+
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
253229
} else {
254-
output_value = boundary_val;
230+
clustererNN.mInputData_32[write_idx] = output_value;
255231
}
232+
write_idx += clustererNN.mNnClusterizerFullTimeSize;
256233
}
257-
if (dtype == 0) {
258-
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
259-
} else {
260-
clustererNN.mInputData_32[write_idx] = output_value;
261-
}
262-
write_idx += clustererNN.mNnClusterizerFullTimeSize;
234+
return;
263235
}
264236
}
265237
}

0 commit comments

Comments
 (0)