Skip to content

Commit 6e43257

Browse files
committed
const'ing + fixing CPU kernel
1 parent 587c3e6 commit 6e43257

File tree

2 files changed

+32
-33
lines changed

2 files changed

+32
-33
lines changed

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1035,7 +1035,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10351035
// Filling the data
10361036
if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
10371037
// Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations
1038-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
1038+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
10391039
} else {
10401040
// Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
10411041
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx

Lines changed: 31 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
5757
auto& clusterer = processors.tpcClusterer[sector];
5858
auto& clustererNN = processors.tpcNNClusterer[sector];
5959

60-
uint32_t glo_idx = get_global_id(0);
60+
const uint32_t glo_idx = get_global_id(0);
6161
if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
6262
return;
6363
}
@@ -67,43 +67,42 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
6767
CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
6868
CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
6969
CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
70-
int32_t row = static_cast<int>(peak.row());
71-
int32_t pad = static_cast<int>(peak.pad());
72-
int32_t time = static_cast<int>(peak.time());
73-
float central_charge = static_cast<float>(chargeMap[peak].unpack());
74-
int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
70+
const int32_t row = static_cast<int>(peak.row());
71+
const int32_t pad = static_cast<int>(peak.pad());
72+
const int32_t time = static_cast<int>(peak.time());
73+
const float central_charge = static_cast<float>(chargeMap[peak].unpack());
74+
const float inverse_charge = 1.f / central_charge;
75+
76+
const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
7577
const int32_t iroc_row = 63 + clustererNN.mNnClusterizerSizeInputRow;
7678
const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + clustererNN.mNnClusterizerSizeInputRow;
7779
const int32_t npads_row = GPUTPCGeometry::NPads(row);
80+
float output_value = clustererNN.mNnClusterizerBoundaryFillValue;
7881

79-
for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) {
80-
int32_t target_row = row + r;
81-
bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
82-
int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row);
83-
int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(target_row + row_offset);
82+
for (int32_t target_row = -clustererNN.mNnClusterizerSizeInputRow + row; target_row <= clustererNN.mNnClusterizerSizeInputRow + row; ++target_row) {
83+
uint8_t is_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
84+
const int32_t p_local = pad + (is_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row));
85+
const int32_t npads_reference = is_boundary ? 0 : GPUTPCGeometry::NPads(target_row - row_offset);
8486

85-
for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
86-
int32_t target_pad = pad + p;
87-
bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference);
88-
89-
for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) {
90-
int32_t target_time = time + t;
87+
for (int32_t target_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; target_pad <= clustererNN.mNnClusterizerSizeInputPad + p_local; ++target_pad) {
88+
is_boundary = is_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference);
9189

90+
for (int32_t target_time = -clustererNN.mNnClusterizerSizeInputTime + time; target_time <= clustererNN.mNnClusterizerSizeInputTime + time; ++target_time) {
9291
if (is_boundary || target_time < 0 || target_time >= clustererNN.maxAllowedTimebin) {
9392
// Fill boundary value
94-
float boundary_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
93+
output_value = clustererNN.mNnClusterizerBoundaryFillValue;
9594
if (dtype == 0) {
96-
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_value;
95+
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
9796
} else {
98-
clustererNN.mInputData_32[write_idx] = boundary_value;
97+
clustererNN.mInputData_32[write_idx] = output_value;
9998
}
10099
} else {
101100
CfChargePos tmp_pos(target_row, target_pad, target_time);
102-
float normalized_charge = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
101+
output_value = chargeMap[tmp_pos].unpack() * inverse_charge;
103102
if (dtype == 0) {
104-
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)normalized_charge;
103+
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
105104
} else {
106-
clustererNN.mInputData_32[write_idx] = normalized_charge;
105+
clustererNN.mInputData_32[write_idx] = output_value;
107106
}
108107
}
109108
// if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
@@ -119,11 +118,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
119118
if (dtype == 0) {
120119
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
121120
clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
122-
clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
121+
clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads_row);
123122
} else {
124123
clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
125124
clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
126-
clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
125+
clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads_row;
127126
}
128127
}
129128

@@ -143,16 +142,16 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
143142
template <>
144143
GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
145144
{
146-
uint32_t glo_idx = get_global_id(0);
145+
const uint32_t glo_idx = get_global_id(0);
147146
auto& clusterer = processors.tpcClusterer[sector];
148147
auto& clustererNN = processors.tpcNNClusterer[sector];
149148

150149
if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeThreads) {
151150
return;
152151
}
153152

154-
uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads;
155-
uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads);
153+
const uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads;
154+
const uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads);
156155

157156
// Early exit for out-of-bounds threads
158157
if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
@@ -164,10 +163,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
164163
// Use dedicated neural network shared memory arrays for warp-level caching
165164
// First thread in each warp loads shared data
166165
CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
167-
float central_charge = chargeMap[peak].unpack();
168-
int32_t row = static_cast<int>(peak.row());
169-
int32_t pad = static_cast<int>(peak.pad());
170-
int32_t time = static_cast<int>(peak.time());
166+
const float central_charge = chargeMap[peak].unpack();
167+
const int32_t row = static_cast<int>(peak.row());
168+
const int32_t pad = static_cast<int>(peak.pad());
169+
const int32_t time = static_cast<int>(peak.time());
171170

172171
// Handle index data with fewer branches
173172
if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {

0 commit comments

Comments
 (0)