Skip to content

Commit 4949b55

Browse files
committed
Beautifications to trigger the CI
1 parent 069a7e9 commit 4949b55

File tree

1 file changed

+46
-28
lines changed

1 file changed

+46
-28
lines changed

GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx

Lines changed: 46 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -53,47 +53,58 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
5353
template <>
5454
GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNCPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
5555
{
56-
uint32_t glo_idx = get_global_id(0);
57-
5856
auto& clusterer = processors.tpcClusterer[sector];
5957
auto& clustererNN = processors.tpcNNClusterer[sector];
6058

59+
uint32_t glo_idx = get_global_id(0);
6160
if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
6261
return;
6362
}
64-
uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize; // Potential optimization: Either choose mNnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
63+
64+
uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize;
6565

6666
CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
6767
CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
68-
CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
69-
int32_t row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
68+
CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, clusterer.mPmemory->counters.nClusters - 1)];
69+
int32_t row = static_cast<int>(peak.row());
70+
int32_t pad = static_cast<int>(peak.pad());
71+
int32_t time = static_cast<int>(peak.time());
7072
float central_charge = static_cast<float>(chargeMap[peak].unpack());
7173
int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
7274

73-
for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) {
74-
bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
75-
int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r);
76-
for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) {
77-
bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow);
78-
for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) {
79-
int32_t time_pos = time + t;
80-
if (is_boundary || (time_pos < 0) || (time_pos >= TPC_MAX_FRAGMENT_LEN_GPU)) {
81-
// Filling boundary just to make sure that no values are left unintentionally
75+
for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) {
76+
int32_t target_row = row + r;
77+
bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
78+
int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row);
79+
80+
for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
81+
int32_t target_pad = pad + p;
82+
bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow);
83+
84+
for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) {
85+
int32_t target_time = time + t;
86+
87+
if (is_boundary || target_time < 0 || target_time >= TPC_MAX_FRAGMENT_LEN_GPU) {
88+
// Fill boundary value
89+
float boundary_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
8290
if (dtype == 0) {
83-
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
91+
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_value;
8492
} else {
85-
clustererNN.mInputData_32[write_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
93+
clustererNN.mInputData_32[write_idx] = boundary_value;
8694
}
8795
} else {
88-
CfChargePos tmp_pos(row + r, pad + p, time + t);
89-
if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
96+
CfChargePos tmp_pos(target_row, target_pad, target_time);
97+
float normalized_charge = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
98+
99+
if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && r == 0 && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) {
90100
clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
91101
clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
92102
}
103+
93104
if (dtype == 0) {
94-
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
95-
} else if (dtype == 1) {
96-
clustererNN.mInputData_32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
105+
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)normalized_charge;
106+
} else {
107+
clustererNN.mInputData_32[write_idx] = normalized_charge;
97108
}
98109
}
99110
// if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
@@ -104,21 +115,28 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
104115
}
105116
}
106117
}
118+
107119
if (clustererNN.mNnClusterizerAddIndexData) {
120+
float sector_norm = sector / 36.f;
121+
float row_norm = row / 152.f;
122+
float pad_norm = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
123+
108124
if (dtype == 0) {
109-
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(sector / 36.f);
110-
clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
111-
clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
125+
clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)sector_norm;
126+
clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)row_norm;
127+
clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)pad_norm;
112128
} else {
113-
clustererNN.mInputData_32[write_idx] = sector / 36.f;
114-
clustererNN.mInputData_32[write_idx + 1] = row / 152.f;
115-
clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
129+
clustererNN.mInputData_32[write_idx] = sector_norm;
130+
clustererNN.mInputData_32[write_idx + 1] = row_norm;
131+
clustererNN.mInputData_32[write_idx + 2] = pad_norm;
116132
}
117133
}
134+
118135
if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) {
119136
clustererNN.mClusterFlags[2 * glo_idx] = 0;
120137
clustererNN.mClusterFlags[2 * glo_idx + 1] = 0;
121-
for (uint16_t i = 0; i < 8; i++) {
138+
139+
for (uint16_t i = 0; i < 8; ++i) {
122140
Delta2 d = cfconsts::InnerNeighbors[i];
123141
CfChargePos tmp_pos = peak.delta(d);
124142
clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);

0 commit comments

Comments
 (0)