@@ -72,8 +72,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
7272 int32_t time = static_cast <int >(peak.time ());
7373 float central_charge = static_cast <float >(chargeMap[peak].unpack ());
7474 int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset (row, clustererNN.mNnClusterizerSizeInputRow );
75- const int32_t iroc_row = 63 + row_offset ;
76- const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset ;
75+ const int32_t iroc_row = 63 + clustererNN. mNnClusterizerSizeInputRow ;
76+ const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + clustererNN. mNnClusterizerSizeInputRow ;
7777 const int32_t npads_row = GPUTPCGeometry::NPads (row);
7878
7979 for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow ; r <= clustererNN.mNnClusterizerSizeInputRow ; ++r) {
@@ -169,45 +169,18 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
169169 int32_t pad = static_cast <int >(peak.pad ());
170170 int32_t time = static_cast <int >(peak.time ());
171171
172- const int32_t npads_row = GPUTPCGeometry::NPads (row);
173-
174172 // Handle index data with fewer branches
175173 if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize ) {
176- // int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
177- // uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
178- //
179- // float index_values[3] = {
180- // static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR,
181- // static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW,
182- // static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
183- //
184- // if (dtype == 0) {
185- // clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
186- // } else {
187- // clustererNN.mInputData_32[write_idx] = index_values[data_idx];
188- // }
189- //
190- // // Handle deconvolution flags only once per cluster (last thread in element)
191- // if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) {
192- // uint8_t cluster_flags = 0;
193- // for (uint16_t i = 0; i < 8; i++) {
194- // Delta2 d = cfconsts::InnerNeighbors[i];
195- // CfChargePos tmp_pos = peak.delta(d);
196- // cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
197- // }
198- // clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
199- // clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
200- // }
201- // return;
202174 uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize ;
175+ const int32_t npads = GPUTPCGeometry::NPads (row);
203176 if (dtype == 0 ) {
204177 clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)(static_cast <float >(sector) / o2::tpc::constants::MAXSECTOR);
205178 clustererNN.mInputData_16 [write_idx + 1 ] = (OrtDataType::Float16_t)(static_cast <float >(row) / o2::tpc::constants::MAXGLOBALPADROW);
206- clustererNN.mInputData_16 [write_idx + 2 ] = (OrtDataType::Float16_t)(static_cast <float >(pad) / npads_row );
179+ clustererNN.mInputData_16 [write_idx + 2 ] = (OrtDataType::Float16_t)(static_cast <float >(pad) / npads );
207180 } else {
208181 clustererNN.mInputData_32 [write_idx] = static_cast <float >(sector) / o2::tpc::constants::MAXSECTOR;
209182 clustererNN.mInputData_32 [write_idx + 1 ] = static_cast <float >(row) / o2::tpc::constants::MAXGLOBALPADROW;
210- clustererNN.mInputData_32 [write_idx + 2 ] = static_cast <float >(pad) / npads_row ;
183+ clustererNN.mInputData_32 [write_idx + 2 ] = static_cast <float >(pad) / npads ;
211184 }
212185 }
213186
@@ -220,46 +193,45 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
220193
221194 // Early boundary check for row
222195 const int32_t target_row = row + row_idx - clustererNN.mNnClusterizerSizeInputRow ;
223- const int8_t is_row_boundary = (target_row < 0 ) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1 )) ;
224- const int32_t target_time = time + time_idx - clustererNN. mNnClusterizerSizeInputTime ;
225- const uint8_t is_time_boundary = (target_time < 0 ) || (target_time >= clustererNN. maxAllowedTimebin );
226- const float inverse_central_charge = 1 . f / central_charge; // multiply by inverse is cheaper than divide
227-
228- // Calculate offsets
229- // int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
230- // int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row) ;
231- const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset (row, clustererNN. mNnClusterizerSizeInputRow );
232- const int32_t iroc_row = 63 + row_offset ;
233- const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset;
234- const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset (row, target_row) ;
235- const int32_t boundary_row = target_row + row_offset;
236- const int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads (boundary_row - clustererNN. mNnClusterizerSizeInputRow );
237- const float boundary_val = clustererNN.mNnClusterizerBoundaryFillValue ;
238-
239- float output_value = boundary_val;
240-
241- const int32_t start_pad = -clustererNN. mNnClusterizerSizeInputPad + p_local ;
242- const int32_t end_pad = clustererNN. mNnClusterizerSizeInputPad + p_local;
243-
244- for ( int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) {
245- if (is_row_boundary) {
246- output_value = boundary_val;
247- } else {
248- const uint8_t is_boundary = is_time_boundary || GPUTPCNNClusterizerKernels::isBoundary (boundary_row, target_pad, maxrow, iroc_row, npads_row, npads_reference) ;
249- if (!is_boundary) {
196+ float output_value = clustererNN. mNnClusterizerBoundaryFillValue ;
197+
198+ if ((row < 63 && target_row > 62 ) || (target_row < 0 ) || (row > 62 && target_row < 63 ) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) {
199+ for ( int32_t target_pad = 0 ; target_pad < clustererNN. mNnClusterizerFullPadSize ; ++target_pad) {
200+ if (dtype == 0 ) {
201+ clustererNN. mInputData_16 [write_idx] = (OrtDataType::Float16_t)output_value;
202+ } else {
203+ clustererNN. mInputData_32 [write_idx] = output_value ;
204+ }
205+ write_idx += clustererNN. mNnClusterizerFullTimeSize ;
206+ }
207+ return ;
208+ } else {
209+ // Calculate offsets
210+ const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime ;
211+ const uint8_t is_time_boundary = (target_time < 0 ) || (target_time >= clustererNN. maxAllowedTimebin );
212+ const float inverse_central_charge = 1 . f / central_charge; // multiply by inverse is cheaper than divide
213+ const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset (row, target_row);
214+ const int32_t npads = GPUTPCGeometry::NPads (target_row) ;
215+
216+ const int32_t start_pad = -clustererNN. mNnClusterizerSizeInputPad + p_local;
217+ const int32_t end_pad = clustererNN. mNnClusterizerSizeInputPad + p_local;
218+
219+ for ( int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) {
220+ if (target_pad >= npads || target_pad < 0 || is_time_boundary) {
221+ output_value = clustererNN. mNnClusterizerBoundaryFillValue ;
222+ } else {
250223 CfChargePos pos (target_row, target_pad, target_time);
251224 // one load + one multiply
252225 output_value = chargeMap[pos].unpack () * inverse_central_charge;
226+ }
227+ if (dtype == 0 ) {
228+ clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)output_value;
253229 } else {
254- output_value = boundary_val ;
230+ clustererNN. mInputData_32 [write_idx] = output_value ;
255231 }
232+ write_idx += clustererNN.mNnClusterizerFullTimeSize ;
256233 }
257- if (dtype == 0 ) {
258- clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)output_value;
259- } else {
260- clustererNN.mInputData_32 [write_idx] = output_value;
261- }
262- write_idx += clustererNN.mNnClusterizerFullTimeSize ;
234+ return ;
263235 }
264236 }
265237}
0 commit comments