@@ -137,8 +137,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
137137 auto & clustererNN = processors.tpcNNClusterer [sector];
138138
139139 // Optimized division using bit operations
140- uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerElementSize ;
141- uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerElementSize );
140+ uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeFull ;
141+ uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeFull );
142142
143143 // Early exit for out-of-bounds threads
144144 if (base_idx + batchStart >= clusterer.mPmemory ->counters .nClusters ) {
@@ -156,9 +156,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
156156 int32_t time = static_cast <int >(peak.time ());
157157
158158 // Handle index data with fewer branches
159- if (clustererNN.mNnClusterizerAddIndexData && (int32_t )transient_index >= clustererNN.mNnClusterizerChargeArraySize ) {
160- uint32_t output_idx = base_idx * clustererNN.mNnClusterizerElementSize + transient_index ;
161- int32_t data_idx = transient_index - clustererNN.mNnClusterizerChargeArraySize ;
159+ if (clustererNN.mNnClusterizerAddIndexData && (int32_t )transient_index >= clustererNN.mNnClusterizerRowTimeSize ) {
160+ int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize ;
161+ uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN. mNnClusterizerChargeArraySize + data_idx ;
162162
163163 float index_values[3 ] = {
164164 sector / 36 .f ,
@@ -167,9 +167,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
167167 };
168168
169169 if (dtype == 0 ) {
170- clustererNN.mInputData_16 [output_idx ] = (OrtDataType::Float16_t)index_values[data_idx];
170+ clustererNN.mInputData_16 [write_idx ] = (OrtDataType::Float16_t)index_values[data_idx];
171171 } else {
172- clustererNN.mInputData_32 [output_idx ] = index_values[data_idx];
172+ clustererNN.mInputData_32 [write_idx ] = index_values[data_idx];
173173 }
174174
175175 // Handle deconvolution flags only once per cluster (last thread in element)
@@ -187,51 +187,57 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
187187 }
188188
189189 // Main data processing - optimize index calculations
190- if ((int32_t )transient_index < clustererNN.mNnClusterizerChargeArraySize ) {
190+ if ((int32_t )transient_index < clustererNN.mNnClusterizerRowTimeSize ) {
191191 // Optimize 3D index calculation
192- int32_t r_local = (transient_index / clustererNN.mNnClusterizerPadTimeSize ) - clustererNN.mNnClusterizerSizeInputRow ;
193- int32_t pad_time_slice = (transient_index % clustererNN.mNnClusterizerPadTimeSize );
194- int32_t p_local = (pad_time_slice / clustererNN.mNnClusterizerFullPadSize ) - clustererNN.mNnClusterizerSizeInputPad ;
195- int32_t t_local = (pad_time_slice % clustererNN.mNnClusterizerFullPadSize ) - clustererNN.mNnClusterizerSizeInputTime ;
192+ int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize ;
193+ int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow ;
194+ int32_t time_idx = transient_index - row_idx*clustererNN.mNnClusterizerFullTimeSize ;
195+ int32_t t_local = time_idx - clustererNN.mNnClusterizerSizeInputTime ;
196+ int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx;
196197
197198 // Early boundary check for row
198199 int32_t target_row = row + r_local;
199200 int8_t is_row_boundary = (target_row < 0 ) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1 ));
200201
201- if (is_row_boundary) {
202- // Use boundary fill value
203- float boundary_val = static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue );
204- if (dtype == 0 ) {
205- clustererNN.mInputData_16 [glo_idx] = (OrtDataType::Float16_t)boundary_val;
206- } else {
207- clustererNN.mInputData_32 [glo_idx] = boundary_val;
208- }
209- return ;
210- }
211-
212202 // Calculate offsets
213203 int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset (row, clustererNN.mNnClusterizerSizeInputRow );
214204 int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset (row, target_row);
215- int32_t target_pad = pad + p_local + pad_offset;
216- int32_t target_time = time + t_local;
205+ for (int32_t p_local = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local++) {
206+ if (is_row_boundary) {
207+ // Use boundary fill value
208+ float boundary_val = static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue );
209+ if (dtype == 0 ) {
210+ clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)boundary_val;
211+ } else {
212+ clustererNN.mInputData_32 [write_idx] = boundary_val;
213+ }
214+ write_idx += clustererNN.mNnClusterizerFullTimeSize ; // Move to next pad position
215+ continue ;
216+ }
217217
218- // Optimized boundary check
219- int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary (target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow ) || (target_time < 0 ) || (target_time >= TPC_MAX_FRAGMENT_LEN_GPU);
218+ // Calculate target pad and time
219+ int32_t target_pad = pad + p_local;
220+ int32_t target_time = time + t_local;
220221
221- float output_value;
222- if (is_boundary) {
223- output_value = static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue );
224- } else {
225- // Coalesced memory access - create position and read charge
226- CfChargePos tmp_pos (target_row, target_pad, target_time);
227- output_value = static_cast <float >(chargeMap[tmp_pos].unpack ()) / central_charge; // Normalize by central charge
228- }
222+ // Optimized boundary check
223+ int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary (target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow ) || (target_time < 0 ) || (target_time >= TPC_MAX_FRAGMENT_LEN_GPU);
229224
230- // Write output with reduced branching
231- if (dtype == 0 ) {
232- clustererNN.mInputData_16 [glo_idx] = (OrtDataType::Float16_t)output_value;
233- } else {
234- clustererNN.mInputData_32 [glo_idx] = output_value;
225+ float output_value;
226+ if (is_boundary) {
227+ output_value = static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue );
228+ } else {
229+ // Coalesced memory access - create position and read charge
230+ CfChargePos tmp_pos (target_row, target_pad, target_time);
231+ output_value = static_cast <float >(chargeMap[tmp_pos].unpack ()) / central_charge; // Normalize by central charge
232+ }
233+
234+ // Write output with reduced branching
235+ if (dtype == 0 ) {
236+ clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)output_value;
237+ } else {
238+ clustererNN.mInputData_32 [write_idx] = output_value;
239+ }
240+ write_idx += clustererNN.mNnClusterizerFullTimeSize ; // Move to next pad position
235241 }
236242 }
237243}
0 commit comments