@@ -72,15 +72,19 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
7272 int32_t time = static_cast <int >(peak.time ());
7373 float central_charge = static_cast <float >(chargeMap[peak].unpack ());
7474 int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset (row, clustererNN.mNnClusterizerSizeInputRow );
75+ const int32_t iroc_row = 63 + row_offset;
76+ const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset;
77+ const int32_t npads_row = GPUTPCGeometry::NPads (row);
7578
7679 for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow ; r <= clustererNN.mNnClusterizerSizeInputRow ; ++r) {
7780 int32_t target_row = row + r;
7881 bool is_row_boundary = (target_row < 0 ) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
7982 int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset (row, target_row);
83+ int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads (target_row + row_offset);
8084
8185 for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
8286 int32_t target_pad = pad + p;
83- bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary (target_row + row_offset, target_pad, clustererNN. mNnClusterizerSizeInputRow );
87+ bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary (target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference );
8488
8589 for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime ; t <= clustererNN.mNnClusterizerSizeInputTime ; ++t) {
8690 int32_t target_time = time + t;
@@ -143,125 +147,119 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
143147 auto & clusterer = processors.tpcClusterer [sector];
144148 auto & clustererNN = processors.tpcNNClusterer [sector];
145149
146- if (glo_idx >= (uint32_t )clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeFull ) {
150+ if (glo_idx >= (uint32_t )clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeThreads ) {
147151 return ;
148152 }
149153
150- uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeFull ;
151- uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeFull );
154+ uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads ;
155+ uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads );
152156
153157 // Early exit for out-of-bounds threads
154- if (base_idx + batchStart >= clusterer.mPmemory ->counters .nClusters ) {
155- return ;
156- }
158+ // if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
159+ // return;
160+ // }
157161 CfArray2D<PackedCharge> chargeMap (reinterpret_cast <PackedCharge*>(clusterer.mPchargeMap ));
158162 CfArray2D<uint8_t > isPeakMap (clusterer.mPpeakMap );
159163
160164 // Use dedicated neural network shared memory arrays for warp-level caching
161165 // First thread in each warp loads shared data
162166 CfChargePos peak = clusterer.mPfilteredPeakPositions [CAMath::Min (base_idx + batchStart, (uint32_t )(clusterer.mPmemory ->counters .nClusters - 1 ))];
163- float central_charge = static_cast < float >( chargeMap[peak].unpack () );
167+ float central_charge = chargeMap[peak].unpack ();
164168 int32_t row = static_cast <int >(peak.row ());
165169 int32_t pad = static_cast <int >(peak.pad ());
166170 int32_t time = static_cast <int >(peak.time ());
167171
172+ const int32_t npads_row = GPUTPCGeometry::NPads (row);
173+
168174 // Handle index data with fewer branches
169175 if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize ) {
170- int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize ;
171- uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
172-
173- float index_values[3 ] = {
174- static_cast <float >(sector) / o2::tpc::constants::MAXSECTOR,
175- static_cast <float >(row) / o2::tpc::constants::MAXGLOBALPADROW,
176- static_cast <float >(pad) / GPUTPCGeometry::NPads (row)};
177-
176+ // int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
177+ // uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
178+ //
179+ // float index_values[3] = {
180+ // static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR,
181+ // static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW,
182+ // static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
183+ //
184+ // if (dtype == 0) {
185+ // clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
186+ // } else {
187+ // clustererNN.mInputData_32[write_idx] = index_values[data_idx];
188+ // }
189+ //
190+ // // Handle deconvolution flags only once per cluster (last thread in element)
191+ // if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) {
192+ // uint8_t cluster_flags = 0;
193+ // for (uint16_t i = 0; i < 8; i++) {
194+ // Delta2 d = cfconsts::InnerNeighbors[i];
195+ // CfChargePos tmp_pos = peak.delta(d);
196+ // cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
197+ // }
198+ // clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
199+ // clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
200+ // }
201+ // return;
202+ uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize ;
178203 if (dtype == 0 ) {
179- clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
204+ clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)(static_cast <float >(sector) / o2::tpc::constants::MAXSECTOR);
205+ clustererNN.mInputData_16 [write_idx + 1 ] = (OrtDataType::Float16_t)(static_cast <float >(row) / o2::tpc::constants::MAXGLOBALPADROW);
206+ clustererNN.mInputData_16 [write_idx + 2 ] = (OrtDataType::Float16_t)(static_cast <float >(pad) / npads_row);
180207 } else {
181- clustererNN.mInputData_32 [write_idx] = index_values[data_idx];
182- }
183-
184- // Handle deconvolution flags only once per cluster (last thread in element)
185- if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2 ) {
186- uint8_t cluster_flags = 0 ;
187- for (uint16_t i = 0 ; i < 8 ; i++) {
188- Delta2 d = cfconsts::InnerNeighbors[i];
189- CfChargePos tmp_pos = peak.delta (d);
190- cluster_flags += CfUtils::isPeak (isPeakMap[tmp_pos]);
191- }
192- clustererNN.mClusterFlags [2 * base_idx] = cluster_flags;
193- clustererNN.mClusterFlags [2 * base_idx + 1 ] = cluster_flags;
208+ clustererNN.mInputData_32 [write_idx] = static_cast <float >(sector) / o2::tpc::constants::MAXSECTOR;
209+ clustererNN.mInputData_32 [write_idx + 1 ] = static_cast <float >(row) / o2::tpc::constants::MAXGLOBALPADROW;
210+ clustererNN.mInputData_32 [write_idx + 2 ] = static_cast <float >(pad) / npads_row;
194211 }
195- return ;
196212 }
197213
198214 // Main data processing - optimize index calculations
199215 if (transient_index < clustererNN.mNnClusterizerRowTimeSize ) {
200216 // Optimize 3D index calculation
201- int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize ;
202- int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow ;
203- int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize ;
204- int32_t t_local = time_idx - clustererNN.mNnClusterizerSizeInputTime ;
217+ const int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize ;
218+ const int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize ;
205219 int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx;
206220
207221 // Early boundary check for row
208- int32_t target_row = row + r_local;
209- int8_t is_row_boundary = (target_row < 0 ) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1 ));
222+ const int32_t target_row = row + row_idx - clustererNN.mNnClusterizerSizeInputRow ;
223+ const int8_t is_row_boundary = (target_row < 0 ) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1 ));
224+ const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime ;
225+ const uint8_t is_time_boundary = (target_time < 0 ) || (target_time >= clustererNN.maxAllowedTimebin );
226+ const float inverse_central_charge = 1 .f / central_charge; // multiply by inverse is cheaper than divide
210227
211228 // Calculate offsets
212- int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset (row, clustererNN.mNnClusterizerSizeInputRow );
213- int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset (row, target_row);
214- for (int32_t p_local = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local++) {
229+ // int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
230+ // int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
231+ const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset (row, clustererNN.mNnClusterizerSizeInputRow );
232+ const int32_t iroc_row = 63 + row_offset;
233+ const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset;
234+ const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset (row, target_row);
235+ const int32_t boundary_row = target_row + row_offset;
236+ const int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads (boundary_row - clustererNN.mNnClusterizerSizeInputRow );
237+ const float boundary_val = clustererNN.mNnClusterizerBoundaryFillValue ;
238+
239+ float output_value = boundary_val;
240+
241+ const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local;
242+ const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local;
243+
244+ for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) {
215245 if (is_row_boundary) {
216- // Use boundary fill value
217- float boundary_val = static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue );
218- if (dtype == 0 ) {
219- clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)boundary_val;
246+ output_value = boundary_val;
247+ } else {
248+ const uint8_t is_boundary = is_time_boundary || GPUTPCNNClusterizerKernels::isBoundary (boundary_row, target_pad, maxrow, iroc_row, npads_row, npads_reference);
249+ if (!is_boundary) {
250+ CfChargePos pos (target_row, target_pad, target_time);
251+ // one load + one multiply
252+ output_value = chargeMap[pos].unpack () * inverse_central_charge;
220253 } else {
221- clustererNN. mInputData_32 [write_idx] = boundary_val;
254+ output_value = boundary_val;
222255 }
223- write_idx += clustererNN.mNnClusterizerFullTimeSize ; // Move to next pad position
224- continue ;
225- }
226-
227- // Calculate target pad and time
228- int32_t target_pad = pad + p_local;
229- int32_t target_time = time + t_local;
230-
231- // Optimized boundary check
232- int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary (target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow ) || (target_time < 0 ) || (target_time >= clustererNN.maxAllowedTimebin );
233-
234- float output_value;
235- if (is_boundary) {
236- output_value = static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue );
237- } else {
238- // Coalesced memory access - create position and read charge
239- CfChargePos tmp_pos (target_row, target_pad, target_time);
240- output_value = static_cast <float >(chargeMap[tmp_pos].unpack ()) / central_charge; // Normalize by central charge
241256 }
242-
243- // Write output with reduced branching
244257 if (dtype == 0 ) {
245258 clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)output_value;
246259 } else {
247260 clustererNN.mInputData_32 [write_idx] = output_value;
248261 }
249- // if (write_idx >= clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode) {
250- // printf("Error: Write index out of bounds (central array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
251- // write_idx, (int)(clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
252- // }
253- // if ((clusterer.mPmemory->counters.nClusters - batchStart) < clustererNN.mNnClusterizerBatchedMode) {
254- // if (write_idx >= ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize)) {
255- // printf("Error: Write index out of bounds (end of array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
256- // write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
257- // }
258- // if (write_idx > ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 5)) {
259- // printf("Sanity check (should appear only once) %d == %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
260- // write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 4), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
261- // }
262- // }
263-
264- write_idx += clustererNN.mNnClusterizerFullTimeSize ; // Move to next pad position
262+ write_idx += clustererNN.mNnClusterizerFullTimeSize ;
265263 }
266264 }
267265}
@@ -275,6 +273,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
275273 if (glo_idx + batchStart >= clusterer.mPmemory ->counters .nClusters || glo_idx >= (uint32_t )clustererNN.mNnClusterizerBatchedMode ) {
276274 return ;
277275 }
276+ if (glo_idx + batchStart >= clustererNN.mNnClusterizerTotalClusters ) {
277+ printf (" Error: Class output index out of bounds! %d >= %d (glo_idx: %d, batchStart: %d, mNnClusterizerBatchedMode: %d, mNnClusterizerModelClassNumOutputNodes: %d, clusterer.mPmemory->counters.nClusters %d)\n " ,
278+ glo_idx + batchStart, clustererNN.mNnClusterizerTotalClusters , glo_idx, batchStart, clustererNN.mNnClusterizerBatchedMode , clustererNN.mNnClusterizerModelClassNumOutputNodes , clusterer.mPmemory ->counters .nClusters );
279+ }
278280 if (clustererNN.mNnClusterizerUseClassification ) {
279281 if (dtype == 0 ) {
280282 clustererNN.mOutputDataClass [glo_idx + batchStart] = (int32_t )((clustererNN.mModelProbabilities_16 [glo_idx]).ToFloat () > clustererNN.mNnClassThreshold );
@@ -364,6 +366,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
364366 return ;
365367 }
366368
369+ if (full_glo_idx >= clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes ) {
370+ printf (" Error: Global index out of bounds! %d >= %d (full_glo_idx: %d, maxClusterNum: %d, batchStart: %d)\n " ,
371+ full_glo_idx, clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes , full_glo_idx, maxClusterNum, batchStart);
372+ }
373+
367374 tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow ;
368375
369376 ClusterAccumulator pc;
@@ -737,16 +744,16 @@ GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset
737744 return (row > 62 ? offset : 0 );
738745}
739746
740- GPUd () bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset )
747+ GPUd () bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t maxrow, int32_t iroc_row, int32_t npads_row, int32_t npads_reference )
741748{
742- if (pad < 0 || row < 0 ) { // Faster short-circuit
749+ if (pad < 0 ) { // Faster short-circuit
743750 return true ;
744751 } else if (row < 63 ) {
745- return (pad >= static_cast < int >( GPUTPCGeometry::NPads (row)) );
746- } else if (row < ( 63 + offset) ) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
752+ return (pad >= npads_row );
753+ } else if (row < iroc_row ) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
747754 return true ;
748- } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + offset) ) {
749- return (pad >= static_cast < int >( GPUTPCGeometry::NPads (row - offset)) );
755+ } else if (row < maxrow ) {
756+ return (pad >= npads_reference );
750757 } else {
751758 return true ;
752759 }
0 commit comments