@@ -57,7 +57,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
5757 auto & clusterer = processors.tpcClusterer [sector];
5858 auto & clustererNN = processors.tpcNNClusterer [sector];
5959
60- uint32_t glo_idx = get_global_id (0 );
60+ const uint32_t glo_idx = get_global_id (0 );
6161 if (glo_idx + batchStart >= clusterer.mPmemory ->counters .nClusters || glo_idx >= (uint32_t )clustererNN.mNnClusterizerBatchedMode ) {
6262 return ;
6363 }
@@ -67,43 +67,42 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
6767 CfArray2D<PackedCharge> chargeMap (reinterpret_cast <PackedCharge*>(clusterer.mPchargeMap ));
6868 CfArray2D<uint8_t > isPeakMap (clusterer.mPpeakMap );
6969 CfChargePos peak = clusterer.mPfilteredPeakPositions [CAMath::Min (glo_idx + batchStart, (uint32_t )(clusterer.mPmemory ->counters .nClusters - 1 ))];
70- int32_t row = static_cast <int >(peak.row ());
71- int32_t pad = static_cast <int >(peak.pad ());
72- int32_t time = static_cast <int >(peak.time ());
73- float central_charge = static_cast <float >(chargeMap[peak].unpack ());
74- int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset (row, clustererNN.mNnClusterizerSizeInputRow );
70+ const int32_t row = static_cast <int >(peak.row ());
71+ const int32_t pad = static_cast <int >(peak.pad ());
72+ const int32_t time = static_cast <int >(peak.time ());
73+ const float central_charge = static_cast <float >(chargeMap[peak].unpack ());
74+ const float inverse_charge = 1 .f / central_charge;
75+
76+ const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset (row, clustererNN.mNnClusterizerSizeInputRow );
7577 const int32_t iroc_row = 63 + clustererNN.mNnClusterizerSizeInputRow ;
7678 const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + clustererNN.mNnClusterizerSizeInputRow ;
7779 const int32_t npads_row = GPUTPCGeometry::NPads (row);
80+ float output_value = clustererNN.mNnClusterizerBoundaryFillValue ;
7881
79- for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow ; r <= clustererNN.mNnClusterizerSizeInputRow ; ++r) {
80- int32_t target_row = row + r;
81- bool is_row_boundary = (target_row < 0 ) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
82- int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset (row, target_row);
83- int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads (target_row + row_offset);
82+ for (int32_t target_row = -clustererNN.mNnClusterizerSizeInputRow + row; target_row <= clustererNN.mNnClusterizerSizeInputRow + row; ++target_row) {
83+ uint8_t is_boundary = (target_row < 0 ) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
84+ const int32_t p_local = pad + (is_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset (row, target_row));
85+ const int32_t npads_reference = is_boundary ? 0 : GPUTPCGeometry::NPads (target_row - row_offset);
8486
85- for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
86- int32_t target_pad = pad + p;
87- bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary (target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference);
88-
89- for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime ; t <= clustererNN.mNnClusterizerSizeInputTime ; ++t) {
90- int32_t target_time = time + t;
87+ for (int32_t target_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; target_pad <= clustererNN.mNnClusterizerSizeInputPad + p_local; ++target_pad) {
88+ is_boundary = is_boundary || GPUTPCNNClusterizerKernels::isBoundary (target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference);
9189
90+ for (int32_t target_time = -clustererNN.mNnClusterizerSizeInputTime + time; target_time <= clustererNN.mNnClusterizerSizeInputTime + time; ++target_time) {
9291 if (is_boundary || target_time < 0 || target_time >= clustererNN.maxAllowedTimebin ) {
9392 // Fill boundary value
94- float boundary_value = static_cast < float >( clustererNN.mNnClusterizerBoundaryFillValue ) ;
93+ output_value = clustererNN.mNnClusterizerBoundaryFillValue ;
9594 if (dtype == 0 ) {
96- clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)boundary_value ;
95+ clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)output_value ;
9796 } else {
98- clustererNN.mInputData_32 [write_idx] = boundary_value ;
97+ clustererNN.mInputData_32 [write_idx] = output_value ;
9998 }
10099 } else {
101100 CfChargePos tmp_pos (target_row, target_pad, target_time);
102- float normalized_charge = static_cast < float >( chargeMap[tmp_pos].unpack ()) / central_charge ;
101+ output_value = chargeMap[tmp_pos].unpack () * inverse_charge ;
103102 if (dtype == 0 ) {
104- clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)normalized_charge ;
103+ clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)output_value ;
105104 } else {
106- clustererNN.mInputData_32 [write_idx] = normalized_charge ;
105+ clustererNN.mInputData_32 [write_idx] = output_value ;
107106 }
108107 }
109108 // if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
@@ -119,11 +118,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
119118 if (dtype == 0 ) {
120119 clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)(static_cast <float >(sector) / o2::tpc::constants::MAXSECTOR);
121120 clustererNN.mInputData_16 [write_idx + 1 ] = (OrtDataType::Float16_t)(static_cast <float >(row) / o2::tpc::constants::MAXGLOBALPADROW);
122- clustererNN.mInputData_16 [write_idx + 2 ] = (OrtDataType::Float16_t)(static_cast <float >(pad) / GPUTPCGeometry::NPads (row) );
121+ clustererNN.mInputData_16 [write_idx + 2 ] = (OrtDataType::Float16_t)(static_cast <float >(pad) / npads_row );
123122 } else {
124123 clustererNN.mInputData_32 [write_idx] = static_cast <float >(sector) / o2::tpc::constants::MAXSECTOR;
125124 clustererNN.mInputData_32 [write_idx + 1 ] = static_cast <float >(row) / o2::tpc::constants::MAXGLOBALPADROW;
126- clustererNN.mInputData_32 [write_idx + 2 ] = static_cast <float >(pad) / GPUTPCGeometry::NPads (row) ;
125+ clustererNN.mInputData_32 [write_idx + 2 ] = static_cast <float >(pad) / npads_row ;
127126 }
128127 }
129128
@@ -143,16 +142,16 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
143142template <>
144143GPUdii () void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
145144{
146- uint32_t glo_idx = get_global_id (0 );
145+ const uint32_t glo_idx = get_global_id (0 );
147146 auto & clusterer = processors.tpcClusterer [sector];
148147 auto & clustererNN = processors.tpcNNClusterer [sector];
149148
150149 if (glo_idx >= (uint32_t )clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeThreads ) {
151150 return ;
152151 }
153152
154- uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads ;
155- uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads );
153+ const uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads ;
154+ const uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads );
156155
157156 // Early exit for out-of-bounds threads
158157 if (base_idx + batchStart >= clusterer.mPmemory ->counters .nClusters ) {
@@ -164,10 +163,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
164163 // Use dedicated neural network shared memory arrays for warp-level caching
165164 // First thread in each warp loads shared data
166165 CfChargePos peak = clusterer.mPfilteredPeakPositions [CAMath::Min (base_idx + batchStart, (uint32_t )(clusterer.mPmemory ->counters .nClusters - 1 ))];
167- float central_charge = chargeMap[peak].unpack ();
168- int32_t row = static_cast <int >(peak.row ());
169- int32_t pad = static_cast <int >(peak.pad ());
170- int32_t time = static_cast <int >(peak.time ());
166+ const float central_charge = chargeMap[peak].unpack ();
167+ const int32_t row = static_cast <int >(peak.row ());
168+ const int32_t pad = static_cast <int >(peak.pad ());
169+ const int32_t time = static_cast <int >(peak.time ());
171170
172171 // Handle index data with fewer branches
173172 if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize ) {
0 commit comments