@@ -53,47 +53,58 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
5353template <>
5454GPUdii () void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNCPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
5555{
56- uint32_t glo_idx = get_global_id (0 );
57-
5856 auto & clusterer = processors.tpcClusterer [sector];
5957 auto & clustererNN = processors.tpcNNClusterer [sector];
6058
59+ uint32_t glo_idx = get_global_id (0 );
6160 if (glo_idx + batchStart >= clusterer.mPmemory ->counters .nClusters ) {
6261 return ;
6362 }
64- uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize ; // Potential optimization: Either choose mNnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
63+
64+ uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize ;
6565
6666 CfArray2D<PackedCharge> chargeMap (reinterpret_cast <PackedCharge*>(clusterer.mPchargeMap ));
6767 CfArray2D<uint8_t > isPeakMap (clusterer.mPpeakMap );
68- CfChargePos peak = clusterer.mPfilteredPeakPositions [CAMath::Min (glo_idx + batchStart, (uint32_t )(clusterer.mPmemory ->counters .nClusters - 1 ))];
69- int32_t row = static_cast <int >(peak.row ()), pad = static_cast <int >(peak.pad ()), time = static_cast <int >(peak.time ()); // Explicit casting to avoid conversion errors
68+ CfChargePos peak = clusterer.mPfilteredPeakPositions [CAMath::Min (glo_idx + batchStart, clusterer.mPmemory ->counters .nClusters - 1 )];
69+ int32_t row = static_cast <int >(peak.row ());
70+ int32_t pad = static_cast <int >(peak.pad ());
71+ int32_t time = static_cast <int >(peak.time ());
7072 float central_charge = static_cast <float >(chargeMap[peak].unpack ());
7173 int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset (row, clustererNN.mNnClusterizerSizeInputRow );
7274
73- for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow ; r <= clustererNN.mNnClusterizerSizeInputRow ; r++) {
74- bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1 )) || ((row + r) < 0 );
75- int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset (row, row + r);
76- for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) {
77- bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary (row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow );
78- for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime ; t <= clustererNN.mNnClusterizerSizeInputTime ; t++) {
79- int32_t time_pos = time + t;
80- if (is_boundary || (time_pos < 0 ) || (time_pos >= TPC_MAX_FRAGMENT_LEN_GPU)) {
81- // Filling boundary just to make sure that no values are left unintentionally
75+ for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow ; r <= clustererNN.mNnClusterizerSizeInputRow ; ++r) {
76+ int32_t target_row = row + r;
77+ bool is_row_boundary = (target_row < 0 ) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
78+ int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset (row, target_row);
79+
80+ for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
81+ int32_t target_pad = pad + p;
82+ bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary (target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow );
83+
84+ for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime ; t <= clustererNN.mNnClusterizerSizeInputTime ; ++t) {
85+ int32_t target_time = time + t;
86+
87+ if (is_boundary || target_time < 0 || target_time >= TPC_MAX_FRAGMENT_LEN_GPU) {
88+ // Fill boundary value
89+ float boundary_value = static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue );
8290 if (dtype == 0 ) {
83- clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)( static_cast < float >(clustererNN. mNnClusterizerBoundaryFillValue )) ;
91+ clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)boundary_value ;
8492 } else {
85- clustererNN.mInputData_32 [write_idx] = static_cast < float >(clustererNN. mNnClusterizerBoundaryFillValue ) ;
93+ clustererNN.mInputData_32 [write_idx] = boundary_value ;
8694 }
8795 } else {
88- CfChargePos tmp_pos (row + r, pad + p, time + t);
89- if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && r == 0 && !clustererNN.mClusterFlags [2 * glo_idx] && CAMath::Abs (p) < 3 && CAMath::Abs (t) < 3 && p != 0 && t != 0 ) { // ordering is done for short circuit optimization
96+ CfChargePos tmp_pos (target_row, target_pad, target_time);
97+ float normalized_charge = static_cast <float >(chargeMap[tmp_pos].unpack ()) / central_charge;
98+
99+ if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && r == 0 && CAMath::Abs (p) < 3 && CAMath::Abs (t) < 3 && p != 0 && t != 0 ) {
90100 clustererNN.mClusterFlags [2 * glo_idx] += CfUtils::isPeak (isPeakMap[tmp_pos]);
91101 clustererNN.mClusterFlags [2 * glo_idx + 1 ] = clustererNN.mClusterFlags [2 * glo_idx];
92102 }
103+
93104 if (dtype == 0 ) {
94- clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)( static_cast < float >(chargeMap[tmp_pos]. unpack ()) / central_charge) ;
95- } else if (dtype == 1 ) {
96- clustererNN.mInputData_32 [write_idx] = static_cast < float >(chargeMap[tmp_pos]. unpack ()) / central_charge ;
105+ clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)normalized_charge ;
106+ } else {
107+ clustererNN.mInputData_32 [write_idx] = normalized_charge ;
97108 }
98109 }
99110 // if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
@@ -104,21 +115,28 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
104115 }
105116 }
106117 }
118+
107119 if (clustererNN.mNnClusterizerAddIndexData ) {
120+ float sector_norm = sector / 36 .f ;
121+ float row_norm = row / 152 .f ;
122+ float pad_norm = static_cast <float >(pad) / GPUTPCGeometry::NPads (row);
123+
108124 if (dtype == 0 ) {
109- clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)(sector / 36 . f ) ;
110- clustererNN.mInputData_16 [write_idx + 1 ] = (OrtDataType::Float16_t)(row / 152 . f ) ;
111- clustererNN.mInputData_16 [write_idx + 2 ] = (OrtDataType::Float16_t)( static_cast < float >(pad) / GPUTPCGeometry::NPads (row)) ;
125+ clustererNN.mInputData_16 [write_idx] = (OrtDataType::Float16_t)sector_norm ;
126+ clustererNN.mInputData_16 [write_idx + 1 ] = (OrtDataType::Float16_t)row_norm ;
127+ clustererNN.mInputData_16 [write_idx + 2 ] = (OrtDataType::Float16_t)pad_norm ;
112128 } else {
113- clustererNN.mInputData_32 [write_idx] = sector / 36 . f ;
114- clustererNN.mInputData_32 [write_idx + 1 ] = row / 152 . f ;
115- clustererNN.mInputData_32 [write_idx + 2 ] = static_cast < float >(pad) / GPUTPCGeometry::NPads (row) ;
129+ clustererNN.mInputData_32 [write_idx] = sector_norm ;
130+ clustererNN.mInputData_32 [write_idx + 1 ] = row_norm ;
131+ clustererNN.mInputData_32 [write_idx + 2 ] = pad_norm ;
116132 }
117133 }
134+
118135 if (!clustererNN.mNnClusterizerSetDeconvolutionFlags ) {
119136 clustererNN.mClusterFlags [2 * glo_idx] = 0 ;
120137 clustererNN.mClusterFlags [2 * glo_idx + 1 ] = 0 ;
121- for (uint16_t i = 0 ; i < 8 ; i++) {
138+
139+ for (uint16_t i = 0 ; i < 8 ; ++i) {
122140 Delta2 d = cfconsts::InnerNeighbors[i];
123141 CfChargePos tmp_pos = peak.delta (d);
124142 clustererNN.mClusterFlags [2 * glo_idx] += CfUtils::isPeak (isPeakMap[tmp_pos]);
0 commit comments