@@ -77,7 +77,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
7777 if (!is_boundary) {
7878 ChargePos tmp_pos (row + r, pad + p, time + t);
7979 if (r == 0 && !clustererNN.clusterFlags [2 * glo_idx] && CAMath::Abs (p) < 3 && CAMath::Abs (t) < 3 && p != 0 && t != 0 ) { // ordering is done for short circuit optimization
80- clustererNN.clusterFlags [2 * glo_idx] = CfUtils::isPeak (isPeakMap[tmp_pos]);
80+ clustererNN.clusterFlags [2 * glo_idx] + = CfUtils::isPeak (isPeakMap[tmp_pos]);
8181 clustererNN.clusterFlags [2 * glo_idx + 1 ] = clustererNN.clusterFlags [2 * glo_idx];
8282 }
8383 if (dtype == 0 ) {
@@ -99,17 +99,85 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
9999 }
100100 if (clustererNN.nnClusterizerAddIndexData ) {
101101 if (dtype == 0 ) {
102- clustererNN.inputData_16 [write_idx] = (OrtDataType::Float16_t)(clusterer. mISector / 36 .f );
102+ clustererNN.inputData_16 [write_idx] = (OrtDataType::Float16_t)(sector / 36 .f );
103103 clustererNN.inputData_16 [write_idx + 1 ] = (OrtDataType::Float16_t)(row / 152 .f );
104104 clustererNN.inputData_16 [write_idx + 2 ] = (OrtDataType::Float16_t)(static_cast <float >(pad) / GPUTPCGeometry::NPads (row));
105105 } else {
106- clustererNN.inputData_32 [write_idx] = clusterer. mISector / 36 .f ;
106+ clustererNN.inputData_32 [write_idx] = sector / 36 .f ;
107107 clustererNN.inputData_32 [write_idx + 1 ] = row / 152 .f ;
108108 clustererNN.inputData_32 [write_idx + 2 ] = static_cast <float >(pad) / GPUTPCGeometry::NPads (row);
109109 }
110110 }
111111}
112112
113+ template <>
114+ GPUdii () void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNSingleElement>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
115+ {
116+ uint glo_idx = get_global_id (0 );
117+ auto & clusterer = processors.tpcClusterer [sector];
118+ auto & clustererNN = processors.tpcNNClusterer [sector];
119+ uint base_idx = CAMath::Floor (glo_idx / clustererNN.nnClusterizerElementSize );
120+ uint transient_index = glo_idx % clustererNN.nnClusterizerElementSize ;
121+
122+ Array2D<PackedCharge> chargeMap (reinterpret_cast <PackedCharge*>(clusterer.mPchargeMap ));
123+ Array2D<uint8_t > isPeakMap (clusterer.mPpeakMap );
124+ ChargePos peak = clusterer.mPfilteredPeakPositions [base_idx + batchStart];
125+ int row = static_cast <int >(peak.row ()), pad = static_cast <int >(peak.pad ());
126+
127+ if (clustererNN.nnClusterizerAddIndexData && transient_index == 0 ) {
128+ uint top_idx = (base_idx + 1 ) * clustererNN.nnClusterizerElementSize ;
129+ for (uint16_t i = 0 ; i < 8 ; i++) {
130+ Delta2 d = cfconsts::InnerNeighbors[i];
131+ ChargePos tmp_pos = peak.delta (d);
132+ clustererNN.clusterFlags [2 * glo_idx] += CfUtils::isPeak (isPeakMap[tmp_pos]);
133+ clustererNN.clusterFlags [2 * glo_idx + 1 ] = clustererNN.clusterFlags [2 * glo_idx];
134+ }
135+ if (dtype == 0 ) {
136+ clustererNN.inputData_16 [top_idx - 3 ] = (OrtDataType::Float16_t)(sector / 36 .f );
137+ clustererNN.inputData_16 [top_idx - 2 ] = (OrtDataType::Float16_t)(row / 152 .f );
138+ clustererNN.inputData_16 [top_idx - 1 ] = (OrtDataType::Float16_t)(static_cast <float >(pad) / GPUTPCGeometry::NPads (row));
139+ } else {
140+ clustererNN.inputData_32 [top_idx - 3 ] = sector / 36 .f ;
141+ clustererNN.inputData_32 [top_idx - 2 ] = row / 152 .f ;
142+ clustererNN.inputData_32 [top_idx - 1 ] = static_cast <float >(pad) / GPUTPCGeometry::NPads (row);
143+ }
144+ } else {
145+ int time = static_cast <int >(peak.time ());
146+ int r = CAMath::Floor (transient_index / ((2 * clustererNN.nnClusterizerSizeInputPad + 1 ) * (2 * clustererNN.nnClusterizerSizeInputTime + 1 ))) - clustererNN.nnClusterizerSizeInputRow ;
147+ bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1 )) || ((row + r) < 0 );
148+ if (is_row_boundary) {
149+ if (dtype == 0 ) {
150+ clustererNN.inputData_16 [base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast <float >(clustererNN.nnClusterizerBoundaryFillValue ));
151+ } else {
152+ clustererNN.inputData_32 [base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast <float >(clustererNN.nnClusterizerBoundaryFillValue );
153+ }
154+ } else {
155+ int row_offset = GPUTPCNNClusterizerKernels::rowOffset (row, clustererNN.nnClusterizerSizeInputRow );
156+ int pad_offset = GPUTPCNNClusterizerKernels::padOffset (row, row + r);
157+ int rest_1 = transient_index % ((2 * clustererNN.nnClusterizerSizeInputPad + 1 ) * (2 * clustererNN.nnClusterizerSizeInputTime + 1 ));
158+ int p = CAMath::Floor (rest_1 / (2 * clustererNN.nnClusterizerSizeInputTime + 1 )) - clustererNN.nnClusterizerSizeInputPad + pad_offset;
159+ bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary (row + r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow );
160+
161+ if (!is_boundary) {
162+ float central_charge = static_cast <float >(chargeMap[peak].unpack ());
163+ int t = (rest_1 % (2 * clustererNN.nnClusterizerSizeInputTime + 1 )) - clustererNN.nnClusterizerSizeInputTime ;
164+ ChargePos tmp_pos (row + r, pad + p, time + t);
165+ if (dtype == 0 ) {
166+ clustererNN.inputData_16 [base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast <float >(chargeMap[tmp_pos].unpack ()) / central_charge);
167+ } else if (dtype == 1 ) {
168+ clustererNN.inputData_32 [base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast <float >(chargeMap[tmp_pos].unpack ()) / central_charge;
169+ }
170+ } else {
171+ if (dtype == 0 ) {
172+ clustererNN.inputData_16 [base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast <float >(clustererNN.nnClusterizerBoundaryFillValue ));
173+ } else {
174+ clustererNN.inputData_32 [base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast <float >(clustererNN.nnClusterizerBoundaryFillValue );
175+ }
176+ }
177+ }
178+ }
179+ }
180+
113181template <>
114182GPUdii () void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
115183{
0 commit comments