@@ -51,7 +51,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
5151}
5252
5353template <>
54- GPUdii () void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC , uint32_t batchStart)
54+ GPUdii () void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC , uint32_t batchStart)
5555{
5656 uint32_t glo_idx = get_global_id (0 );
5757 auto & clusterer = processors.tpcClusterer [sector];
@@ -111,7 +111,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
111111}
112112
113113template <>
114- GPUdii () void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNSingleElement>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC , uint32_t batchStart)
114+ GPUdii () void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNSingleElement>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC , uint32_t batchStart)
115115{
116116 uint32_t glo_idx = get_global_id (0 );
117117 auto & clusterer = processors.tpcClusterer [sector];
@@ -126,11 +126,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
126126
127127 if (clustererNN.mNnClusterizerAddIndexData && (int32_t )transient_index == (clustererNN.mNnClusterizerElementSize - 1 )) {
128128 uint32_t top_idx = (base_idx + 1 ) * clustererNN.mNnClusterizerElementSize ;
129- for (uint16_t i = 0 ; i < 8 ; i++) {
130- Delta2 d = cfconsts::InnerNeighbors[i];
131- CfChargePos tmp_pos = peak.delta (d);
132- clustererNN.mClusterFlags [2 * glo_idx] += CfUtils::isPeak (isPeakMap[tmp_pos]);
133- clustererNN.mClusterFlags [2 * glo_idx + 1 ] = clustererNN.mClusterFlags [2 * glo_idx];
129+ if (!clustererNN.mNnClusterizerSetDeconvolutionFlags ) { // Only if deconvolution flags are not set
130+ for (uint16_t i = 0 ; i < 8 ; i++) { // This solution needs testing. It is not the same as the deconvolution flags
131+ Delta2 d = cfconsts::InnerNeighbors[i];
132+ CfChargePos tmp_pos = peak.delta (d);
133+ clustererNN.mClusterFlags [2 * base_idx] += CfUtils::isPeak (isPeakMap[tmp_pos]);
134+ }
135+ clustererNN.mClusterFlags [2 * base_idx + 1 ] = clustererNN.mClusterFlags [2 * base_idx];
134136 }
135137 if (dtype == 0 ) {
136138 clustererNN.mInputData_16 [top_idx - 3 ] = (OrtDataType::Float16_t)(sector / 36 .f );
@@ -147,40 +149,40 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
147149 bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1 )) || ((row + r) < 0 );
148150 if (is_row_boundary) {
149151 if (dtype == 0 ) {
150- clustererNN.mInputData_16 [base_idx * clustererNN. mNnClusterizerElementSize + transient_index ] = (OrtDataType::Float16_t)(static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue ));
152+ clustererNN.mInputData_16 [glo_idx ] = (OrtDataType::Float16_t)(static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue ));
151153 } else {
152- clustererNN.mInputData_32 [base_idx * clustererNN. mNnClusterizerElementSize + transient_index ] = static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue );
154+ clustererNN.mInputData_32 [glo_idx ] = static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue );
153155 }
154156 } else {
155157 int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset (row, clustererNN.mNnClusterizerSizeInputRow );
156158 int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset (row, row + r);
157159 int32_t rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1 ) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1 ));
158160 int32_t p = CAMath::Floor (rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1 )) - clustererNN.mNnClusterizerSizeInputPad + pad_offset;
159- int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1 )) - clustererNN.mNnClusterizerSizeInputTime ;
161+ int32_t time_pos = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1 )) - clustererNN.mNnClusterizerSizeInputTime + time ;
160162
161- bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary (row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow ) && (t < 0 || t >= TPC_MAX_FRAGMENT_LEN_GPU);
163+ bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary (row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow ) && (time_pos < 0 || time_pos >= TPC_MAX_FRAGMENT_LEN_GPU);
162164
163165 if (!is_boundary) {
164166 float central_charge = static_cast <float >(chargeMap[peak].unpack ());
165- CfChargePos tmp_pos (row + r, pad + p, time + t );
167+ CfChargePos tmp_pos (row + r, pad + p, time_pos );
166168 if (dtype == 0 ) {
167- clustererNN.mInputData_16 [base_idx * clustererNN. mNnClusterizerElementSize + transient_index ] = (OrtDataType::Float16_t)(static_cast <float >(chargeMap[tmp_pos].unpack ()) / central_charge);
169+ clustererNN.mInputData_16 [glo_idx ] = (OrtDataType::Float16_t)(static_cast <float >(chargeMap[tmp_pos].unpack ()) / central_charge);
168170 } else if (dtype == 1 ) {
169- clustererNN.mInputData_32 [base_idx * clustererNN. mNnClusterizerElementSize + transient_index ] = static_cast <float >(chargeMap[tmp_pos].unpack ()) / central_charge;
171+ clustererNN.mInputData_32 [glo_idx ] = static_cast <float >(chargeMap[tmp_pos].unpack ()) / central_charge;
170172 }
171173 } else {
172174 if (dtype == 0 ) {
173- clustererNN.mInputData_16 [base_idx * clustererNN. mNnClusterizerElementSize + transient_index ] = (OrtDataType::Float16_t)(static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue ));
175+ clustererNN.mInputData_16 [glo_idx ] = (OrtDataType::Float16_t)(static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue ));
174176 } else {
175- clustererNN.mInputData_32 [base_idx * clustererNN. mNnClusterizerElementSize + transient_index ] = static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue );
177+ clustererNN.mInputData_32 [glo_idx ] = static_cast <float >(clustererNN.mNnClusterizerBoundaryFillValue );
176178 }
177179 }
178180 }
179181 }
180182}
181183
182184template <>
183- GPUdii () void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC , uint32_t batchStart)
185+ GPUdii () void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC , uint32_t batchStart)
184186{
185187 uint32_t glo_idx = get_global_id (0 );
186188 if (dtype == 0 ) {
@@ -191,7 +193,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
191193}
192194
193195template <>
194- GPUdii () void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC , uint32_t batchStart)
196+ GPUdii () void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC , uint32_t batchStart)
195197{
196198 auto & clustererNN = processors.tpcNNClusterer [sector];
197199 uint32_t glo_idx = get_global_id (0 );
@@ -457,6 +459,33 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
457459 }
458460}
459461
462+ // ---------------------------------
463+ template <>
464+ GPUdii () void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
465+ {
466+ // Implements identical publishing logic as the heuristic clusterizer and deconvolution kernel
467+ uint32_t idx = get_global_id (0 );
468+ auto & clusterer = processors.tpcClusterer [sector];
469+ auto & clustererNN = processors.tpcNNClusterer [sector];
470+ CfArray2D<PackedCharge> chargeMap (reinterpret_cast <PackedCharge*>(clusterer.mPchargeMap ));
471+ CfChargePos peak = clusterer.mPfilteredPeakPositions [idx + batchStart];
472+
473+ for (int i = 0 ; i < 8 ; i++) {
474+ Delta2 d = cfconsts::InnerNeighbors[i];
475+ CfChargePos tmp_pos = peak.delta (d);
476+ PackedCharge charge = chargeMap[tmp_pos];
477+ clustererNN.mClusterFlags [2 * idx] += (d.y != 0 && charge.isSplit ());
478+ clustererNN.mClusterFlags [2 * idx + 1 ] += (d.x != 0 && charge.isSplit ());
479+ }
480+ for (int i = 0 ; i < 16 ; i++) {
481+ Delta2 d = cfconsts::OuterNeighbors[i];
482+ CfChargePos tmp_pos = peak.delta (d);
483+ PackedCharge charge = chargeMap[tmp_pos];
484+ clustererNN.mClusterFlags [2 * idx] += (d.y != 0 && charge.isSplit () && !charge.has3x3Peak ());
485+ clustererNN.mClusterFlags [2 * idx + 1 ] += (d.x != 0 && charge.isSplit () && !charge.has3x3Peak ());
486+ }
487+ }
488+
460489// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
461490GPUd () int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
462491{
0 commit comments