2626using namespace o2 ::gpu;
2727using namespace o2 ::gpu::tpccf;
2828
29+ // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
2930template <>
3031GPUdii () void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
3132{
@@ -58,7 +59,7 @@ GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass2La
5859{
5960 uint glo_idx = get_global_id (0 );
6061 auto elem_iterator = clusterer.modelProbabilities .begin () + (unsigned int )(glo_idx * clusterer.model_class .getNumOutputNodes ()[0 ][1 ]);
61- uint class_label = std::distance (elem_iterator, std::max_element (elem_iterator, elem_iterator + clusterer.model_class .getNumOutputNodes ()[0 ][1 ]));
62+ uint class_label = std::distance (elem_iterator, std::max_element (elem_iterator, elem_iterator + clusterer.model_class .getNumOutputNodes ()[0 ][1 ])); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
6263 clusterer.outputDataClass [glo_idx + batchStart] = class_label;
6364}
6465
@@ -107,6 +108,7 @@ void GPUTPCNNClusterizer::applyNetworkReg2(processorType& clusterer, int8_t dtyp
107108 }
108109}
109110
111+ // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
110112int GPUTPCNNClusterizer::padOffset (int row_ref, int row_current, const GPUTPCGeometry& geo)
111113{
112114 return (int )((geo.NPads (row_current) - geo.NPads (row_ref)) / 2 );
@@ -117,7 +119,6 @@ int GPUTPCNNClusterizer::rowOffset(int row, int global_shift)
117119 return (row > 62 ? global_shift : 0 );
118120}
119121
120- // ---------------------------------
121122bool GPUTPCNNClusterizer::isBoundary (int row, int pad, int global_shift, const GPUTPCGeometry& geo)
122123{
123124 if (pad < 0 || row < 0 ) { // Faster short-circuit
@@ -133,24 +134,25 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
133134 }
134135}
135136
136- // ---------------------------------
137+ // Filling the input data for the neural network where there is no boundary
137138GPUd () void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& clusterer, int8_t dtype, uint batchStart)
138139{
139140
140141 Array2D<PackedCharge> chargeMap (reinterpret_cast <PackedCharge*>(clusterer.mPchargeMap ));
141142
142143 uint glo_idx = get_global_id (0 );
143144
144- uint write_idx = glo_idx * clusterer.nnClusterizerElementSize ; // For optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
145+ uint write_idx = glo_idx * clusterer.nnClusterizerElementSize ; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
145146
146147 ChargePos peak = clusterer.mPfilteredPeakPositions [glo_idx + batchStart];
147- int row = static_cast <int >(peak.row ()), pad = static_cast <int >(peak.pad ()), time = static_cast <int >(peak.time ());
148+ int row = static_cast <int >(peak.row ()), pad = static_cast <int >(peak.pad ()), time = static_cast <int >(peak.time ()); // Explicit casting to avoid conversion errors
148149 float central_charge = static_cast <float >(chargeMap[peak].unpack ());
149150
150151 clusterer.peakPositions [glo_idx] = peak;
151152 clusterer.centralCharges [glo_idx] = central_charge;
152153
153154 int row_offset = GPUTPCNNClusterizer::rowOffset (row, clusterer.nnClusterizerSizeInputRow );
155+ GPUCA_UNROLL (U (), U ());
154156 for (int r = -clusterer.nnClusterizerSizeInputRow ; r <= clusterer.nnClusterizerSizeInputRow ; r++) {
155157 bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1 )) || ((row + r) < 0 );
156158 int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizer::padOffset (row, row + r, clusterer.Param ().tpcGeometry );
@@ -165,6 +167,7 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
165167 clusterer.inputData32 [write_idx] = static_cast <float >(chargeMap[tmp_pos].unpack ()) / central_charge;
166168 }
167169 } else {
170+ // Filling boundary just to make sure that no values are left unintentionally
168171 if (dtype == 0 ){
169172 clusterer.inputData16 [write_idx] = (OrtDataType::Float16_t)(static_cast <float >(clusterer.nnClusterizerBoundaryFillValue ));
170173 } else {
@@ -188,7 +191,6 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
188191 }
189192}
190193
191- // ---------------------------------
192194GPUd () void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
193195{
194196 Array2D<PackedCharge> chargeMap (reinterpret_cast <PackedCharge*>(clusterer.mPchargeMap ));
@@ -204,6 +206,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
204206
205207 ClusterAccumulator pc;
206208
209+ // Publishing logic is taken from default clusterizer
207210 if (onlyMC) {
208211 ClusterAccumulator dummy_pc;
209212 CPU_ONLY (labelAcc->collect (clusterer.peakPositions [glo_idx], chargeMap[clusterer.peakPositions [glo_idx]].unpack ()));
@@ -252,10 +255,14 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
252255 rowIndex = clusterer.mPclusterPosInRow [full_glo_idx];
253256 }
254257 CPU_ONLY (labelAcc->commit (clusterer.peakPositions [glo_idx].row (), rowIndex, clusterer.mNMaxClusterPerRow ));
258+ } else {
259+ if (clusterer.mPclusterPosInRow ) {
260+ clusterer.mPclusterPosInRow [full_glo_idx] = clusterer.mNMaxClusterPerRow ;
261+ }
262+ return ;
255263 }
256264}
257265
258- // ---------------------------------
259266GPUd () void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
260267{
261268 Array2D<PackedCharge> chargeMap (reinterpret_cast <PackedCharge*>(clusterer.mPchargeMap ));
@@ -353,5 +360,10 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
353360 rowIndex = clusterer.mPclusterPosInRow [full_glo_idx];
354361 }
355362 // CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
363+ } else {
364+ if (clusterer.mPclusterPosInRow ) {
365+ clusterer.mPclusterPosInRow [full_glo_idx] = clusterer.mNMaxClusterPerRow ;
366+ }
367+ return ;
356368 }
357369}
0 commit comments