Adding some documentation

ChSonnabend · ChSonnabend · commit e830697fc61b · 2025-03-05T14:37:46.000+01:00
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -26,6 +26,7 @@
 using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
 
+// Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
 template <>
 GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
@@ -58,7 +59,7 @@ GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass2La
 {
   uint glo_idx = get_global_id(0);
   auto elem_iterator = clusterer.modelProbabilities.begin() + (unsigned int)(glo_idx * clusterer.model_class.getNumOutputNodes()[0][1]);
-  uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1]));
+  uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1])); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
   clusterer.outputDataClass[glo_idx + batchStart] = class_label;
 }
 
@@ -107,6 +108,7 @@ void GPUTPCNNClusterizer::applyNetworkReg2(processorType& clusterer, int8_t dtyp
   }
 }
 
+// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
 {
   return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);
@@ -117,7 +119,6 @@ int GPUTPCNNClusterizer::rowOffset(int row, int global_shift)
   return (row > 62 ? global_shift : 0);
 }
 
-// ---------------------------------
 bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
 {
   if (pad < 0 || row < 0) { // Faster short-circuit
@@ -133,24 +134,25 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
   }
 }
 
-// ---------------------------------
+// Filling the input data for the neural network where there is no boundary
 GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& clusterer, int8_t dtype, uint batchStart)
 {
 
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
 
   uint glo_idx = get_global_id(0);
 
-  uint write_idx = glo_idx * clusterer.nnClusterizerElementSize; // For optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
+  uint write_idx = glo_idx * clusterer.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
 
   ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
-  int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time());
+  int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
 
   clusterer.peakPositions[glo_idx] = peak;
   clusterer.centralCharges[glo_idx] = central_charge;
 
   int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
+  GPUCA_UNROLL(U(), U());
   for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {
     bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
     int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
@@ -165,6 +167,7 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
             clusterer.inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
           }
         } else {
+          // Filling boundary just to make sure that no values are left unintentionally
           if(dtype == 0){
             clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clusterer.nnClusterizerBoundaryFillValue));
           } else {
@@ -188,7 +191,6 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
   }
 }
 
-// ---------------------------------
 GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
@@ -204,6 +206,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
 
     ClusterAccumulator pc;
 
+    // Publishing logic is taken from default clusterizer
     if (onlyMC) {
       ClusterAccumulator dummy_pc;
       CPU_ONLY(labelAcc->collect(clusterer.peakPositions[glo_idx], chargeMap[clusterer.peakPositions[glo_idx]].unpack()));
@@ -252,10 +255,14 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
     CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+  } else {
+    if (clusterer.mPclusterPosInRow) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+    }
+    return;
   }
 }
 
-// ---------------------------------
 GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
@@ -353,5 +360,10 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
     // CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
+  } else {
+    if (clusterer.mPclusterPosInRow) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+    }
+    return;
   }
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -68,14 +68,11 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, int8_t = 0, int8_t = 0, uint = 0, Args...);
 
   static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, int8_t, uint);
-
   static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
   static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
 
   static void applyNetworkClass(processorType&, int8_t = 0, uint = 0);
-
   static void applyNetworkReg1(processorType&, int8_t = 0);
-
   static void applyNetworkReg2(processorType&, int8_t = 0);
 
   

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@`
`26`	`26`	`using namespace o2::gpu;`
`27`	`27`	`using namespace o2::gpu::tpccf;`
`28`	`28`
	`29`	`+// Defining individual thread functions for data filling, determining the class label and running the CF clusterizer`
`29`	`30`	`template <>`
`30`	`31`	`GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)`
`31`	`32`	`{`
`@@ -58,7 +59,7 @@ GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass2La`
`58`	`59`	`{`
`59`	`60`	`uint glo_idx = get_global_id(0);`
`60`	`61`	`auto elem_iterator = clusterer.modelProbabilities.begin() + (unsigned int)(glo_idx * clusterer.model_class.getNumOutputNodes()[0][1]);`
`61`		`- uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1]));`
	`62`	`+ uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1])); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"`
`62`	`63`	`clusterer.outputDataClass[glo_idx + batchStart] = class_label;`
`63`	`64`	`}`
`64`	`65`
`@@ -107,6 +108,7 @@ void GPUTPCNNClusterizer::applyNetworkReg2(processorType& clusterer, int8_t dtyp`
`107`	`108`	`}`
`108`	`109`	`}`
`109`	`110`
	`111`	`+// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary`
`110`	`112`	`int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)`
`111`	`113`	`{`
`112`	`114`	`return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);`
`@@ -117,7 +119,6 @@ int GPUTPCNNClusterizer::rowOffset(int row, int global_shift)`
`117`	`119`	`return (row > 62 ? global_shift : 0);`
`118`	`120`	`}`
`119`	`121`
`120`		`-// ---------------------------------`
`121`	`122`	`bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)`
`122`	`123`	`{`
`123`	`124`	`if (pad < 0 \|\| row < 0) { // Faster short-circuit`
`@@ -133,24 +134,25 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G`
`133`	`134`	`}`
`134`	`135`	`}`
`135`	`136`
`136`		`-// ---------------------------------`
	`137`	`+// Filling the input data for the neural network where there is no boundary`
`137`	`138`	`GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& clusterer, int8_t dtype, uint batchStart)`
`138`	`139`	`{`
`139`	`140`
`140`	`141`	`Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));`
`141`	`142`
`142`	`143`	`uint glo_idx = get_global_id(0);`
`143`	`144`
`144`		`- uint write_idx = glo_idx * clusterer.nnClusterizerElementSize; // For optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId`
	`145`	`+ uint write_idx = glo_idx * clusterer.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId`
`145`	`146`
`146`	`147`	`ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];`
`147`		`- int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time());`
	`148`	`+ int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors`
`148`	`149`	`float central_charge = static_cast<float>(chargeMap[peak].unpack());`
`149`	`150`
`150`	`151`	`clusterer.peakPositions[glo_idx] = peak;`
`151`	`152`	`clusterer.centralCharges[glo_idx] = central_charge;`
`152`	`153`
`153`	`154`	`int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);`
	`155`	`+ GPUCA_UNROLL(U(), U());`
`154`	`156`	`for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {`
`155`	`157`	`bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) \|\| ((row + r) < 0);`
`156`	`158`	`int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);`
`@@ -165,6 +167,7 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads`
`165`	`167`	`clusterer.inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;`
`166`	`168`	`}`
`167`	`169`	`} else {`
	`170`	`+ // Filling boundary just to make sure that no values are left unintentionally`
`168`	`171`	`if(dtype == 0){`
`169`	`172`	`clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clusterer.nnClusterizerBoundaryFillValue));`
`170`	`173`	`} else {`
`@@ -188,7 +191,6 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads`
`188`	`191`	`}`
`189`	`192`	`}`
`190`	`193`
`191`		`-// ---------------------------------`
`192`	`194`	`GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)`
`193`	`195`	`{`
`194`	`196`	`Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));`
`@@ -204,6 +206,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo`
`204`	`206`
`205`	`207`	`ClusterAccumulator pc;`
`206`	`208`
	`209`	`+ // Publishing logic is taken from default clusterizer`
`207`	`210`	`if (onlyMC) {`
`208`	`211`	`ClusterAccumulator dummy_pc;`
`209`	`212`	`CPU_ONLY(labelAcc->collect(clusterer.peakPositions[glo_idx], chargeMap[clusterer.peakPositions[glo_idx]].unpack()));`
`@@ -252,10 +255,14 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo`
`252`	`255`	`rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];`
`253`	`256`	`}`
`254`	`257`	`CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));`
	`258`	`+ } else {`
	`259`	`+ if (clusterer.mPclusterPosInRow) {`
	`260`	`+ clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;`
	`261`	`+ }`
	`262`	`+ return;`
`255`	`263`	`}`
`256`	`264`	`}`
`257`	`265`
`258`		`-// ---------------------------------`
`259`	`266`	`GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)`
`260`	`267`	`{`
`261`	`268`	`Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));`
`@@ -353,5 +360,10 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo`
`353`	`360`	`rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];`
`354`	`361`	`}`
`355`	`362`	`// CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?`
	`363`	`+ } else {`
	`364`	`+ if (clusterer.mPclusterPosInRow) {`
	`365`	`+ clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;`
	`366`	`+ }`
	`367`	`+ return;`
`356`	`368`	`}`
`357`	`369`	`}`