@@ -641,6 +641,34 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
641641 const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings ().nn ;
642642 GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings ().nTPCClustererLanes ];
643643
644+ // Maximum of 4 lanes supported
645+ HighResTimer* nnTimers[12 ] = {
646+ &getTimer<GPUTPCNNClusterizer, 0 >(" GPUTPCNNClusterizer_ONNXClassification_0_" , 0 ),
647+ &getTimer<GPUTPCNNClusterizer, 1 >(" GPUTPCNNClusterizer_ONNXRegression_1_" , 1 ),
648+ &getTimer<GPUTPCNNClusterizer, 2 >(" GPUTPCNNClusterizer_ONNXRegression2_2_" , 2 ),
649+ &getTimer<GPUTPCNNClusterizer, 3 >(" GPUTPCNNClusterizer_ONNXClassification_0_" , 3 ),
650+ &getTimer<GPUTPCNNClusterizer, 4 >(" GPUTPCNNClusterizer_ONNXRegression_1_" , 4 ),
651+ &getTimer<GPUTPCNNClusterizer, 5 >(" GPUTPCNNClusterizer_ONNXRegression2_2_" , 5 ),
652+ &getTimer<GPUTPCNNClusterizer, 6 >(" GPUTPCNNClusterizer_ONNXClassification_0_" , 6 ),
653+ &getTimer<GPUTPCNNClusterizer, 7 >(" GPUTPCNNClusterizer_ONNXRegression_1_" , 7 ),
654+ &getTimer<GPUTPCNNClusterizer, 8 >(" GPUTPCNNClusterizer_ONNXRegression2_2_" , 8 ),
655+ &getTimer<GPUTPCNNClusterizer, 9 >(" GPUTPCNNClusterizer_ONNXClassification_0_" , 9 ),
656+ &getTimer<GPUTPCNNClusterizer, 10 >(" GPUTPCNNClusterizer_ONNXRegression_1_" , 10 ),
657+ &getTimer<GPUTPCNNClusterizer , 11 >(" GPUTPCNNClusterizer_ONNXRegression2_2_" , 11 )
658+ };
659+ HighResTimer* nnFillInputTimers[4 ] {
660+ &getTimer<GPUTPCNNClusterizer, 0 >(" GPUTPCNNClusterizer_fillInputNNSingleElement_0_" , 0 ),
661+ &getTimer<GPUTPCNNClusterizer, 1 >(" GPUTPCNNClusterizer_fillInputNNSingleElement_1_" , 1 ),
662+ &getTimer<GPUTPCNNClusterizer, 2 >(" GPUTPCNNClusterizer_fillInputNNSingleElement_2_" , 2 ),
663+ &getTimer<GPUTPCNNClusterizer, 3 >(" GPUTPCNNClusterizer_fillInputNNSingleElement_3_" , 3 )
664+ };
665+ HighResTimer* nnPublishingTimers[4 ] {
666+ &getTimer<GPUTPCNNClusterizer, 0 >(" GPUTPCNNClusterizer_publish_0_" , 0 ),
667+ &getTimer<GPUTPCNNClusterizer, 1 >(" GPUTPCNNClusterizer_publish_1_" , 1 ),
668+ &getTimer<GPUTPCNNClusterizer, 2 >(" GPUTPCNNClusterizer_publish_2_" , 2 ),
669+ &getTimer<GPUTPCNNClusterizer, 3 >(" GPUTPCNNClusterizer_publish_3_" , 3 )
670+ };
671+
644672 if (GetProcessingSettings ().nn .applyNNclusterizer ) {
645673 int32_t deviceId = -1 ;
646674 int32_t numLanes = GetProcessingSettings ().nTPCClustererLanes ;
@@ -1001,7 +1029,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10011029 size_t iSize = CAMath::Min ((uint)clustererNNShadow.mNnClusterizerBatchedMode , (uint)(clusterer.mPmemory ->counters .nClusters - batchStart));
10021030
10031031 // auto start0 = std::chrono::high_resolution_clock::now();
1032+ if (GetProcessingSettings ().debugLevel >= 1 ) { nnFillInputTimers[lane]->Start (); }
10041033 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid (iSize * clustererNNShadow.mNnClusterizerElementSize , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType , withMC, batchStart); // Filling the data
1034+ if (GetProcessingSettings ().debugLevel >= 1 ) { nnFillInputTimers[lane]->Stop (); }
10051035
10061036 if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags ) {
10071037 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType , withMC, batchStart); // Filling the regression data
@@ -1011,6 +1041,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10111041 // auto start1 = std::chrono::high_resolution_clock::now();
10121042
10131043 // NN evaluations
1044+ if (GetProcessingSettings ().debugLevel >= 1 ) { nnTimers[3 *lane]->Start (); }
10141045 if (clustererNNShadow.mNnInferenceInputDType == 0 ) {
10151046 if (clustererNNShadow.mNnInferenceOutputDType == 0 ) {
10161047 (nnApplication.mModelClass ).inference (clustererNNShadow.mInputData_16 , iSize, clustererNNShadow.mModelProbabilities_16 );
@@ -1024,7 +1055,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10241055 (nnApplication.mModelClass ).inference (clustererNNShadow.mInputData_32 , iSize, clustererNNShadow.mModelProbabilities_32 );
10251056 }
10261057 }
1058+ if (GetProcessingSettings ().debugLevel >= 1 ) { nnTimers[3 *lane]->Stop (); }
10271059 if (!clustererNNShadow.mNnClusterizerUseCfRegression ) {
1060+ if (GetProcessingSettings ().debugLevel >= 1 ) { nnTimers[3 *lane + 1 ]->Start (); }
10281061 if (clustererNNShadow.mNnInferenceInputDType == 0 ) {
10291062 if (clustererNNShadow.mNnInferenceOutputDType == 0 ) {
10301063 (nnApplication.mModelReg1 ).inference (clustererNNShadow.mInputData_16 , iSize, clustererNNShadow.mOutputDataReg1_16 );
@@ -1038,7 +1071,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10381071 (nnApplication.mModelReg1 ).inference (clustererNNShadow.mInputData_32 , iSize, clustererNNShadow.mOutputDataReg1_32 );
10391072 }
10401073 }
1074+ if (GetProcessingSettings ().debugLevel >= 1 ) { nnTimers[3 *lane + 1 ]->Stop (); }
10411075 if (nnApplication.mModelClass .getNumOutputNodes ()[0 ][1 ] > 1 && nnApplication.mModelReg2 .isInitialized ()) {
1076+ if (GetProcessingSettings ().debugLevel >= 1 ) { nnTimers[3 *lane + 2 ]->Start (); }
10421077 if (clustererNNShadow.mNnInferenceInputDType == 0 ) {
10431078 if (clustererNNShadow.mNnInferenceOutputDType == 0 ) {
10441079 (nnApplication.mModelReg2 ).inference (clustererNNShadow.mInputData_16 , iSize, clustererNNShadow.mOutputDataReg2_16 );
@@ -1052,12 +1087,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10521087 (nnApplication.mModelReg2 ).inference (clustererNNShadow.mInputData_32 , iSize, clustererNNShadow.mOutputDataReg2_32 );
10531088 }
10541089 }
1090+ if (GetProcessingSettings ().debugLevel >= 1 ) { nnTimers[3 *lane + 2 ]->Stop (); }
10551091 }
10561092 }
10571093
10581094 // auto stopNNs = std::chrono::high_resolution_clock::now();
10591095
10601096 // Publishing kernels
1097+ if (GetProcessingSettings ().debugLevel >= 1 ) { nnPublishingTimers[lane]->Start (); }
10611098 if (nnApplication.mModelClass .getNumOutputNodes ()[0 ][1 ] == 1 ) {
10621099 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType , withMC, batchStart); // Assigning class labels
10631100 } else {
@@ -1069,6 +1106,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10691106 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType , withMC, batchStart); // Publishing class 2 regression results
10701107 }
10711108 }
1109+ if (GetProcessingSettings ().debugLevel >= 1 ) { nnPublishingTimers[lane]->Stop (); }
10721110
10731111 // for(int i = 0; i < iSize; ++i) {
10741112 // if(clustererNNShadow.mOutputDataClass[i + batchStart] > 1) {
@@ -1090,7 +1128,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10901128 }
10911129 if (clustererNNShadow.mNnClusterizerUseCfRegression ) {
10921130 // auto start1 = std::chrono::high_resolution_clock::now();
1131+ if (GetProcessingSettings ().debugLevel >= 1 ) { nnPublishingTimers[lane]->Start (); }
10931132 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid (clusterer.mPmemory ->counters .nClusters , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType , withMC, 0 ); // Running the CF regression kernel - no batching needed: batchStart = 0
1133+ if (GetProcessingSettings ().debugLevel >= 1 ) { nnPublishingTimers[lane]->Stop (); }
10941134 // auto stop1 = std::chrono::high_resolution_clock::now();
10951135 // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
10961136 }
0 commit comments