Switch on timers on CPU for the first four lanes

ChSonnabend · ChSonnabend · commit b756dcc19b4e · 2025-09-12T00:42:17.000+02:00
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -1052,7 +1052,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             // NN evaluations
             if(clustererNNShadow.mNnClusterizerUseClassification) {
-              if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Start(); }
+              if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Start(); }
               if (clustererNNShadow.mNnInferenceInputDType == 0) {
                 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                   (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
@@ -1066,13 +1066,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                   (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
                 }
               }
-              if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); }
+              if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU || lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time...
               if (nn_settings.nnClusterizerVerbosity > 3) {
                 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN classification inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
               }
             }
             if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
-              if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); }
+              if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Start(); }
               if (clustererNNShadow.mNnInferenceInputDType == 0) {
                 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                   (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
@@ -1086,9 +1086,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                   (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
                 }
               }
-              if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Stop(); }
+              if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Stop(); }
               if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
-                if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Start(); }
+                if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Start(); }
                 if (clustererNNShadow.mNnInferenceInputDType == 0) {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                     (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
@@ -1102,7 +1102,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                     (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
                   }
                 }
-                if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Stop(); }
+                if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Stop(); }
               }
               if (nn_settings.nnClusterizerVerbosity > 3) {
                 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN regression inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";

Original file line number	Diff line number	Diff line change
`@@ -1052,7 +1052,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)`
`1052`	`1052`
`1053`	`1053`	`// NN evaluations`
`1054`	`1054`	`if(clustererNNShadow.mNnClusterizerUseClassification) {`
`1055`		`- if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Start(); }`
	`1055`	`+ if(GetProcessingSettings().debugLevel >= 1 && (doGPU \|\| lane < 4)) { nnTimers[3*lane]->Start(); }`
`1056`	`1056`	`if (clustererNNShadow.mNnInferenceInputDType == 0) {`
`1057`	`1057`	`if (clustererNNShadow.mNnInferenceOutputDType == 0) {`
`1058`	`1058`	`(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);`
`@@ -1066,13 +1066,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)`
`1066`	`1066`	`(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);`
`1067`	`1067`	`}`
`1068`	`1068`	`}`
`1069`		`- if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); }`
	`1069`	`+ if(GetProcessingSettings().debugLevel >= 1 && (doGPU \|\| lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU \|\| lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time...`
`1070`	`1070`	`if (nn_settings.nnClusterizerVerbosity > 3) {`
`1071`	`1071`	`LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN classification inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";`
`1072`	`1072`	`}`
`1073`	`1073`	`}`
`1074`	`1074`	`if (!clustererNNShadow.mNnClusterizerUseCfRegression) {`
`1075`		`- if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); }`
	`1075`	`+ if(GetProcessingSettings().debugLevel >= 1 && (doGPU \|\| lane < 4)) { nnTimers[3*lane + 1]->Start(); }`
`1076`	`1076`	`if (clustererNNShadow.mNnInferenceInputDType == 0) {`
`1077`	`1077`	`if (clustererNNShadow.mNnInferenceOutputDType == 0) {`
`1078`	`1078`	`(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);`
`@@ -1086,9 +1086,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)`
`1086`	`1086`	`(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);`
`1087`	`1087`	`}`
`1088`	`1088`	`}`
`1089`		`- if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Stop(); }`
	`1089`	`+ if(GetProcessingSettings().debugLevel >= 1 && (doGPU \|\| lane < 4)) { nnTimers[3*lane + 1]->Stop(); }`
`1090`	`1090`	`if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {`
`1091`		`- if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Start(); }`
	`1091`	`+ if(GetProcessingSettings().debugLevel >= 1 && (doGPU \|\| lane < 4)) { nnTimers[3*lane + 2]->Start(); }`
`1092`	`1092`	`if (clustererNNShadow.mNnInferenceInputDType == 0) {`
`1093`	`1093`	`if (clustererNNShadow.mNnInferenceOutputDType == 0) {`
`1094`	`1094`	`(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);`
`@@ -1102,7 +1102,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)`
`1102`	`1102`	`(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);`
`1103`	`1103`	`}`
`1104`	`1104`	`}`
`1105`		`- if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Stop(); }`
	`1105`	`+ if(GetProcessingSettings().debugLevel >= 1 && (doGPU \|\| lane < 4)) { nnTimers[3*lane + 2]->Stop(); }`
`1106`	`1106`	`}`
`1107`	`1107`	`if (nn_settings.nnClusterizerVerbosity > 3) {`
`1108`	`1108`	`LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN regression inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";`