Skip to content

Commit 5ef448c

Browse files
committed
Adding first version of kernel timers
1 parent fa3dd7b commit 5ef448c

File tree

1 file changed

+40
-0
lines changed

1 file changed

+40
-0
lines changed

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,34 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
641641
const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
642642
GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes];
643643

644+
// Maximum of 4 lanes supported
645+
HighResTimer* nnTimers[12] = {
646+
&getTimer<GPUTPCNNClusterizer, 0>("GPUTPCNNClusterizer_ONNXClassification_0_", 0),
647+
&getTimer<GPUTPCNNClusterizer, 1>("GPUTPCNNClusterizer_ONNXRegression_1_", 1),
648+
&getTimer<GPUTPCNNClusterizer, 2>("GPUTPCNNClusterizer_ONNXRegression2_2_", 2),
649+
&getTimer<GPUTPCNNClusterizer, 3>("GPUTPCNNClusterizer_ONNXClassification_0_", 3),
650+
&getTimer<GPUTPCNNClusterizer, 4>("GPUTPCNNClusterizer_ONNXRegression_1_", 4),
651+
&getTimer<GPUTPCNNClusterizer, 5>("GPUTPCNNClusterizer_ONNXRegression2_2_", 5),
652+
&getTimer<GPUTPCNNClusterizer, 6>("GPUTPCNNClusterizer_ONNXClassification_0_", 6),
653+
&getTimer<GPUTPCNNClusterizer, 7>("GPUTPCNNClusterizer_ONNXRegression_1_", 7),
654+
&getTimer<GPUTPCNNClusterizer, 8>("GPUTPCNNClusterizer_ONNXRegression2_2_", 8),
655+
&getTimer<GPUTPCNNClusterizer, 9>("GPUTPCNNClusterizer_ONNXClassification_0_", 9),
656+
&getTimer<GPUTPCNNClusterizer, 10>("GPUTPCNNClusterizer_ONNXRegression_1_", 10),
657+
&getTimer<GPUTPCNNClusterizer , 11>("GPUTPCNNClusterizer_ONNXRegression2_2_", 11)
658+
};
659+
HighResTimer* nnFillInputTimers[4] {
660+
&getTimer<GPUTPCNNClusterizer, 0>("GPUTPCNNClusterizer_fillInputNNSingleElement_0_", 0),
661+
&getTimer<GPUTPCNNClusterizer, 1>("GPUTPCNNClusterizer_fillInputNNSingleElement_1_", 1),
662+
&getTimer<GPUTPCNNClusterizer, 2>("GPUTPCNNClusterizer_fillInputNNSingleElement_2_", 2),
663+
&getTimer<GPUTPCNNClusterizer, 3>("GPUTPCNNClusterizer_fillInputNNSingleElement_3_", 3)
664+
};
665+
HighResTimer* nnPublishingTimers[4] {
666+
&getTimer<GPUTPCNNClusterizer, 0>("GPUTPCNNClusterizer_publish_0_", 0),
667+
&getTimer<GPUTPCNNClusterizer, 1>("GPUTPCNNClusterizer_publish_1_", 1),
668+
&getTimer<GPUTPCNNClusterizer, 2>("GPUTPCNNClusterizer_publish_2_", 2),
669+
&getTimer<GPUTPCNNClusterizer, 3>("GPUTPCNNClusterizer_publish_3_", 3)
670+
};
671+
644672
if (GetProcessingSettings().nn.applyNNclusterizer) {
645673
int32_t deviceId = -1;
646674
int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
@@ -1001,7 +1029,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10011029
size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
10021030

10031031
// auto start0 = std::chrono::high_resolution_clock::now();
1032+
if(GetProcessingSettings().debugLevel >= 1) { nnFillInputTimers[lane]->Start(); }
10041033
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the data
1034+
if(GetProcessingSettings().debugLevel >= 1) { nnFillInputTimers[lane]->Stop(); }
10051035

10061036
if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
10071037
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the regression data
@@ -1011,6 +1041,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10111041
// auto start1 = std::chrono::high_resolution_clock::now();
10121042

10131043
// NN evaluations
1044+
if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane]->Start(); }
10141045
if (clustererNNShadow.mNnInferenceInputDType == 0) {
10151046
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
10161047
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
@@ -1024,7 +1055,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10241055
(nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
10251056
}
10261057
}
1058+
if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane]->Stop(); }
10271059
if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
1060+
if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 1]->Start(); }
10281061
if (clustererNNShadow.mNnInferenceInputDType == 0) {
10291062
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
10301063
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
@@ -1038,7 +1071,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10381071
(nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
10391072
}
10401073
}
1074+
if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 1]->Stop(); }
10411075
if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
1076+
if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 2]->Start(); }
10421077
if (clustererNNShadow.mNnInferenceInputDType == 0) {
10431078
if (clustererNNShadow.mNnInferenceOutputDType == 0) {
10441079
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
@@ -1052,12 +1087,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10521087
(nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
10531088
}
10541089
}
1090+
if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 2]->Stop(); }
10551091
}
10561092
}
10571093

10581094
// auto stopNNs = std::chrono::high_resolution_clock::now();
10591095

10601096
// Publishing kernels
1097+
if(GetProcessingSettings().debugLevel >= 1) { nnPublishingTimers[lane]->Start(); }
10611098
if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) {
10621099
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Assigning class labels
10631100
} else {
@@ -1069,6 +1106,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10691106
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results
10701107
}
10711108
}
1109+
if(GetProcessingSettings().debugLevel >= 1) { nnPublishingTimers[lane]->Stop(); }
10721110

10731111
// for(int i = 0; i < iSize; ++i) {
10741112
// if(clustererNNShadow.mOutputDataClass[i + batchStart] > 1) {
@@ -1090,7 +1128,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10901128
}
10911129
if (clustererNNShadow.mNnClusterizerUseCfRegression) {
10921130
// auto start1 = std::chrono::high_resolution_clock::now();
1131+
if(GetProcessingSettings().debugLevel >= 1) { nnPublishingTimers[lane]->Start(); }
10931132
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
1133+
if(GetProcessingSettings().debugLevel >= 1) { nnPublishingTimers[lane]->Stop(); }
10941134
// auto stop1 = std::chrono::high_resolution_clock::now();
10951135
// time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
10961136
}

0 commit comments

Comments
 (0)