@@ -630,23 +630,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
630630 mRec ->runParallelOuterLoop (doGPU, numLanes, [&](uint32_t lane) {
631631 nnApplications[lane].init (nn_settings);
632632 if (nnApplications[lane].modelsUsed [0 ]) {
633- SetONNXGPUStream ((nnApplications[lane].model_class ).getSessionOptions (), lane, &deviceId);
633+ SetONNXGPUStream ((nnApplications[lane].model_class ).getSessionOptions (), lane + numLanes , &deviceId);
634634 (nnApplications[lane].model_class ).setDeviceId (deviceId);
635635 if (nnApplications[lane].model_class .getIntraOpNumThreads () > maxThreads) {
636636 nnApplications[lane].model_class .setIntraOpNumThreads (maxThreads);
637637 }
638638 (nnApplications[lane].model_class ).initEnvironment ();
639639 }
640640 if (nnApplications[lane].modelsUsed [1 ]) {
641- SetONNXGPUStream ((nnApplications[lane].model_reg_1 ).getSessionOptions (), lane, &deviceId);
641+ SetONNXGPUStream ((nnApplications[lane].model_reg_1 ).getSessionOptions (), lane + 2 *numLanes , &deviceId);
642642 (nnApplications[lane].model_reg_1 ).setDeviceId (deviceId);
643643 if (nnApplications[lane].model_reg_1 .getIntraOpNumThreads () > maxThreads) {
644644 nnApplications[lane].model_reg_1 .setIntraOpNumThreads (maxThreads);
645645 }
646646 (nnApplications[lane].model_reg_1 ).initEnvironment ();
647647 }
648648 if (nnApplications[lane].modelsUsed [2 ]) {
649- SetONNXGPUStream ((nnApplications[lane].model_reg_2 ).getSessionOptions (), lane, &deviceId);
649+ SetONNXGPUStream ((nnApplications[lane].model_reg_2 ).getSessionOptions (), lane + 3 *numLanes , &deviceId);
650650 (nnApplications[lane].model_reg_2 ).setDeviceId (deviceId);
651651 if (nnApplications[lane].model_reg_2 .getIntraOpNumThreads () > maxThreads) {
652652 nnApplications[lane].model_reg_2 .setIntraOpNumThreads (maxThreads);
@@ -950,7 +950,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
950950 DoDebugAndDump (RecoStep::TPCClusterFinding, 262144 << 4 , clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile , " Split Charges" );
951951 }
952952
953- float time_clusterizer = 0 , time_fill = 0 ;
953+ float time_clusterizer = 0 , time_fill = 0 , time_networks = 0 ;
954954 for (int batch = 0 ; batch < std::ceil ((float )clusterer.mPmemory ->counters .nClusters / clustererNNShadow.nnClusterizerBatchedMode ); batch++) {
955955 uint batchStart = batch * clustererNNShadow.nnClusterizerBatchedMode ;
956956 size_t iSize = CAMath::Min ((uint)clustererNNShadow.nnClusterizerBatchedMode , (uint)(clusterer.mPmemory ->counters .nClusters - batchStart));
@@ -961,6 +961,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
961961
962962 auto start1 = std::chrono::high_resolution_clock::now ();
963963
964+ // NN evaluations
964965 if (clustererNNShadow.nnInferenceInputDType == 0 ) {
965966 if (clustererNNShadow.nnInferenceOutputDType == 0 ) {
966967 (nnApplication.model_class ).inference (clustererNNShadow.inputData_16 , iSize, clustererNNShadow.modelProbabilities_16 );
@@ -974,14 +975,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
974975 (nnApplication.model_class ).inference (clustererNNShadow.inputData_32 , iSize, clustererNNShadow.modelProbabilities_32 );
975976 }
976977 }
977-
978- if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] == 1 ) {
979- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType , withMC, batchStart); // Assigning class labels
980- } else {
981- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType , withMC, batchStart); // Assigning class labels
982- }
983978 if (!clustererNNShadow.nnClusterizerUseCfRegression ) {
984- // nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType);
985979 if (clustererNNShadow.nnInferenceInputDType == 0 ) {
986980 if (clustererNNShadow.nnInferenceOutputDType == 0 ) {
987981 (nnApplication.model_reg_1 ).inference (clustererNNShadow.inputData_16 , iSize, clustererNNShadow.outputDataReg1_16 );
@@ -995,9 +989,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
995989 (nnApplication.model_reg_1 ).inference (clustererNNShadow.inputData_32 , iSize, clustererNNShadow.outputDataReg1_32 );
996990 }
997991 }
998- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType , withMC, batchStart); // Running the NN for regression class 1
999992 if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] > 1 && nnApplication.model_reg_2 .isInitialized ()) {
1000- // nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType);
1001993 if (clustererNNShadow.nnInferenceInputDType == 0 ) {
1002994 if (clustererNNShadow.nnInferenceOutputDType == 0 ) {
1003995 (nnApplication.model_reg_2 ).inference (clustererNNShadow.inputData_16 , iSize, clustererNNShadow.outputDataReg2_16 );
@@ -1011,11 +1003,26 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10111003 (nnApplication.model_reg_2 ).inference (clustererNNShadow.inputData_32 , iSize, clustererNNShadow.outputDataReg2_32 );
10121004 }
10131005 }
1014- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType , withMC, batchStart); // Running the NN for regression class 2
1006+ }
1007+ }
1008+
1009+ auto stopNNs = std::chrono::high_resolution_clock::now ();
1010+
1011+ // Publishing kernels
1012+ if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] == 1 ) {
1013+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType , withMC, batchStart); // Assigning class labels
1014+ } else {
1015+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType , withMC, batchStart); // Assigning class labels
1016+ }
1017+ if (!clustererNNShadow.nnClusterizerUseCfRegression ) {
1018+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType , withMC, batchStart); // Publishing class 1 regression results
1019+ if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] > 1 && nnApplication.model_reg_2 .isInitialized ()) {
1020+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType , withMC, batchStart); // Publishing class 2 regression results
10151021 }
10161022 }
10171023 auto stop1 = std::chrono::high_resolution_clock::now ();
10181024
1025+ time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count () / 1e9 ;
10191026 time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count () / 1e9 ;
10201027 time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count () / 1e9 ;
10211028 }
@@ -1030,8 +1037,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10301037 for (size_t i = 0 ; i < clusterer.mPmemory ->counters .nClusters ; ++i) {
10311038 acceptedClusters += clustererNNShadow.outputDataClass [i];
10321039 }
1033- LOG (info) << " [NN CF] Apply NN (fragment " << fragment.index << " , lane: " << lane << " , sector: " << iSector << " ): filling data " << time_fill << " s ; clusterizer: " << time_clusterizer << " s ; " << clusterer.mPmemory ->counters .nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t )clusterer.mPmemory ->counters .nClusters / (time_fill + time_clusterizer) << " clusters/s" ;
1040+ LOG (info) << " [NN CF] Apply NN (fragment " << fragment.index << " , lane: " << lane << " , sector: " << iSector << " ): filling data " << time_fill << " s ; networks: " << time_networks << " s ; clusterizer: " << time_clusterizer << " s ; " << clusterer.mPmemory ->counters .nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t )clusterer.mPmemory ->counters .nClusters / (time_fill + time_clusterizer) << " clusters/s" ;
10341041 }
1042+ TransferMemoryResourcesToHost (RecoStep::TPCClusterFinding, &clustererNN, lane);
10351043#else
10361044 GPUFatal (" Project not compiled with neural network clusterization. Aborting." );
10371045#endif
@@ -1132,6 +1140,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
11321140 }
11331141 }
11341142 for (int32_t i = 0 ; i < GetProcessingSettings ().nTPCClustererLanes ; i++) {
1143+ if (GetProcessingSettings ().nn .applyNNclusterizer ) {
1144+ GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
1145+ nnApplication.model_class .release ();
1146+ nnApplication.model_reg_1 .release ();
1147+ nnApplication.model_reg_2 .release ();
1148+ }
11351149 if (transferRunning[i]) {
11361150 ReleaseEvent (mEvents ->stream [i], doGPU);
11371151 }
0 commit comments