@@ -665,7 +665,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
665665 nnTimers[11 ] = &getTimer<GPUTPCNNClusterizer, 11 >(" GPUTPCNNClusterizer_ONNXRegression2_2_" , 11 );
666666 }
667667
668- mRec -> runParallelOuterLoop (doGPU, numLanes, [&]( uint32_t lane) {
668+ for ( int32_t lane = 0 ; lane < numLanes; lane++ ) {
669669 nnApplications[lane].init (nn_settings, GetProcessingSettings ().deterministicGPUReconstruction );
670670 if (nnApplications[lane].mModelsUsed [0 ]) {
671671 SetONNXGPUStream (*(nnApplications[lane].mModelClass ).getSessionOptions (), lane, &deviceId);
@@ -706,10 +706,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
706706 // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
707707 (nnApplications[lane].mModelReg2 ).initSession ();
708708 }
709- if (nn_settings.nnClusterizerVerbosity < 3 ) {
709+ if (nn_settings.nnClusterizerVerbosity > 0 ) {
710710 LOG (info) << " (ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
711711 }
712- }) ;
712+ };
713713 for (int32_t sector = 0 ; sector < NSECTORS; sector++) {
714714 GPUTPCNNClusterizer& clustererNN = processors ()->tpcNNClusterer [sector];
715715 GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow ()->tpcNNClusterer [sector] : clustererNN;
@@ -724,12 +724,24 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
724724 clustererNNShadow.mNnClusterizerTotalClusters = processors ()->tpcClusterer [lane].mNMaxClusters ;
725725 nnApplications[lane].initClusterizer (nn_settings, clustererNNShadow);
726726 }
727+ if (nn_settings.nnClusterizerVerbosity > 2 ) {
728+ LOG (info) << " (NNCLUS, GPUChainTrackingClusterizer, this=" << this << " ) Processor initialized. Sector " << sector << " , lane " << lane << " , max clusters " << clustererNN.mNnClusterizerTotalClusters << " (clustererNN=" << &clustererNN << " , clustererNNShadow=" << &clustererNNShadow << " )" ;
729+ }
727730 AllocateRegisteredMemory (clustererNN.mMemoryId );
731+ if (nn_settings.nnClusterizerVerbosity > 2 ) {
732+ LOG (info) << " (NNCLUS, GPUChainTrackingClusterizer, this=" << this << " ) Memory registered for memoryId " << clustererNN.mMemoryId << " (clustererNN=" << &clustererNN << " , clustererNNShadow=" << &clustererNNShadow << " )" ;
733+ }
728734 // nnApplications[lane].createBoundary(clustererNNShadow);
729735 // nnApplications[lane].createIndexLookup(clustererNNShadow);
730736 }
731737 if (doGPU) {
738+ if (nn_settings.nnClusterizerVerbosity > 2 ) {
739+ LOG (info) << " (NNCLUS, GPUChainTrackingClusterizer, this=" << this << " ) Writing to constant memory..." ;
740+ }
732741 WriteToConstantMemory (RecoStep::TPCClusterFinding, (char *)&processors ()->tpcNNClusterer - (char *)processors (), &processorsShadow ()->tpcNNClusterer , sizeof (GPUTPCNNClusterizer) * NSECTORS, mRec ->NStreams () - 1, &mEvents->init);
742+ if (nn_settings.nnClusterizerVerbosity > 2 ) {
743+ LOG (info) << " (NNCLUS, GPUChainTrackingClusterizer, this=" << this << " ) Writing to constant memory done" ;
744+ }
733745 }
734746 }
735747#endif
@@ -1010,9 +1022,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10101022 }
10111023
10121024 // float time_clusterizer = 0, time_fill = 0, time_networks = 0;
1025+ if (nn_settings.nnClusterizerVerbosity > 2 ) {
1026+ LOG (info) << " (NNCLUS, GPUChainTrackingClusterizer, this=" << this << " ) Starting loop over batched data. clustererNNShadow.mNnClusterizerBatchedMode=" << clustererNNShadow.mNnClusterizerBatchedMode << " , numLoops=" << std::ceil ((float )clusterer.mPmemory ->counters .nClusters / clustererNNShadow.mNnClusterizerBatchedMode ) << " , numClusters=" << clusterer.mPmemory ->counters .nClusters << " . (clustererNN=" << &clustererNN << " , clustererNNShadow=" << &clustererNNShadow << " )" ;
1027+ }
10131028 for (int batch = 0 ; batch < std::ceil ((float )clusterer.mPmemory ->counters .nClusters / clustererNNShadow.mNnClusterizerBatchedMode ); batch++) {
1029+ if (nn_settings.nnClusterizerVerbosity > 3 ) {
1030+ LOG (info) << " (NNCLUS, GPUChainTrackingClusterizer, this=" << this << " ) Start. Loop=" << batch << " . (clustererNN=" << &clustererNN << " , clustererNNShadow=" << &clustererNNShadow << " )" ;
1031+ }
10141032 uint batchStart = batch * clustererNNShadow.mNnClusterizerBatchedMode ;
1015- size_t iSize = CAMath::Min ((uint)clustererNNShadow.mNnClusterizerBatchedMode , (uint)(clusterer.mPmemory ->counters .nClusters - batchStart));
1033+ size_t iSize = CAMath::Min ((uint)clustererNNShadow.mNnClusterizerBatchedMode , (uint)(clusterer.mPmemory ->counters .nClusters - batchStart - 1 ));
10161034
10171035 // Filling the data
10181036 if (mRec ->IsGPU () || GetProcessingSettings ().nn .nnClusterizerForceGpuInputFill ) {
@@ -1022,9 +1040,18 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10221040 // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
10231041 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType , propagateMCLabels, batchStart);
10241042 }
1043+ if (doGPU) { // This is to make sure that the network does not start the evaluation before all data is filled
1044+ SynchronizeStream (lane);
1045+ }
1046+ if (nn_settings.nnClusterizerVerbosity > 3 ) {
1047+ LOG (info) << " (NNCLUS, GPUChainTrackingClusterizer, this=" << this << " ) Done filling data. Loop=" << batch << " . (clustererNN=" << &clustererNN << " , clustererNNShadow=" << &clustererNNShadow << " )" ;
1048+ }
10251049
10261050 if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags ) {
10271051 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType , propagateMCLabels, batchStart); // Publishing the deconvolution flags
1052+ if (nn_settings.nnClusterizerVerbosity > 3 ) {
1053+ LOG (info) << " (NNCLUS, GPUChainTrackingClusterizer, this=" << this << " ) Done setting deconvolution flags. Loop=" << batch << " . (clustererNN=" << &clustererNN << " , clustererNNShadow=" << &clustererNNShadow << " )" ;
1054+ }
10281055 }
10291056
10301057 // NN evaluations
@@ -1044,6 +1071,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10441071 }
10451072 }
10461073 if (GetProcessingSettings ().debugLevel >= 1 && doGPU) { nnTimers[3 *lane]->Stop (); }
1074+ if (nn_settings.nnClusterizerVerbosity > 3 ) {
1075+ LOG (info) << " (NNCLUS, GPUChainTrackingClusterizer, this=" << this << " ) Done with NN classification inference. Loop=" << batch << " . (clustererNN=" << &clustererNN << " , clustererNNShadow=" << &clustererNNShadow << " )" ;
1076+ }
10471077 }
10481078 if (!clustererNNShadow.mNnClusterizerUseCfRegression ) {
10491079 if (GetProcessingSettings ().debugLevel >= 1 && doGPU) { nnTimers[3 *lane + 1 ]->Start (); }
@@ -1078,6 +1108,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10781108 }
10791109 if (GetProcessingSettings ().debugLevel >= 1 && doGPU) { nnTimers[3 *lane + 2 ]->Stop (); }
10801110 }
1111+ if (nn_settings.nnClusterizerVerbosity > 3 ) {
1112+ LOG (info) << " (NNCLUS, GPUChainTrackingClusterizer, this=" << this << " ) Done with NN regression inference. Loop=" << batch << " . (clustererNN=" << &clustererNN << " , clustererNNShadow=" << &clustererNNShadow << " )" ;
1113+ }
10811114 }
10821115
10831116 // Publishing kernels for class labels and regression results
@@ -1092,6 +1125,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10921125 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType , propagateMCLabels, batchStart); // Publishing class 2 regression results
10931126 }
10941127 }
1128+ if (nn_settings.nnClusterizerVerbosity > 3 ) {
1129+ LOG (info) << " (NNCLUS, GPUChainTrackingClusterizer, this=" << this << " ) Done publishing. Loop=" << batch << " . (clustererNN=" << &clustererNN << " , clustererNNShadow=" << &clustererNNShadow << " )" ;
1130+ }
10951131 }
10961132
10971133 if (clustererNNShadow.mNnClusterizerUseCfRegression ) {
@@ -1100,6 +1136,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
11001136 }
11011137 DoDebugAndDump (RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile , " Split Charges" );
11021138 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid (clusterer.mPmemory ->counters .nClusters , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType , propagateMCLabels, 0 ); // Running the CF regression kernel - no batching needed: batchStart = 0
1139+ if (nn_settings.nnClusterizerVerbosity > 3 ) {
1140+ LOG (info) << " (NNCLUS, GPUChainTrackingClusterizer, this=" << this << " ) Done with CF regression. (clustererNN=" << &clustererNN << " , clustererNNShadow=" << &clustererNNShadow << " )" ;
1141+ }
11031142 }
11041143#else
11051144 GPUFatal (" Project not compiled with neural network clusterization. Aborting." );
@@ -1202,7 +1241,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
12021241 }
12031242 for (int32_t i = 0 ; i < GetProcessingSettings ().nTPCClustererLanes ; i++) {
12041243#ifdef GPUCA_HAS_ONNX
1205- if (GetProcessingSettings ().nn .applyNNclusterizer ) {
1244+ if (GetProcessingSettings ().nn .applyNNclusterizer && GetProcessingSettings (). nn . nnClusterizerVerbosity > 0 ) {
12061245 LOG (info) << " (ORT) Environment releasing..." ;
12071246 GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
12081247 nnApplication.mModelClass .release (true );
0 commit comments