@@ -611,6 +611,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
611611 RunTPCClusterizer_prepare (true ); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
612612 }
613613
614+ if (doGPU && mIOPtrs .tpcZS ) {
615+ processorsShadow ()->ioPtrs .tpcZS = mInputsShadow ->mPzsMeta ;
616+ WriteToConstantMemory (RecoStep::TPCClusterFinding, (char *)&processors ()->ioPtrs - (char *)processors (), &processorsShadow ()->ioPtrs , sizeof (processorsShadow ()->ioPtrs ), mRec ->NStreams () - 1);
617+ }
618+ if (doGPU) {
619+ WriteToConstantMemory (RecoStep::TPCClusterFinding, (char *)processors ()->tpcClusterer - (char *)processors (), processorsShadow ()->tpcClusterer , sizeof (GPUTPCClusterFinder) * NSECTORS, mRec ->NStreams () - 1, &mEvents->init);
620+ }
621+
614622#ifdef GPUCA_HAS_ONNX
615623 const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings ().nn ;
616624 GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings ().nTPCClustererLanes ];
@@ -624,9 +632,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
624632 }
625633 mRec ->runParallelOuterLoop (doGPU, numLanes, [&](uint32_t lane) {
626634 nnApplications[lane].init (nn_settings);
627- GPUTPCNNClusterizer& clustererNN = processors ()->tpcNNClusterer [lane];
628- GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow ()->tpcNNClusterer [lane] : clustererNN;
629-
630635 if (nnApplications[lane].modelsUsed [0 ]) {
631636 SetONNXGPUStream ((nnApplications[lane].model_class ).getSessionOptions (), lane, &deviceId);
632637 (nnApplications[lane].model_class ).setDeviceId (deviceId);
@@ -642,43 +647,32 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
642647 (nnApplications[lane].model_reg_2 ).setDeviceId (deviceId);
643648 (nnApplications[lane].model_reg_2 ).initEnvironment ();
644649 }
645- if (clustererNNShadow .nnClusterizerVerbosity < 3 ) {
650+ if (nn_settings .nnClusterizerVerbosity < 3 ) {
646651 LOG (info) << " Allocated ONNX stream for lane " << lane << " and device " << deviceId;
647652 }
648653 });
649654 mRec ->runParallelOuterLoop (doGPU, NSECTORS, [&](uint32_t sector) {
650655 GPUTPCNNClusterizer& clustererNN = processors ()->tpcNNClusterer [sector];
651656 GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow ()->tpcNNClusterer [sector] : clustererNN;
652657 int32_t lane = sector % numLanes;
658+ clustererNN.deviceId = deviceId;
659+ clustererNN.mISector = sector;
660+ clustererNN.nnClusterizerTotalClusters = maxClusters;
661+ nnApplications[lane].initClusterizer (nn_settings, clustererNN);
653662 if (doGPU){
654663 clustererNNShadow.deviceId = deviceId;
655664 clustererNNShadow.mISector = sector;
656665 clustererNNShadow.nnClusterizerTotalClusters = maxClusters;
657666 nnApplications[lane].initClusterizer (nn_settings, clustererNNShadow);
658- } else {
659- // TODO: not sure if this part is needed at all
660- clustererNN.deviceId = deviceId;
661- clustererNN.mISector = sector;
662- clustererNN.nnClusterizerTotalClusters = maxClusters;
663- nnApplications[lane].initClusterizer (nn_settings, clustererNN);
664667 }
665668 AllocateRegisteredMemory (clustererNN.mMemoryId );
666- if (doGPU){
667- WriteToConstantMemory (RecoStep::TPCClusterFinding, (char *)&clustererNN - (char *)processors (), &clustererNNShadow, sizeof (clustererNN), lane);
668- TransferMemoryResourcesToGPU (RecoStep::TPCClusterFinding, &clustererNNShadow, lane);
669- }
670669 });
670+ if (doGPU){
671+ WriteToConstantMemory (RecoStep::TPCClusterFinding, (char *)&processors ()->tpcNNClusterer - (char *)processors (), &processorsShadow ()->tpcNNClusterer , sizeof (GPUTPCNNClusterizer)*NSECTORS, mRec ->NStreams () - 1, &mEvents->init);
672+ }
671673 }
672674#endif
673675
674- if (doGPU && mIOPtrs .tpcZS ) {
675- processorsShadow ()->ioPtrs .tpcZS = mInputsShadow ->mPzsMeta ;
676- WriteToConstantMemory (RecoStep::TPCClusterFinding, (char *)&processors ()->ioPtrs - (char *)processors (), &processorsShadow ()->ioPtrs , sizeof (processorsShadow ()->ioPtrs ), mRec ->NStreams () - 1);
677- }
678- if (doGPU) {
679- WriteToConstantMemory (RecoStep::TPCClusterFinding, (char *)processors ()->tpcClusterer - (char *)processors (), processorsShadow ()->tpcClusterer , sizeof (GPUTPCClusterFinder) * NSECTORS, mRec ->NStreams () - 1, &mEvents->init);
680- }
681-
682676 size_t nClsTotal = 0 ;
683677 ClusterNativeAccess* tmpNativeAccess = mClusterNativeAccess .get ();
684678 ClusterNative* tmpNativeClusters = nullptr ;
@@ -961,7 +955,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
961955 auto stop0 = std::chrono::high_resolution_clock::now ();
962956 auto start1 = std::chrono::high_resolution_clock::now ();
963957
964- // nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType);
965958 if (clustererNNShadow.nnInferenceInputDType == 0 ) {
966959 if (clustererNNShadow.nnInferenceOutputDType == 0 ) {
967960 (nnApplication.model_class ).inference (clustererNNShadow.inputData_16 , iSize, clustererNNShadow.modelProbabilities_16 );
@@ -975,6 +968,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
975968 (nnApplication.model_class ).inference (clustererNNShadow.inputData_32 , iSize, clustererNNShadow.modelProbabilities_32 );
976969 }
977970 }
971+
978972 if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] == 1 ) {
979973 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType , withMC, batchStart); // Assigning class labels
980974 } else {
0 commit comments