@@ -523,7 +523,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
523523 mPipelineNotifyCtx ->rec ->AllocateRegisteredForeignMemory (processors ()->tpcClusterer [iSector].mZSId , mRec );
524524 } else {
525525 AllocateRegisteredMemory (processors ()->tpcClusterer [iSector].mZSOffsetId );
526- AllocateRegisteredMemory (processors ()->tpcClusterer [iSector].mZSId );
526+ AllocateRegisteredMemory (processors ()->tpcClusterer [iSector].mZSId );
527527 }
528528 }
529529 } else {
@@ -611,6 +611,36 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
611611 RunTPCClusterizer_prepare (true ); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
612612 }
613613
614+ #ifdef GPUCA_HAS_ONNX
615+ uint32_t maxClusters = -1 ;
616+ for (uint32_t iSector = 0 ; iSector < NSECTORS; iSector++) {
617+ maxClusters = std::max (maxClusters, processors ()->tpcClusterer [iSector].mNMaxClusters );
618+ }
619+ for (uint32_t iSector = 0 ; iSector < NSECTORS; iSector++) {
620+ GPUTPCNNClusterizer& clustererNN = processors ()->tpcNNClusterer [iSector];
621+ const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings ().nn ;
622+ clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression ;
623+ clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow ;
624+ clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad ;
625+ clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime ;
626+ clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData ;
627+ clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1 ) * (2 * nn_settings.nnClusterizerSizeInputPad + 1 ) * (2 * nn_settings.nnClusterizerSizeInputTime + 1 )) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0 );
628+ clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode ;
629+ clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue ;
630+ clustererNN.nnClusterizerTotalClusters = maxClusters;
631+ clustererNN.nnClassThreshold = nn_settings.nnClassThreshold ;
632+ clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold ;
633+ if (nn_settings.nnClusterizerVerbosity < 0 ) {
634+ clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity ;
635+ } else {
636+ clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity ;
637+ }
638+ clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype .find (" 32" ) != std::string::npos;
639+ GPUTPCNNClusterizerHost nnApplication (nn_settings, clustererNN);
640+ AllocateRegisteredMemory (clustererNN.mMemoryId );
641+ }
642+ #endif
643+
614644 if (doGPU && mIOPtrs .tpcZS ) {
615645 processorsShadow ()->ioPtrs .tpcZS = mInputsShadow ->mPzsMeta ;
616646 WriteToConstantMemory (RecoStep::TPCClusterFinding, (char *)&processors ()->ioPtrs - (char *)processors (), &processorsShadow ()->ioPtrs , sizeof (processorsShadow ()->ioPtrs ), mRec ->NStreams () - 1);
@@ -885,86 +915,59 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
885915
886916 // Setting some initial sizes, important for memory allocation
887917 const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings ().nn ;
888- clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression ;
889- clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow ;
890- clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad ;
891- clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime ;
892- clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData ;
893- clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1 ) * (2 * nn_settings.nnClusterizerSizeInputPad + 1 ) * (2 * nn_settings.nnClusterizerSizeInputTime + 1 )) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0 );
894- clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode ;
895- clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue ;
896- clustererNN.nnClusterizerTotalClusters = clusterer.mNMaxClusterPerRow ;
897- if (nn_settings.nnClusterizerVerbosity < 0 ) {
898- clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity ;
899- } else {
900- clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity ;
901- }
902-
903918 int evalDtype = nn_settings.nnInferenceDtype .find (" 32" ) != std::string::npos;
904- clustererNN.nnClusterizerDtype = evalDtype;
905-
906- // Settings for the NN evaluation
907- clustererNN.nnClassThreshold = nn_settings.nnClassThreshold ;
908- clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold ;
909-
910- GPUTPCNNClusterizerHost nnApplication (nn_settings, clustererNN);
911-
912- if (fragment.index == 0 ){
913- AllocateRegisteredMemory (clustererNN.mMemoryId );
914- }
915-
916- if (clustererNN.nnClusterizerUseCfRegression || (int )(nn_settings.nnClusterizerApplyCfDeconvolution )) {
917- runKernel<GPUTPCCFDeconvolution>({GetGrid (clusterer.mPmemory ->counters .nPositions , lane), {iSector}});
918- DoDebugAndDump (RecoStep::TPCClusterFinding, 262144 << 4 , clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile , " Split Charges" );
919- }
920-
921- if (clustererNN.nnSigmoidTrafoClassThreshold ) {
922- // Inverse sigmoid transformation
923- clustererNN.nnClassThreshold = (float )std::log (clustererNN.nnClassThreshold / (1 .f - clustererNN.nnClassThreshold ));
924- }
925-
926- float time_clusterizer = 0 , time_fill = 0 ;
927-
928- for (int batch = 0 ; batch < std::ceil ((float )clusterer.mPmemory ->counters .nClusters / clustererNN.nnClusterizerBatchedMode ); batch++) {
929- uint batchStart = batch * clustererNN.nnClusterizerBatchedMode ;
930- size_t iSize = CAMath::Min ((uint)clustererNN.nnClusterizerBatchedMode , (uint)(clusterer.mPmemory ->counters .nClusters - batchStart));
931-
932- auto start0 = std::chrono::high_resolution_clock::now ();
933- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0 , batchStart); // Filling the data
934-
935- auto stop0 = std::chrono::high_resolution_clock::now ();
936- auto start1 = std::chrono::high_resolution_clock::now ();
937- nnApplication.networkInference (nnApplication.model_class , clustererNN, iSize, clustererNN.modelProbabilities , evalDtype);
938- if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] == 1 ) {
939- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0 , batchStart); // Assigning class labels
940- } else {
941- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0 , batchStart); // Assigning class labels
942- }
943919
944- if (!clustererNN.nnClusterizerUseCfRegression ) {
945- nnApplication.networkInference (nnApplication.model_reg_1 , clustererNN, iSize, clustererNN.outputDataReg1 , evalDtype);
946- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0 , batchStart); // Running the NN for regression class 1
947- if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] > 1 && nnApplication.reg_model_paths .size () > 1 ) {
948- nnApplication.networkInference (nnApplication.model_reg_2 , clustererNN, iSize, clustererNN.outputDataReg2 , evalDtype);
949- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0 , batchStart); // Running the NN for regression class 2
950- }
951- }
952- auto stop1 = std::chrono::high_resolution_clock::now ();
953-
954- time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count () / 1e9 ;
955- time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count () / 1e9 ;
956- }
957-
958- auto start1 = std::chrono::high_resolution_clock::now ();
959- if (clustererNN.nnClusterizerUseCfRegression ) {
960- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid (clusterer.mPmemory ->counters .nClusters , lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0 , 0 ); // Running the CF regression kernel - no batching needed: batchStart = 0
961- }
962- auto stop1 = std::chrono::high_resolution_clock::now ();
963- time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count () / 1e9 ;
964-
965- if (clustererNN.nnClusterizerVerbosity < 3 ) {
966- LOG (info) << " [NN CF] Apply NN (fragment " << fragment.index << " , lane: " << lane << " , sector: " << iSector << " ): filling data " << time_fill << " s ; clusterizer: " << time_clusterizer << " s ; " << clusterer.mPmemory ->counters .nClusters << " clusters --> " << clusterer.mPmemory ->counters .nClusters / (time_fill + time_clusterizer) << " clusters/s" ;
967- }
920+ GPUTPCNNClusterizerHost nnApplication (nn_settings, clustererNN);
921+
922+ if (clustererNN.nnClusterizerUseCfRegression || (int )(nn_settings.nnClusterizerApplyCfDeconvolution )) {
923+ runKernel<GPUTPCCFDeconvolution>({GetGrid (clusterer.mPmemory ->counters .nPositions , lane), {iSector}});
924+ DoDebugAndDump (RecoStep::TPCClusterFinding, 262144 << 4 , clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile , " Split Charges" );
925+ }
926+
927+ if (clustererNN.nnSigmoidTrafoClassThreshold ) {
928+ // Inverse sigmoid transformation
929+ clustererNN.nnClassThreshold = (float )std::log (clustererNN.nnClassThreshold / (1 .f - clustererNN.nnClassThreshold ));
930+ }
931+
932+ float time_clusterizer = 0 , time_fill = 0 ;
933+ for (int batch = 0 ; batch < std::ceil ((float )clusterer.mPmemory ->counters .nClusters / clustererNN.nnClusterizerBatchedMode ); batch++) {
934+ uint batchStart = batch * clustererNN.nnClusterizerBatchedMode ;
935+ size_t iSize = CAMath::Min ((uint)clustererNN.nnClusterizerBatchedMode , (uint)(clusterer.mPmemory ->counters .nClusters - batchStart));
936+
937+ auto start0 = std::chrono::high_resolution_clock::now ();
938+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Filling the data
939+
940+ auto stop0 = std::chrono::high_resolution_clock::now ();
941+ auto start1 = std::chrono::high_resolution_clock::now ();
942+ nnApplication.networkInference (nnApplication.model_class , clustererNN, iSize, clustererNN.modelProbabilities , clustererNN.nnClusterizerDtype );
943+ if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] == 1 ) {
944+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Assigning class labels
945+ } else {
946+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Assigning class labels
947+ }
948+
949+ if (!clustererNN.nnClusterizerUseCfRegression ) {
950+ nnApplication.networkInference (nnApplication.model_reg_1 , clustererNN, iSize, clustererNN.outputDataReg1 , clustererNN.nnClusterizerDtype );
951+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Running the NN for regression class 1
952+ if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] > 1 && nnApplication.reg_model_paths .size () > 1 ) {
953+ nnApplication.networkInference (nnApplication.model_reg_2 , clustererNN, iSize, clustererNN.outputDataReg2 , clustererNN.nnClusterizerDtype );
954+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Running the NN for regression class 2
955+ }
956+ }
957+ auto stop1 = std::chrono::high_resolution_clock::now ();
958+
959+ time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count () / 1e9 ;
960+ time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count () / 1e9 ;
961+ }
962+ auto start1 = std::chrono::high_resolution_clock::now ();
963+ if (clustererNN.nnClusterizerUseCfRegression ) {
964+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid (clusterer.mPmemory ->counters .nClusters , lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , 0 ); // Running the CF regression kernel - no batching needed: batchStart = 0
965+ }
966+ auto stop1 = std::chrono::high_resolution_clock::now ();
967+ time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count () / 1e9 ;
968+ if (clustererNN.nnClusterizerVerbosity < 3 ) {
969+ LOG (info) << " [NN CF] Apply NN (fragment " << fragment.index << " , lane: " << lane << " , sector: " << iSector << " ): filling data " << time_fill << " s ; clusterizer: " << time_clusterizer << " s ; " << clusterer.mPmemory ->counters .nClusters << " clusters --> " << clusterer.mPmemory ->counters .nClusters / (time_fill + time_clusterizer) << " clusters/s" ;
970+ }
968971#else
969972 GPUFatal (" Project not compiled with neural network clusterization. Aborting." );
970973#endif
0 commit comments