@@ -888,9 +888,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
888888 mRec ->runParallelOuterLoop (doGPU, maxLane, [&](uint32_t lane) {
889889 uint32_t iSector = iSectorBase + lane;
890890 GPUTPCClusterFinder& clusterer = processors ()->tpcClusterer [iSector];
891- #ifdef GPUCA_HAS_ONNX
892- GPUTPCNNClusterizer& clustererNN = processors ()->tpcNNClusterer [iSector];
893- #endif
894891 GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow ()->tpcClusterer [iSector] : clusterer;
895892
896893 if (doGPU) {
@@ -912,62 +909,58 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
912909
913910 if (GetProcessingSettings ().nn .applyNNclusterizer ) {
914911#ifdef GPUCA_HAS_ONNX
915-
916- // Setting some initial sizes, important for memory allocation
912+ GPUTPCNNClusterizer& clustererNN = processors ()->tpcNNClusterer [iSector];
917913 const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings ().nn ;
918- int evalDtype = nn_settings.nnInferenceDtype .find (" 32" ) != std::string::npos;
919-
920- GPUTPCNNClusterizerHost nnApplication (nn_settings, clustererNN);
921-
922- if (clustererNN.nnClusterizerUseCfRegression || (int )(nn_settings.nnClusterizerApplyCfDeconvolution )) {
923- runKernel<GPUTPCCFDeconvolution>({GetGrid (clusterer.mPmemory ->counters .nPositions , lane), {iSector}});
924- DoDebugAndDump (RecoStep::TPCClusterFinding, 262144 << 4 , clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile , " Split Charges" );
925- }
926-
927- if (clustererNN.nnSigmoidTrafoClassThreshold ) {
928- // Inverse sigmoid transformation
929- clustererNN.nnClassThreshold = (float )std::log (clustererNN.nnClassThreshold / (1 .f - clustererNN.nnClassThreshold ));
930- }
931-
932- float time_clusterizer = 0 , time_fill = 0 ;
933- for (int batch = 0 ; batch < std::ceil ((float )clusterer.mPmemory ->counters .nClusters / clustererNN.nnClusterizerBatchedMode ); batch++) {
934- uint batchStart = batch * clustererNN.nnClusterizerBatchedMode ;
935- size_t iSize = CAMath::Min ((uint)clustererNN.nnClusterizerBatchedMode , (uint)(clusterer.mPmemory ->counters .nClusters - batchStart));
936-
937- auto start0 = std::chrono::high_resolution_clock::now ();
938- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Filling the data
939-
940- auto stop0 = std::chrono::high_resolution_clock::now ();
941- auto start1 = std::chrono::high_resolution_clock::now ();
942- nnApplication.networkInference (nnApplication.model_class , clustererNN, iSize, clustererNN.modelProbabilities , clustererNN.nnClusterizerDtype );
943- if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] == 1 ) {
944- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Assigning class labels
945- } else {
946- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Assigning class labels
947- }
948-
949- if (!clustererNN.nnClusterizerUseCfRegression ) {
950- nnApplication.networkInference (nnApplication.model_reg_1 , clustererNN, iSize, clustererNN.outputDataReg1 , clustererNN.nnClusterizerDtype );
951- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Running the NN for regression class 1
952- if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] > 1 && nnApplication.reg_model_paths .size () > 1 ) {
953- nnApplication.networkInference (nnApplication.model_reg_2 , clustererNN, iSize, clustererNN.outputDataReg2 , clustererNN.nnClusterizerDtype );
954- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Running the NN for regression class 2
955- }
956- }
957- auto stop1 = std::chrono::high_resolution_clock::now ();
958-
959- time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count () / 1e9 ;
960- time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count () / 1e9 ;
961- }
962- auto start1 = std::chrono::high_resolution_clock::now ();
963- if (clustererNN.nnClusterizerUseCfRegression ) {
964- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid (clusterer.mPmemory ->counters .nClusters , lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , 0 ); // Running the CF regression kernel - no batching needed: batchStart = 0
965- }
966- auto stop1 = std::chrono::high_resolution_clock::now ();
967- time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count () / 1e9 ;
968- if (clustererNN.nnClusterizerVerbosity < 3 ) {
969- LOG (info) << " [NN CF] Apply NN (fragment " << fragment.index << " , lane: " << lane << " , sector: " << iSector << " ): filling data " << time_fill << " s ; clusterizer: " << time_clusterizer << " s ; " << clusterer.mPmemory ->counters .nClusters << " clusters --> " << clusterer.mPmemory ->counters .nClusters / (time_fill + time_clusterizer) << " clusters/s" ;
970- }
914+ GPUTPCNNClusterizerHost nnApplication (nn_settings, clustererNN);
915+
916+ if (clustererNN.nnClusterizerUseCfRegression || (int )(nn_settings.nnClusterizerApplyCfDeconvolution )) {
917+ runKernel<GPUTPCCFDeconvolution>({GetGrid (clusterer.mPmemory ->counters .nPositions , lane), {iSector}});
918+ DoDebugAndDump (RecoStep::TPCClusterFinding, 262144 << 4 , clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile , " Split Charges" );
919+ }
920+
921+ float time_clusterizer = 0 , time_fill = 0 ;
922+ for (int batch = 0 ; batch < std::ceil ((float )clusterer.mPmemory ->counters .nClusters / clustererNN.nnClusterizerBatchedMode ); batch++) {
923+ uint batchStart = batch * clustererNN.nnClusterizerBatchedMode ;
924+ size_t iSize = CAMath::Min ((uint)clustererNN.nnClusterizerBatchedMode , (uint)(clusterer.mPmemory ->counters .nClusters - batchStart));
925+
926+ auto start0 = std::chrono::high_resolution_clock::now ();
927+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Filling the data
928+
929+ auto stop0 = std::chrono::high_resolution_clock::now ();
930+ auto start1 = std::chrono::high_resolution_clock::now ();
931+ nnApplication.networkInference (nnApplication.model_class , clustererNN, iSize, clustererNN.modelProbabilities , clustererNN.nnClusterizerDtype );
932+ if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] == 1 ) {
933+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Assigning class labels
934+ } else {
935+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Assigning class labels
936+ }
937+
938+ if (!clustererNN.nnClusterizerUseCfRegression ) {
939+ nnApplication.networkInference (nnApplication.model_reg_1 , clustererNN, iSize, clustererNN.outputDataReg1 , clustererNN.nnClusterizerDtype );
940+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Running the NN for regression class 1
941+ if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] > 1 && nnApplication.reg_model_paths .size () > 1 ) {
942+ nnApplication.networkInference (nnApplication.model_reg_2 , clustererNN, iSize, clustererNN.outputDataReg2 , clustererNN.nnClusterizerDtype );
943+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid (iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , batchStart); // Running the NN for regression class 2
944+ }
945+ }
946+ auto stop1 = std::chrono::high_resolution_clock::now ();
947+
948+ time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count () / 1e9 ;
949+ time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count () / 1e9 ;
950+ }
951+ auto start1 = std::chrono::high_resolution_clock::now ();
952+ if (clustererNN.nnClusterizerUseCfRegression ) {
953+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid (clusterer.mPmemory ->counters .nClusters , lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors (), iSector, clustererNN.nnClusterizerDtype , 0 , 0 ); // Running the CF regression kernel - no batching needed: batchStart = 0
954+ }
955+ auto stop1 = std::chrono::high_resolution_clock::now ();
956+ time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count () / 1e9 ;
957+ if (clustererNN.nnClusterizerVerbosity < 3 ) {
958+ int acceptedClusters = 0 ;
959+ for (size_t i = 0 ; i < clusterer.mPmemory ->counters .nClusters ; ++i) {
960+ acceptedClusters += clustererNN.outputDataClass [i];
961+ }
962+ LOG (info) << " [NN CF] Apply NN (fragment " << fragment.index << " , lane: " << lane << " , sector: " << iSector << " ): filling data " << time_fill << " s ; clusterizer: " << time_clusterizer << " s ; " << clusterer.mPmemory ->counters .nClusters << " clusters, " << acceptedClusters << " accepted. --> " << clusterer.mPmemory ->counters .nClusters / (time_fill + time_clusterizer) << " clusters/s" ;
963+ }
971964#else
972965 GPUFatal (" Project not compiled with neural network clusterization. Aborting." );
973966#endif
0 commit comments