@@ -622,28 +622,45 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
622622 }
623623 for (uint32_t iSector = 0 ; iSector < NSECTORS; iSector++) {
624624 GPUTPCNNClusterizer& clustererNN = processors ()->tpcNNClusterer [iSector];
625- clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression ;
626- clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow ;
627- clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad ;
628- clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime ;
629- clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData ;
630- clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1 ) * (2 * nn_settings.nnClusterizerSizeInputPad + 1 ) * (2 * nn_settings.nnClusterizerSizeInputTime + 1 )) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0 );
631- clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode ;
632- clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue ;
633- clustererNN.nnClusterizerTotalClusters = maxClusters;
634- clustererNN.nnClassThreshold = nn_settings.nnClassThreshold ;
635- clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold ;
636- if (clustererNN.nnSigmoidTrafoClassThreshold ) {
637- clustererNN.nnClassThreshold = (float )std::log (clustererNN.nnClassThreshold / (1 .f - clustererNN.nnClassThreshold ));
625+ GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow ()->tpcNNClusterer [iSector] : clustererNN;
626+ clustererNNShadow.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression ;
627+ clustererNNShadow.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow ;
628+ clustererNNShadow.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad ;
629+ clustererNNShadow.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime ;
630+ clustererNNShadow.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData ;
631+ clustererNNShadow.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1 ) * (2 * nn_settings.nnClusterizerSizeInputPad + 1 ) * (2 * nn_settings.nnClusterizerSizeInputTime + 1 )) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0 );
632+ clustererNNShadow.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode ;
633+ clustererNNShadow.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue ;
634+ clustererNNShadow.nnClusterizerTotalClusters = maxClusters;
635+ clustererNNShadow.nnClassThreshold = nn_settings.nnClassThreshold ;
636+ clustererNNShadow.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold ;
637+ if (clustererNNShadow.nnSigmoidTrafoClassThreshold ) {
638+ clustererNNShadow.nnClassThreshold = (float )std::log (clustererNNShadow.nnClassThreshold / (1 .f - clustererNNShadow.nnClassThreshold ));
638639 }
639640 if (nn_settings.nnClusterizerVerbosity < 0 ) {
640- clustererNN .nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity ;
641+ clustererNNShadow .nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity ;
641642 } else {
642- clustererNN .nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity ;
643+ clustererNNShadow .nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity ;
643644 }
644- clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType .find (" 32" ) != std::string::npos;
645- nnApplication.initClusterizer (nn_settings, clustererNN);
646- AllocateRegisteredMemory (clustererNN.mMemoryId );
645+ clustererNNShadow.nnInferenceInputDType = nn_settings.nnInferenceInputDType .find (" 32" ) != std::string::npos;
646+ nnApplication.initClusterizer (nn_settings, clustererNNShadow);
647+ // if (doGPU) {
648+ // std::vector<int32_t> pointerSizes = clustererNNShadow.pointerSizes();
649+ // // FIXME: These are for sure not needed. The arrays are empty at this point, only the space needs to be reserved. Is this already handeled by computePointerWithAlignment?
650+ // // Once a GPU is available, everything should be done on the GPU for now.
651+ // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.inputData32, clustererNN.inputData32, pointerSizes[0], lane, true);
652+ // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.inputData16, clustererNN.inputData16, pointerSizes[1], lane, true);
653+ // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataClass, clustererNN.outputDataClass, pointerSizes[2], lane, true);
654+ // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.modelProbabilities, clustererNN.modelProbabilities, pointerSizes[3], lane, true);
655+ // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataReg1, clustererNN.outputDataReg1, pointerSizes[4], lane, true);
656+ // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataReg2, clustererNN.outputDataReg2, pointerSizes[5], lane, true);
657+ // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.peakPositions, clustererNN.peakPositions, pointerSizes[6], lane, true);
658+ // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.clusterFlags, clustererNN.clusterFlags, pointerSizes[7], lane, true);
659+ // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.centralCharges, clustererNN.centralCharges, pointerSizes[8], lane, true);
660+ // } else {
661+ // AllocateRegisteredMemory(clustererNNShadow.mMemoryId);
662+ // }
663+ AllocateRegisteredMemory (clustererNNShadow.mMemoryId );
647664 }
648665 }
649666#endif
@@ -917,41 +934,43 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
917934 if (GetProcessingSettings ().nn .applyNNclusterizer ) {
918935#ifdef GPUCA_HAS_ONNX
919936 GPUTPCNNClusterizer& clustererNN = processors ()->tpcNNClusterer [iSector];
937+ GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow ()->tpcNNClusterer [iSector] : clustererNN;
920938 const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings ().nn ;
921- GPUTPCNNClusterizerHost nnApplication (nn_settings, lane);
922- SetONNXGPUStream (nnApplication.model_class .updateSessionOptions (), lane);
923- SetONNXGPUStream (nnApplication.model_reg_1 .updateSessionOptions (), lane);
924- SetONNXGPUStream (nnApplication.model_reg_2 .updateSessionOptions (), lane);
939+ GPUTPCNNClusterizerHost nnApplication (nn_settings, lane); // FIXME: This needs to be the deviceID. If that is the lane, then this line is correct
940+ int32_t deviceId = -1 ;
941+ SetONNXGPUStream (nnApplication.model_class .updateSessionOptions (), lane, &deviceId);
942+ SetONNXGPUStream (nnApplication.model_reg_1 .updateSessionOptions (), lane, &deviceId);
943+ SetONNXGPUStream (nnApplication.model_reg_2 .updateSessionOptions (), lane, &deviceId);
925944 int withMC = (doGPU && propagateMCLabels);
926945
927- if (clustererNN .nnClusterizerUseCfRegression || (int )(nn_settings.nnClusterizerApplyCfDeconvolution )) {
946+ if (clustererNNShadow .nnClusterizerUseCfRegression || (int )(nn_settings.nnClusterizerApplyCfDeconvolution )) {
928947 runKernel<GPUTPCCFDeconvolution>({GetGrid (clusterer.mPmemory ->counters .nPositions , lane), {iSector}});
929948 DoDebugAndDump (RecoStep::TPCClusterFinding, 262144 << 4 , clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile , " Split Charges" );
930949 }
931950
932951 float time_clusterizer = 0 , time_fill = 0 ;
933- for (int batch = 0 ; batch < std::ceil ((float )clusterer.mPmemory ->counters .nClusters / clustererNN .nnClusterizerBatchedMode ); batch++) {
934- uint batchStart = batch * clustererNN .nnClusterizerBatchedMode ;
935- size_t iSize = CAMath::Min ((uint)clustererNN .nnClusterizerBatchedMode , (uint)(clusterer.mPmemory ->counters .nClusters - batchStart));
952+ for (int batch = 0 ; batch < std::ceil ((float )clusterer.mPmemory ->counters .nClusters / clustererNNShadow .nnClusterizerBatchedMode ); batch++) {
953+ uint batchStart = batch * clustererNNShadow .nnClusterizerBatchedMode ;
954+ size_t iSize = CAMath::Min ((uint)clustererNNShadow .nnClusterizerBatchedMode , (uint)(clusterer.mPmemory ->counters .nClusters - batchStart));
936955
937956 auto start0 = std::chrono::high_resolution_clock::now ();
938- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNN .nnInferenceInputDType , withMC, batchStart); // Filling the data
957+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow .nnInferenceInputDType , withMC, batchStart); // Filling the data
939958
940959 auto stop0 = std::chrono::high_resolution_clock::now ();
941960 auto start1 = std::chrono::high_resolution_clock::now ();
942- nnApplication.networkInference (nnApplication.model_class , clustererNN, iSize, clustererNN .modelProbabilities , clustererNN .nnInferenceInputDType );
961+ nnApplication.networkInference (nnApplication.model_class , clustererNN, iSize, clustererNNShadow .modelProbabilities , clustererNNShadow .nnInferenceInputDType , deviceId );
943962 if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] == 1 ) {
944- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNN .nnInferenceInputDType , withMC, batchStart); // Assigning class labels
963+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow .nnInferenceInputDType , withMC, batchStart); // Assigning class labels
945964 } else {
946- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNN .nnInferenceInputDType , withMC, batchStart); // Assigning class labels
965+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow .nnInferenceInputDType , withMC, batchStart); // Assigning class labels
947966 }
948967
949- if (!clustererNN .nnClusterizerUseCfRegression ) {
950- nnApplication.networkInference (nnApplication.model_reg_1 , clustererNN, iSize, clustererNN .outputDataReg1 , clustererNN .nnInferenceInputDType );
951- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNN .nnInferenceInputDType , withMC, batchStart); // Running the NN for regression class 1
968+ if (!clustererNNShadow .nnClusterizerUseCfRegression ) {
969+ nnApplication.networkInference (nnApplication.model_reg_1 , clustererNN, iSize, clustererNNShadow .outputDataReg1 , clustererNNShadow .nnInferenceInputDType , deviceId );
970+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow .nnInferenceInputDType , withMC, batchStart); // Running the NN for regression class 1
952971 if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] > 1 && nnApplication.model_reg_2 .isInitialized ()) {
953- nnApplication.networkInference (nnApplication.model_reg_2 , clustererNN, iSize, clustererNN .outputDataReg2 , clustererNN .nnInferenceInputDType );
954- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNN .nnInferenceInputDType , withMC, batchStart); // Running the NN for regression class 2
972+ nnApplication.networkInference (nnApplication.model_reg_2 , clustererNN, iSize, clustererNNShadow .outputDataReg2 , clustererNNShadow .nnInferenceInputDType , deviceId );
973+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow .nnInferenceInputDType , withMC, batchStart); // Running the NN for regression class 2
955974 }
956975 }
957976 auto stop1 = std::chrono::high_resolution_clock::now ();
@@ -960,15 +979,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
960979 time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count () / 1e9 ;
961980 }
962981 auto start1 = std::chrono::high_resolution_clock::now ();
963- if (clustererNN .nnClusterizerUseCfRegression ) {
964- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid (clusterer.mPmemory ->counters .nClusters , lane), krnlRunRangeNone}, iSector, clustererNN .nnInferenceInputDType , withMC, 0 ); // Running the CF regression kernel - no batching needed: batchStart = 0
982+ if (clustererNNShadow .nnClusterizerUseCfRegression ) {
983+ runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid (clusterer.mPmemory ->counters .nClusters , lane), krnlRunRangeNone}, iSector, clustererNNShadow .nnInferenceInputDType , withMC, 0 ); // Running the CF regression kernel - no batching needed: batchStart = 0
965984 }
966985 auto stop1 = std::chrono::high_resolution_clock::now ();
967986 time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count () / 1e9 ;
968- if (clustererNN .nnClusterizerVerbosity < 3 ) {
987+ if (clustererNNShadow .nnClusterizerVerbosity < 3 ) {
969988 int acceptedClusters = 0 ;
970989 for (size_t i = 0 ; i < clusterer.mPmemory ->counters .nClusters ; ++i) {
971- acceptedClusters += clustererNN .outputDataClass [i];
990+ acceptedClusters += clustererNNShadow .outputDataClass [i];
972991 }
973992 LOG (info) << " [NN CF] Apply NN (fragment " << fragment.index << " , lane: " << lane << " , sector: " << iSector << " ): filling data " << time_fill << " s ; clusterizer: " << time_clusterizer << " s ; " << clusterer.mPmemory ->counters .nClusters << " clusters, " << acceptedClusters << " accepted. --> " << clusterer.mPmemory ->counters .nClusters / (time_fill + time_clusterizer) << " clusters/s" ;
974993 }
0 commit comments