4242#ifdef GPUCA_HAS_ONNX
4343#include " GPUTPCNNClusterizerKernels.h"
4444#include " GPUTPCNNClusterizerHost.h"
45+ // #include "ML/3rdparty/GPUORTFloat16.h"
4546#endif
4647
4748using namespace o2 ::gpu;
@@ -630,31 +631,39 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
630631 mRec ->runParallelOuterLoop (doGPU, numLanes, [&](uint32_t lane) {
631632 nnApplications[lane].init (nn_settings);
632633 if (nnApplications[lane].modelsUsed [0 ]) {
633- SetONNXGPUStream ((nnApplications[lane].model_class ).getSessionOptions (), lane, &deviceId);
634+ SetONNXGPUStream (* (nnApplications[lane].model_class ).getSessionOptions (), lane, &deviceId);
634635 (nnApplications[lane].model_class ).setDeviceId (deviceId);
635636 if (nnApplications[lane].model_class .getIntraOpNumThreads () > maxThreads) {
636637 nnApplications[lane].model_class .setIntraOpNumThreads (maxThreads);
637638 }
638639 (nnApplications[lane].model_class ).initEnvironment ();
640+ // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, 0);
641+ (nnApplications[lane].model_class ).initSession ();
639642 }
640643 if (nnApplications[lane].modelsUsed [1 ]) {
641- SetONNXGPUStream ((nnApplications[lane].model_reg_1 ).getSessionOptions (), lane, &deviceId);
644+ SetONNXGPUStream (* (nnApplications[lane].model_reg_1 ).getSessionOptions (), lane, &deviceId);
642645 (nnApplications[lane].model_reg_1 ).setDeviceId (deviceId);
643646 if (nnApplications[lane].model_reg_1 .getIntraOpNumThreads () > maxThreads) {
644647 nnApplications[lane].model_reg_1 .setIntraOpNumThreads (maxThreads);
645648 }
649+ // (nnApplications[lane].model_reg_1).setEnv((nnApplications[lane].model_class).getEnv());
646650 (nnApplications[lane].model_reg_1 ).initEnvironment ();
651+ // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, 1);
652+ (nnApplications[lane].model_reg_1 ).initSession ();
647653 }
648654 if (nnApplications[lane].modelsUsed [2 ]) {
649- SetONNXGPUStream ((nnApplications[lane].model_reg_2 ).getSessionOptions (), lane, &deviceId);
655+ SetONNXGPUStream (* (nnApplications[lane].model_reg_2 ).getSessionOptions (), lane, &deviceId);
650656 (nnApplications[lane].model_reg_2 ).setDeviceId (deviceId);
651657 if (nnApplications[lane].model_reg_2 .getIntraOpNumThreads () > maxThreads) {
652658 nnApplications[lane].model_reg_2 .setIntraOpNumThreads (maxThreads);
653659 }
660+ // (nnApplications[lane].model_reg_2).setEnv((nnApplications[lane].model_class).getEnv());
654661 (nnApplications[lane].model_reg_2 ).initEnvironment ();
662+ // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_2).getEnv(), (nnApplications[lane].model_reg_2).getMemoryInfo(), mRec, 2);
663+ (nnApplications[lane].model_reg_2 ).initSession ();
655664 }
656665 if (nn_settings.nnClusterizerVerbosity < 3 ) {
657- LOG (info) << " Allocated ONNX stream for lane " << lane << " and device " << deviceId;
666+ LOG (info) << " (ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
658667 }
659668 });
660669 mRec ->runParallelOuterLoop (doGPU, NSECTORS, [&](uint32_t sector) {
@@ -957,9 +966,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
957966
958967 auto start0 = std::chrono::high_resolution_clock::now ();
959968 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid (iSize * clustererNNShadow.nnClusterizerElementSize , lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType , withMC, batchStart); // Filling the data
960- auto stop0 = std::chrono::high_resolution_clock::now ();
969+ // auto stop0 = std::chrono::high_resolution_clock::now();
961970
962- auto start1 = std::chrono::high_resolution_clock::now ();
971+ // auto start1 = std::chrono::high_resolution_clock::now();
963972
964973 // NN evaluations
965974 if (clustererNNShadow.nnInferenceInputDType == 0 ) {
@@ -1006,7 +1015,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10061015 }
10071016 }
10081017
1009- auto stopNNs = std::chrono::high_resolution_clock::now ();
1018+ // auto stopNNs = std::chrono::high_resolution_clock::now();
10101019
10111020 // Publishing kernels
10121021 if (nnApplication.model_class .getNumOutputNodes ()[0 ][1 ] == 1 ) {
@@ -1020,25 +1029,41 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
10201029 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid (iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType , withMC, batchStart); // Publishing class 2 regression results
10211030 }
10221031 }
1023- auto stop1 = std::chrono::high_resolution_clock::now ();
10241032
1025- time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count () / 1e9 ;
1026- time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count () / 1e9 ;
1027- time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count () / 1e9 ;
1028- }
1029- if (clustererNNShadow.nnClusterizerUseCfRegression ) {
1030- auto start1 = std::chrono::high_resolution_clock::now ();
1031- runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid (clusterer.mPmemory ->counters .nClusters , lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType , withMC, 0 ); // Running the CF regression kernel - no batching needed: batchStart = 0
1032- auto stop1 = std::chrono::high_resolution_clock::now ();
1033- time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count () / 1e9 ;
1034- }
1035- if (clustererNNShadow.nnClusterizerVerbosity < 3 ) {
1036- int acceptedClusters = 0 ;
1037- for (size_t i = 0 ; i < clusterer.mPmemory ->counters .nClusters ; ++i) {
1038- acceptedClusters += clustererNNShadow.outputDataClass [i];
1039- }
1040- LOG (info) << " [NN CF] Apply NN (fragment " << fragment.index << " , lane: " << lane << " , sector: " << iSector << " ): filling data " << time_fill << " s ; networks: " << time_networks << " s ; clusterizer: " << time_clusterizer << " s ; " << clusterer.mPmemory ->counters .nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t )clusterer.mPmemory ->counters .nClusters / (time_fill + time_clusterizer) << " clusters/s" ;
1033+ // for(int i = 0; i < iSize; ++i) {
1034+ // if(clustererNNShadow.outputDataClass[i + batchStart] > 1) {
1035+ // LOG(info) << "WARNING ORT: Output of " << i + batchStart << " / " << clusterer.mPmemory->counters.nClusters << " is " << clustererNNShadow.modelProbabilities_16[i].ToFloat() << " and " << clustererNNShadow.outputDataClass[i + batchStart] << " thresh " << clustererNNShadow.nnClassThreshold << " instead of 0 or 1. Please check the model and the input data.";
1036+ // // std::string input = "[";
1037+ // // for(int j = 0; j < clustererNNShadow.nnClusterizerElementSize; j++){
1038+ // // input += std::to_string(clustererNNShadow.inputData_16[i * clustererNNShadow.nnClusterizerElementSize + j].ToFloat()) + ", ";
1039+ // // }
1040+ // // input += "]";
1041+ // // LOG(info) << "Input is: " << input;
1042+ // }
1043+ // }
1044+
1045+ // auto stop1 = std::chrono::high_resolution_clock::now();
1046+
1047+ // time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count() / 1e9;
1048+ // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
1049+ // time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
10411050 }
1051+ // if (clustererNNShadow.nnClusterizerUseCfRegression) {
1052+ // auto start1 = std::chrono::high_resolution_clock::now();
1053+ // runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
1054+ // auto stop1 = std::chrono::high_resolution_clock::now();
1055+ // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
1056+ // }
1057+ // if (clustererNNShadow.nnClusterizerVerbosity < 3) {
1058+ // int acceptedClusters = 0;
1059+ // for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
1060+ // if(clustererNNShadow.outputDataClass[i] > 1 || clustererNNShadow.outputDataClass[i] < 0) {
1061+ // LOG(info) << "WARNING ORT 2: " << clustererNNShadow.outputDataClass[i] << " for index " << i << " / " << clusterer.mPmemory->counters.nClusters;
1062+ // }
1063+ // acceptedClusters += clustererNNShadow.outputDataClass[i];
1064+ // }
1065+ // LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
1066+ // }
10421067#else
10431068 GPUFatal (" Project not compiled with neural network clusterization. Aborting." );
10441069#endif
@@ -1139,12 +1164,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
11391164 }
11401165 }
11411166 for (int32_t i = 0 ; i < GetProcessingSettings ().nTPCClustererLanes ; i++) {
1142- if (GetProcessingSettings ().nn .applyNNclusterizer ) {
1143- GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
1144- nnApplication.model_class .release ();
1145- nnApplication.model_reg_1 .release ();
1146- nnApplication.model_reg_2 .release ();
1147- }
1167+ // if (GetProcessingSettings().nn.applyNNclusterizer) {
1168+ // GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
1169+ // nnApplication.model_class.release(GetProcessingSettings().nn.nnInferenceOrtProfiling );
1170+ // nnApplication.model_reg_1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling );
1171+ // nnApplication.model_reg_2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling );
1172+ // }
11481173 if (transferRunning[i]) {
11491174 ReleaseEvent (mEvents ->stream [i], doGPU);
11501175 }
0 commit comments