Skip to content

Commit e66efb1

Browse files
committed
Network now accepts clusters over all sectors
1 parent 713dd64 commit e66efb1

File tree

2 files changed

+53
-60
lines changed

2 files changed

+53
-60
lines changed

Common/ML/src/OrtInterface.cxx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -233,10 +233,10 @@ void OrtModel::inference(I* input, size_t input_size, O* output)
233233
}
234234

235235
std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
236-
size_t outputSize = (int64_t)(inputShape[0] * mOutputShapes[0][1]);
236+
size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]);
237237
Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
238238

239-
(pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, 1);
239+
(pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here
240240
}
241241

242242
template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 51 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -888,9 +888,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
888888
mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
889889
uint32_t iSector = iSectorBase + lane;
890890
GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
891-
#ifdef GPUCA_HAS_ONNX
892-
GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
893-
#endif
894891
GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
895892

896893
if (doGPU) {
@@ -912,62 +909,58 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
912909

913910
if (GetProcessingSettings().nn.applyNNclusterizer) {
914911
#ifdef GPUCA_HAS_ONNX
915-
916-
// Setting some initial sizes, important for memory allocation
912+
GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
917913
const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
918-
int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
919-
920-
GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
921-
922-
if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
923-
runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
924-
DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
925-
}
926-
927-
if (clustererNN.nnSigmoidTrafoClassThreshold) {
928-
// Inverse sigmoid transformation
929-
clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
930-
}
931-
932-
float time_clusterizer = 0, time_fill = 0;
933-
for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
934-
uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
935-
size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
936-
937-
auto start0 = std::chrono::high_resolution_clock::now();
938-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
939-
940-
auto stop0 = std::chrono::high_resolution_clock::now();
941-
auto start1 = std::chrono::high_resolution_clock::now();
942-
nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
943-
if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
944-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
945-
} else {
946-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
947-
}
948-
949-
if (!clustererNN.nnClusterizerUseCfRegression) {
950-
nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
951-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
952-
if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
953-
nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
954-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
955-
}
956-
}
957-
auto stop1 = std::chrono::high_resolution_clock::now();
958-
959-
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
960-
time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
961-
}
962-
auto start1 = std::chrono::high_resolution_clock::now();
963-
if (clustererNN.nnClusterizerUseCfRegression) {
964-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
965-
}
966-
auto stop1 = std::chrono::high_resolution_clock::now();
967-
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
968-
if (clustererNN.nnClusterizerVerbosity < 3) {
969-
LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
970-
}
914+
GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
915+
916+
if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
917+
runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
918+
DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
919+
}
920+
921+
float time_clusterizer = 0, time_fill = 0;
922+
for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
923+
uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
924+
size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
925+
926+
auto start0 = std::chrono::high_resolution_clock::now();
927+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
928+
929+
auto stop0 = std::chrono::high_resolution_clock::now();
930+
auto start1 = std::chrono::high_resolution_clock::now();
931+
nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
932+
if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
933+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
934+
} else {
935+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
936+
}
937+
938+
if (!clustererNN.nnClusterizerUseCfRegression) {
939+
nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
940+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
941+
if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
942+
nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
943+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
944+
}
945+
}
946+
auto stop1 = std::chrono::high_resolution_clock::now();
947+
948+
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
949+
time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
950+
}
951+
auto start1 = std::chrono::high_resolution_clock::now();
952+
if (clustererNN.nnClusterizerUseCfRegression) {
953+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
954+
}
955+
auto stop1 = std::chrono::high_resolution_clock::now();
956+
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
957+
if (clustererNN.nnClusterizerVerbosity < 3) {
958+
int acceptedClusters = 0;
959+
for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
960+
acceptedClusters += clustererNN.outputDataClass[i];
961+
}
962+
LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
963+
}
971964
#else
972965
GPUFatal("Project not compiled with neural network clusterization. Aborting.");
973966
#endif

0 commit comments

Comments
 (0)