Skip to content

Commit 9155cca

Browse files
committed
New version of clusterizer. Compiles locally, but segfaults in fillInput kernel. Testing with the CI now.
1 parent 08753dd commit 9155cca

18 files changed

+758
-765
lines changed

Common/ML/include/ML/3rdparty/GPUORTFloat16.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
// - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_float16.h
66
// - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_api.h
77

8+
#ifndef GPUORTFLOAT16_H
9+
#define GPUORTFLOAT16_H
10+
811
#ifndef GPUCA_GPUCODE_DEVICE
912
#include <stdint.h>
1013
#include <cmath>
@@ -868,4 +871,6 @@ static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
868871

869872
} // namespace OrtDataType
870873

871-
} // namespace o2
874+
} // namespace o2
875+
876+
#endif

Common/ML/include/ML/OrtInterface.h

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,12 @@ class OrtModel
5353
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
5454
std::vector<O> inference(std::vector<I>&);
5555

56-
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
57-
O* inference(I*, size_t);
56+
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
57+
std::vector<O> inference(std::vector<std::vector<I>>&);
5858

5959
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
6060
void inference(I*, size_t, O*);
6161

62-
template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
63-
std::vector<O> inference(std::vector<std::vector<I>>&);
64-
6562
// template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
6663
// std::vector<O> inference(std::vector<I>&);
6764

Common/ML/src/OrtInterface.cxx

Lines changed: 38 additions & 144 deletions
Large diffs are not rendered by default.

GPU/GPUTracking/Base/GPUConstantMem.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@
3434
#include "GPUKernelDebugOutput.h"
3535
#endif
3636

37+
#ifdef GPUCA_HAS_ONNX
38+
#include "GPUTPCNNClusterizer.h"
39+
#endif
40+
3741
namespace o2::gpu
3842
{
3943
struct GPUConstantMem {
@@ -55,6 +59,9 @@ struct GPUConstantMem {
5559
#ifdef GPUCA_KERNEL_DEBUGGER_OUTPUT
5660
GPUKernelDebugOutput debugOutput;
5761
#endif
62+
#ifdef GPUCA_HAS_ONNX
63+
GPUTPCNNClusterizer tpcNNClusterer[GPUCA_NSECTORS];
64+
#endif
5865

5966
template <int32_t I>
6067
GPUd() auto& getTRDTracker();

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ GPUReconstruction::GPUReconstruction(const GPUSettingsDeviceBackend& cfg) : mHos
9393
for (uint32_t i = 0; i < NSECTORS; i++) {
9494
processors()->tpcTrackers[i].SetSector(i); // TODO: Move to a better place
9595
processors()->tpcClusterer[i].mISector = i;
96+
#ifdef GPUCA_HAS_ONNX
97+
processors()->tpcNNClusterer[i].mISector = i;
98+
#endif
9699
}
97100
#ifndef GPUCA_NO_ROOT
98101
mROOTDump = GPUROOTDumpCore::getAndCreate();

GPU/GPUTracking/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ set(SRCS_NO_CINT ${SRCS_NO_CINT}
193193
Merger/GPUTPCGMO2Output.cxx)
194194

195195
if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
196-
list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizer.cxx TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx)
196+
list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx TPCClusterFinder/GPUTPCNNClusterizer.cxx TPCClusterFinder/GPUTPCNNClusterizerHost.cxx)
197197
endif()
198198

199199
set(SRCS_DATATYPES

GPU/GPUTracking/Definitions/GPUDefGPUParameters.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@
8181
#define GPUCA_LB_GPUTPCCFNoiseSuppression 512
8282
#define GPUCA_LB_GPUTPCCFDeconvolution 512
8383
#define GPUCA_LB_GPUTPCCFClusterizer 448
84-
#define GPUCA_LB_GPUTPCNNClusterizer 448
84+
#define GPUCA_LB_GPUTPCNNClusterizerKernels 448
8585
#define GPUCA_LB_COMPRESSION_GATHER 1024
8686
#define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
8787
#define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -148,7 +148,7 @@
148148
#define GPUCA_LB_GPUTPCCFNoiseSuppression 512
149149
#define GPUCA_LB_GPUTPCCFDeconvolution 512
150150
#define GPUCA_LB_GPUTPCCFClusterizer 512
151-
#define GPUCA_LB_GPUTPCNNClusterizer 512
151+
#define GPUCA_LB_GPUTPCNNClusterizerKernels 512
152152
#define GPUCA_LB_COMPRESSION_GATHER 1024
153153
#define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
154154
#define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -215,7 +215,7 @@
215215
#define GPUCA_LB_GPUTPCCFNoiseSuppression 448
216216
#define GPUCA_LB_GPUTPCCFDeconvolution 384
217217
#define GPUCA_LB_GPUTPCCFClusterizer 448
218-
#define GPUCA_LB_GPUTPCNNClusterizer 448
218+
#define GPUCA_LB_GPUTPCNNClusterizerKernels 448
219219
#define GPUCA_LB_COMPRESSION_GATHER 1024
220220
#define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
221221
#define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -492,8 +492,8 @@
492492
#ifndef GPUCA_LB_GPUTPCCFClusterizer
493493
#define GPUCA_LB_GPUTPCCFClusterizer 512
494494
#endif
495-
#ifndef GPUCA_LB_GPUTPCNNClusterizer
496-
#define GPUCA_LB_GPUTPCNNClusterizer 512
495+
#ifndef GPUCA_LB_GPUTPCNNClusterizerKernels
496+
#define GPUCA_LB_GPUTPCNNClusterizerKernels 512
497497
#endif
498498
#ifndef GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU
499499
#define GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU 256
@@ -515,12 +515,12 @@
515515
#define GPUCA_LB_GPUTPCCFNoiseSuppression_updatePeaks GPUCA_LB_GPUTPCCFNoiseSuppression
516516

517517
#ifdef GPUCA_HAS_ONNX
518-
#define GPUCA_LB_GPUTPCNNClusterizer_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizer
519-
#define GPUCA_LB_GPUTPCNNClusterizer_fillInputNN GPUCA_LB_GPUTPCNNClusterizer
520-
#define GPUCA_LB_GPUTPCNNClusterizer_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizer
521-
#define GPUCA_LB_GPUTPCNNClusterizer_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizer
522-
#define GPUCA_LB_GPUTPCNNClusterizer_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizer
523-
#define GPUCA_LB_GPUTPCNNClusterizer_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizer
518+
#define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels
519+
#define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN GPUCA_LB_GPUTPCNNClusterizerKernels
520+
#define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels
521+
#define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels
522+
#define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels
523+
#define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizerKernels
524524
#endif
525525

526526
#define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_THREAD_COUNT_SCAN

GPU/GPUTracking/Global/GPUChainTracking.cxx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ void GPUChainTracking::RegisterPermanentMemoryAndProcessors()
104104
if (GetRecoSteps() & RecoStep::TPCClusterFinding) {
105105
for (uint32_t i = 0; i < NSECTORS; i++) {
106106
mRec->RegisterGPUProcessor(&processors()->tpcClusterer[i], GetRecoStepsGPU() & RecoStep::TPCClusterFinding);
107+
#ifdef GPUCA_HAS_ONNX
108+
mRec->RegisterGPUProcessor(&processors()->tpcNNClusterer[i], GetRecoStepsGPU() & RecoStep::TPCClusterFinding);
109+
#endif
107110
}
108111
}
109112
if (GetRecoSteps() & RecoStep::Refit) {
@@ -149,6 +152,9 @@ void GPUChainTracking::RegisterGPUProcessors()
149152
if (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) {
150153
for (uint32_t i = 0; i < NSECTORS; i++) {
151154
mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcClusterer[i], &processors()->tpcClusterer[i]);
155+
#ifdef GPUCA_HAS_ONNX
156+
mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcNNClusterer[i], &processors()->tpcNNClusterer[i]);
157+
#endif
152158
}
153159
}
154160
if (GetRecoStepsGPU() & RecoStep::Refit) {

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 43 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@
4040
#endif
4141

4242
#ifdef GPUCA_HAS_ONNX
43-
#include "GPUTPCNNClusterizer.h"
44-
#include "GPUTPCNNClusterizerInternals.h"
43+
#include "GPUTPCNNClusterizerKernels.h"
44+
#include "GPUTPCNNClusterizerHost.h"
4545
#endif
4646

4747
using namespace o2::gpu;
@@ -858,7 +858,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
858858
mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
859859
uint32_t iSector = iSectorBase + lane;
860860
GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
861+
GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
861862
GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
863+
862864
if (doGPU) {
863865
SynchronizeStream(lane);
864866
}
@@ -878,62 +880,68 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
878880

879881
if (GetProcessingSettings().nn.applyNNclusterizer) {
880882
#ifdef GPUCA_HAS_ONNX
881-
// Settings for the clusterizer
882-
GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn;
883-
clusterer.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
884-
clusterer.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
885-
clusterer.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
886-
clusterer.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
887-
clusterer.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
888-
clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
889-
clusterer.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
890-
clusterer.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
883+
884+
// Setting some initial sizes, important for memory allocation
885+
const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
886+
clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
887+
clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
888+
clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
889+
clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
890+
clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
891+
clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
892+
clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
893+
clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
891894
if (nn_settings.nnClusterizerVerbosity < 0) {
892-
clusterer.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
895+
clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
893896
} else {
894-
clusterer.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
897+
clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
895898
}
896899

897900
// Settings for the NN evaluation
898-
clusterer.nnClassThreshold = nn_settings.nnClassThreshold;
899-
clusterer.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
901+
clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
902+
clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
900903

901-
GPUTPCNNClusterizerInternals nnApplication(GetProcessingSettings(), clusterer);
904+
GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
902905

903-
if (clusterer.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
906+
if(fragment.index == 0){
907+
AllocateRegisteredMemory(clustererNN.mMemoryId);
908+
}
909+
910+
if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
904911
runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
905912
DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
906913
}
907914

908-
if (clusterer.nnSigmoidTrafoClassThreshold) {
915+
if (clustererNN.nnSigmoidTrafoClassThreshold) {
909916
// Inverse sigmoid transformation
910-
clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold));
917+
clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
911918
}
912919

913920
float time_clusterizer = 0, time_fill = 0;
914921
int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
915922

916-
for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clusterer.nnClusterizerBatchedMode); batch++) {
917-
uint batchStart = batch * clusterer.nnClusterizerBatchedMode;
918-
uint iSize = CAMath::Min((uint)clusterer.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
923+
for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
924+
uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
925+
size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
919926

920927
auto start0 = std::chrono::high_resolution_clock::now();
921-
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Filling the data
928+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Filling the data
929+
922930
auto stop0 = std::chrono::high_resolution_clock::now();
923931
auto start1 = std::chrono::high_resolution_clock::now();
924-
nnApplication.inferenceNetworkClass(clusterer, evalDtype, batchStart);
932+
nnApplication.inferenceNetworkClass(clustererNN, iSize, evalDtype, batchStart);
925933
if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
926-
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Assigning class labels
934+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Assigning class labels
927935
} else {
928-
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Assigning class labels
936+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Assigning class labels
929937
}
930938

931-
if (!clusterer.nnClusterizerUseCfRegression) {
932-
nnApplication.inferenceNetworkReg1(clusterer, evalDtype, batchStart);
933-
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Running the NN for regression class 1
939+
if (!clustererNN.nnClusterizerUseCfRegression) {
940+
nnApplication.inferenceNetworkReg1(clustererNN, iSize, evalDtype, batchStart);
941+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 1
934942
if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
935-
nnApplication.inferenceNetworkReg2(clusterer, evalDtype, batchStart);
936-
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Running the NN for regression class 2
943+
nnApplication.inferenceNetworkReg2(clustererNN, iSize, evalDtype, batchStart);
944+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 2
937945
}
938946
}
939947
auto stop1 = std::chrono::high_resolution_clock::now();
@@ -943,13 +951,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
943951
}
944952

945953
auto start1 = std::chrono::high_resolution_clock::now();
946-
if (clusterer.nnClusterizerUseCfRegression) {
947-
runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
954+
if (clustererNN.nnClusterizerUseCfRegression) {
955+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
948956
}
949957
auto stop1 = std::chrono::high_resolution_clock::now();
950958
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
951959

952-
if (clusterer.nnClusterizerVerbosity < 3) {
960+
if (clustererNN.nnClusterizerVerbosity < 3) {
953961
LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
954962
}
955963
#else

GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,6 @@ struct ChargePos;
5151

5252
class GPUTPCGeometry;
5353

54-
class GPUTPCNNClusterizerInternals;
55-
5654
class GPUTPCClusterFinder : public GPUProcessor
5755
{
5856
public:
@@ -144,40 +142,6 @@ class GPUTPCClusterFinder : public GPUProcessor
144142
int16_t mZSOffsetId = -1;
145143
int16_t mOutputId = -1;
146144

147-
// Neural network clusterization
148-
149-
int nnClusterizerSizeInputRow = 3;
150-
int nnClusterizerSizeInputPad = 3;
151-
int nnClusterizerSizeInputTime = 3;
152-
int nnClusterizerElementSize = -1;
153-
bool nnClusterizerAddIndexData = true;
154-
float nnClassThreshold = 0.16;
155-
bool nnSigmoidTrafoClassThreshold = 1;
156-
int nnClusterizerUseCfRegression = 0;
157-
int nnClusterizerBatchedMode = 1;
158-
int nnClusterizerVerbosity = 0;
159-
int nnClusterizerBoundaryFillValue = -1;
160-
int nnClusterizerDumpDigits = 0;
161-
int nnClusterizerApplyCfDeconvolution = 0;
162-
int nnClusterizerModelClassNumOutputNodes = -1;
163-
int nnClusterizerModelReg1NumOutputNodes = -1;
164-
int nnClusterizerModelReg2NumOutputNodes = -1;
165-
uint nnClusterizerCurrentSize = -1; // This variable determines the size of the memory pointers. It will be set at runtime.
166-
int nnClusterizerDtype = 0; // 0: float16, 1: float32
167-
168-
// Memory allocation for neural network
169-
uint class2_elements = 0;
170-
float* inputData32=nullptr;
171-
OrtDataType::Float16_t* inputData16=nullptr;
172-
float* outputDataClass=nullptr;
173-
float* modelProbabilities=nullptr;
174-
float* outputDataReg1=nullptr;
175-
float* outputDataReg2=nullptr;
176-
177-
ChargePos* peakPositions=nullptr;
178-
bool* clusterFlags=nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx
179-
float* centralCharges=nullptr;
180-
181145
#ifndef GPUCA_GPUCODE
182146
void DumpDigits(std::ostream& out);
183147
void DumpChargeMap(std::ostream& out, std::string_view);

0 commit comments

Comments
 (0)