Skip to content

Commit 713dd64

Browse files
committed
Fixed segfault. Not producing the right number of clusters yet.
1 parent bce04bc commit 713dd64

File tree

7 files changed

+170
-118
lines changed

7 files changed

+170
-118
lines changed

GPU/GPUTracking/Base/GPUConstantMem.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,7 @@ namespace o2::gpu
4242
{
4343
struct GPUConstantMem {
4444
GPUParam param;
45-
GPUTPCTracker
46-
tpcTrackers[GPUCA_NSECTORS];
45+
GPUTPCTracker tpcTrackers[GPUCA_NSECTORS];
4746
GPUTPCConvert tpcConverter;
4847
GPUTPCCompression tpcCompressor;
4948
GPUTPCDecompression tpcDecompressor;

GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx

Lines changed: 82 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -523,7 +523,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
523523
mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSId, mRec);
524524
} else {
525525
AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSOffsetId);
526-
AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSId);
526+
AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSId);
527527
}
528528
}
529529
} else {
@@ -611,6 +611,36 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
611611
RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
612612
}
613613

614+
#ifdef GPUCA_HAS_ONNX
615+
uint32_t maxClusters = -1;
616+
for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
617+
maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
618+
}
619+
for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
620+
GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
621+
const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
622+
clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
623+
clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
624+
clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
625+
clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
626+
clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
627+
clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
628+
clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
629+
clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
630+
clustererNN.nnClusterizerTotalClusters = maxClusters;
631+
clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
632+
clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
633+
if (nn_settings.nnClusterizerVerbosity < 0) {
634+
clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
635+
} else {
636+
clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
637+
}
638+
clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
639+
GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
640+
AllocateRegisteredMemory(clustererNN.mMemoryId);
641+
}
642+
#endif
643+
614644
if (doGPU && mIOPtrs.tpcZS) {
615645
processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta;
616646
WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
@@ -885,86 +915,59 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
885915

886916
// Setting some initial sizes, important for memory allocation
887917
const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
888-
clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
889-
clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
890-
clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
891-
clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
892-
clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
893-
clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
894-
clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
895-
clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
896-
clustererNN.nnClusterizerTotalClusters = clusterer.mNMaxClusterPerRow;
897-
if (nn_settings.nnClusterizerVerbosity < 0) {
898-
clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
899-
} else {
900-
clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
901-
}
902-
903918
int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
904-
clustererNN.nnClusterizerDtype = evalDtype;
905-
906-
// Settings for the NN evaluation
907-
clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
908-
clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
909-
910-
GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
911-
912-
if(fragment.index == 0){
913-
AllocateRegisteredMemory(clustererNN.mMemoryId);
914-
}
915-
916-
if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
917-
runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
918-
DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
919-
}
920-
921-
if (clustererNN.nnSigmoidTrafoClassThreshold) {
922-
// Inverse sigmoid transformation
923-
clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
924-
}
925-
926-
float time_clusterizer = 0, time_fill = 0;
927-
928-
for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
929-
uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
930-
size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
931-
932-
auto start0 = std::chrono::high_resolution_clock::now();
933-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Filling the data
934-
935-
auto stop0 = std::chrono::high_resolution_clock::now();
936-
auto start1 = std::chrono::high_resolution_clock::now();
937-
nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, evalDtype);
938-
if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
939-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Assigning class labels
940-
} else {
941-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Assigning class labels
942-
}
943919

944-
if (!clustererNN.nnClusterizerUseCfRegression) {
945-
nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, evalDtype);
946-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 1
947-
if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
948-
nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, evalDtype);
949-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 2
950-
}
951-
}
952-
auto stop1 = std::chrono::high_resolution_clock::now();
953-
954-
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
955-
time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
956-
}
957-
958-
auto start1 = std::chrono::high_resolution_clock::now();
959-
if (clustererNN.nnClusterizerUseCfRegression) {
960-
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
961-
}
962-
auto stop1 = std::chrono::high_resolution_clock::now();
963-
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
964-
965-
if (clustererNN.nnClusterizerVerbosity < 3) {
966-
LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
967-
}
920+
GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
921+
922+
if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
923+
runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
924+
DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
925+
}
926+
927+
if (clustererNN.nnSigmoidTrafoClassThreshold) {
928+
// Inverse sigmoid transformation
929+
clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
930+
}
931+
932+
float time_clusterizer = 0, time_fill = 0;
933+
for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
934+
uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
935+
size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
936+
937+
auto start0 = std::chrono::high_resolution_clock::now();
938+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
939+
940+
auto stop0 = std::chrono::high_resolution_clock::now();
941+
auto start1 = std::chrono::high_resolution_clock::now();
942+
nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
943+
if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
944+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
945+
} else {
946+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
947+
}
948+
949+
if (!clustererNN.nnClusterizerUseCfRegression) {
950+
nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
951+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
952+
if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
953+
nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
954+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
955+
}
956+
}
957+
auto stop1 = std::chrono::high_resolution_clock::now();
958+
959+
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
960+
time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
961+
}
962+
auto start1 = std::chrono::high_resolution_clock::now();
963+
if (clustererNN.nnClusterizerUseCfRegression) {
964+
runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
965+
}
966+
auto stop1 = std::chrono::high_resolution_clock::now();
967+
time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
968+
if (clustererNN.nnClusterizerVerbosity < 3) {
969+
LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
970+
}
968971
#else
969972
GPUFatal("Project not compiled with neural network clusterization. Aborting.");
970973
#endif

GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.cxx

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,3 +119,52 @@ GPUd() bool ClusterAccumulator::toNative(const ChargePos& pos, Charge q, tpc::Cl
119119

120120
return true;
121121
}
122+
123+
GPUd() bool ClusterAccumulator::toNativeSimple(const ChargePos& pos, Charge q, tpc::ClusterNative& cn, const GPUParam& param, const Array2D<PackedCharge>& chargeMap)
124+
{
125+
cn.qTot = CAMath::Float2UIntRn(mQtot);
126+
if (cn.qTot <= param.rec.tpc.cfQTotCutoff) {
127+
return false;
128+
}
129+
cn.qMax = q;
130+
if (cn.qMax <= param.rec.tpc.cfQMaxCutoff) {
131+
return false;
132+
}
133+
if (mTimeMean < param.rec.tpc.clustersShiftTimebinsClusterizer) {
134+
return false;
135+
}
136+
if (q <= param.rec.tpc.cfQMaxCutoffSingleTime && mTimeSigma == 0) {
137+
return false;
138+
}
139+
if (q <= param.rec.tpc.cfQMaxCutoffSinglePad && mPadSigma == 0) {
140+
return false;
141+
}
142+
143+
bool wasSplitInTime = mSplitInTime >= param.rec.tpc.cfMinSplitNum;
144+
bool wasSplitInPad = mSplitInPad >= param.rec.tpc.cfMinSplitNum;
145+
bool isSingleCluster = (mPadSigma == 0) || (mTimeSigma == 0);
146+
147+
uint8_t flags = 0;
148+
uint8_t pad = pos.pad();
149+
bool isEdgeCluster = pad < 2 || pad >= param.tpcGeometry.NPads(pos.row()) - 2; // Geometrical edge check, peak within 2 pads of sector edge
150+
if (isEdgeCluster) {
151+
bool leftEdge = (pad < 2);
152+
if (leftEdge ? (pad == 1 && chargeMap[pos.delta({-1, 0})].unpack() < 1) : (pad == (param.tpcGeometry.NPads(pos.row()) - 2) && chargeMap[pos.delta({1, 0})].unpack() < 1)) {
153+
isEdgeCluster = false; // No edge cluster if peak is close to edge but no charge at the edge.
154+
} else if (leftEdge ? (pad < mPadMean) : (pad > mPadMean)) {
155+
mPadMean = pad; // Correct to peak position if COG is close to middle of pad than peak
156+
}
157+
}
158+
159+
flags |= (isEdgeCluster) ? tpc::ClusterNative::flagEdge : 0;
160+
flags |= (wasSplitInTime) ? tpc::ClusterNative::flagSplitTime : 0;
161+
flags |= (wasSplitInPad) ? tpc::ClusterNative::flagSplitPad : 0;
162+
flags |= (isSingleCluster) ? tpc::ClusterNative::flagSingle : 0;
163+
164+
cn.setTimeFlags(mTimeMean - param.rec.tpc.clustersShiftTimebinsClusterizer, flags);
165+
cn.setPad(mPadMean);
166+
cn.setSigmaTime(mTimeSigma);
167+
cn.setSigmaPad(mPadSigma);
168+
169+
return true;
170+
}

GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ class ClusterAccumulator
4242
GPUd() tpccf::Charge updateOuter(PackedCharge, tpccf::Delta2);
4343

4444
GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&, tpccf::TPCTime, const Array2D<PackedCharge>&);
45+
GPUd() bool toNativeSimple(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&, const Array2D<PackedCharge>&);
4546

4647
GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad)
4748
{

0 commit comments

Comments
 (0)