@@ -246,6 +246,7 @@ int32_t GPUChainTracking::RunTPCDecompression()
246246 mRec ->PushNonPersistentMemory (qStr2Tag (" TPCDCMPR" ));
247247 RecoStep myStep = RecoStep::TPCDecompression;
248248 bool doGPU = GetRecoStepsGPU () & RecoStep::TPCDecompression;
249+ bool runFiltering = GetProcessingSettings ().tpcApplyCFCutsAtDecoding ;
249250 GPUTPCDecompression& Decompressor = processors ()->tpcDecompressor ;
250251 GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow ()->tpcDecompressor : Decompressor;
251252 const auto & threadContext = GetThreadContext ();
@@ -300,12 +301,6 @@ int32_t GPUChainTracking::RunTPCDecompression()
300301 GPUMemCpy (myStep, inputGPUShadow.sigmaPadU , cmprClsHost.sigmaPadU , cmprClsHost.nUnattachedClusters * sizeof (cmprClsHost.sigmaPadU [0 ]), unattachedStream, toGPU);
301302 GPUMemCpy (myStep, inputGPUShadow.sigmaTimeU , cmprClsHost.sigmaTimeU , cmprClsHost.nUnattachedClusters * sizeof (cmprClsHost.sigmaTimeU [0 ]), unattachedStream, toGPU);
302303
303- mInputsHost ->mNClusterNative = mInputsShadow ->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters ;
304- AllocateRegisteredMemory (mInputsHost ->mResourceClusterNativeOutput , mSubOutputControls [GPUTrackingOutputs::getIndex (&GPUTrackingOutputs::clustersNative)]);
305- AllocateRegisteredMemory (mInputsHost ->mResourceClusterNativeBuffer );
306- DecompressorShadow.mNativeClustersBuffer = mInputsShadow ->mPclusterNativeBuffer ;
307- Decompressor.mNativeClustersBuffer = mInputsHost ->mPclusterNativeOutput ;
308- WriteToConstantMemory (myStep, (char *)&processors ()->tpcDecompressor - (char *)processors (), &DecompressorShadow, sizeof (DecompressorShadow), inputStream);
309304 TransferMemoryResourceLinkToHost (RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes , inputStream, nullptr , mEvents ->stream , nStreams);
310305 SynchronizeStream (inputStream);
311306 uint32_t offset = 0 ;
@@ -324,27 +319,83 @@ int32_t GPUChainTracking::RunTPCDecompression()
324319 if (decodedAttachedClusters != cmprClsHost.nAttachedClusters ) {
325320 GPUWarning (" %u / %u clusters failed track model decoding (%f %%)" , cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters , 100 .f * (float )(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float )cmprClsHost.nAttachedClusters );
326321 }
327- if (doGPU) {
328- mClusterNativeAccess ->clustersLinear = mInputsShadow ->mPclusterNativeBuffer ;
322+ if (runFiltering) { // If filtering, allocate a temporary buffer and cluster native access in decompressor context
323+ Decompressor.mNClusterNativeBeforeFiltering = DecompressorShadow.mNClusterNativeBeforeFiltering = decodedAttachedClusters + cmprClsHost.nUnattachedClusters ;
324+ AllocateRegisteredMemory (Decompressor.mResourceTmpBufferBeforeFiltering );
325+ AllocateRegisteredMemory (Decompressor.mResourceClusterNativeAccess );
326+ mClusterNativeAccess ->clustersLinear = DecompressorShadow.mNativeClustersBuffer ;
327+ mClusterNativeAccess ->setOffsetPtrs ();
328+ *Decompressor.mClusterNativeAccess = *mClusterNativeAccess ;
329+ WriteToConstantMemory (myStep, (char *)&processors ()->tpcDecompressor - (char *)processors (), &DecompressorShadow, sizeof (DecompressorShadow), inputStream);
330+ TransferMemoryResourceLinkToGPU (RecoStep::TPCDecompression, Decompressor.mResourceClusterNativeAccess , inputStream, &mEvents ->single );
331+ } else { // If not filtering, directly allocate the final buffers
332+ mInputsHost ->mNClusterNative = mInputsShadow ->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters ;
333+ AllocateRegisteredMemory (mInputsHost ->mResourceClusterNativeOutput , mSubOutputControls [GPUTrackingOutputs::getIndex (&GPUTrackingOutputs::clustersNative)]);
334+ AllocateRegisteredMemory (mInputsHost ->mResourceClusterNativeBuffer );
335+ DecompressorShadow.mNativeClustersBuffer = mInputsShadow ->mPclusterNativeBuffer ;
336+ Decompressor.mNativeClustersBuffer = mInputsHost ->mPclusterNativeOutput ;
337+ DecompressorShadow.mClusterNativeAccess = mInputsShadow ->mPclusterNativeAccess ;
338+ Decompressor.mClusterNativeAccess = mInputsHost ->mPclusterNativeAccess ;
339+ WriteToConstantMemory (myStep, (char *)&processors ()->tpcDecompressor - (char *)processors (), &DecompressorShadow, sizeof (DecompressorShadow), inputStream);
340+ if (doGPU) {
341+ mClusterNativeAccess ->clustersLinear = mInputsShadow ->mPclusterNativeBuffer ;
342+ mClusterNativeAccess ->setOffsetPtrs ();
343+ *mInputsHost ->mPclusterNativeAccess = *mClusterNativeAccess ;
344+ processorsShadow ()->ioPtrs .clustersNative = mInputsShadow ->mPclusterNativeAccess ;
345+ WriteToConstantMemory (RecoStep::TPCDecompression, (char *)&processors ()->ioPtrs - (char *)processors (), &processorsShadow ()->ioPtrs , sizeof (processorsShadow ()->ioPtrs ), inputStream);
346+ TransferMemoryResourceLinkToGPU (RecoStep::TPCDecompression, mInputsHost ->mResourceClusterNativeAccess , inputStream, &mEvents ->single );
347+ }
348+ mIOPtrs .clustersNative = mClusterNativeAccess .get ();
349+ mClusterNativeAccess ->clustersLinear = mInputsHost ->mPclusterNativeOutput ;
329350 mClusterNativeAccess ->setOffsetPtrs ();
330351 *mInputsHost ->mPclusterNativeAccess = *mClusterNativeAccess ;
331- processorsShadow ()->ioPtrs .clustersNative = mInputsShadow ->mPclusterNativeAccess ;
332- WriteToConstantMemory (RecoStep::TPCDecompression, (char *)&processors ()->ioPtrs - (char *)processors (), &processorsShadow ()->ioPtrs , sizeof (processorsShadow ()->ioPtrs ), inputStream);
333- TransferMemoryResourceLinkToGPU (RecoStep::TPCDecompression, mInputsHost ->mResourceClusterNativeAccess , inputStream, &mEvents ->single );
334352 }
335- mIOPtrs .clustersNative = mClusterNativeAccess .get ();
336- mClusterNativeAccess ->clustersLinear = mInputsHost ->mPclusterNativeOutput ;
337- mClusterNativeAccess ->setOffsetPtrs ();
338353
339354 uint32_t batchSize = doGPU ? 6 : NSLICES;
340355 for (uint32_t iSlice = 0 ; iSlice < NSLICES; iSlice = iSlice + batchSize) {
341356 int32_t iStream = (iSlice / batchSize) % mRec ->NStreams ();
342357 runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step1unattached>({GetGridAuto (iStream), krnlRunRangeNone, {nullptr , &mEvents ->single }}, iSlice, batchSize);
343358 uint32_t copySize = std::accumulate (mClusterNativeAccess ->nClustersSector + iSlice, mClusterNativeAccess ->nClustersSector + iSlice + batchSize, 0u );
344- GPUMemCpy (RecoStep::TPCDecompression, mInputsHost ->mPclusterNativeOutput + mClusterNativeAccess ->clusterOffset [iSlice][0 ], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess ->clusterOffset [iSlice][0 ], sizeof (Decompressor.mNativeClustersBuffer [0 ]) * copySize, iStream, false );
359+ if (!runFiltering) {
360+ GPUMemCpy (RecoStep::TPCDecompression, mInputsHost ->mPclusterNativeOutput + mClusterNativeAccess ->clusterOffset [iSlice][0 ], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess ->clusterOffset [iSlice][0 ], sizeof (Decompressor.mNativeClustersBuffer [0 ]) * copySize, iStream, false );
361+ }
345362 }
346363 SynchronizeGPU ();
347364
365+ if (runFiltering) { // If filtering is applied, count how many clusters will remain after filtering and allocate final buffers accordingly
366+ AllocateRegisteredMemory (Decompressor.mResourceNClusterPerSectorRow );
367+ WriteToConstantMemory (myStep, (char *)&processors ()->tpcDecompressor - (char *)processors (), &DecompressorShadow, sizeof (DecompressorShadow), unattachedStream);
368+ runKernel<GPUMemClean16>({GetGridAutoStep (unattachedStream, RecoStep::TPCDecompression), krnlRunRangeNone}, DecompressorShadow.mNClusterPerSectorRow , NSLICES * GPUCA_ROW_COUNT * sizeof (DecompressorShadow.mNClusterPerSectorRow [0 ]));
369+ runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::countFilteredClusters>(GetGridAutoStep (unattachedStream, RecoStep::TPCDecompression));
370+ TransferMemoryResourceLinkToHost (RecoStep::TPCDecompression, Decompressor.mResourceNClusterPerSectorRow , unattachedStream);
371+ SynchronizeStream (unattachedStream);
372+ uint32_t nClustersFinal = std::accumulate (Decompressor.mNClusterPerSectorRow , Decompressor.mNClusterPerSectorRow + inputGPU.nSliceRows , 0u );
373+ mInputsHost ->mNClusterNative = mInputsShadow ->mNClusterNative = nClustersFinal;
374+ AllocateRegisteredMemory (mInputsHost ->mResourceClusterNativeOutput , mSubOutputControls [GPUTrackingOutputs::getIndex (&GPUTrackingOutputs::clustersNative)]);
375+ AllocateRegisteredMemory (mInputsHost ->mResourceClusterNativeBuffer );
376+ DecompressorShadow.mNativeClustersBuffer = mInputsShadow ->mPclusterNativeBuffer ;
377+ Decompressor.mNativeClustersBuffer = mInputsHost ->mPclusterNativeOutput ;
378+ WriteToConstantMemory (myStep, (char *)&processors ()->tpcDecompressor - (char *)processors (), &DecompressorShadow, sizeof (DecompressorShadow), unattachedStream);
379+ for (uint32_t i = 0 ; i < NSLICES; i++) {
380+ for (uint32_t j = 0 ; j < GPUCA_ROW_COUNT; j++) {
381+ mClusterNativeAccess ->nClusters [i][j] = Decompressor.mNClusterPerSectorRow [i * GPUCA_ROW_COUNT + j];
382+ }
383+ }
384+ if (doGPU) {
385+ mClusterNativeAccess ->clustersLinear = mInputsShadow ->mPclusterNativeBuffer ;
386+ mClusterNativeAccess ->setOffsetPtrs ();
387+ *mInputsHost ->mPclusterNativeAccess = *mClusterNativeAccess ;
388+ processorsShadow ()->ioPtrs .clustersNative = mInputsShadow ->mPclusterNativeAccess ;
389+ WriteToConstantMemory (RecoStep::TPCDecompression, (char *)&processors ()->ioPtrs - (char *)processors (), &processorsShadow ()->ioPtrs , sizeof (processorsShadow ()->ioPtrs ), unattachedStream);
390+ TransferMemoryResourceLinkToGPU (RecoStep::TPCDecompression, mInputsHost ->mResourceClusterNativeAccess , unattachedStream);
391+ }
392+ mIOPtrs .clustersNative = mClusterNativeAccess .get ();
393+ mClusterNativeAccess ->clustersLinear = mInputsHost ->mPclusterNativeOutput ;
394+ mClusterNativeAccess ->setOffsetPtrs ();
395+ runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::storeFilteredClusters>(GetGridAutoStep (unattachedStream, RecoStep::TPCDecompression));
396+ GPUMemCpy (RecoStep::TPCDecompression, mInputsHost ->mPclusterNativeOutput , DecompressorShadow.mNativeClustersBuffer , sizeof (Decompressor.mNativeClustersBuffer [0 ]) * nClustersFinal, unattachedStream, false );
397+ SynchronizeStream (unattachedStream);
398+ }
348399 if (GetProcessingSettings ().deterministicGPUReconstruction || GetProcessingSettings ().debugLevel >= 4 ) {
349400 runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::sortPerSectorRow>(GetGridAutoStep (unattachedStream, RecoStep::TPCDecompression));
350401 const ClusterNativeAccess* decoded = mIOPtrs .clustersNative ;
@@ -357,6 +408,7 @@ int32_t GPUChainTracking::RunTPCDecompression()
357408 }
358409 }
359410 }
411+ SynchronizeStream (unattachedStream);
360412 }
361413 mRec ->PopNonPersistentMemory (RecoStep::TPCDecompression, qStr2Tag (" TPCDCMPR" ));
362414 }
0 commit comments