@@ -246,13 +246,21 @@ int32_t GPUChainTracking::RunTPCDecompression()
246246 mRec ->PushNonPersistentMemory (qStr2Tag (" TPCDCMPR" ));
247247 RecoStep myStep = RecoStep::TPCDecompression;
248248 bool doGPU = GetRecoStepsGPU () & RecoStep::TPCDecompression;
249+ bool runFiltering = param ().tpcCutTimeBin > 0 ;
249250 GPUTPCDecompression& Decompressor = processors ()->tpcDecompressor ;
250251 GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow ()->tpcDecompressor : Decompressor;
251252 const auto & threadContext = GetThreadContext ();
252253 CompressedClusters cmprClsHost = *mIOPtrs .tpcCompressedClusters ;
253254 CompressedClusters& inputGPU = Decompressor.mInputGPU ;
254255 CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU ;
255256
257+ if (cmprClsHost.nTracks && cmprClsHost.solenoidBz != -1e6f && cmprClsHost.solenoidBz != param ().bzkG ) {
258+ throw std::runtime_error (" Configured solenoid Bz does not match value used for track model encoding" );
259+ }
260+ if (cmprClsHost.nTracks && cmprClsHost.maxTimeBin != -1e6 && cmprClsHost.maxTimeBin != param ().continuousMaxTimeBin ) {
261+ throw std::runtime_error (" Configured max time bin does not match value used for track model encoding" );
262+ }
263+
256264 int32_t inputStream = 0 ;
257265 int32_t unattachedStream = mRec ->NStreams () - 1;
258266 inputGPU = cmprClsHost;
@@ -300,12 +308,6 @@ int32_t GPUChainTracking::RunTPCDecompression()
300308 GPUMemCpy (myStep, inputGPUShadow.sigmaPadU , cmprClsHost.sigmaPadU , cmprClsHost.nUnattachedClusters * sizeof (cmprClsHost.sigmaPadU [0 ]), unattachedStream, toGPU);
301309 GPUMemCpy (myStep, inputGPUShadow.sigmaTimeU , cmprClsHost.sigmaTimeU , cmprClsHost.nUnattachedClusters * sizeof (cmprClsHost.sigmaTimeU [0 ]), unattachedStream, toGPU);
302310
303- mInputsHost ->mNClusterNative = mInputsShadow ->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters ;
304- AllocateRegisteredMemory (mInputsHost ->mResourceClusterNativeOutput , mSubOutputControls [GPUTrackingOutputs::getIndex (&GPUTrackingOutputs::clustersNative)]);
305- AllocateRegisteredMemory (mInputsHost ->mResourceClusterNativeBuffer );
306- DecompressorShadow.mNativeClustersBuffer = mInputsShadow ->mPclusterNativeBuffer ;
307- Decompressor.mNativeClustersBuffer = mInputsHost ->mPclusterNativeOutput ;
308- WriteToConstantMemory (myStep, (char *)&processors ()->tpcDecompressor - (char *)processors (), &DecompressorShadow, sizeof (DecompressorShadow), inputStream);
309311 TransferMemoryResourceLinkToHost (RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes , inputStream, nullptr , mEvents ->stream , nStreams);
310312 SynchronizeStream (inputStream);
311313 uint32_t offset = 0 ;
@@ -324,27 +326,83 @@ int32_t GPUChainTracking::RunTPCDecompression()
324326 if (decodedAttachedClusters != cmprClsHost.nAttachedClusters ) {
325327 GPUWarning (" %u / %u clusters failed track model decoding (%f %%)" , cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters , 100 .f * (float )(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float )cmprClsHost.nAttachedClusters );
326328 }
327- if (doGPU) {
328- mClusterNativeAccess ->clustersLinear = mInputsShadow ->mPclusterNativeBuffer ;
329+ if (runFiltering) { // If filtering, allocate a temporary buffer and cluster native access in decompressor context
330+ Decompressor.mNClusterNativeBeforeFiltering = DecompressorShadow.mNClusterNativeBeforeFiltering = decodedAttachedClusters + cmprClsHost.nUnattachedClusters ;
331+ AllocateRegisteredMemory (Decompressor.mResourceTmpBufferBeforeFiltering );
332+ AllocateRegisteredMemory (Decompressor.mResourceClusterNativeAccess );
333+ mClusterNativeAccess ->clustersLinear = DecompressorShadow.mNativeClustersBuffer ;
334+ mClusterNativeAccess ->setOffsetPtrs ();
335+ *Decompressor.mClusterNativeAccess = *mClusterNativeAccess ;
336+ WriteToConstantMemory (myStep, (char *)&processors ()->tpcDecompressor - (char *)processors (), &DecompressorShadow, sizeof (DecompressorShadow), inputStream);
337+ TransferMemoryResourceLinkToGPU (RecoStep::TPCDecompression, Decompressor.mResourceClusterNativeAccess , inputStream, &mEvents ->single );
338+ } else { // If not filtering, directly allocate the final buffers
339+ mInputsHost ->mNClusterNative = mInputsShadow ->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters ;
340+ AllocateRegisteredMemory (mInputsHost ->mResourceClusterNativeOutput , mSubOutputControls [GPUTrackingOutputs::getIndex (&GPUTrackingOutputs::clustersNative)]);
341+ AllocateRegisteredMemory (mInputsHost ->mResourceClusterNativeBuffer );
342+ DecompressorShadow.mNativeClustersBuffer = mInputsShadow ->mPclusterNativeBuffer ;
343+ Decompressor.mNativeClustersBuffer = mInputsHost ->mPclusterNativeOutput ;
344+ DecompressorShadow.mClusterNativeAccess = mInputsShadow ->mPclusterNativeAccess ;
345+ Decompressor.mClusterNativeAccess = mInputsHost ->mPclusterNativeAccess ;
346+ WriteToConstantMemory (myStep, (char *)&processors ()->tpcDecompressor - (char *)processors (), &DecompressorShadow, sizeof (DecompressorShadow), inputStream);
347+ if (doGPU) {
348+ mClusterNativeAccess ->clustersLinear = mInputsShadow ->mPclusterNativeBuffer ;
349+ mClusterNativeAccess ->setOffsetPtrs ();
350+ *mInputsHost ->mPclusterNativeAccess = *mClusterNativeAccess ;
351+ processorsShadow ()->ioPtrs .clustersNative = mInputsShadow ->mPclusterNativeAccess ;
352+ WriteToConstantMemory (RecoStep::TPCDecompression, (char *)&processors ()->ioPtrs - (char *)processors (), &processorsShadow ()->ioPtrs , sizeof (processorsShadow ()->ioPtrs ), inputStream);
353+ TransferMemoryResourceLinkToGPU (RecoStep::TPCDecompression, mInputsHost ->mResourceClusterNativeAccess , inputStream, &mEvents ->single );
354+ }
355+ mIOPtrs .clustersNative = mClusterNativeAccess .get ();
356+ mClusterNativeAccess ->clustersLinear = mInputsHost ->mPclusterNativeOutput ;
329357 mClusterNativeAccess ->setOffsetPtrs ();
330358 *mInputsHost ->mPclusterNativeAccess = *mClusterNativeAccess ;
331- processorsShadow ()->ioPtrs .clustersNative = mInputsShadow ->mPclusterNativeAccess ;
332- WriteToConstantMemory (RecoStep::TPCDecompression, (char *)&processors ()->ioPtrs - (char *)processors (), &processorsShadow ()->ioPtrs , sizeof (processorsShadow ()->ioPtrs ), inputStream);
333- TransferMemoryResourceLinkToGPU (RecoStep::TPCDecompression, mInputsHost ->mResourceClusterNativeAccess , inputStream, &mEvents ->single );
334359 }
335- mIOPtrs .clustersNative = mClusterNativeAccess .get ();
336- mClusterNativeAccess ->clustersLinear = mInputsHost ->mPclusterNativeOutput ;
337- mClusterNativeAccess ->setOffsetPtrs ();
338360
339361 uint32_t batchSize = doGPU ? 6 : NSLICES;
340362 for (uint32_t iSlice = 0 ; iSlice < NSLICES; iSlice = iSlice + batchSize) {
341363 int32_t iStream = (iSlice / batchSize) % mRec ->NStreams ();
342364 runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step1unattached>({GetGridAuto (iStream), krnlRunRangeNone, {nullptr , &mEvents ->single }}, iSlice, batchSize);
343365 uint32_t copySize = std::accumulate (mClusterNativeAccess ->nClustersSector + iSlice, mClusterNativeAccess ->nClustersSector + iSlice + batchSize, 0u );
344- GPUMemCpy (RecoStep::TPCDecompression, mInputsHost ->mPclusterNativeOutput + mClusterNativeAccess ->clusterOffset [iSlice][0 ], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess ->clusterOffset [iSlice][0 ], sizeof (Decompressor.mNativeClustersBuffer [0 ]) * copySize, iStream, false );
366+ if (!runFiltering) {
367+ GPUMemCpy (RecoStep::TPCDecompression, mInputsHost ->mPclusterNativeOutput + mClusterNativeAccess ->clusterOffset [iSlice][0 ], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess ->clusterOffset [iSlice][0 ], sizeof (Decompressor.mNativeClustersBuffer [0 ]) * copySize, iStream, false );
368+ }
345369 }
346370 SynchronizeGPU ();
347371
372+ if (runFiltering) { // If filtering is applied, count how many clusters will remain after filtering and allocate final buffers accordingly
373+ AllocateRegisteredMemory (Decompressor.mResourceNClusterPerSectorRow );
374+ WriteToConstantMemory (myStep, (char *)&processors ()->tpcDecompressor - (char *)processors (), &DecompressorShadow, sizeof (DecompressorShadow), unattachedStream);
375+ runKernel<GPUMemClean16>({GetGridAutoStep (unattachedStream, RecoStep::TPCDecompression), krnlRunRangeNone}, DecompressorShadow.mNClusterPerSectorRow , NSLICES * GPUCA_ROW_COUNT * sizeof (DecompressorShadow.mNClusterPerSectorRow [0 ]));
376+ runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::countFilteredClusters>(GetGridAutoStep (unattachedStream, RecoStep::TPCDecompression));
377+ TransferMemoryResourceLinkToHost (RecoStep::TPCDecompression, Decompressor.mResourceNClusterPerSectorRow , unattachedStream);
378+ SynchronizeStream (unattachedStream);
379+ uint32_t nClustersFinal = std::accumulate (Decompressor.mNClusterPerSectorRow , Decompressor.mNClusterPerSectorRow + inputGPU.nSliceRows , 0u );
380+ mInputsHost ->mNClusterNative = mInputsShadow ->mNClusterNative = nClustersFinal;
381+ AllocateRegisteredMemory (mInputsHost ->mResourceClusterNativeOutput , mSubOutputControls [GPUTrackingOutputs::getIndex (&GPUTrackingOutputs::clustersNative)]);
382+ AllocateRegisteredMemory (mInputsHost ->mResourceClusterNativeBuffer );
383+ DecompressorShadow.mNativeClustersBuffer = mInputsShadow ->mPclusterNativeBuffer ;
384+ Decompressor.mNativeClustersBuffer = mInputsHost ->mPclusterNativeOutput ;
385+ WriteToConstantMemory (myStep, (char *)&processors ()->tpcDecompressor - (char *)processors (), &DecompressorShadow, sizeof (DecompressorShadow), unattachedStream);
386+ for (uint32_t i = 0 ; i < NSLICES; i++) {
387+ for (uint32_t j = 0 ; j < GPUCA_ROW_COUNT; j++) {
388+ mClusterNativeAccess ->nClusters [i][j] = Decompressor.mNClusterPerSectorRow [i * GPUCA_ROW_COUNT + j];
389+ }
390+ }
391+ if (doGPU) {
392+ mClusterNativeAccess ->clustersLinear = mInputsShadow ->mPclusterNativeBuffer ;
393+ mClusterNativeAccess ->setOffsetPtrs ();
394+ *mInputsHost ->mPclusterNativeAccess = *mClusterNativeAccess ;
395+ processorsShadow ()->ioPtrs .clustersNative = mInputsShadow ->mPclusterNativeAccess ;
396+ WriteToConstantMemory (RecoStep::TPCDecompression, (char *)&processors ()->ioPtrs - (char *)processors (), &processorsShadow ()->ioPtrs , sizeof (processorsShadow ()->ioPtrs ), unattachedStream);
397+ TransferMemoryResourceLinkToGPU (RecoStep::TPCDecompression, mInputsHost ->mResourceClusterNativeAccess , unattachedStream);
398+ }
399+ mIOPtrs .clustersNative = mClusterNativeAccess .get ();
400+ mClusterNativeAccess ->clustersLinear = mInputsHost ->mPclusterNativeOutput ;
401+ mClusterNativeAccess ->setOffsetPtrs ();
402+ runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::storeFilteredClusters>(GetGridAutoStep (unattachedStream, RecoStep::TPCDecompression));
403+ GPUMemCpy (RecoStep::TPCDecompression, mInputsHost ->mPclusterNativeOutput , DecompressorShadow.mNativeClustersBuffer , sizeof (Decompressor.mNativeClustersBuffer [0 ]) * nClustersFinal, unattachedStream, false );
404+ SynchronizeStream (unattachedStream);
405+ }
348406 if (GetProcessingSettings ().deterministicGPUReconstruction || GetProcessingSettings ().debugLevel >= 4 ) {
349407 runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::sortPerSectorRow>(GetGridAutoStep (unattachedStream, RecoStep::TPCDecompression));
350408 const ClusterNativeAccess* decoded = mIOPtrs .clustersNative ;
@@ -357,6 +415,7 @@ int32_t GPUChainTracking::RunTPCDecompression()
357415 }
358416 }
359417 }
418+ SynchronizeStream (unattachedStream);
360419 }
361420 mRec ->PopNonPersistentMemory (RecoStep::TPCDecompression, qStr2Tag (" TPCDCMPR" ));
362421 }
0 commit comments