@@ -40,7 +40,7 @@ int32_t GPUChainTracking::RunTPCCompression()
4040 RecordMarker (mEvents ->single , 0 );
4141 }
4242
43- if (ProcessingSettings ().tpcCompressionGatherMode == 3 ) {
43+ if (GetProcessingSettings ().tpcCompressionGatherMode == 3 ) {
4444 mRec ->AllocateVolatileDeviceMemory (0 ); // make future device memory allocation volatile
4545 }
4646 SetupGPUProcessor (&Compressor, true );
@@ -73,19 +73,19 @@ int32_t GPUChainTracking::RunTPCCompression()
7373 Compressor.mOutputFlat ->set (outputSize, *Compressor.mOutput );
7474 char * hostFlatPtr = (char *)Compressor.mOutput ->qTotU ; // First array as allocated in GPUTPCCompression::SetPointersCompressedClusters
7575 size_t copySize = 0 ;
76- if (ProcessingSettings ().tpcCompressionGatherMode == 3 ) {
76+ if (GetProcessingSettings ().tpcCompressionGatherMode == 3 ) {
7777 CompressorShadow.mOutputA = Compressor.mOutput ;
7878 copySize = AllocateRegisteredMemory (Compressor.mMemoryResOutputGPU ); // We overwrite Compressor.mOutput with the allocated output pointers on the GPU
7979 }
8080 const o2::tpc::CompressedClustersPtrs* P = nullptr ;
8181 HighResTimer* gatherTimer = nullptr ;
8282 int32_t outputStream = 0 ;
83- if (ProcessingSettings ().doublePipeline ) {
83+ if (GetProcessingSettings ().doublePipeline ) {
8484 SynchronizeStream (OutputStream ()); // Synchronize output copies running in parallel from memory that might be released, only the following async copy from stacked memory is safe after the chain finishes.
8585 outputStream = OutputStream ();
8686 }
87- if (ProcessingSettings ().tpcCompressionGatherMode >= 2 ) {
88- if (ProcessingSettings ().tpcCompressionGatherMode == 2 ) {
87+ if (GetProcessingSettings ().tpcCompressionGatherMode >= 2 ) {
88+ if (GetProcessingSettings ().tpcCompressionGatherMode == 2 ) {
8989 void * devicePtr = mRec ->getGPUPointer (Compressor.mOutputFlat );
9090 if (devicePtr != Compressor.mOutputFlat ) {
9191 CompressedClustersPtrs& ptrs = *Compressor.mOutput ; // We need to update the ptrs with the gpu-mapped version of the host address space
@@ -97,7 +97,7 @@ int32_t GPUChainTracking::RunTPCCompression()
9797 TransferMemoryResourcesToGPU (myStep, &Compressor, outputStream);
9898 constexpr uint32_t nBlocksDefault = 2 ;
9999 constexpr uint32_t nBlocksMulti = 1 + 2 * 200 ;
100- switch (ProcessingSettings ().tpcCompressionGatherModeKernel ) {
100+ switch (GetProcessingSettings ().tpcCompressionGatherModeKernel ) {
101101 case 0 :
102102 runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::unbuffered>(GetGridBlkStep (nBlocksDefault, outputStream, RecoStep::TPCCompression));
103103 getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::unbuffered>(RecoStep::TPCCompression, 0 , outputSize, false );
@@ -120,10 +120,10 @@ int32_t GPUChainTracking::RunTPCCompression()
120120 getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::multiBlock>(RecoStep::TPCCompression, 0 , outputSize, false );
121121 break ;
122122 default :
123- GPUError (" Invalid compression kernel %d selected." , (int32_t )ProcessingSettings ().tpcCompressionGatherModeKernel );
123+ GPUError (" Invalid compression kernel %d selected." , (int32_t )GetProcessingSettings ().tpcCompressionGatherModeKernel );
124124 return 1 ;
125125 }
126- if (ProcessingSettings ().tpcCompressionGatherMode == 3 ) {
126+ if (GetProcessingSettings ().tpcCompressionGatherMode == 3 ) {
127127 RecordMarker (mEvents ->stream [outputStream], outputStream);
128128 char * deviceFlatPts = (char *)Compressor.mOutput ->qTotU ;
129129 if (GetProcessingSettings ().doublePipeline ) {
@@ -138,9 +138,9 @@ int32_t GPUChainTracking::RunTPCCompression()
138138 }
139139 } else {
140140 int8_t direction = 0 ;
141- if (ProcessingSettings ().tpcCompressionGatherMode == 0 ) {
141+ if (GetProcessingSettings ().tpcCompressionGatherMode == 0 ) {
142142 P = &CompressorShadow.mPtrs ;
143- } else if (ProcessingSettings ().tpcCompressionGatherMode == 1 ) {
143+ } else if (GetProcessingSettings ().tpcCompressionGatherMode == 1 ) {
144144 P = &Compressor.mPtrs ;
145145 direction = -1 ;
146146 gatherTimer = &getTimer<GPUTPCCompressionKernels>(" GPUTPCCompression_GatherOnCPU" , 0 );
@@ -184,11 +184,11 @@ int32_t GPUChainTracking::RunTPCCompression()
184184 GPUMemCpyAlways (myStep, O->timeA , P->timeA , O->nTracks * sizeof (O->timeA [0 ]), outputStream, direction);
185185 GPUMemCpyAlways (myStep, O->padA , P->padA , O->nTracks * sizeof (O->padA [0 ]), outputStream, direction);
186186 }
187- if (ProcessingSettings ().tpcCompressionGatherMode == 1 ) {
187+ if (GetProcessingSettings ().tpcCompressionGatherMode == 1 ) {
188188 gatherTimer->Stop ();
189189 }
190190 mIOPtrs .tpcCompressedClusters = Compressor.mOutputFlat ;
191- if (ProcessingSettings ().tpcCompressionGatherMode == 3 ) {
191+ if (GetProcessingSettings ().tpcCompressionGatherMode == 3 ) {
192192 SynchronizeEventAndRelease (mEvents ->stream [outputStream]);
193193 mRec ->ReturnVolatileDeviceMemory ();
194194 }
@@ -209,18 +209,52 @@ int32_t GPUChainTracking::RunTPCDecompression()
209209 if (GetProcessingSettings ().tpcUseOldCPUDecoding ) {
210210 const auto & threadContext = GetThreadContext ();
211211 TPCClusterDecompressor decomp;
212- auto allocator = [this ](size_t size) {
212+ auto allocatorFinal = [this ](size_t size) {
213213 this ->mInputsHost ->mNClusterNative = this ->mInputsShadow ->mNClusterNative = size;
214214 this ->AllocateRegisteredMemory (this ->mInputsHost ->mResourceClusterNativeOutput , this ->mSubOutputControls [GPUTrackingOutputs::getIndex (&GPUTrackingOutputs::clustersNative)]);
215215 return this ->mInputsHost ->mPclusterNativeOutput ;
216216 };
217- auto & gatherTimer = getTimer<TPCClusterDecompressor>(" TPCDecompression" , 0 );
218- gatherTimer.Start ();
219- if (decomp.decompress (mIOPtrs .tpcCompressedClusters , *mClusterNativeAccess , allocator, param (), GetProcessingSettings ().deterministicGPUReconstruction )) {
217+ std::unique_ptr<ClusterNative[]> tmpBuffer;
218+ auto allocatorTmp = [&tmpBuffer](size_t size) {
219+ return ((tmpBuffer = std::make_unique<ClusterNative[]>(size))).get ();
220+ };
221+ auto & decompressTimer = getTimer<TPCClusterDecompressor>(" TPCDecompression" , 0 );
222+ auto allocatorUse = GetProcessingSettings ().tpcApplyCFCutsAtDecoding ? std::function<ClusterNative*(size_t )>{allocatorTmp} : std::function<ClusterNative*(size_t )>{allocatorFinal};
223+ decompressTimer.Start ();
224+ if (decomp.decompress (mIOPtrs .tpcCompressedClusters , *mClusterNativeAccess , allocatorUse, param (), GetProcessingSettings ().deterministicGPUReconstruction )) {
220225 GPUError (" Error decompressing clusters" );
221226 return 1 ;
222227 }
223- gatherTimer.Stop ();
228+ if (GetProcessingSettings ().tpcApplyCFCutsAtDecoding ) {
229+ ClusterNative* outputBuffer;
230+ for (int32_t iPhase = 0 ; iPhase < 2 ; iPhase++) {
231+ uint32_t countTotal = 0 ;
232+ for (uint32_t iSector = 0 ; iSector < GPUCA_NSLICES; iSector++) {
233+ for (uint32_t iRow = 0 ; iRow < GPUCA_ROW_COUNT; iRow++) {
234+ uint32_t count = 0 ;
235+ for (uint32_t k = 0 ; k < mClusterNativeAccess ->nClusters [iSector][iRow]; k++) {
236+ const ClusterNative& cl = mClusterNativeAccess ->clusters [iSector][iRow][k];
237+ bool keep = cl.qTot > param ().rec .tpc .cfQTotCutoff && cl.qMax > param ().rec .tpc .cfQMaxCutoff && (cl.sigmaPadPacked || !(cl.getFlags () & ClusterNative::flagSingle) || cl.qMax > param ().rec .tpc .cfQMaxCutoffSinglePad ) && (cl.sigmaTimePacked || !(cl.getFlags () & ClusterNative::flagSingle) || cl.qMax > param ().rec .tpc .cfQMaxCutoffSingleTime );
238+ count += keep;
239+ countTotal += keep;
240+ if (iPhase) {
241+ outputBuffer[countTotal] = cl;
242+ }
243+ }
244+ if (iPhase) {
245+ mClusterNativeAccess ->nClusters [iSector][iRow] = count;
246+ }
247+ }
248+ }
249+ if (iPhase) {
250+ mClusterNativeAccess ->clustersLinear = outputBuffer;
251+ mClusterNativeAccess ->setOffsetPtrs ();
252+ } else {
253+ outputBuffer = allocatorFinal (countTotal);
254+ }
255+ }
256+ }
257+ decompressTimer.Stop ();
224258 mIOPtrs .clustersNative = mClusterNativeAccess .get ();
225259 if (mRec ->IsGPU ()) {
226260 AllocateRegisteredMemory (mInputsHost ->mResourceClusterNativeBuffer );
0 commit comments