Skip to content

Commit ee7b21e

Browse files
committed
GPU: TPC Decoding: add optional timebin cut to CTF cluster decoding
1 parent 66b81d8 commit ee7b21e

File tree

8 files changed

+156
-21
lines changed

8 files changed

+156
-21
lines changed

GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,34 @@ void* GPUTPCDecompression::SetPointersTmpNativeBuffersInput(void* mem)
8484
return mem;
8585
}
8686

87+
void* GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering(void* mem)
88+
{
89+
computePointerWithAlignment(mem, mNativeClustersBuffer, mNClusterNativeBeforeFiltering);
90+
return mem;
91+
}
92+
93+
void* GPUTPCDecompression::SetPointersInputClusterNativeAccess(void* mem)
94+
{
95+
computePointerWithAlignment(mem, mClusterNativeAccess);
96+
return mem;
97+
}
98+
99+
void* GPUTPCDecompression::SetPointersNClusterPerSectorRow(void* mem)
100+
{
101+
computePointerWithAlignment(mem, mNClusterPerSectorRow, NSLICES * GPUCA_ROW_COUNT);
102+
return mem;
103+
}
104+
87105
void GPUTPCDecompression::RegisterMemoryAllocation()
88106
{
89107
AllocateAndInitializeLate();
90108
mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_EXTERNAL | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionInput");
91109
mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersGPU, GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersGPU");
92110
mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersOutput, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersOutput");
93111
mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersInput, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersInput");
112+
mResourceTmpBufferBeforeFiltering = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering, GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBufferForFiltering");
113+
mResourceClusterNativeAccess = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputClusterNativeAccess, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterAccessForFiltering");
114+
mResourceNClusterPerSectorRow = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersNClusterPerSectorRow, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterCountForFiltering");
94115
}
95116

96117
void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io)

GPU/GPUTracking/DataCompression/GPUTPCDecompression.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ class GPUTPCDecompression : public GPUProcessor
5555
void* SetPointersTmpNativeBuffersGPU(void* mem);
5656
void* SetPointersTmpNativeBuffersOutput(void* mem);
5757
void* SetPointersTmpNativeBuffersInput(void* mem);
58+
void* SetPointersTmpClusterNativeAccessForFiltering(void* mem);
59+
void* SetPointersInputClusterNativeAccess(void* mem);
60+
void* SetPointersNClusterPerSectorRow(void* mem);
5861

5962
#endif
6063

@@ -63,18 +66,24 @@ class GPUTPCDecompression : public GPUProcessor
6366
o2::tpc::CompressedClusters mInputGPU;
6467

6568
uint32_t mMaxNativeClustersPerBuffer;
69+
uint32_t mNClusterNativeBeforeFiltering;
6670
uint32_t* mNativeClustersIndex;
6771
uint32_t* mUnattachedClustersOffsets;
6872
uint32_t* mAttachedClustersOffsets;
73+
uint32_t* mNClusterPerSectorRow;
6974
o2::tpc::ClusterNative* mTmpNativeClusters;
7075
o2::tpc::ClusterNative* mNativeClustersBuffer;
76+
o2::tpc::ClusterNativeAccess* mClusterNativeAccess;
7177

7278
template <class T>
7379
void SetPointersCompressedClusters(void*& mem, T& c, uint32_t nClA, uint32_t nTr, uint32_t nClU, bool reducedClA);
7480

7581
int16_t mMemoryResInputGPU = -1;
7682
int16_t mResourceTmpIndexes = -1;
7783
int16_t mResourceTmpClustersOffsets = -1;
84+
int16_t mResourceTmpBufferBeforeFiltering = -1;
85+
int16_t mResourceClusterNativeAccess = -1;
86+
int16_t mResourceNClusterPerSectorRow = -1;
7887
};
7988
} // namespace GPUCA_NAMESPACE::gpu
8089
#endif // GPUTPCDECOMPRESSION_H

GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::ste
4343
GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
4444
CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
4545
ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer;
46-
const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative;
46+
const ClusterNativeAccess* outputAccess = decompressor.mClusterNativeAccess;
4747
uint32_t* offsets = decompressor.mUnattachedClustersOffsets;
4848
for (int32_t i = get_global_id(0); i < GPUCA_ROW_COUNT * nSlices; i += get_global_size(0)) {
4949
uint32_t iRow = i % GPUCA_ROW_COUNT;
@@ -81,6 +81,51 @@ GPUdi() void GPUTPCDecompressionKernels::decompressorMemcpyBasic(T* GPUrestrict(
8181
}
8282
}
8383

84+
GPUdi() bool GPUTPCDecompressionUtilKernels::isClusterKept(const o2::tpc::ClusterNative& cl, const GPUParam& GPUrestrict() param)
85+
{
86+
return param.tpcCutTimeBin > 0 ? cl.getTime() < param.tpcCutTimeBin : true;
87+
}
88+
89+
template <>
90+
GPUdii() void GPUTPCDecompressionUtilKernels::Thread<GPUTPCDecompressionUtilKernels::countFilteredClusters>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
91+
{
92+
const GPUParam& GPUrestrict() param = processors.param;
93+
GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
94+
const ClusterNativeAccess* clusterAccess = decompressor.mClusterNativeAccess;
95+
for (uint32_t i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)) {
96+
uint32_t slice = i / GPUCA_ROW_COUNT;
97+
uint32_t row = i % GPUCA_ROW_COUNT;
98+
for (uint32_t k = 0; k < clusterAccess->nClusters[slice][row]; k++) {
99+
ClusterNative cl = clusterAccess->clusters[slice][row][k];
100+
if (isClusterKept(cl, param)) {
101+
decompressor.mNClusterPerSectorRow[i]++;
102+
}
103+
}
104+
}
105+
}
106+
107+
template <>
108+
GPUdii() void GPUTPCDecompressionUtilKernels::Thread<GPUTPCDecompressionUtilKernels::storeFilteredClusters>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
109+
{
110+
const GPUParam& GPUrestrict() param = processors.param;
111+
GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
112+
ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer;
113+
const ClusterNativeAccess* clusterAccess = decompressor.mClusterNativeAccess;
114+
const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative;
115+
for (uint32_t i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)) {
116+
uint32_t slice = i / GPUCA_ROW_COUNT;
117+
uint32_t row = i % GPUCA_ROW_COUNT;
118+
uint32_t count = 0;
119+
for (uint32_t k = 0; k < clusterAccess->nClusters[slice][row]; k++) {
120+
const ClusterNative cl = clusterAccess->clusters[slice][row][k];
121+
if (isClusterKept(cl, param)) {
122+
clusterBuffer[outputAccess->clusterOffset[slice][row] + count] = cl;
123+
count++;
124+
}
125+
}
126+
}
127+
}
128+
84129
template <>
85130
GPUdii() void GPUTPCDecompressionUtilKernels::Thread<GPUTPCDecompressionUtilKernels::sortPerSectorRow>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
86131
{

GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,15 @@ class GPUTPCDecompressionUtilKernels : public GPUKernelTemplate
5959
{
6060
public:
6161
enum K : int32_t {
62-
sortPerSectorRow = 0,
62+
countFilteredClusters = 0,
63+
storeFilteredClusters = 1,
64+
sortPerSectorRow = 2,
6365
};
6466

6567
template <int32_t iKernel = defaultKernel>
6668
GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors);
69+
70+
GPUdi() static bool isClusterKept(const o2::tpc::ClusterNative& cl, const GPUParam& GPUrestrict() param);
6771
};
6872

6973
} // namespace GPUCA_NAMESPACE::gpu

GPU/GPUTracking/Definitions/GPUDefGPUParameters.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,12 @@
344344
#endif
345345
#ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_sortPerSectorRow
346346
#define GPUCA_LB_GPUTPCDecompressionUtilKernels_sortPerSectorRow 256
347+
#endif
348+
#ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_countFilteredClusters
349+
#define GPUCA_LB_GPUTPCDecompressionUtilKernels_countFilteredClusters 256
350+
#endif
351+
#ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_storeFilteredClusters
352+
#define GPUCA_LB_GPUTPCDecompressionUtilKernels_storeFilteredClusters 256
347353
#endif
348354
#ifndef GPUCA_LB_GPUTPCCFDecodeZS
349355
#define GPUCA_LB_GPUTPCCFDecodeZS 128, 4

GPU/GPUTracking/Global/GPUChainTracking.cxx

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -318,10 +318,6 @@ bool GPUChainTracking::ValidateSettings()
318318
return false;
319319
}
320320
}
321-
if ((GetRecoSteps() & GPUDataTypes::RecoStep::TPCDecompression) && GetProcessingSettings().tpcApplyCFCutsAtDecoding && !GetProcessingSettings().tpcUseOldCPUDecoding) {
322-
GPUError("tpcApplyCFCutsAtDecoding currently requires tpcUseOldCPUDecoding");
323-
return false;
324-
}
325321
if ((GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && !(GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression) && (GetProcessingSettings().tpcCompressionGatherMode == 1 || GetProcessingSettings().tpcCompressionGatherMode == 3)) {
326322
GPUError("Invalid tpcCompressionGatherMode for compression on CPU");
327323
return false;

GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx

Lines changed: 67 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ int32_t GPUChainTracking::RunTPCDecompression()
246246
mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR"));
247247
RecoStep myStep = RecoStep::TPCDecompression;
248248
bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression;
249+
bool runFiltering = GetProcessingSettings().tpcApplyCFCutsAtDecoding;
249250
GPUTPCDecompression& Decompressor = processors()->tpcDecompressor;
250251
GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor;
251252
const auto& threadContext = GetThreadContext();
@@ -300,12 +301,6 @@ int32_t GPUChainTracking::RunTPCDecompression()
300301
GPUMemCpy(myStep, inputGPUShadow.sigmaPadU, cmprClsHost.sigmaPadU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaPadU[0]), unattachedStream, toGPU);
301302
GPUMemCpy(myStep, inputGPUShadow.sigmaTimeU, cmprClsHost.sigmaTimeU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaTimeU[0]), unattachedStream, toGPU);
302303

303-
mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters;
304-
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
305-
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
306-
DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer;
307-
Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput;
308-
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream);
309304
TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes, inputStream, nullptr, mEvents->stream, nStreams);
310305
SynchronizeStream(inputStream);
311306
uint32_t offset = 0;
@@ -324,27 +319,83 @@ int32_t GPUChainTracking::RunTPCDecompression()
324319
if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) {
325320
GPUWarning("%u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters);
326321
}
327-
if (doGPU) {
328-
mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
322+
if (runFiltering) { // If filtering, allocate a temporary buffer and cluster native access in decompressor context
323+
Decompressor.mNClusterNativeBeforeFiltering = DecompressorShadow.mNClusterNativeBeforeFiltering = decodedAttachedClusters + cmprClsHost.nUnattachedClusters;
324+
AllocateRegisteredMemory(Decompressor.mResourceTmpBufferBeforeFiltering);
325+
AllocateRegisteredMemory(Decompressor.mResourceClusterNativeAccess);
326+
mClusterNativeAccess->clustersLinear = DecompressorShadow.mNativeClustersBuffer;
327+
mClusterNativeAccess->setOffsetPtrs();
328+
*Decompressor.mClusterNativeAccess = *mClusterNativeAccess;
329+
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream);
330+
TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, Decompressor.mResourceClusterNativeAccess, inputStream, &mEvents->single);
331+
} else { // If not filtering, directly allocate the final buffers
332+
mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters;
333+
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
334+
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
335+
DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer;
336+
Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput;
337+
DecompressorShadow.mClusterNativeAccess = mInputsShadow->mPclusterNativeAccess;
338+
Decompressor.mClusterNativeAccess = mInputsHost->mPclusterNativeAccess;
339+
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream);
340+
if (doGPU) {
341+
mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
342+
mClusterNativeAccess->setOffsetPtrs();
343+
*mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess;
344+
processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
345+
WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), inputStream);
346+
TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, inputStream, &mEvents->single);
347+
}
348+
mIOPtrs.clustersNative = mClusterNativeAccess.get();
349+
mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
329350
mClusterNativeAccess->setOffsetPtrs();
330351
*mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess;
331-
processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
332-
WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), inputStream);
333-
TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, inputStream, &mEvents->single);
334352
}
335-
mIOPtrs.clustersNative = mClusterNativeAccess.get();
336-
mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
337-
mClusterNativeAccess->setOffsetPtrs();
338353

339354
uint32_t batchSize = doGPU ? 6 : NSLICES;
340355
for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice = iSlice + batchSize) {
341356
int32_t iStream = (iSlice / batchSize) % mRec->NStreams();
342357
runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step1unattached>({GetGridAuto(iStream), krnlRunRangeNone, {nullptr, &mEvents->single}}, iSlice, batchSize);
343358
uint32_t copySize = std::accumulate(mClusterNativeAccess->nClustersSector + iSlice, mClusterNativeAccess->nClustersSector + iSlice + batchSize, 0u);
344-
GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSlice][0], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess->clusterOffset[iSlice][0], sizeof(Decompressor.mNativeClustersBuffer[0]) * copySize, iStream, false);
359+
if (!runFiltering) {
360+
GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSlice][0], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess->clusterOffset[iSlice][0], sizeof(Decompressor.mNativeClustersBuffer[0]) * copySize, iStream, false);
361+
}
345362
}
346363
SynchronizeGPU();
347364

365+
if (runFiltering) { // If filtering is applied, count how many clusters will remain after filtering and allocate final buffers accordingly
366+
AllocateRegisteredMemory(Decompressor.mResourceNClusterPerSectorRow);
367+
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), unattachedStream);
368+
runKernel<GPUMemClean16>({GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression), krnlRunRangeNone}, DecompressorShadow.mNClusterPerSectorRow, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mNClusterPerSectorRow[0]));
369+
runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::countFilteredClusters>(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
370+
TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceNClusterPerSectorRow, unattachedStream);
371+
SynchronizeStream(unattachedStream);
372+
uint32_t nClustersFinal = std::accumulate(Decompressor.mNClusterPerSectorRow, Decompressor.mNClusterPerSectorRow + inputGPU.nSliceRows, 0u);
373+
mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = nClustersFinal;
374+
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
375+
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
376+
DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer;
377+
Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput;
378+
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), unattachedStream);
379+
for (uint32_t i = 0; i < NSLICES; i++) {
380+
for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
381+
mClusterNativeAccess->nClusters[i][j] = Decompressor.mNClusterPerSectorRow[i * GPUCA_ROW_COUNT + j];
382+
}
383+
}
384+
if (doGPU) {
385+
mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
386+
mClusterNativeAccess->setOffsetPtrs();
387+
*mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess;
388+
processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
389+
WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), unattachedStream);
390+
TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, unattachedStream);
391+
}
392+
mIOPtrs.clustersNative = mClusterNativeAccess.get();
393+
mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
394+
mClusterNativeAccess->setOffsetPtrs();
395+
runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::storeFilteredClusters>(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
396+
GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput, DecompressorShadow.mNativeClustersBuffer, sizeof(Decompressor.mNativeClustersBuffer[0]) * nClustersFinal, unattachedStream, false);
397+
SynchronizeStream(unattachedStream);
398+
}
348399
if (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4) {
349400
runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::sortPerSectorRow>(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
350401
const ClusterNativeAccess* decoded = mIOPtrs.clustersNative;
@@ -357,6 +408,7 @@ int32_t GPUChainTracking::RunTPCDecompression()
357408
}
358409
}
359410
}
411+
SynchronizeStream(unattachedStream);
360412
}
361413
mRec->PopNonPersistentMemory(RecoStep::TPCDecompression, qStr2Tag("TPCDCMPR"));
362414
}

0 commit comments

Comments
 (0)