Skip to content

Commit dc760aa

Browse files
authored
GPU TPC: Decoding: Add option to apply timebin cut to CTF cluster decoding on GPUs (#13753)
* GPU: TPC Decoding: add optional timebin cut to CTF cluster decoding * GPU: TPC Decoding: add missing checks on track model parameters
1 parent 167b8c0 commit dc760aa

File tree

7 files changed

+163
-17
lines changed

7 files changed

+163
-17
lines changed

GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,34 @@ void* GPUTPCDecompression::SetPointersTmpNativeBuffersInput(void* mem)
8484
return mem;
8585
}
8686

87+
void* GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering(void* mem)
88+
{
89+
computePointerWithAlignment(mem, mNativeClustersBuffer, mNClusterNativeBeforeFiltering);
90+
return mem;
91+
}
92+
93+
void* GPUTPCDecompression::SetPointersInputClusterNativeAccess(void* mem)
94+
{
95+
computePointerWithAlignment(mem, mClusterNativeAccess);
96+
return mem;
97+
}
98+
99+
void* GPUTPCDecompression::SetPointersNClusterPerSectorRow(void* mem)
100+
{
101+
computePointerWithAlignment(mem, mNClusterPerSectorRow, NSLICES * GPUCA_ROW_COUNT);
102+
return mem;
103+
}
104+
87105
void GPUTPCDecompression::RegisterMemoryAllocation()
88106
{
89107
AllocateAndInitializeLate();
90108
mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_EXTERNAL | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionInput");
91109
mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersGPU, GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersGPU");
92110
mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersOutput, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersOutput");
93111
mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersInput, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersInput");
112+
mResourceTmpBufferBeforeFiltering = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering, GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBufferForFiltering");
113+
mResourceClusterNativeAccess = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputClusterNativeAccess, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterAccessForFiltering");
114+
mResourceNClusterPerSectorRow = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersNClusterPerSectorRow, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterCountForFiltering");
94115
}
95116

96117
void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io)

GPU/GPUTracking/DataCompression/GPUTPCDecompression.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,9 @@ class GPUTPCDecompression : public GPUProcessor
5555
void* SetPointersTmpNativeBuffersGPU(void* mem);
5656
void* SetPointersTmpNativeBuffersOutput(void* mem);
5757
void* SetPointersTmpNativeBuffersInput(void* mem);
58+
void* SetPointersTmpClusterNativeAccessForFiltering(void* mem);
59+
void* SetPointersInputClusterNativeAccess(void* mem);
60+
void* SetPointersNClusterPerSectorRow(void* mem);
5861

5962
#endif
6063

@@ -63,18 +66,24 @@ class GPUTPCDecompression : public GPUProcessor
6366
o2::tpc::CompressedClusters mInputGPU;
6467

6568
uint32_t mMaxNativeClustersPerBuffer;
69+
uint32_t mNClusterNativeBeforeFiltering;
6670
uint32_t* mNativeClustersIndex;
6771
uint32_t* mUnattachedClustersOffsets;
6872
uint32_t* mAttachedClustersOffsets;
73+
uint32_t* mNClusterPerSectorRow;
6974
o2::tpc::ClusterNative* mTmpNativeClusters;
7075
o2::tpc::ClusterNative* mNativeClustersBuffer;
76+
o2::tpc::ClusterNativeAccess* mClusterNativeAccess;
7177

7278
template <class T>
7379
void SetPointersCompressedClusters(void*& mem, T& c, uint32_t nClA, uint32_t nTr, uint32_t nClU, bool reducedClA);
7480

7581
int16_t mMemoryResInputGPU = -1;
7682
int16_t mResourceTmpIndexes = -1;
7783
int16_t mResourceTmpClustersOffsets = -1;
84+
int16_t mResourceTmpBufferBeforeFiltering = -1;
85+
int16_t mResourceClusterNativeAccess = -1;
86+
int16_t mResourceNClusterPerSectorRow = -1;
7887
};
7988
} // namespace GPUCA_NAMESPACE::gpu
8089
#endif // GPUTPCDECOMPRESSION_H

GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::ste
4343
GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
4444
CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
4545
ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer;
46-
const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative;
46+
const ClusterNativeAccess* outputAccess = decompressor.mClusterNativeAccess;
4747
uint32_t* offsets = decompressor.mUnattachedClustersOffsets;
4848
for (int32_t i = get_global_id(0); i < GPUCA_ROW_COUNT * nSlices; i += get_global_size(0)) {
4949
uint32_t iRow = i % GPUCA_ROW_COUNT;
@@ -81,6 +81,51 @@ GPUdi() void GPUTPCDecompressionKernels::decompressorMemcpyBasic(T* GPUrestrict(
8181
}
8282
}
8383

84+
GPUdi() bool GPUTPCDecompressionUtilKernels::isClusterKept(const o2::tpc::ClusterNative& cl, const GPUParam& GPUrestrict() param)
85+
{
86+
return param.tpcCutTimeBin > 0 ? cl.getTime() < param.tpcCutTimeBin : true;
87+
}
88+
89+
template <>
90+
GPUdii() void GPUTPCDecompressionUtilKernels::Thread<GPUTPCDecompressionUtilKernels::countFilteredClusters>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
91+
{
92+
const GPUParam& GPUrestrict() param = processors.param;
93+
GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
94+
const ClusterNativeAccess* clusterAccess = decompressor.mClusterNativeAccess;
95+
for (uint32_t i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)) {
96+
uint32_t slice = i / GPUCA_ROW_COUNT;
97+
uint32_t row = i % GPUCA_ROW_COUNT;
98+
for (uint32_t k = 0; k < clusterAccess->nClusters[slice][row]; k++) {
99+
ClusterNative cl = clusterAccess->clusters[slice][row][k];
100+
if (isClusterKept(cl, param)) {
101+
decompressor.mNClusterPerSectorRow[i]++;
102+
}
103+
}
104+
}
105+
}
106+
107+
template <>
108+
GPUdii() void GPUTPCDecompressionUtilKernels::Thread<GPUTPCDecompressionUtilKernels::storeFilteredClusters>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
109+
{
110+
const GPUParam& GPUrestrict() param = processors.param;
111+
GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
112+
ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer;
113+
const ClusterNativeAccess* clusterAccess = decompressor.mClusterNativeAccess;
114+
const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative;
115+
for (uint32_t i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)) {
116+
uint32_t slice = i / GPUCA_ROW_COUNT;
117+
uint32_t row = i % GPUCA_ROW_COUNT;
118+
uint32_t count = 0;
119+
for (uint32_t k = 0; k < clusterAccess->nClusters[slice][row]; k++) {
120+
const ClusterNative cl = clusterAccess->clusters[slice][row][k];
121+
if (isClusterKept(cl, param)) {
122+
clusterBuffer[outputAccess->clusterOffset[slice][row] + count] = cl;
123+
count++;
124+
}
125+
}
126+
}
127+
}
128+
84129
template <>
85130
GPUdii() void GPUTPCDecompressionUtilKernels::Thread<GPUTPCDecompressionUtilKernels::sortPerSectorRow>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
86131
{

GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,15 @@ class GPUTPCDecompressionUtilKernels : public GPUKernelTemplate
5959
{
6060
public:
6161
enum K : int32_t {
62-
sortPerSectorRow = 0,
62+
countFilteredClusters = 0,
63+
storeFilteredClusters = 1,
64+
sortPerSectorRow = 2,
6365
};
6466

6567
template <int32_t iKernel = defaultKernel>
6668
GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors);
69+
70+
GPUdi() static bool isClusterKept(const o2::tpc::ClusterNative& cl, const GPUParam& GPUrestrict() param);
6771
};
6872

6973
} // namespace GPUCA_NAMESPACE::gpu

GPU/GPUTracking/Definitions/GPUDefGPUParameters.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,12 @@
344344
#endif
345345
#ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_sortPerSectorRow
346346
#define GPUCA_LB_GPUTPCDecompressionUtilKernels_sortPerSectorRow 256
347+
#endif
348+
#ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_countFilteredClusters
349+
#define GPUCA_LB_GPUTPCDecompressionUtilKernels_countFilteredClusters 256
350+
#endif
351+
#ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_storeFilteredClusters
352+
#define GPUCA_LB_GPUTPCDecompressionUtilKernels_storeFilteredClusters 256
347353
#endif
348354
#ifndef GPUCA_LB_GPUTPCCFDecodeZS
349355
#define GPUCA_LB_GPUTPCCFDecodeZS 128, 4

GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx

Lines changed: 74 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -246,13 +246,21 @@ int32_t GPUChainTracking::RunTPCDecompression()
246246
mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR"));
247247
RecoStep myStep = RecoStep::TPCDecompression;
248248
bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression;
249+
bool runFiltering = param().tpcCutTimeBin > 0;
249250
GPUTPCDecompression& Decompressor = processors()->tpcDecompressor;
250251
GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor;
251252
const auto& threadContext = GetThreadContext();
252253
CompressedClusters cmprClsHost = *mIOPtrs.tpcCompressedClusters;
253254
CompressedClusters& inputGPU = Decompressor.mInputGPU;
254255
CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU;
255256

257+
if (cmprClsHost.nTracks && cmprClsHost.solenoidBz != -1e6f && cmprClsHost.solenoidBz != param().bzkG) {
258+
throw std::runtime_error("Configured solenoid Bz does not match value used for track model encoding");
259+
}
260+
if (cmprClsHost.nTracks && cmprClsHost.maxTimeBin != -1e6 && cmprClsHost.maxTimeBin != param().continuousMaxTimeBin) {
261+
throw std::runtime_error("Configured max time bin does not match value used for track model encoding");
262+
}
263+
256264
int32_t inputStream = 0;
257265
int32_t unattachedStream = mRec->NStreams() - 1;
258266
inputGPU = cmprClsHost;
@@ -300,12 +308,6 @@ int32_t GPUChainTracking::RunTPCDecompression()
300308
GPUMemCpy(myStep, inputGPUShadow.sigmaPadU, cmprClsHost.sigmaPadU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaPadU[0]), unattachedStream, toGPU);
301309
GPUMemCpy(myStep, inputGPUShadow.sigmaTimeU, cmprClsHost.sigmaTimeU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaTimeU[0]), unattachedStream, toGPU);
302310

303-
mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters;
304-
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
305-
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
306-
DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer;
307-
Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput;
308-
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream);
309311
TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes, inputStream, nullptr, mEvents->stream, nStreams);
310312
SynchronizeStream(inputStream);
311313
uint32_t offset = 0;
@@ -324,27 +326,83 @@ int32_t GPUChainTracking::RunTPCDecompression()
324326
if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) {
325327
GPUWarning("%u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters);
326328
}
327-
if (doGPU) {
328-
mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
329+
if (runFiltering) { // If filtering, allocate a temporary buffer and cluster native access in decompressor context
330+
Decompressor.mNClusterNativeBeforeFiltering = DecompressorShadow.mNClusterNativeBeforeFiltering = decodedAttachedClusters + cmprClsHost.nUnattachedClusters;
331+
AllocateRegisteredMemory(Decompressor.mResourceTmpBufferBeforeFiltering);
332+
AllocateRegisteredMemory(Decompressor.mResourceClusterNativeAccess);
333+
mClusterNativeAccess->clustersLinear = DecompressorShadow.mNativeClustersBuffer;
334+
mClusterNativeAccess->setOffsetPtrs();
335+
*Decompressor.mClusterNativeAccess = *mClusterNativeAccess;
336+
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream);
337+
TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, Decompressor.mResourceClusterNativeAccess, inputStream, &mEvents->single);
338+
} else { // If not filtering, directly allocate the final buffers
339+
mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters;
340+
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
341+
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
342+
DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer;
343+
Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput;
344+
DecompressorShadow.mClusterNativeAccess = mInputsShadow->mPclusterNativeAccess;
345+
Decompressor.mClusterNativeAccess = mInputsHost->mPclusterNativeAccess;
346+
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream);
347+
if (doGPU) {
348+
mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
349+
mClusterNativeAccess->setOffsetPtrs();
350+
*mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess;
351+
processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
352+
WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), inputStream);
353+
TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, inputStream, &mEvents->single);
354+
}
355+
mIOPtrs.clustersNative = mClusterNativeAccess.get();
356+
mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
329357
mClusterNativeAccess->setOffsetPtrs();
330358
*mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess;
331-
processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
332-
WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), inputStream);
333-
TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, inputStream, &mEvents->single);
334359
}
335-
mIOPtrs.clustersNative = mClusterNativeAccess.get();
336-
mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
337-
mClusterNativeAccess->setOffsetPtrs();
338360

339361
uint32_t batchSize = doGPU ? 6 : NSLICES;
340362
for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice = iSlice + batchSize) {
341363
int32_t iStream = (iSlice / batchSize) % mRec->NStreams();
342364
runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step1unattached>({GetGridAuto(iStream), krnlRunRangeNone, {nullptr, &mEvents->single}}, iSlice, batchSize);
343365
uint32_t copySize = std::accumulate(mClusterNativeAccess->nClustersSector + iSlice, mClusterNativeAccess->nClustersSector + iSlice + batchSize, 0u);
344-
GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSlice][0], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess->clusterOffset[iSlice][0], sizeof(Decompressor.mNativeClustersBuffer[0]) * copySize, iStream, false);
366+
if (!runFiltering) {
367+
GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSlice][0], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess->clusterOffset[iSlice][0], sizeof(Decompressor.mNativeClustersBuffer[0]) * copySize, iStream, false);
368+
}
345369
}
346370
SynchronizeGPU();
347371

372+
if (runFiltering) { // If filtering is applied, count how many clusters will remain after filtering and allocate final buffers accordingly
373+
AllocateRegisteredMemory(Decompressor.mResourceNClusterPerSectorRow);
374+
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), unattachedStream);
375+
runKernel<GPUMemClean16>({GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression), krnlRunRangeNone}, DecompressorShadow.mNClusterPerSectorRow, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mNClusterPerSectorRow[0]));
376+
runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::countFilteredClusters>(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
377+
TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceNClusterPerSectorRow, unattachedStream);
378+
SynchronizeStream(unattachedStream);
379+
uint32_t nClustersFinal = std::accumulate(Decompressor.mNClusterPerSectorRow, Decompressor.mNClusterPerSectorRow + inputGPU.nSliceRows, 0u);
380+
mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = nClustersFinal;
381+
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
382+
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
383+
DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer;
384+
Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput;
385+
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), unattachedStream);
386+
for (uint32_t i = 0; i < NSLICES; i++) {
387+
for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
388+
mClusterNativeAccess->nClusters[i][j] = Decompressor.mNClusterPerSectorRow[i * GPUCA_ROW_COUNT + j];
389+
}
390+
}
391+
if (doGPU) {
392+
mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
393+
mClusterNativeAccess->setOffsetPtrs();
394+
*mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess;
395+
processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
396+
WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), unattachedStream);
397+
TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, unattachedStream);
398+
}
399+
mIOPtrs.clustersNative = mClusterNativeAccess.get();
400+
mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
401+
mClusterNativeAccess->setOffsetPtrs();
402+
runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::storeFilteredClusters>(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
403+
GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput, DecompressorShadow.mNativeClustersBuffer, sizeof(Decompressor.mNativeClustersBuffer[0]) * nClustersFinal, unattachedStream, false);
404+
SynchronizeStream(unattachedStream);
405+
}
348406
if (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4) {
349407
runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::sortPerSectorRow>(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
350408
const ClusterNativeAccess* decoded = mIOPtrs.clustersNative;
@@ -357,6 +415,7 @@ int32_t GPUChainTracking::RunTPCDecompression()
357415
}
358416
}
359417
}
418+
SynchronizeStream(unattachedStream);
360419
}
361420
mRec->PopNonPersistentMemory(RecoStep::TPCDecompression, qStr2Tag("TPCDCMPR"));
362421
}

0 commit comments

Comments
 (0)