Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions GPU/GPUTracking/DataCompression/GPUTPCDecompression.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,34 @@ void* GPUTPCDecompression::SetPointersTmpNativeBuffersInput(void* mem)
return mem;
}

void* GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering(void* mem)
{
computePointerWithAlignment(mem, mNativeClustersBuffer, mNClusterNativeBeforeFiltering);
return mem;
}

void* GPUTPCDecompression::SetPointersInputClusterNativeAccess(void* mem)
{
computePointerWithAlignment(mem, mClusterNativeAccess);
return mem;
}

void* GPUTPCDecompression::SetPointersNClusterPerSectorRow(void* mem)
{
computePointerWithAlignment(mem, mNClusterPerSectorRow, NSLICES * GPUCA_ROW_COUNT);
return mem;
}

void GPUTPCDecompression::RegisterMemoryAllocation()
{
AllocateAndInitializeLate();
mMemoryResInputGPU = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputGPU, GPUMemoryResource::MEMORY_INPUT_FLAG | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_EXTERNAL | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionInput");
mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersGPU, GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersGPU");
mResourceTmpIndexes = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersOutput, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersOutput");
mResourceTmpClustersOffsets = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpNativeBuffersInput, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBuffersInput");
mResourceTmpBufferBeforeFiltering = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersTmpClusterNativeAccessForFiltering, GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpBufferForFiltering");
mResourceClusterNativeAccess = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersInputClusterNativeAccess, GPUMemoryResource::MEMORY_INPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterAccessForFiltering");
mResourceNClusterPerSectorRow = mRec->RegisterMemoryAllocation(this, &GPUTPCDecompression::SetPointersNClusterPerSectorRow, GPUMemoryResource::MEMORY_OUTPUT | GPUMemoryResource::MEMORY_CUSTOM | GPUMemoryResource::MEMORY_SCRATCH, "TPCDecompressionTmpClusterCountForFiltering");
}

void GPUTPCDecompression::SetMaxData(const GPUTrackingInOutPointers& io)
Expand Down
9 changes: 9 additions & 0 deletions GPU/GPUTracking/DataCompression/GPUTPCDecompression.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ class GPUTPCDecompression : public GPUProcessor
void* SetPointersTmpNativeBuffersGPU(void* mem);
void* SetPointersTmpNativeBuffersOutput(void* mem);
void* SetPointersTmpNativeBuffersInput(void* mem);
void* SetPointersTmpClusterNativeAccessForFiltering(void* mem);
void* SetPointersInputClusterNativeAccess(void* mem);
void* SetPointersNClusterPerSectorRow(void* mem);

#endif

Expand All @@ -63,18 +66,24 @@ class GPUTPCDecompression : public GPUProcessor
o2::tpc::CompressedClusters mInputGPU;

uint32_t mMaxNativeClustersPerBuffer;
uint32_t mNClusterNativeBeforeFiltering;
uint32_t* mNativeClustersIndex;
uint32_t* mUnattachedClustersOffsets;
uint32_t* mAttachedClustersOffsets;
uint32_t* mNClusterPerSectorRow;
o2::tpc::ClusterNative* mTmpNativeClusters;
o2::tpc::ClusterNative* mNativeClustersBuffer;
o2::tpc::ClusterNativeAccess* mClusterNativeAccess;

template <class T>
void SetPointersCompressedClusters(void*& mem, T& c, uint32_t nClA, uint32_t nTr, uint32_t nClU, bool reducedClA);

int16_t mMemoryResInputGPU = -1;
int16_t mResourceTmpIndexes = -1;
int16_t mResourceTmpClustersOffsets = -1;
int16_t mResourceTmpBufferBeforeFiltering = -1;
int16_t mResourceClusterNativeAccess = -1;
int16_t mResourceNClusterPerSectorRow = -1;
};
} // namespace GPUCA_NAMESPACE::gpu
#endif // GPUTPCDECOMPRESSION_H
47 changes: 46 additions & 1 deletion GPU/GPUTracking/DataCompression/GPUTPCDecompressionKernels.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ GPUdii() void GPUTPCDecompressionKernels::Thread<GPUTPCDecompressionKernels::ste
GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
CompressedClusters& GPUrestrict() cmprClusters = decompressor.mInputGPU;
ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer;
const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative;
const ClusterNativeAccess* outputAccess = decompressor.mClusterNativeAccess;
uint32_t* offsets = decompressor.mUnattachedClustersOffsets;
for (int32_t i = get_global_id(0); i < GPUCA_ROW_COUNT * nSlices; i += get_global_size(0)) {
uint32_t iRow = i % GPUCA_ROW_COUNT;
Expand Down Expand Up @@ -81,6 +81,51 @@ GPUdi() void GPUTPCDecompressionKernels::decompressorMemcpyBasic(T* GPUrestrict(
}
}

GPUdi() bool GPUTPCDecompressionUtilKernels::isClusterKept(const o2::tpc::ClusterNative& cl, const GPUParam& GPUrestrict() param)
{
return param.tpcCutTimeBin > 0 ? cl.getTime() < param.tpcCutTimeBin : true;
}

template <>
GPUdii() void GPUTPCDecompressionUtilKernels::Thread<GPUTPCDecompressionUtilKernels::countFilteredClusters>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
{
const GPUParam& GPUrestrict() param = processors.param;
GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
const ClusterNativeAccess* clusterAccess = decompressor.mClusterNativeAccess;
for (uint32_t i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)) {
uint32_t slice = i / GPUCA_ROW_COUNT;
uint32_t row = i % GPUCA_ROW_COUNT;
for (uint32_t k = 0; k < clusterAccess->nClusters[slice][row]; k++) {
ClusterNative cl = clusterAccess->clusters[slice][row][k];
if (isClusterKept(cl, param)) {
decompressor.mNClusterPerSectorRow[i]++;
}
}
}
}

template <>
GPUdii() void GPUTPCDecompressionUtilKernels::Thread<GPUTPCDecompressionUtilKernels::storeFilteredClusters>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
{
const GPUParam& GPUrestrict() param = processors.param;
GPUTPCDecompression& GPUrestrict() decompressor = processors.tpcDecompressor;
ClusterNative* GPUrestrict() clusterBuffer = decompressor.mNativeClustersBuffer;
const ClusterNativeAccess* clusterAccess = decompressor.mClusterNativeAccess;
const ClusterNativeAccess* outputAccess = processors.ioPtrs.clustersNative;
for (uint32_t i = get_global_id(0); i < GPUCA_NSLICES * GPUCA_ROW_COUNT; i += get_global_size(0)) {
uint32_t slice = i / GPUCA_ROW_COUNT;
uint32_t row = i % GPUCA_ROW_COUNT;
uint32_t count = 0;
for (uint32_t k = 0; k < clusterAccess->nClusters[slice][row]; k++) {
const ClusterNative cl = clusterAccess->clusters[slice][row][k];
if (isClusterKept(cl, param)) {
clusterBuffer[outputAccess->clusterOffset[slice][row] + count] = cl;
count++;
}
}
}
}

template <>
GPUdii() void GPUTPCDecompressionUtilKernels::Thread<GPUTPCDecompressionUtilKernels::sortPerSectorRow>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& processors)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,15 @@ class GPUTPCDecompressionUtilKernels : public GPUKernelTemplate
{
public:
enum K : int32_t {
sortPerSectorRow = 0,
countFilteredClusters = 0,
storeFilteredClusters = 1,
sortPerSectorRow = 2,
};

template <int32_t iKernel = defaultKernel>
GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors);

GPUdi() static bool isClusterKept(const o2::tpc::ClusterNative& cl, const GPUParam& GPUrestrict() param);
};

} // namespace GPUCA_NAMESPACE::gpu
Expand Down
6 changes: 6 additions & 0 deletions GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,12 @@
#endif
#ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_sortPerSectorRow
#define GPUCA_LB_GPUTPCDecompressionUtilKernels_sortPerSectorRow 256
#endif
#ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_countFilteredClusters
#define GPUCA_LB_GPUTPCDecompressionUtilKernels_countFilteredClusters 256
#endif
#ifndef GPUCA_LB_GPUTPCDecompressionUtilKernels_storeFilteredClusters
#define GPUCA_LB_GPUTPCDecompressionUtilKernels_storeFilteredClusters 256
#endif
#ifndef GPUCA_LB_GPUTPCCFDecodeZS
#define GPUCA_LB_GPUTPCCFDecodeZS 128, 4
Expand Down
89 changes: 74 additions & 15 deletions GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -246,13 +246,21 @@ int32_t GPUChainTracking::RunTPCDecompression()
mRec->PushNonPersistentMemory(qStr2Tag("TPCDCMPR"));
RecoStep myStep = RecoStep::TPCDecompression;
bool doGPU = GetRecoStepsGPU() & RecoStep::TPCDecompression;
bool runFiltering = param().tpcCutTimeBin > 0;
GPUTPCDecompression& Decompressor = processors()->tpcDecompressor;
GPUTPCDecompression& DecompressorShadow = doGPU ? processorsShadow()->tpcDecompressor : Decompressor;
const auto& threadContext = GetThreadContext();
CompressedClusters cmprClsHost = *mIOPtrs.tpcCompressedClusters;
CompressedClusters& inputGPU = Decompressor.mInputGPU;
CompressedClusters& inputGPUShadow = DecompressorShadow.mInputGPU;

if (cmprClsHost.nTracks && cmprClsHost.solenoidBz != -1e6f && cmprClsHost.solenoidBz != param().bzkG) {
throw std::runtime_error("Configured solenoid Bz does not match value used for track model encoding");
}
if (cmprClsHost.nTracks && cmprClsHost.maxTimeBin != -1e6 && cmprClsHost.maxTimeBin != param().continuousMaxTimeBin) {
throw std::runtime_error("Configured max time bin does not match value used for track model encoding");
}

int32_t inputStream = 0;
int32_t unattachedStream = mRec->NStreams() - 1;
inputGPU = cmprClsHost;
Expand Down Expand Up @@ -300,12 +308,6 @@ int32_t GPUChainTracking::RunTPCDecompression()
GPUMemCpy(myStep, inputGPUShadow.sigmaPadU, cmprClsHost.sigmaPadU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaPadU[0]), unattachedStream, toGPU);
GPUMemCpy(myStep, inputGPUShadow.sigmaTimeU, cmprClsHost.sigmaTimeU, cmprClsHost.nUnattachedClusters * sizeof(cmprClsHost.sigmaTimeU[0]), unattachedStream, toGPU);

mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters;
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer;
Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput;
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream);
TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceTmpIndexes, inputStream, nullptr, mEvents->stream, nStreams);
SynchronizeStream(inputStream);
uint32_t offset = 0;
Expand All @@ -324,27 +326,83 @@ int32_t GPUChainTracking::RunTPCDecompression()
if (decodedAttachedClusters != cmprClsHost.nAttachedClusters) {
GPUWarning("%u / %u clusters failed track model decoding (%f %%)", cmprClsHost.nAttachedClusters - decodedAttachedClusters, cmprClsHost.nAttachedClusters, 100.f * (float)(cmprClsHost.nAttachedClusters - decodedAttachedClusters) / (float)cmprClsHost.nAttachedClusters);
}
if (doGPU) {
mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
if (runFiltering) { // If filtering, allocate a temporary buffer and cluster native access in decompressor context
Decompressor.mNClusterNativeBeforeFiltering = DecompressorShadow.mNClusterNativeBeforeFiltering = decodedAttachedClusters + cmprClsHost.nUnattachedClusters;
AllocateRegisteredMemory(Decompressor.mResourceTmpBufferBeforeFiltering);
AllocateRegisteredMemory(Decompressor.mResourceClusterNativeAccess);
mClusterNativeAccess->clustersLinear = DecompressorShadow.mNativeClustersBuffer;
mClusterNativeAccess->setOffsetPtrs();
*Decompressor.mClusterNativeAccess = *mClusterNativeAccess;
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream);
TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, Decompressor.mResourceClusterNativeAccess, inputStream, &mEvents->single);
} else { // If not filtering, directly allocate the final buffers
mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = cmprClsHost.nAttachedClusters + cmprClsHost.nUnattachedClusters;
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer;
Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput;
DecompressorShadow.mClusterNativeAccess = mInputsShadow->mPclusterNativeAccess;
Decompressor.mClusterNativeAccess = mInputsHost->mPclusterNativeAccess;
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), inputStream);
if (doGPU) {
mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
mClusterNativeAccess->setOffsetPtrs();
*mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess;
processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), inputStream);
TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, inputStream, &mEvents->single);
}
mIOPtrs.clustersNative = mClusterNativeAccess.get();
mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
mClusterNativeAccess->setOffsetPtrs();
*mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess;
processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), inputStream);
TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, inputStream, &mEvents->single);
}
mIOPtrs.clustersNative = mClusterNativeAccess.get();
mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
mClusterNativeAccess->setOffsetPtrs();

uint32_t batchSize = doGPU ? 6 : NSLICES;
for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice = iSlice + batchSize) {
int32_t iStream = (iSlice / batchSize) % mRec->NStreams();
runKernel<GPUTPCDecompressionKernels, GPUTPCDecompressionKernels::step1unattached>({GetGridAuto(iStream), krnlRunRangeNone, {nullptr, &mEvents->single}}, iSlice, batchSize);
uint32_t copySize = std::accumulate(mClusterNativeAccess->nClustersSector + iSlice, mClusterNativeAccess->nClustersSector + iSlice + batchSize, 0u);
GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSlice][0], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess->clusterOffset[iSlice][0], sizeof(Decompressor.mNativeClustersBuffer[0]) * copySize, iStream, false);
if (!runFiltering) {
GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput + mClusterNativeAccess->clusterOffset[iSlice][0], DecompressorShadow.mNativeClustersBuffer + mClusterNativeAccess->clusterOffset[iSlice][0], sizeof(Decompressor.mNativeClustersBuffer[0]) * copySize, iStream, false);
}
}
SynchronizeGPU();

if (runFiltering) { // If filtering is applied, count how many clusters will remain after filtering and allocate final buffers accordingly
AllocateRegisteredMemory(Decompressor.mResourceNClusterPerSectorRow);
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), unattachedStream);
runKernel<GPUMemClean16>({GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression), krnlRunRangeNone}, DecompressorShadow.mNClusterPerSectorRow, NSLICES * GPUCA_ROW_COUNT * sizeof(DecompressorShadow.mNClusterPerSectorRow[0]));
runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::countFilteredClusters>(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
TransferMemoryResourceLinkToHost(RecoStep::TPCDecompression, Decompressor.mResourceNClusterPerSectorRow, unattachedStream);
SynchronizeStream(unattachedStream);
uint32_t nClustersFinal = std::accumulate(Decompressor.mNClusterPerSectorRow, Decompressor.mNClusterPerSectorRow + inputGPU.nSliceRows, 0u);
mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = nClustersFinal;
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
DecompressorShadow.mNativeClustersBuffer = mInputsShadow->mPclusterNativeBuffer;
Decompressor.mNativeClustersBuffer = mInputsHost->mPclusterNativeOutput;
WriteToConstantMemory(myStep, (char*)&processors()->tpcDecompressor - (char*)processors(), &DecompressorShadow, sizeof(DecompressorShadow), unattachedStream);
for (uint32_t i = 0; i < NSLICES; i++) {
for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
mClusterNativeAccess->nClusters[i][j] = Decompressor.mNClusterPerSectorRow[i * GPUCA_ROW_COUNT + j];
}
}
if (doGPU) {
mClusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
mClusterNativeAccess->setOffsetPtrs();
*mInputsHost->mPclusterNativeAccess = *mClusterNativeAccess;
processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
WriteToConstantMemory(RecoStep::TPCDecompression, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), unattachedStream);
TransferMemoryResourceLinkToGPU(RecoStep::TPCDecompression, mInputsHost->mResourceClusterNativeAccess, unattachedStream);
}
mIOPtrs.clustersNative = mClusterNativeAccess.get();
mClusterNativeAccess->clustersLinear = mInputsHost->mPclusterNativeOutput;
mClusterNativeAccess->setOffsetPtrs();
runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::storeFilteredClusters>(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
GPUMemCpy(RecoStep::TPCDecompression, mInputsHost->mPclusterNativeOutput, DecompressorShadow.mNativeClustersBuffer, sizeof(Decompressor.mNativeClustersBuffer[0]) * nClustersFinal, unattachedStream, false);
SynchronizeStream(unattachedStream);
}
if (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4) {
runKernel<GPUTPCDecompressionUtilKernels, GPUTPCDecompressionUtilKernels::sortPerSectorRow>(GetGridAutoStep(unattachedStream, RecoStep::TPCDecompression));
const ClusterNativeAccess* decoded = mIOPtrs.clustersNative;
Expand All @@ -357,6 +415,7 @@ int32_t GPUChainTracking::RunTPCDecompression()
}
}
}
SynchronizeStream(unattachedStream);
}
mRec->PopNonPersistentMemory(RecoStep::TPCDecompression, qStr2Tag("TPCDCMPR"));
}
Expand Down
Loading