GPU: Make TPC CF CF_SCAN_WORKGROUP_SIZE configureable

davidrohr · davidrohr · commit 8ebcfcf78561 · 2025-04-24T22:45:24.000+02:00
diff --git a/GPU/GPUTracking/Base/GPUProcessor.h b/GPU/GPUTracking/Base/GPUProcessor.h
@@ -63,7 +63,7 @@ class GPUProcessor
   }
 
   template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>
-  static inline size_t getAlignmentMod(size_t addr)
+  static constexpr inline size_t getAlignmentMod(size_t addr)
   {
     static_assert((alignment & (alignment - 1)) == 0, "Invalid alignment, not power of 2");
     if (alignment <= 1) {
@@ -72,7 +72,7 @@ class GPUProcessor
     return addr & (alignment - 1);
   }
   template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>
-  static inline size_t getAlignment(size_t addr)
+  static constexpr inline size_t getAlignment(size_t addr)
   {
     size_t mod = getAlignmentMod<alignment>(addr);
     if (mod == 0) {
@@ -81,10 +81,22 @@ class GPUProcessor
     return (alignment - mod);
   }
   template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>
-  static inline size_t nextMultipleOf(size_t size)
+  static constexpr inline size_t nextMultipleOf(size_t size)
   {
     return size + getAlignment<alignment>(size);
   }
+  static constexpr inline size_t nextMultipleOf(size_t size, size_t alignment)
+  {
+    if (alignment & (alignment - 1)) {
+      size_t tmp = size % alignment;
+      if (tmp) {
+        size += alignment - tmp;
+      }
+      return size;
+    } else {
+      return (size + alignment - 1) & ~(alignment - 1);
+    }
+  }
   template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>
   static inline void* alignPointer(void* ptr)
   {
diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersConstants.h b/GPU/GPUTracking/Definitions/GPUDefParametersConstants.h
@@ -18,8 +18,6 @@
 #define GPUDEFPARAMETERSCONSTANTS_H
 // clang-format off
 
-#define GPUCA_THREAD_COUNT_SCAN 512 // TODO: WARNING!!! Must not be GPUTYPE-dependent right now! // TODO: Fix!
-
 #if defined(__CUDACC__) || defined(__HIPCC__)
   #define GPUCA_SPECIALIZE_THRUST_SORTS // Not compiled with RTC, so must be compile-time constant
 #endif
diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
@@ -22,7 +22,6 @@
 
 // GPU Run Configuration
 #if defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS) // Avoid including for RTC generation besides normal include protection.
-  #define GPUCA_LB_SCAN 512
   // GPU-architecture-dependent default settings
   #if defined(GPUCA_GPUTYPE_MI2xx)
     #define GPUCA_WARP_SIZE 64
@@ -499,11 +498,11 @@
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizerKernels
 
-  #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_LB_SCAN
-  #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_LB_SCAN
-  #define GPUCA_LB_GPUTPCCFStreamCompaction_scanTop GPUCA_LB_SCAN
-  #define GPUCA_LB_GPUTPCCFStreamCompaction_scanDown GPUCA_LB_SCAN
-  #define GPUCA_LB_GPUTPCCFStreamCompaction_compactDigits GPUCA_LB_SCAN
+  #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE
+  #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE
+  #define GPUCA_LB_GPUTPCCFStreamCompaction_scanTop GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE
+  #define GPUCA_LB_GPUTPCCFStreamCompaction_scanDown GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE
+  #define GPUCA_LB_GPUTPCCFStreamCompaction_compactDigits GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE
   #define GPUCA_LB_GPUTPCCompressionGatherKernels_unbuffered GPUCA_LB_COMPRESSION_GATHER
   #define GPUCA_LB_GPUTPCCompressionGatherKernels_buffered32 GPUCA_LB_COMPRESSION_GATHER
   #define GPUCA_LB_GPUTPCCompressionGatherKernels_buffered64 GPUCA_LB_COMPRESSION_GATHER
@@ -541,6 +540,9 @@
   #ifndef GPUCA_PAR_COMP_GATHER_MODE
     #define GPUCA_PAR_COMP_GATHER_MODE 2
   #endif
+  #ifndef GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE
+    #define GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE 512
+  #endif
 #endif // defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_GENRTC) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS)
 
 #ifndef GPUCA_GPUCODE_GENRTC
@@ -578,6 +580,9 @@
   #ifndef GPUCA_PAR_NO_ATOMIC_PRECHECK
     #define GPUCA_PAR_NO_ATOMIC_PRECHECK 0
   #endif
+  #ifndef GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE
+    #define GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE 0
+  #endif
   #ifndef GPUCA_PAR_DEDX_STORAGE_TYPE
     #define GPUCA_PAR_DEDX_STORAGE_TYPE float
   #endif
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -23,6 +23,7 @@
 #include "CfChargePos.h"
 #include "CfArray2D.h"
 #include "GPUGeneralKernels.h"
+#include "GPUDefParametersRuntime.h"
 #include "GPUTPCCFStreamCompaction.h"
 #include "GPUTPCCFChargeMapFiller.h"
 #include "GPUTPCCFDecodeZS.h"
@@ -402,27 +403,28 @@ void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clust
       exit(1);
     }
 
+    int32_t scanWorkgroupSize = mRec->getGPUParameters(doGPU).par_CF_SCAN_WORKGROUP_SIZE;
     size_t tmpCount = count;
     if (nSteps > 1) {
       for (uint32_t i = 1; i < nSteps; i++) {
         counts.push_back(tmpCount);
         if (i == 1) {
-          runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanStart>({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, stage);
+          runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanStart>({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, i, stage);
         } else {
-          runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanUp>({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, tmpCount);
+          runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanUp>({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, i, tmpCount);
         }
-        tmpCount = (tmpCount + clusterer.mScanWorkGroupSize - 1) / clusterer.mScanWorkGroupSize;
+        tmpCount = (tmpCount + scanWorkgroupSize - 1) / scanWorkgroupSize;
       }
 
-      runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanTop>({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, nSteps, tmpCount);
+      runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanTop>({GetGrid(tmpCount, scanWorkgroupSize, lane), {iSector}}, nSteps, tmpCount);
 
       for (uint32_t i = nSteps - 1; i > 1; i--) {
         tmpCount = counts[i - 1];
-        runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanDown>({GetGrid(tmpCount - clusterer.mScanWorkGroupSize, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, clusterer.mScanWorkGroupSize, tmpCount);
+        runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanDown>({GetGrid(tmpCount - scanWorkgroupSize, scanWorkgroupSize, lane), {iSector}}, i, scanWorkgroupSize, tmpCount);
       }
     }
 
-    runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::compactDigits>({GetGrid(count, clusterer.mScanWorkGroupSize, lane), {iSector}}, 1, stage, in, out);
+    runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::compactDigits>({GetGrid(count, scanWorkgroupSize, lane), {iSector}}, 1, stage, in, out);
   } else {
     auto& nOut = stage ? clusterer.mPmemory->counters.nClusters : clusterer.mPmemory->counters.nPeaks;
     auto& nIn = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
@@ -24,6 +24,7 @@ using namespace o2::gpu::tpccf;
 template <>
 GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanStart>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t stage)
 {
+#ifdef GPUCA_GPUCODE
   int32_t nElems = CompactionElems(clusterer, stage);
 
   const auto* predicate = clusterer.mPisPeak;
@@ -35,17 +36,19 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanSta
     pred = predicate[iThreadGlobal];
   }
 
-  int32_t nElemsInBlock = CfUtils::blockPredicateSum<GPUCA_THREAD_COUNT_SCAN>(smem, pred);
+  int32_t nElemsInBlock = CfUtils::blockPredicateSum<GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE>(smem, pred);
 
   int32_t lastThread = nThreads - 1;
   if (iThread == lastThread) {
     scanOffset[iBlock] = nElemsInBlock;
   }
+#endif
 }
 
 template <>
 GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanUp>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t nElems)
 {
+#ifdef GPUCA_GPUCODE
   auto* scanOffset = clusterer.GetScanBuffer(iBuf - 1);
   auto* scanOffsetNext = clusterer.GetScanBuffer(iBuf);
 
@@ -59,11 +62,13 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanUp>
   if (iThread == lastThread) {
     scanOffsetNext[iBlock] = offsetInBlock;
   }
+#endif
 }
 
 template <>
 GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanTop>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t nElems)
 {
+#ifdef GPUCA_GPUCODE
   int32_t iThreadGlobal = get_global_id(0);
   int32_t* scanOffset = clusterer.GetScanBuffer(iBuf - 1);
 
@@ -74,11 +79,13 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanTop
   if (inBounds) {
     scanOffset[iThreadGlobal] = offsetInBlock;
   }
+#endif
 }
 
 template <>
 GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanDown>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& /*smem*/, processorType& clusterer, int32_t iBuf, uint32_t offset, int32_t nElems)
 {
+#ifdef GPUCA_GPUCODE
   int32_t iThreadGlobal = get_global_id(0) + offset;
 
   int32_t* scanOffsetPrev = clusterer.GetScanBuffer(iBuf - 1);
@@ -89,11 +96,13 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanDow
   if (iThreadGlobal < nElems) {
     scanOffsetPrev[iThreadGlobal] += shift;
   }
+#endif
 }
 
 template <>
 GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::compactDigits>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t stage, CfChargePos* in, CfChargePos* out)
 {
+#ifdef GPUCA_GPUCODE
   uint32_t nElems = CompactionElems(clusterer, stage);
   SizeT bufferSize = (stage) ? clusterer.mNMaxClusters : clusterer.mNMaxPeaks;
 
@@ -105,7 +114,7 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::compact
   bool iAmDummy = (iThreadGlobal >= nElems);
 
   int32_t pred = (iAmDummy) ? 0 : predicate[iThreadGlobal];
-  int32_t offsetInBlock = CfUtils::blockPredicateScan<GPUCA_THREAD_COUNT_SCAN>(smem, pred);
+  int32_t offsetInBlock = CfUtils::blockPredicateScan<GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE>(smem, pred);
 
   SizeT globalOffsetOut = offsetInBlock;
   if (iBlock > 0) {
@@ -129,6 +138,7 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::compact
       clusterer.mPmemory->counters.nPeaks = nFinal;
     }
   }
+#endif
 }
 
 GPUdii() int32_t GPUTPCCFStreamCompaction::CompactionElems(processorType& clusterer, int32_t stage)
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFStreamCompaction.h
@@ -35,14 +35,14 @@ class GPUTPCCFStreamCompaction : public GPUKernelTemplate
     compactDigits = 4,
   };
 
-  struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64<int32_t, GPUCA_THREAD_COUNT_SCAN> {
-  };
 #if defined(GPUCA_GPUCODE) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS)
-  static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanStart));
-  static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanUp));
-  static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanTop));
-  static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanDown));
-  static_assert(GPUCA_THREAD_COUNT_SCAN == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_compactDigits));
+  struct GPUSharedMemory : public GPUKernelTemplate::GPUSharedMemoryScan64<int32_t, GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE> {
+  };
+  static_assert(GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanStart));
+  static_assert(GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanUp));
+  static_assert(GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanTop));
+  static_assert(GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_scanDown));
+  static_assert(GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFStreamCompaction_compactDigits));
 #endif
 
   typedef GPUTPCClusterFinder processorType;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.cxx
@@ -17,6 +17,7 @@
 #include "GPUMemorySizeScalers.h"
 #include "GPUHostDataTypes.h"
 #include "GPUSettings.h"
+#include "GPUDefParametersRuntime.h"
 
 #include "DataFormatsTPC/ClusterNative.h"
 #include "DataFormatsTPC/ZeroSuppression.h"
@@ -90,9 +91,10 @@ void* GPUTPCClusterFinder::SetPointersScratch(void* mem)
   computePointerWithAlignment(mem, mPisPeak, mNMaxDigitsFragment);
   computePointerWithAlignment(mem, mPchargeMap, TPCMapMemoryLayout<decltype(*mPchargeMap)>::items(mRec->GetProcessingSettings().overrideClusterizerFragmentLen));
   computePointerWithAlignment(mem, mPpeakMap, TPCMapMemoryLayout<decltype(*mPpeakMap)>::items(mRec->GetProcessingSettings().overrideClusterizerFragmentLen));
-  computePointerWithAlignment(mem, mPbuf, mBufSize * mNBufs);
   computePointerWithAlignment(mem, mPclusterByRow, GPUCA_ROW_COUNT * mNMaxClusterPerRow);
-
+  if ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding)) {
+    computePointerWithAlignment(mem, mPscanBuf, mBufSize * mNBufs);
+  }
   return mem;
 }
 
@@ -129,14 +131,15 @@ void GPUTPCClusterFinder::SetMaxData(const GPUTrackingInOutPointers& io)
   if (mRec->GetProcessingSettings().tpcIncreasedMinClustersPerRow) {
     mNMaxClusterPerRow = std::max<uint32_t>(mNMaxClusterPerRow, mRec->GetProcessingSettings().tpcIncreasedMinClustersPerRow);
   }
-
-  mBufSize = nextMultipleOf<std::max<int32_t>(GPUCA_MEMALIGN, mScanWorkGroupSize)>(mNMaxDigitsFragment);
-  mNBufs = getNSteps(mBufSize);
+  if ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding)) {
+    mBufSize = nextMultipleOf(mNMaxDigitsFragment, std::max<int32_t>(GPUCA_MEMALIGN, mRec->getGPUParameters(mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding).par_CF_SCAN_WORKGROUP_SIZE));
+    mNBufs = getNSteps(mBufSize);
+  }
 }
 
 void GPUTPCClusterFinder::SetNMaxDigits(size_t nDigits, size_t nPages, size_t nDigitsFragment, size_t nDigitsEndpointMax)
 {
-  mNMaxDigits = nextMultipleOf<std::max<int32_t>(GPUCA_MEMALIGN, mScanWorkGroupSize)>(nDigits);
+  mNMaxDigits = nextMultipleOf(nDigits, std::max<int32_t>(GPUCA_MEMALIGN, mRec->getGPUParameters(mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding).par_CF_SCAN_WORKGROUP_SIZE));
   mNMaxPages = nPages;
   mNMaxDigitsFragment = nDigitsFragment;
   mNMaxDigitsEndpoint = nDigitsEndpointMax;
@@ -148,9 +151,10 @@ uint32_t GPUTPCClusterFinder::getNSteps(size_t items) const
     return 0;
   }
   uint32_t c = 1;
-  size_t capacity = mScanWorkGroupSize;
+  const size_t scanWorkgroupSize = mRec->getGPUParameters(mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding).par_CF_SCAN_WORKGROUP_SIZE;
+  size_t capacity = scanWorkgroupSize;
   while (items > capacity) {
-    capacity *= mScanWorkGroupSize;
+    capacity *= scanWorkgroupSize;
     c++;
   }
   return c;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -111,18 +111,17 @@ class GPUTPCClusterFinder : public GPUProcessor
   uint32_t* mPclusterInRow = nullptr;
   tpc::ClusterNative* mPclusterByRow = nullptr;
   GPUTPCClusterMCInterimArray* mPlabelsByRow = nullptr;
-  int32_t* mPbuf = nullptr;
+  int32_t* mPscanBuf = nullptr;
   Memory* mPmemory = nullptr;
 
-  GPUdi() int32_t* GetScanBuffer(int32_t iBuf) const { return mPbuf + iBuf * mBufSize; }
+  GPUdi() int32_t* GetScanBuffer(int32_t iBuf) const { return mPscanBuf + iBuf * mBufSize; }
 
   o2::dataformats::ConstMCTruthContainerView<o2::MCCompLabel> const* mPinputLabels = nullptr;
   uint32_t* mPlabelsInRow = nullptr;
   uint32_t mPlabelsHeaderGlobalOffset = 0;
   uint32_t mPlabelsDataGlobalOffset = 0;
 
   int32_t mISector = 0;
-  constexpr static int32_t mScanWorkGroupSize = GPUCA_THREAD_COUNT_SCAN;
   uint32_t mNMaxClusterPerRow = 0;
   uint32_t mNMaxClusters = 0;
   uint32_t mNMaxPages = 0;
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
@@ -146,7 +146,8 @@ o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP
                             NO_ATOMIC_PRECHECK
                             COMP_GATHER_KERNEL
                             COMP_GATHER_MODE
-                            SORT_STARTHITS)
+                            SORT_STARTHITS
+                            CF_SCAN_WORKGROUP_SIZE)
 
 o2_gpu_kernel_add_string_parameter(DEDX_STORAGE_TYPE
                                    MERGER_INTERPOLATION_ERROR_TYPE)

Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@ class GPUProcessor`
`63`	`63`	`}`
`64`	`64`
`65`	`65`	`template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>`
`66`		`- static inline size_t getAlignmentMod(size_t addr)`
	`66`	`+ static constexpr inline size_t getAlignmentMod(size_t addr)`
`67`	`67`	`{`
`68`	`68`	`static_assert((alignment & (alignment - 1)) == 0, "Invalid alignment, not power of 2");`
`69`	`69`	`if (alignment <= 1) {`
`@@ -72,7 +72,7 @@ class GPUProcessor`
`72`	`72`	`return addr & (alignment - 1);`
`73`	`73`	`}`
`74`	`74`	`template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>`
`75`		`- static inline size_t getAlignment(size_t addr)`
	`75`	`+ static constexpr inline size_t getAlignment(size_t addr)`
`76`	`76`	`{`
`77`	`77`	`size_t mod = getAlignmentMod<alignment>(addr);`
`78`	`78`	`if (mod == 0) {`
`@@ -81,10 +81,22 @@ class GPUProcessor`
`81`	`81`	`return (alignment - mod);`
`82`	`82`	`}`
`83`	`83`	`template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>`
`84`		`- static inline size_t nextMultipleOf(size_t size)`
	`84`	`+ static constexpr inline size_t nextMultipleOf(size_t size)`
`85`	`85`	`{`
`86`	`86`	`return size + getAlignment<alignment>(size);`
`87`	`87`	`}`
	`88`	`+ static constexpr inline size_t nextMultipleOf(size_t size, size_t alignment)`
	`89`	`+ {`
	`90`	`+ if (alignment & (alignment - 1)) {`
	`91`	`+ size_t tmp = size % alignment;`
	`92`	`+ if (tmp) {`
	`93`	`+ size += alignment - tmp;`
	`94`	`+ }`
	`95`	`+ return size;`
	`96`	`+ } else {`
	`97`	`+ return (size + alignment - 1) & ~(alignment - 1);`
	`98`	`+ }`
	`99`	`+ }`
`88`	`100`	`template <size_t alignment = GPUCA_BUFFER_ALIGNMENT>`
`89`	`101`	`static inline void* alignPointer(void* ptr)`
`90`	`102`	`{`
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,7 @@ using namespace o2::gpu::tpccf;`
`24`	`24`	`template <>`
`25`	`25`	`GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanStart>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t stage)`
`26`	`26`	`{`
	`27`	`+#ifdef GPUCA_GPUCODE`
`27`	`28`	`int32_t nElems = CompactionElems(clusterer, stage);`
`28`	`29`
`29`	`30`	`const auto* predicate = clusterer.mPisPeak;`
`@@ -35,17 +36,19 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanSta`
`35`	`36`	`pred = predicate[iThreadGlobal];`
`36`	`37`	`}`
`37`	`38`
`38`		`- int32_t nElemsInBlock = CfUtils::blockPredicateSum<GPUCA_THREAD_COUNT_SCAN>(smem, pred);`
	`39`	`+ int32_t nElemsInBlock = CfUtils::blockPredicateSum<GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE>(smem, pred);`
`39`	`40`
`40`	`41`	`int32_t lastThread = nThreads - 1;`
`41`	`42`	`if (iThread == lastThread) {`
`42`	`43`	`scanOffset[iBlock] = nElemsInBlock;`
`43`	`44`	`}`
	`45`	`+#endif`
`44`	`46`	`}`
`45`	`47`
`46`	`48`	`template <>`
`47`	`49`	`GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanUp>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t nElems)`
`48`	`50`	`{`
	`51`	`+#ifdef GPUCA_GPUCODE`
`49`	`52`	`auto* scanOffset = clusterer.GetScanBuffer(iBuf - 1);`
`50`	`53`	`auto* scanOffsetNext = clusterer.GetScanBuffer(iBuf);`
`51`	`54`
`@@ -59,11 +62,13 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanUp>`
`59`	`62`	`if (iThread == lastThread) {`
`60`	`63`	`scanOffsetNext[iBlock] = offsetInBlock;`
`61`	`64`	`}`
	`65`	`+#endif`
`62`	`66`	`}`
`63`	`67`
`64`	`68`	`template <>`
`65`	`69`	`GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanTop>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t nElems)`
`66`	`70`	`{`
	`71`	`+#ifdef GPUCA_GPUCODE`
`67`	`72`	`int32_t iThreadGlobal = get_global_id(0);`
`68`	`73`	`int32_t* scanOffset = clusterer.GetScanBuffer(iBuf - 1);`
`69`	`74`
`@@ -74,11 +79,13 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanTop`
`74`	`79`	`if (inBounds) {`
`75`	`80`	`scanOffset[iThreadGlobal] = offsetInBlock;`
`76`	`81`	`}`
	`82`	`+#endif`
`77`	`83`	`}`
`78`	`84`
`79`	`85`	`template <>`
`80`	`86`	`GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanDown>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& /smem/, processorType& clusterer, int32_t iBuf, uint32_t offset, int32_t nElems)`
`81`	`87`	`{`
	`88`	`+#ifdef GPUCA_GPUCODE`
`82`	`89`	`int32_t iThreadGlobal = get_global_id(0) + offset;`
`83`	`90`
`84`	`91`	`int32_t* scanOffsetPrev = clusterer.GetScanBuffer(iBuf - 1);`
`@@ -89,11 +96,13 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::scanDow`
`89`	`96`	`if (iThreadGlobal < nElems) {`
`90`	`97`	`scanOffsetPrev[iThreadGlobal] += shift;`
`91`	`98`	`}`
	`99`	`+#endif`
`92`	`100`	`}`
`93`	`101`
`94`	`102`	`template <>`
`95`	`103`	`GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::compactDigits>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int32_t iBuf, int32_t stage, CfChargePos* in, CfChargePos* out)`
`96`	`104`	`{`
	`105`	`+#ifdef GPUCA_GPUCODE`
`97`	`106`	`uint32_t nElems = CompactionElems(clusterer, stage);`
`98`	`107`	`SizeT bufferSize = (stage) ? clusterer.mNMaxClusters : clusterer.mNMaxPeaks;`
`99`	`108`
`@@ -105,7 +114,7 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::compact`
`105`	`114`	`bool iAmDummy = (iThreadGlobal >= nElems);`
`106`	`115`
`107`	`116`	`int32_t pred = (iAmDummy) ? 0 : predicate[iThreadGlobal];`
`108`		`- int32_t offsetInBlock = CfUtils::blockPredicateScan<GPUCA_THREAD_COUNT_SCAN>(smem, pred);`
	`117`	`+ int32_t offsetInBlock = CfUtils::blockPredicateScan<GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE>(smem, pred);`
`109`	`118`
`110`	`119`	`SizeT globalOffsetOut = offsetInBlock;`
`111`	`120`	`if (iBlock > 0) {`
`@@ -129,6 +138,7 @@ GPUdii() void GPUTPCCFStreamCompaction::Thread<GPUTPCCFStreamCompaction::compact`
`129`	`138`	`clusterer.mPmemory->counters.nPeaks = nFinal;`
`130`	`139`	`}`
`131`	`140`	`}`
	`141`	`+#endif`
`132`	`142`	`}`
`133`	`143`
`134`	`144`	`GPUdii() int32_t GPUTPCCFStreamCompaction::CompactionElems(processorType& clusterer, int32_t stage)`