GPU: Use warp barriers in ZS decoder.

fweig · davidrohr · commit 2f7ce66c03f7 · 2025-07-11T10:50:05.000+02:00
diff --git a/GPU/Common/GPUCommonAlgorithm.h b/GPU/Common/GPUCommonAlgorithm.h
@@ -331,28 +331,28 @@ GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b)
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 
 template <class T>
-GPUdi() T work_group_scan_inclusive_add_FUNC(T v)
+GPUdi() T warp_scan_inclusive_add_FUNC(T v)
 {
   return sub_group_scan_inclusive_add(v);
 }
 template <> // FIXME: It seems OpenCL does not support 8 and 16 bit subgroup operations
-GPUdi() uint8_t work_group_scan_inclusive_add_FUNC<uint8_t>(uint8_t v)
+GPUdi() uint8_t warp_scan_inclusive_add_FUNC<uint8_t>(uint8_t v)
 {
   return sub_group_scan_inclusive_add((uint32_t)v);
 }
 template <class T>
-GPUdi() T work_group_broadcast_FUNC(T v, int32_t i)
+GPUdi() T warp_broadcast_FUNC(T v, int32_t i)
 {
   return sub_group_broadcast(v, i);
 }
 template <>
-GPUdi() uint8_t work_group_broadcast_FUNC<uint8_t>(uint8_t v, int32_t i)
+GPUdi() uint8_t warp_broadcast_FUNC<uint8_t>(uint8_t v, int32_t i)
 {
   return sub_group_broadcast((uint32_t)v, i);
 }
 
-#define warp_scan_inclusive_add(v) work_group_scan_inclusive_add_FUNC(v)
-#define warp_broadcast(v, i) work_group_broadcast_FUNC(v, i)
+#define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v)
+#define warp_broadcast(v, i) warp_broadcast_FUNC(v, i)
 
 #elif (defined(__CUDACC__) || defined(__HIPCC__))
 // CUDA and HIP work the same way using cub, need just different header
diff --git a/GPU/Common/GPUCommonDefAPI.h b/GPU/Common/GPUCommonDefAPI.h
@@ -96,13 +96,13 @@
   #define GPUgeneric() __generic
   #define GPUconstexprref() GPUconstexpr()
   #if defined(__OPENCL__) && !defined(__clang__)
-    #define GPUbarrier() work_group_barrier(mem_fence::global | mem_fence::local);
-    #define GPUbarrierWarp()
+    #define GPUbarrier() work_group_barrier(mem_fence::global | mem_fence::local)
+    #define GPUbarrierWarp() sub_group_barrier(mem_fence::global | mem_fence::local)
     #define GPUAtomic(type) atomic<type>
     static_assert(sizeof(atomic<uint32_t>) == sizeof(uint32_t), "Invalid size of atomic type");
   #else
     #define GPUbarrier() barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE)
-    #define GPUbarrierWarp()
+    #define GPUbarrierWarp() sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE)
     #if defined(__OPENCL__) && defined(GPUCA_OPENCL_CLANG_C11_ATOMICS)
       namespace o2 { namespace gpu {
       template <class T> struct oclAtomic;
diff --git a/GPU/GPUTracking/TPCClusterFinder/CfUtils.h b/GPU/GPUTracking/TPCClusterFinder/CfUtils.h
@@ -58,10 +58,9 @@ class CfUtils
     *sum = __popc(waveMask);
     return myOffset;
 #else // CPU / OpenCL fallback
-    int32_t myOffset = warp_scan_inclusive_add(pred ? 1 : 0);
+    int32_t myOffset = warp_scan_inclusive_add(!!pred);
     *sum = warp_broadcast(myOffset, GPUCA_WARP_SIZE - 1);
-    myOffset--;
-    return myOffset;
+    return myOffset - !!pred;
 #endif
   }
 
@@ -111,8 +110,7 @@ class CfUtils
     if (sum != nullptr) {
       *sum = work_group_broadcast(lpos, BlockSize - 1);
     }
-    lpos--;
-    return lpos;
+    return lpos - !!pred;
 #endif
   }
 
@@ -149,7 +147,7 @@ class CfUtils
 
     return sum;
 #else // CPU / OpenCL fallback
-    return work_group_reduce_add(pred ? 1 : 0);
+    return work_group_reduce_add(!!pred);
 #endif
   }
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx
@@ -224,7 +224,7 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, processorT
     return pageDigitOffset;
   }
 
-  int32_t nDecoded = 0;
+  [[maybe_unused]] int32_t nDecoded = 0;
   const auto* decHdr = ConsumeHeader<TPCZSHDRV2>(page);
   ConsumeBytes(page, decHdr->firstZSDataOffset * 16);
 
@@ -275,7 +275,7 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, processorT
 #endif
     pageDigitOffset += nAdc;
   } // for (uint32_t t = 0; t < decHdr->nTimebinHeaders; t++)
-  (void)nDecoded;
+
 #ifdef GPUCA_CHECK_TPCZS_CORRUPTION
   if (iThread == 0 && nDecoded != decHdr->nADCsamples) {
     clusterer.raiseError(GPUErrors::ERROR_TPCZS_INVALID_NADC, clusterer.mISector * 1000 + decHdr->cruID, decHdr->nADCsamples, nDecoded);
@@ -566,6 +566,7 @@ GPUd() void GPUTPCCFDecodeZSLinkBase::WriteCharge(processorType& clusterer, floa
   positions[positionOffset] = pos;
 
   charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad());
+
   chargeMap[pos] = PackedCharge(charge);
 }
 
@@ -615,6 +616,7 @@ GPUd() uint32_t GPUTPCCFDecodeZSDenseLink::DecodePage(GPUSharedMemory& smem, pro
   ConsumeBytes(page, decHeader->firstZSDataOffset - sizeof(o2::header::RAWDataHeader));
 
   for (uint16_t i = 0; i < decHeader->nTimebinHeaders; i++) {
+
     [[maybe_unused]] ptrdiff_t sizeLeftInPage = payloadEnd - page;
     assert(sizeLeftInPage > 0);
 
@@ -728,8 +730,6 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
 
   uint16_t nSamplesInTB = 0;
 
-  GPUbarrier();
-
   // Read timebin link headers
   for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++) {
     uint8_t timebinLinkHeaderStart = ConsumeByte(page);
@@ -777,15 +777,15 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
 
   } // for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++)
 
+  GPUbarrierWarp(); // Ensure all writes to shared memory are finished, before reading it
+
   const uint8_t* adcData = ConsumeBytes(page, (nSamplesInTB * DECODE_BITS + 7) / 8);
   MAYBE_PAGE_OVERFLOW(page); // TODO: We don't need this check?
 
   if (not fragment.contains(timeBin)) {
     return FillWithInvalid(clusterer, iThread, NTHREADS, pageDigitOffset, nSamplesInTB);
   }
 
-  GPUbarrier();
-
   // Unpack ADC
   int32_t iLink = 0;
   for (uint16_t sample = iThread; sample < nSamplesInTB; sample += NTHREADS) {
@@ -819,6 +819,8 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
 
   } // for (uint16_t sample = iThread; sample < nSamplesInTB; sample += NTHREADS)
 
+  GPUbarrierWarp(); // Ensure all reads to shared memory are finished, before decoding next header into shmem
+
   assert(PayloadExtendsToNextPage || adcData <= page);
   assert(PayloadExtendsToNextPage || page <= payloadEnd);
 

Original file line number	Diff line number	Diff line change
`@@ -331,28 +331,28 @@ GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b)`
`331`	`331`	`#pragma OPENCL EXTENSION cl_khr_subgroups : enable`
`332`	`332`
`333`	`333`	`template <class T>`
`334`		`-GPUdi() T work_group_scan_inclusive_add_FUNC(T v)`
	`334`	`+GPUdi() T warp_scan_inclusive_add_FUNC(T v)`
`335`	`335`	`{`
`336`	`336`	`return sub_group_scan_inclusive_add(v);`
`337`	`337`	`}`
`338`	`338`	`template <> // FIXME: It seems OpenCL does not support 8 and 16 bit subgroup operations`
`339`		`-GPUdi() uint8_t work_group_scan_inclusive_add_FUNC<uint8_t>(uint8_t v)`
	`339`	`+GPUdi() uint8_t warp_scan_inclusive_add_FUNC<uint8_t>(uint8_t v)`
`340`	`340`	`{`
`341`	`341`	`return sub_group_scan_inclusive_add((uint32_t)v);`
`342`	`342`	`}`
`343`	`343`	`template <class T>`
`344`		`-GPUdi() T work_group_broadcast_FUNC(T v, int32_t i)`
	`344`	`+GPUdi() T warp_broadcast_FUNC(T v, int32_t i)`
`345`	`345`	`{`
`346`	`346`	`return sub_group_broadcast(v, i);`
`347`	`347`	`}`
`348`	`348`	`template <>`
`349`		`-GPUdi() uint8_t work_group_broadcast_FUNC<uint8_t>(uint8_t v, int32_t i)`
	`349`	`+GPUdi() uint8_t warp_broadcast_FUNC<uint8_t>(uint8_t v, int32_t i)`
`350`	`350`	`{`
`351`	`351`	`return sub_group_broadcast((uint32_t)v, i);`
`352`	`352`	`}`
`353`	`353`
`354`		`-#define warp_scan_inclusive_add(v) work_group_scan_inclusive_add_FUNC(v)`
`355`		`-#define warp_broadcast(v, i) work_group_broadcast_FUNC(v, i)`
	`354`	`+#define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v)`
	`355`	`+#define warp_broadcast(v, i) warp_broadcast_FUNC(v, i)`
`356`	`356`
`357`	`357`	`#elif (defined(__CUDACC__) \|\| defined(__HIPCC__))`
`358`	`358`	`// CUDA and HIP work the same way using cub, need just different header`