Skip to content

Commit 2f7ce66

Browse files
fweigdavidrohr
authored andcommitted
GPU: Use warp barriers in ZS decoder.
1 parent aee6833 commit 2f7ce66

File tree

4 files changed

+21
-21
lines changed

4 files changed

+21
-21
lines changed

GPU/Common/GPUCommonAlgorithm.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -331,28 +331,28 @@ GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b)
331331
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
332332

333333
template <class T>
334-
GPUdi() T work_group_scan_inclusive_add_FUNC(T v)
334+
GPUdi() T warp_scan_inclusive_add_FUNC(T v)
335335
{
336336
return sub_group_scan_inclusive_add(v);
337337
}
338338
template <> // FIXME: It seems OpenCL does not support 8 and 16 bit subgroup operations
339-
GPUdi() uint8_t work_group_scan_inclusive_add_FUNC<uint8_t>(uint8_t v)
339+
GPUdi() uint8_t warp_scan_inclusive_add_FUNC<uint8_t>(uint8_t v)
340340
{
341341
return sub_group_scan_inclusive_add((uint32_t)v);
342342
}
343343
template <class T>
344-
GPUdi() T work_group_broadcast_FUNC(T v, int32_t i)
344+
GPUdi() T warp_broadcast_FUNC(T v, int32_t i)
345345
{
346346
return sub_group_broadcast(v, i);
347347
}
348348
template <>
349-
GPUdi() uint8_t work_group_broadcast_FUNC<uint8_t>(uint8_t v, int32_t i)
349+
GPUdi() uint8_t warp_broadcast_FUNC<uint8_t>(uint8_t v, int32_t i)
350350
{
351351
return sub_group_broadcast((uint32_t)v, i);
352352
}
353353

354-
#define warp_scan_inclusive_add(v) work_group_scan_inclusive_add_FUNC(v)
355-
#define warp_broadcast(v, i) work_group_broadcast_FUNC(v, i)
354+
#define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v)
355+
#define warp_broadcast(v, i) warp_broadcast_FUNC(v, i)
356356

357357
#elif (defined(__CUDACC__) || defined(__HIPCC__))
358358
// CUDA and HIP work the same way using cub, need just different header

GPU/Common/GPUCommonDefAPI.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,13 +96,13 @@
9696
#define GPUgeneric() __generic
9797
#define GPUconstexprref() GPUconstexpr()
9898
#if defined(__OPENCL__) && !defined(__clang__)
99-
#define GPUbarrier() work_group_barrier(mem_fence::global | mem_fence::local);
100-
#define GPUbarrierWarp()
99+
#define GPUbarrier() work_group_barrier(mem_fence::global | mem_fence::local)
100+
#define GPUbarrierWarp() sub_group_barrier(mem_fence::global | mem_fence::local)
101101
#define GPUAtomic(type) atomic<type>
102102
static_assert(sizeof(atomic<uint32_t>) == sizeof(uint32_t), "Invalid size of atomic type");
103103
#else
104104
#define GPUbarrier() barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE)
105-
#define GPUbarrierWarp()
105+
#define GPUbarrierWarp() sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE)
106106
#if defined(__OPENCL__) && defined(GPUCA_OPENCL_CLANG_C11_ATOMICS)
107107
namespace o2 { namespace gpu {
108108
template <class T> struct oclAtomic;

GPU/GPUTracking/TPCClusterFinder/CfUtils.h

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,9 @@ class CfUtils
5858
*sum = __popc(waveMask);
5959
return myOffset;
6060
#else // CPU / OpenCL fallback
61-
int32_t myOffset = warp_scan_inclusive_add(pred ? 1 : 0);
61+
int32_t myOffset = warp_scan_inclusive_add(!!pred);
6262
*sum = warp_broadcast(myOffset, GPUCA_WARP_SIZE - 1);
63-
myOffset--;
64-
return myOffset;
63+
return myOffset - !!pred;
6564
#endif
6665
}
6766

@@ -111,8 +110,7 @@ class CfUtils
111110
if (sum != nullptr) {
112111
*sum = work_group_broadcast(lpos, BlockSize - 1);
113112
}
114-
lpos--;
115-
return lpos;
113+
return lpos - !!pred;
116114
#endif
117115
}
118116

@@ -149,7 +147,7 @@ class CfUtils
149147

150148
return sum;
151149
#else // CPU / OpenCL fallback
152-
return work_group_reduce_add(pred ? 1 : 0);
150+
return work_group_reduce_add(!!pred);
153151
#endif
154152
}
155153

GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, processorT
224224
return pageDigitOffset;
225225
}
226226

227-
int32_t nDecoded = 0;
227+
[[maybe_unused]] int32_t nDecoded = 0;
228228
const auto* decHdr = ConsumeHeader<TPCZSHDRV2>(page);
229229
ConsumeBytes(page, decHdr->firstZSDataOffset * 16);
230230

@@ -275,7 +275,7 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, processorT
275275
#endif
276276
pageDigitOffset += nAdc;
277277
} // for (uint32_t t = 0; t < decHdr->nTimebinHeaders; t++)
278-
(void)nDecoded;
278+
279279
#ifdef GPUCA_CHECK_TPCZS_CORRUPTION
280280
if (iThread == 0 && nDecoded != decHdr->nADCsamples) {
281281
clusterer.raiseError(GPUErrors::ERROR_TPCZS_INVALID_NADC, clusterer.mISector * 1000 + decHdr->cruID, decHdr->nADCsamples, nDecoded);
@@ -566,6 +566,7 @@ GPUd() void GPUTPCCFDecodeZSLinkBase::WriteCharge(processorType& clusterer, floa
566566
positions[positionOffset] = pos;
567567

568568
charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad());
569+
569570
chargeMap[pos] = PackedCharge(charge);
570571
}
571572

@@ -615,6 +616,7 @@ GPUd() uint32_t GPUTPCCFDecodeZSDenseLink::DecodePage(GPUSharedMemory& smem, pro
615616
ConsumeBytes(page, decHeader->firstZSDataOffset - sizeof(o2::header::RAWDataHeader));
616617

617618
for (uint16_t i = 0; i < decHeader->nTimebinHeaders; i++) {
619+
618620
[[maybe_unused]] ptrdiff_t sizeLeftInPage = payloadEnd - page;
619621
assert(sizeLeftInPage > 0);
620622

@@ -728,8 +730,6 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
728730

729731
uint16_t nSamplesInTB = 0;
730732

731-
GPUbarrier();
732-
733733
// Read timebin link headers
734734
for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++) {
735735
uint8_t timebinLinkHeaderStart = ConsumeByte(page);
@@ -777,15 +777,15 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
777777

778778
} // for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++)
779779

780+
GPUbarrierWarp(); // Ensure all writes to shared memory are finished, before reading it
781+
780782
const uint8_t* adcData = ConsumeBytes(page, (nSamplesInTB * DECODE_BITS + 7) / 8);
781783
MAYBE_PAGE_OVERFLOW(page); // TODO: We don't need this check?
782784

783785
if (not fragment.contains(timeBin)) {
784786
return FillWithInvalid(clusterer, iThread, NTHREADS, pageDigitOffset, nSamplesInTB);
785787
}
786788

787-
GPUbarrier();
788-
789789
// Unpack ADC
790790
int32_t iLink = 0;
791791
for (uint16_t sample = iThread; sample < nSamplesInTB; sample += NTHREADS) {
@@ -819,6 +819,8 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(
819819

820820
} // for (uint16_t sample = iThread; sample < nSamplesInTB; sample += NTHREADS)
821821

822+
GPUbarrierWarp(); // Ensure all reads to shared memory are finished, before decoding next header into shmem
823+
822824
assert(PayloadExtendsToNextPage || adcData <= page);
823825
assert(PayloadExtendsToNextPage || page <= payloadEnd);
824826

0 commit comments

Comments
 (0)