Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions GPU/Common/GPUCommonAlgorithm.h
Original file line number Diff line number Diff line change
Expand Up @@ -331,28 +331,28 @@ GPUdi() void GPUCommonAlgorithm::swap(T& a, T& b)
#pragma OPENCL EXTENSION cl_khr_subgroups : enable

template <class T>
GPUdi() T work_group_scan_inclusive_add_FUNC(T v)
GPUdi() T warp_scan_inclusive_add_FUNC(T v)
{
return sub_group_scan_inclusive_add(v);
}
template <> // FIXME: It seems OpenCL does not support 8 and 16 bit subgroup operations
GPUdi() uint8_t work_group_scan_inclusive_add_FUNC<uint8_t>(uint8_t v)
GPUdi() uint8_t warp_scan_inclusive_add_FUNC<uint8_t>(uint8_t v)
{
return sub_group_scan_inclusive_add((uint32_t)v);
}
template <class T>
GPUdi() T work_group_broadcast_FUNC(T v, int32_t i)
GPUdi() T warp_broadcast_FUNC(T v, int32_t i)
{
return sub_group_broadcast(v, i);
}
template <>
GPUdi() uint8_t work_group_broadcast_FUNC<uint8_t>(uint8_t v, int32_t i)
GPUdi() uint8_t warp_broadcast_FUNC<uint8_t>(uint8_t v, int32_t i)
{
return sub_group_broadcast((uint32_t)v, i);
}

#define warp_scan_inclusive_add(v) work_group_scan_inclusive_add_FUNC(v)
#define warp_broadcast(v, i) work_group_broadcast_FUNC(v, i)
#define warp_scan_inclusive_add(v) warp_scan_inclusive_add_FUNC(v)
#define warp_broadcast(v, i) warp_broadcast_FUNC(v, i)

#elif (defined(__CUDACC__) || defined(__HIPCC__))
// CUDA and HIP work the same way using cub, need just different header
Expand Down
6 changes: 3 additions & 3 deletions GPU/Common/GPUCommonDefAPI.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,13 @@
#define GPUgeneric() __generic
#define GPUconstexprref() GPUconstexpr()
#if defined(__OPENCL__) && !defined(__clang__)
#define GPUbarrier() work_group_barrier(mem_fence::global | mem_fence::local);
#define GPUbarrierWarp()
#define GPUbarrier() work_group_barrier(mem_fence::global | mem_fence::local)
#define GPUbarrierWarp() sub_group_barrier(mem_fence::global | mem_fence::local)
#define GPUAtomic(type) atomic<type>
static_assert(sizeof(atomic<uint32_t>) == sizeof(uint32_t), "Invalid size of atomic type");
#else
#define GPUbarrier() barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE)
#define GPUbarrierWarp()
#define GPUbarrierWarp() sub_group_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE)
#if defined(__OPENCL__) && defined(GPUCA_OPENCL_CLANG_C11_ATOMICS)
namespace o2 { namespace gpu {
template <class T> struct oclAtomic;
Expand Down
10 changes: 4 additions & 6 deletions GPU/GPUTracking/TPCClusterFinder/CfUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,9 @@ class CfUtils
*sum = __popc(waveMask);
return myOffset;
#else // CPU / OpenCL fallback
int32_t myOffset = warp_scan_inclusive_add(pred ? 1 : 0);
int32_t myOffset = warp_scan_inclusive_add(!!pred);
*sum = warp_broadcast(myOffset, GPUCA_WARP_SIZE - 1);
myOffset--;
return myOffset;
return myOffset - !!pred;
#endif
}

Expand Down Expand Up @@ -111,8 +110,7 @@ class CfUtils
if (sum != nullptr) {
*sum = work_group_broadcast(lpos, BlockSize - 1);
}
lpos--;
return lpos;
return lpos - !!pred;
#endif
}

Expand Down Expand Up @@ -149,7 +147,7 @@ class CfUtils

return sum;
#else // CPU / OpenCL fallback
return work_group_reduce_add(pred ? 1 : 0);
return work_group_reduce_add(!!pred);
#endif
}

Expand Down
14 changes: 8 additions & 6 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDecodeZS.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, processorT
return pageDigitOffset;
}

int32_t nDecoded = 0;
[[maybe_unused]] int32_t nDecoded = 0;
const auto* decHdr = ConsumeHeader<TPCZSHDRV2>(page);
ConsumeBytes(page, decHdr->firstZSDataOffset * 16);

Expand Down Expand Up @@ -275,7 +275,7 @@ GPUd() size_t GPUTPCCFDecodeZSLink::DecodePage(GPUSharedMemory& smem, processorT
#endif
pageDigitOffset += nAdc;
} // for (uint32_t t = 0; t < decHdr->nTimebinHeaders; t++)
(void)nDecoded;

#ifdef GPUCA_CHECK_TPCZS_CORRUPTION
if (iThread == 0 && nDecoded != decHdr->nADCsamples) {
clusterer.raiseError(GPUErrors::ERROR_TPCZS_INVALID_NADC, clusterer.mISector * 1000 + decHdr->cruID, decHdr->nADCsamples, nDecoded);
Expand Down Expand Up @@ -566,6 +566,7 @@ GPUd() void GPUTPCCFDecodeZSLinkBase::WriteCharge(processorType& clusterer, floa
positions[positionOffset] = pos;

charge *= clusterer.GetConstantMem()->calibObjects.tpcPadGain->getGainCorrection(sector, padAndRow.getRow(), padAndRow.getPad());

chargeMap[pos] = PackedCharge(charge);
}

Expand Down Expand Up @@ -615,6 +616,7 @@ GPUd() uint32_t GPUTPCCFDecodeZSDenseLink::DecodePage(GPUSharedMemory& smem, pro
ConsumeBytes(page, decHeader->firstZSDataOffset - sizeof(o2::header::RAWDataHeader));

for (uint16_t i = 0; i < decHeader->nTimebinHeaders; i++) {

[[maybe_unused]] ptrdiff_t sizeLeftInPage = payloadEnd - page;
assert(sizeLeftInPage > 0);

Expand Down Expand Up @@ -728,8 +730,6 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(

uint16_t nSamplesInTB = 0;

GPUbarrier();

// Read timebin link headers
for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++) {
uint8_t timebinLinkHeaderStart = ConsumeByte(page);
Expand Down Expand Up @@ -777,15 +777,15 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(

} // for (uint8_t iLink = 0; iLink < nLinksInTimebin; iLink++)

GPUbarrierWarp(); // Ensure all writes to shared memory are finished, before reading it

const uint8_t* adcData = ConsumeBytes(page, (nSamplesInTB * DECODE_BITS + 7) / 8);
MAYBE_PAGE_OVERFLOW(page); // TODO: We don't need this check?

if (not fragment.contains(timeBin)) {
return FillWithInvalid(clusterer, iThread, NTHREADS, pageDigitOffset, nSamplesInTB);
}

GPUbarrier();

// Unpack ADC
int32_t iLink = 0;
for (uint16_t sample = iThread; sample < nSamplesInTB; sample += NTHREADS) {
Expand Down Expand Up @@ -819,6 +819,8 @@ GPUd() uint16_t GPUTPCCFDecodeZSDenseLink::DecodeTBMultiThread(

} // for (uint16_t sample = iThread; sample < nSamplesInTB; sample += NTHREADS)

GPUbarrierWarp(); // Ensure all reads to shared memory are finished, before decoding next header into shmem

assert(PayloadExtendsToNextPage || adcData <= page);
assert(PayloadExtendsToNextPage || page <= payloadEnd);

Expand Down