Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
235 changes: 3 additions & 232 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
using namespace o2::gpu;
using namespace o2::gpu::tpccf;

#include "GPUTPCCFClusterizer.inc"

template <>
GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t onlyMC)
{
Expand All @@ -34,235 +36,4 @@ GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;

GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
}

GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
processorType& clusterer,
const CfFragment& fragment,
GPUSharedMemory& smem,
const Array2D<PackedCharge>& chargeMap,
const ChargePos* filteredPeakPositions,
const GPUSettingsRec& calib,
MCLabelAccumulator* labelAcc,
uint32_t clusternum,
uint32_t maxClusterPerRow,
uint32_t* clusterInRow,
tpc::ClusterNative* clusterByRow,
uint32_t* clusterPosInRow)
{
uint32_t idx = get_global_id(0);

// For certain configurations dummy work items are added, so the total
// number of work items is dividable by 64.
// These dummy items also compute the last cluster but discard the result.
ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
Charge charge = chargeMap[pos].unpack();

ClusterAccumulator pc;
CPU_ONLY(labelAcc->collect(pos, charge));

buildCluster(
calib,
chargeMap,
pos,
smem.posBcast,
smem.buf,
smem.innerAboveThreshold,
&pc,
labelAcc);

if (idx >= clusternum) {
return;
}
if (fragment.isOverlap(pos.time())) {
if (clusterPosInRow) {
clusterPosInRow[idx] = maxClusterPerRow;
}
return;
}
pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry);

tpc::ClusterNative myCluster;
bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param());

if (rejectCluster) {
if (clusterPosInRow) {
clusterPosInRow[idx] = maxClusterPerRow;
}
return;
}

uint32_t rowIndex = 0;
if (clusterByRow != nullptr) {
rowIndex = sortIntoBuckets(
clusterer,
myCluster,
pos.row(),
maxClusterPerRow,
clusterInRow,
clusterByRow);
if (clusterPosInRow != nullptr) {
clusterPosInRow[idx] = rowIndex;
}
} else if (clusterPosInRow) {
rowIndex = clusterPosInRow[idx];
}

CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
}

GPUdii() void GPUTPCCFClusterizer::updateClusterInner(
const GPUSettingsRec& calib,
uint16_t lid,
uint16_t N,
const PackedCharge* buf,
const ChargePos& pos,
ClusterAccumulator* cluster,
MCLabelAccumulator* labelAcc,
uint8_t* innerAboveThreshold)
{
uint8_t aboveThreshold = 0;

GPUCA_UNROLL(U(), U())
for (uint16_t i = 0; i < N; i++) {
Delta2 d = cfconsts::InnerNeighbors[i];

PackedCharge p = buf[N * lid + i];

Charge q = cluster->updateInner(p, d);

CPU_ONLY(labelAcc->collect(pos.delta(d), q));

aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
}

innerAboveThreshold[lid] = aboveThreshold;

GPUbarrier();
}

GPUdii() void GPUTPCCFClusterizer::updateClusterOuter(
uint16_t lid,
uint16_t N,
uint16_t M,
uint16_t offset,
const PackedCharge* buf,
const ChargePos& pos,
ClusterAccumulator* cluster,
MCLabelAccumulator* labelAcc)
{
GPUCA_UNROLL(U(), U())
for (uint16_t i = offset; i < M + offset; i++) {
PackedCharge p = buf[N * lid + i];

Delta2 d = cfconsts::OuterNeighbors[i];

Charge q = cluster->updateOuter(p, d);
static_cast<void>(q); // Avoid unused varible warning on GPU.

CPU_ONLY(labelAcc->collect(pos.delta(d), q));
}
}

GPUdii() void GPUTPCCFClusterizer::buildCluster(
const GPUSettingsRec& calib,
const Array2D<PackedCharge>& chargeMap,
ChargePos pos,
ChargePos* posBcast,
PackedCharge* buf,
uint8_t* innerAboveThreshold,
ClusterAccumulator* myCluster,
MCLabelAccumulator* labelAcc)
{
uint16_t ll = get_local_id(0);

posBcast[ll] = pos;
GPUbarrier();

CfUtils::blockLoad<PackedCharge>(
chargeMap,
SCRATCH_PAD_WORK_GROUP_SIZE,
SCRATCH_PAD_WORK_GROUP_SIZE,
ll,
0,
8,
cfconsts::InnerNeighbors,
posBcast,
buf);
updateClusterInner(
calib,
ll,
8,
buf,
pos,
myCluster,
labelAcc,
innerAboveThreshold);

uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;

bool inGroup1 = ll < wgSizeHalf;

uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);

CfUtils::condBlockLoad(
chargeMap,
wgSizeHalf,
SCRATCH_PAD_WORK_GROUP_SIZE,
ll,
0,
16,
cfconsts::OuterNeighbors,
posBcast,
innerAboveThreshold,
buf);

if (inGroup1) {
updateClusterOuter(
llhalf,
16,
16,
0,
buf,
pos,
myCluster,
labelAcc);
}

#if defined(GPUCA_GPUCODE)
CfUtils::condBlockLoad(
chargeMap,
wgSizeHalf,
SCRATCH_PAD_WORK_GROUP_SIZE,
ll,
0,
16,
cfconsts::OuterNeighbors,
posBcast + wgSizeHalf,
innerAboveThreshold + wgSizeHalf,
buf);
if (!inGroup1) {
updateClusterOuter(
llhalf,
16,
16,
0,
buf,
pos,
myCluster,
labelAcc);
}
#endif
}

GPUd() uint32_t GPUTPCCFClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t* elemsInBucket, tpc::ClusterNative* buckets)
{
uint32_t index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
if (index < maxElemsPerBucket) {
buckets[maxElemsPerBucket * row + index] = cluster;
} else {
clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISector * 1000 + row, index, maxElemsPerBucket);
CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
}
return index;
}
}
8 changes: 4 additions & 4 deletions GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,14 +59,14 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate

static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*);

static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);

static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);

private:
static GPUd() void updateClusterInner(const GPUSettingsRec&, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);

static GPUd() void updateClusterOuter(uint16_t, uint16_t, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);

static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);

static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
};

} // namespace o2::gpu
Expand Down
Loading