Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion GPU/GPUTracking/Base/GPUReconstruction.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
}
if (mProcessingSettings.deterministicGPUReconstruction) {
#ifndef GPUCA_DETERMINISTIC_MODE
GPUError("Warning, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!");
GPUError("WARNING, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!");
#endif
mProcessingSettings.overrideClusterizerFragmentLen = TPC_MAX_FRAGMENT_LEN_GPU;
param().rec.tpc.nWaysOuter = true;
Expand All @@ -274,6 +274,10 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
mProcessingSettings.createO2Output = 1;
}
mProcessingSettings.rtc.deterministic = 1;
} else {
#ifdef GPUCA_DETERMINISTIC_MODE
GPUError("WARNING, compiled with GPUCA_DETERMINISTIC_MODE but deterministicGPUReconstruction not set, only compile-time determinism and deterministic math enforced, not fully deterministic!");
#endif
}
if (mProcessingSettings.deterministicGPUReconstruction && mProcessingSettings.debugLevel >= 6) {
mProcessingSettings.nTPCClustererLanes = 1;
Expand Down
12 changes: 12 additions & 0 deletions GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,12 @@
#ifndef GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP
#define GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP 6
#endif
#ifndef GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL
#define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL 4
#endif
#ifndef GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED
#define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED 1
#endif
#ifndef GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE
#define GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE 12
#endif
Expand Down Expand Up @@ -544,6 +550,12 @@
#ifndef GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP
#define GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP 0
#endif
#ifndef GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL
#define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL 0
#endif
#ifndef GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED
#define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED 0
#endif
#ifndef GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE
#define GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE 0
#endif
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for m
AddOption(debugMask, int32_t, 262143, "", 0, "Mask for debug output dumps to file")
AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures")
AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks")
AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6")
AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6", def(1))
AddOption(showOutputStat, bool, false, "", 0, "Print some track output statistics")
AddOption(runCompressionStatistics, bool, false, "compressionStat", 0, "Run statistics and verification for cluster compression")
AddOption(resetTimers, int8_t, 1, "", 0, "Reset timers every event")
Expand Down
155 changes: 75 additions & 80 deletions GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,11 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
return;
}

#define UnrollGlobal 4
#define MaxShared GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP
#if MaxShared < GPUCA_MAXN
#define MaxGlobal ((GPUCA_MAXN - MaxShared - 1) / UnrollGlobal + 1) * UnrollGlobal
#else
#define MaxGlobal 0
#endif
#define MaxTotal MaxShared + MaxGlobal
static constexpr uint32_t UNROLL_GLOBAL = GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL > 1 ? GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL : 1;
static_assert(GPUCA_MAXN % UNROLL_GLOBAL == 0);
static constexpr uint32_t MAX_SHARED = GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP;
static constexpr uint32_t MAX_GLOBAL = (MAX_SHARED < GPUCA_MAXN) ? (((GPUCA_MAXN - MAX_SHARED - 1) / UNROLL_GLOBAL + 1) * UNROLL_GLOBAL) : 0;
static constexpr uint32_t MAX_TOTAL = MAX_SHARED + MAX_GLOBAL;

const float chi2Cut = 3.f * 3.f * 4 * (s.mUpDx * s.mUpDx + s.mDnDx * s.mDnDx);
// float chi2Cut = 3.f*3.f*(s.mUpDx*s.mUpDx + s.mDnDx*s.mDnDx ); //SG
Expand Down Expand Up @@ -117,18 +114,16 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
const float kAreaSlopeZUp = kAngularMultiplier != 0.f ? 1.f : s.mUpTx;
const float kAreaSlopeZDn = kAngularMultiplier != 0.f ? 1.f : s.mDnTx;

#if MaxGlobal > 0
calink neighUp[MaxGlobal];
float yzUp[2 * MaxGlobal];
#endif
calink neighUp[MAX_GLOBAL];
float yzUp[2 * MAX_GLOBAL];

for (int32_t ih = iThread; ih < s.mNHits; ih += nThreads) {

const GPUglobalref() cahit2& hitData = pHitData[lHitNumberOffset + ih];
const float y = y0 + hitData.x * stepY;
const float z = z0 + hitData.y * stepZ;

int32_t nNeighUp = 0;
uint32_t nNeighUp = 0;
float minZ, maxZ, minY, maxY;
int32_t binYmin, binYmax, binZmin, binZmax;
int32_t nY;
Expand All @@ -145,11 +140,11 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
nY = rowUp.Grid().Ny();
}

for (int32_t k1 = binZmin; k1 <= binZmax && (nNeighUp < MaxTotal); k1++) {
for (int32_t k1 = binZmin; k1 <= binZmax && (nNeighUp < MAX_TOTAL); k1++) {
int32_t iMin = lFirstHitInBin[lFirstHitInBinOffsetUp + k1 * nY + binYmin];
int32_t iMax = lFirstHitInBin[lFirstHitInBinOffsetUp + k1 * nY + binYmax + 1];
GPUCA_UNROLL(U(4), U(2))
for (int32_t i = iMin; i < iMax && (nNeighUp < MaxTotal); i++) {
for (int32_t i = iMin; i < iMax && (nNeighUp < MAX_TOTAL); i++) {
const GPUglobalref() cahit2& hitDataUp = pHitData[lHitNumberOffsetUp + i];
GPUTPCHit h;
h.mY = y0Up + (hitDataUp.x) * stepYUp;
Expand All @@ -159,51 +154,48 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
continue;
}

#if MaxGlobal > 0
#if MaxShared == 0
if (true) {
#else
if (nNeighUp >= MaxShared) {
#endif
neighUp[nNeighUp - MaxShared] = (calink)i;
yzUp[2 * (nNeighUp - MaxShared)] = s.mDnDx * (h.Y() - y);
yzUp[2 * (nNeighUp - MaxShared) + 1] = s.mDnDx * (h.Z() - z);
} else
#endif
{
#if MaxShared > 0
s.mB[nNeighUp][iThread] = (calink)i;
s.mA1[nNeighUp][iThread] = s.mDnDx * (h.Y() - y);
s.mA2[nNeighUp][iThread] = s.mDnDx * (h.Z() - z);
#endif
const bool inGlobal = nNeighUp >= MAX_SHARED;
if constexpr (MAX_GLOBAL > 0) {
if (inGlobal) {
neighUp[nNeighUp - MAX_SHARED] = (calink)i;
yzUp[2 * (nNeighUp - MAX_SHARED)] = s.mDnDx * (h.Y() - y);
yzUp[2 * (nNeighUp - MAX_SHARED) + 1] = s.mDnDx * (h.Z() - z);
}
}
if constexpr (MAX_SHARED > 0) {
if (!inGlobal) {
s.mB[nNeighUp][iThread] = (calink)i;
s.mA1[nNeighUp][iThread] = s.mDnDx * (h.Y() - y);
s.mA2[nNeighUp][iThread] = s.mDnDx * (h.Z() - z);
}
}
nNeighUp++;
}
}

#if MaxShared > 0 // init a rest of the shared array
for (int32_t iUp = nNeighUp; iUp < MaxShared; iUp++) {
s.mA1[iUp][iThread] = -1.e10f;
s.mA2[iUp][iThread] = -1.e10f;
s.mB[iUp][iThread] = (calink)-1;
if constexpr (MAX_SHARED > 0 && GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED) { // init the rest of the shared array
for (uint32_t iUp = nNeighUp; iUp < MAX_SHARED; iUp++) {
s.mA1[iUp][iThread] = -1.e10f;
s.mA2[iUp][iThread] = -1.e10f;
s.mB[iUp][iThread] = (calink)-1;
}
}
#endif

#if MaxGlobal > 0 // init a rest of the UnrollGlobal chunk of the global array
int32_t Nrest = nNeighUp - MaxShared;
int32_t N4 = (Nrest / UnrollGlobal) * UnrollGlobal;
if (N4 < Nrest) {
N4 += UnrollGlobal;
GPUCA_UNROLL(U(UnrollGlobal - 1), U(UnrollGlobal - 1))
for (int32_t k = 0; k < UnrollGlobal - 1; k++) {
if (Nrest + k < N4) {
yzUp[2 * (Nrest + k)] = -1.e10f;
yzUp[2 * (Nrest + k) + 1] = -1.e10f;
neighUp[Nrest + k] = (calink)-1;
const uint32_t nRest = nNeighUp - MAX_SHARED;
uint32_t nRestUnrolled = (nRest / UNROLL_GLOBAL) * UNROLL_GLOBAL;
if constexpr (MAX_GLOBAL > 1) { // init the rest of the UNROLL_GLOBAL chunk of the global array
if (nNeighUp > MAX_SHARED && nRestUnrolled < nRest) {
nRestUnrolled += UNROLL_GLOBAL;
GPUCA_UNROLL(U(UNROLL_GLOBAL - 1), U(UNROLL_GLOBAL - 1))
for (uint32_t k = 0; k + 1 < UNROLL_GLOBAL; k++) {
if (nRest + k < nRestUnrolled) {
yzUp[2 * (nRest + k)] = -1.e10f;
yzUp[2 * (nRest + k) + 1] = -1.e10f;
neighUp[nRest + k] = (calink)-1;
}
}
}
}
#endif

{ // area in the lower row
const float yy = y * s.mDnTx;
Expand Down Expand Up @@ -236,47 +228,50 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
float yDnProjUp = s.mUpDx * (yDn - y);
float zDnProjUp = s.mUpDx * (zDn - z);

#if MaxShared > 0
GPUCA_UNROLL(U(MaxShared), U(MaxShared))
for (int32_t iUp = 0; iUp < MaxShared; iUp++) {
const float dy = yDnProjUp - s.mA1[iUp][iThread];
const float dz = zDnProjUp - s.mA2[iUp][iThread];
const float d = dy * dy + dz * dz;
if (d < bestD) {
bestD = d;
linkDn = i;
linkUp = iUp;
}
}
#endif

#if MaxGlobal > 0
for (int32_t iUp = 0; iUp < N4; iUp += UnrollGlobal) {
GPUCA_UNROLL(U(UnrollGlobal), U(UnrollGlobal))
for (int32_t k = 0; k < UnrollGlobal; k++) {
int32_t jUp = iUp + k;
const float dy = yDnProjUp - yzUp[2 * jUp];
const float dz = zDnProjUp - yzUp[2 * jUp + 1];
if constexpr (MAX_SHARED > 0) {
const uint32_t maxSharedUp = GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED ? MAX_SHARED : CAMath::Min(nNeighUp, MAX_SHARED);
GPUCA_UNROLL(U(MAX_SHARED), U(MAX_SHARED))
for (uint32_t iUp = 0; iUp < maxSharedUp; iUp++) {
const float dy = yDnProjUp - s.mA1[iUp][iThread];
const float dz = zDnProjUp - s.mA2[iUp][iThread];
const float d = dy * dy + dz * dz;
if (d < bestD) {
bestD = d;
linkDn = i;
linkUp = MaxShared + jUp;
linkUp = iUp;
}
}
}

if constexpr (MAX_GLOBAL > 0) {
if (nNeighUp > MAX_SHARED) {
for (uint32_t iUp = 0; iUp < nRestUnrolled; iUp += UNROLL_GLOBAL) {
GPUCA_UNROLL(U(UNROLL_GLOBAL), U(UNROLL_GLOBAL))
for (uint32_t k = 0; k < UNROLL_GLOBAL; k++) {
const uint32_t jUp = iUp + k;
const float dy = yDnProjUp - yzUp[2 * jUp];
const float dz = zDnProjUp - yzUp[2 * jUp + 1];
const float d = dy * dy + dz * dz;
if (d < bestD) {
bestD = d;
linkDn = i;
linkUp = MAX_SHARED + jUp;
}
}
}
}
}
#endif
}
}

if (linkUp >= 0) {
#if MaxShared > 0 && MaxGlobal > 0
linkUp = (linkUp >= MaxShared) ? neighUp[linkUp - MaxShared] : s.mB[linkUp][iThread];
#elif MaxShared > 0
linkUp = s.mB[linkUp][iThread];
#else
linkUp = neighUp[linkUp];
#endif
if constexpr (MAX_SHARED > 0 && MAX_GLOBAL > 0) {
linkUp = ((uint32_t)linkUp >= MAX_SHARED) ? neighUp[linkUp - MAX_SHARED] : s.mB[linkUp][iThread];
} else if constexpr (MAX_SHARED > 0) {
linkUp = s.mB[linkUp][iThread];
} else {
linkUp = neighUp[linkUp];
}
}

tracker.mData.mLinkUpData[lHitNumberOffset + ih] = linkUp;
Expand Down
2 changes: 0 additions & 2 deletions GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,10 @@ class GPUTPCNeighboursFinder : public GPUKernelTemplate
int32_t mIRow; // row number
int32_t mIRowUp; // next row number
int32_t mIRowDn; // previous row number
#if GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP > 0
static_assert(GPUCA_MAXN >= GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP);
float mA1[GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNeighboursFinder)];
float mA2[GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNeighboursFinder)];
calink mB[GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP][GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNeighboursFinder)];
#endif
GPUTPCRow mRow, mRowUp, mRowDown;
};

Expand Down
50 changes: 26 additions & 24 deletions GPU/GPUTracking/SectorTracker/GPUTPCStartHitsFinder.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -39,36 +39,38 @@ GPUdii() void GPUTPCStartHitsFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nThr
uint32_t linkUpData = tracker.mData.mLinkUpData[lHitNumberOffset + ih];

if (tracker.mData.mLinkDownData[lHitNumberOffset + ih] == CALINK_INVAL && linkUpData != CALINK_INVAL && tracker.mData.mLinkUpData[rowUp.mHitNumberOffset + linkUpData] != CALINK_INVAL) {
#if GPUCA_PAR_SORT_STARTHITS > 0
GPUglobalref() GPUTPCHitId* const GPUrestrict() startHits = tracker.mTrackletTmpStartHits + s.mIRow * tracker.mNMaxRowStartHits;
uint32_t nextRowStartHits = CAMath::AtomicAddShared(&s.mNRowStartHits, 1u);
if (nextRowStartHits >= tracker.mNMaxRowStartHits) {
tracker.raiseError(GPUErrors::ERROR_ROWSTARTHIT_OVERFLOW, tracker.ISector() * 1000 + s.mIRow, nextRowStartHits, tracker.mNMaxRowStartHits);
CAMath::AtomicExchShared(&s.mNRowStartHits, tracker.mNMaxRowStartHits);
break;
GPUglobalref() GPUTPCHitId* GPUrestrict() startHits;
uint32_t nextRowStartHits;
if constexpr (GPUCA_PAR_SORT_STARTHITS > 0) {
startHits = tracker.mTrackletTmpStartHits + s.mIRow * tracker.mNMaxRowStartHits;
nextRowStartHits = CAMath::AtomicAddShared(&s.mNRowStartHits, 1u);
if (nextRowStartHits >= tracker.mNMaxRowStartHits) {
tracker.raiseError(GPUErrors::ERROR_ROWSTARTHIT_OVERFLOW, tracker.ISector() * 1000 + s.mIRow, nextRowStartHits, tracker.mNMaxRowStartHits);
CAMath::AtomicExchShared(&s.mNRowStartHits, tracker.mNMaxRowStartHits);
break;
}
} else {
startHits = tracker.mTrackletStartHits;
nextRowStartHits = CAMath::AtomicAdd(&tracker.mCommonMem->nStartHits, 1u);
if (nextRowStartHits >= tracker.mNMaxStartHits) {
tracker.raiseError(GPUErrors::ERROR_STARTHIT_OVERFLOW, tracker.ISector() * 1000 + s.mIRow, nextRowStartHits, tracker.mNMaxStartHits);
CAMath::AtomicExch(&tracker.mCommonMem->nStartHits, tracker.mNMaxStartHits);
break;
}
}
#else
GPUglobalref() GPUTPCHitId* const GPUrestrict() startHits = tracker.mTrackletStartHits;
uint32_t nextRowStartHits = CAMath::AtomicAdd(&tracker.mCommonMem->nStartHits, 1u);
if (nextRowStartHits >= tracker.mNMaxStartHits) {
tracker.raiseError(GPUErrors::ERROR_STARTHIT_OVERFLOW, tracker.ISector() * 1000 + s.mIRow, nextRowStartHits, tracker.mNMaxStartHits);
CAMath::AtomicExch(&tracker.mCommonMem->nStartHits, tracker.mNMaxStartHits);
break;
}
#endif
startHits[nextRowStartHits].Set(s.mIRow, ih);
}
}
GPUbarrier();

#if GPUCA_PAR_SORT_STARTHITS > 0
if (iThread == 0) {
uint32_t nOffset = CAMath::AtomicAdd(&tracker.mCommonMem->nStartHits, s.mNRowStartHits);
tracker.mRowStartHitCountOffset[s.mIRow] = s.mNRowStartHits;
if (nOffset + s.mNRowStartHits > tracker.mNMaxStartHits) {
tracker.raiseError(GPUErrors::ERROR_STARTHIT_OVERFLOW, tracker.ISector() * 1000 + s.mIRow, nOffset + s.mNRowStartHits, tracker.mNMaxStartHits);
CAMath::AtomicExch(&tracker.mCommonMem->nStartHits, tracker.mNMaxStartHits);
if constexpr (GPUCA_PAR_SORT_STARTHITS > 0) {
if (iThread == 0) {
uint32_t nOffset = CAMath::AtomicAdd(&tracker.mCommonMem->nStartHits, s.mNRowStartHits);
tracker.mRowStartHitCountOffset[s.mIRow] = s.mNRowStartHits;
if (nOffset + s.mNRowStartHits > tracker.mNMaxStartHits) {
tracker.raiseError(GPUErrors::ERROR_STARTHIT_OVERFLOW, tracker.ISector() * 1000 + s.mIRow, nOffset + s.mNRowStartHits, tracker.mNMaxStartHits);
CAMath::AtomicExch(&tracker.mCommonMem->nStartHits, tracker.mNMaxStartHits);
}
}
}
#endif
}
Loading