Skip to content

Commit 2ab6000

Browse files
committed
GPU: Make some of the optimizations AMD did for the neighbors finder for MI50 optional
1 parent e966e71 commit 2ab6000

File tree

5 files changed

+34
-15
lines changed

5 files changed

+34
-15
lines changed

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
263263
}
264264
if (mProcessingSettings.deterministicGPUReconstruction) {
265265
#ifndef GPUCA_DETERMINISTIC_MODE
266-
GPUError("Warning, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!");
266+
GPUError("WARNING, deterministicGPUReconstruction needs GPUCA_DETERMINISTIC_MODE for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!");
267267
#endif
268268
mProcessingSettings.overrideClusterizerFragmentLen = TPC_MAX_FRAGMENT_LEN_GPU;
269269
param().rec.tpc.nWaysOuter = true;
@@ -274,6 +274,10 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
274274
mProcessingSettings.createO2Output = 1;
275275
}
276276
mProcessingSettings.rtc.deterministic = 1;
277+
} else {
278+
#ifdef GPUCA_DETERMINISTIC_MODE
279+
GPUError("WARNING, compiled with GPUCA_DETERMINISTIC_MODE but deterministicGPUReconstruction not set, only compile-time determinism and deterministic math enforced, not fully deterministic!");
280+
#endif
277281
}
278282
if (mProcessingSettings.deterministicGPUReconstruction && mProcessingSettings.debugLevel >= 6) {
279283
mProcessingSettings.nTPCClustererLanes = 1;

GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,12 @@
516516
#ifndef GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP
517517
#define GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP 6
518518
#endif
519+
#ifndef GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL
520+
#define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL 4
521+
#endif
522+
#ifndef GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED
523+
#define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED 1
524+
#endif
519525
#ifndef GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE
520526
#define GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE 12
521527
#endif
@@ -544,6 +550,12 @@
544550
#ifndef GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP
545551
#define GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP 0
546552
#endif
553+
#ifndef GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL
554+
#define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL 0
555+
#endif
556+
#ifndef GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED
557+
#define GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED 0
558+
#endif
547559
#ifndef GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE
548560
#define GPUCA_PAR_TRACKLET_SELECTOR_HITS_REG_SIZE 0
549561
#endif

GPU/GPUTracking/Definitions/GPUSettingsList.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for m
284284
AddOption(debugMask, int32_t, 262143, "", 0, "Mask for debug output dumps to file")
285285
AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures")
286286
AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks")
287-
AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6")
287+
AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6", def(1))
288288
AddOption(showOutputStat, bool, false, "", 0, "Print some track output statistics")
289289
AddOption(runCompressionStatistics, bool, false, "compressionStat", 0, "Run statistics and verification for cluster compression")
290290
AddOption(resetTimers, int8_t, 1, "", 0, "Reset timers every event")

GPU/GPUTracking/SectorTracker/GPUTPCNeighboursFinder.cxx

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
7575
return;
7676
}
7777

78-
static constexpr uint32_t UNROLL_GLOBAL = 4;
78+
static constexpr uint32_t UNROLL_GLOBAL = GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL > 1 ? GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_GLOBAL : 1;
7979
static_assert(GPUCA_MAXN % UNROLL_GLOBAL == 0);
8080
static constexpr uint32_t MAX_SHARED = GPUCA_PAR_NEIGHBOURS_FINDER_MAX_NNEIGHUP;
8181
static constexpr uint32_t MAX_GLOBAL = (MAX_SHARED < GPUCA_MAXN) ? (((GPUCA_MAXN - MAX_SHARED - 1) / UNROLL_GLOBAL + 1) * UNROLL_GLOBAL) : 0;
@@ -173,25 +173,25 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
173173
}
174174
}
175175

176-
if constexpr (MAX_SHARED > 0) { // init the rest of the shared array
176+
if constexpr (MAX_SHARED > 0 && GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED) { // init the rest of the shared array
177177
for (uint32_t iUp = nNeighUp; iUp < MAX_SHARED; iUp++) {
178178
s.mA1[iUp][iThread] = -1.e10f;
179179
s.mA2[iUp][iThread] = -1.e10f;
180180
s.mB[iUp][iThread] = (calink)-1;
181181
}
182182
}
183183

184-
const uint32_t Nrest = nNeighUp - MAX_SHARED;
185-
uint32_t N4 = (Nrest / UNROLL_GLOBAL) * UNROLL_GLOBAL;
186-
if constexpr (MAX_GLOBAL > 0) { // init the rest of the UNROLL_GLOBAL chunk of the global array
187-
if (nNeighUp > MAX_SHARED && N4 < Nrest) {
188-
N4 += UNROLL_GLOBAL;
184+
const uint32_t nRest = nNeighUp - MAX_SHARED;
185+
uint32_t nRestUnrolled = (nRest / UNROLL_GLOBAL) * UNROLL_GLOBAL;
186+
if constexpr (MAX_GLOBAL > 1) { // init the rest of the UNROLL_GLOBAL chunk of the global array
187+
if (nNeighUp > MAX_SHARED && nRestUnrolled < nRest) {
188+
nRestUnrolled += UNROLL_GLOBAL;
189189
GPUCA_UNROLL(U(UNROLL_GLOBAL - 1), U(UNROLL_GLOBAL - 1))
190190
for (uint32_t k = 0; k + 1 < UNROLL_GLOBAL; k++) {
191-
if (Nrest + k < N4) {
192-
yzUp[2 * (Nrest + k)] = -1.e10f;
193-
yzUp[2 * (Nrest + k) + 1] = -1.e10f;
194-
neighUp[Nrest + k] = (calink)-1;
191+
if (nRest + k < nRestUnrolled) {
192+
yzUp[2 * (nRest + k)] = -1.e10f;
193+
yzUp[2 * (nRest + k) + 1] = -1.e10f;
194+
neighUp[nRest + k] = (calink)-1;
195195
}
196196
}
197197
}
@@ -229,8 +229,9 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
229229
float zDnProjUp = s.mUpDx * (zDn - z);
230230

231231
if constexpr (MAX_SHARED > 0) {
232+
const uint32_t maxSharedUp = GPUCA_PAR_NEIGHBOURS_FINDER_UNROLL_SHARED ? MAX_SHARED : CAMath::Min(nNeighUp, MAX_SHARED);
232233
GPUCA_UNROLL(U(MAX_SHARED), U(MAX_SHARED))
233-
for (uint32_t iUp = 0; iUp < MAX_SHARED; iUp++) {
234+
for (uint32_t iUp = 0; iUp < maxSharedUp; iUp++) {
234235
const float dy = yDnProjUp - s.mA1[iUp][iThread];
235236
const float dz = zDnProjUp - s.mA2[iUp][iThread];
236237
const float d = dy * dy + dz * dz;
@@ -244,7 +245,7 @@ GPUdii() void GPUTPCNeighboursFinder::Thread<0>(int32_t /*nBlocks*/, int32_t nTh
244245

245246
if constexpr (MAX_GLOBAL > 0) {
246247
if (nNeighUp > MAX_SHARED) {
247-
for (uint32_t iUp = 0; iUp < N4; iUp += UNROLL_GLOBAL) {
248+
for (uint32_t iUp = 0; iUp < nRestUnrolled; iUp += UNROLL_GLOBAL) {
248249
GPUCA_UNROLL(U(UNROLL_GLOBAL), U(UNROLL_GLOBAL))
249250
for (uint32_t k = 0; k < UNROLL_GLOBAL; k++) {
250251
const uint32_t jUp = iUp + k;

GPU/GPUTracking/kernels.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@ o2_gpu_add_kernel("GPUTrackingRefitKernel, mode0asGPU" "= GLOBALR
136136
o2_gpu_add_kernel("GPUTrackingRefitKernel, mode1asTrackParCov" "= GLOBALREFIT " LB)
137137

138138
o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP
139+
NEIGHBOURS_FINDER_UNROLL_GLOBAL
140+
NEIGHBOURS_FINDER_UNROLL_SHARED
139141
TRACKLET_SELECTOR_HITS_REG_SIZE
140142
ALTERNATE_BORDER_SORT
141143
SORT_BEFORE_FIT

0 commit comments

Comments
 (0)