Skip to content

Commit 490589c

Browse files
cima22davidrohr
authored andcommitted
GPU Framework: fixed mismatch between CUDA and HIP launch bounds definitions
1 parent 3b68b1c commit 490589c

10 files changed

+41
-18
lines changed

GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,21 @@
3232
#define GPUCA_M_KRNL_NAME(...) GPUCA_M_KRNL_NAME_A(GPUCA_M_STRIP(__VA_ARGS__))
3333

3434
#if defined(GPUCA_GPUCODE) || defined(GPUCA_GPUCODE_HOSTONLY)
35+
36+
#if defined(__HIPCC__) && !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS)
37+
static_assert(GPUCA_PAR_AMD_EUS_PER_CU > 0);
38+
#define GPUCA_MIN_WARPS_PER_EU(maxThreadsPerBlock, minBlocksPerCU) GPUCA_CEIL_INT_DIV((minBlocksPerCU) * (maxThreadsPerBlock), (GPUCA_WARP_SIZE * GPUCA_PAR_AMD_EUS_PER_CU))
39+
40+
#define GPUCA_LB_ARGS_1(maxThreadsPerBlock) maxThreadsPerBlock
41+
#define GPUCA_LB_ARGS_2(maxThreadsPerBlock, minBlocksPerCU) maxThreadsPerBlock, GPUCA_MIN_WARPS_PER_EU(maxThreadsPerBlock, minBlocksPerCU)
42+
43+
#define GPUCA_LAUNCH_BOUNDS_SELECT(n, ...) GPUCA_M_CAT(GPUCA_LB_ARGS_, n)(__VA_ARGS__)
44+
#define GPUCA_LAUNCH_BOUNDS_DISP(...) GPUCA_LAUNCH_BOUNDS_SELECT(GPUCA_M_COUNT(__VA_ARGS__), __VA_ARGS__)
45+
#define GPUCA_KRNL_REG_DEFAULT(args) __launch_bounds__(GPUCA_LAUNCH_BOUNDS_DISP(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args))))
46+
#elif !defined(GPUCA_GPUCODE_NO_LAUNCH_BOUNDS)
47+
#define GPUCA_KRNL_REG_DEFAULT(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
48+
#endif
49+
3550
#ifndef GPUCA_KRNL_REG
3651
#define GPUCA_KRNL_REG(...)
3752
#endif

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
7474
}
7575
fclose(fp);
7676
}
77-
const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true) +
77+
const std::string launchBounds = o2::gpu::internal::GPUDefParametersExport(*mParDevice, true, mParDevice->par_AMD_EUS_PER_CU ? (mParDevice->par_AMD_EUS_PER_CU * mWarpSize) : 0) +
7878
"#define GPUCA_WARP_SIZE " + std::to_string(mWarpSize) + "\n";
7979
if (GetProcessingSettings().rtctech.printLaunchBounds || GetProcessingSettings().debugLevel >= 3) {
8080
GPUInfo("RTC Launch Bounds:\n%s", launchBounds.c_str());

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ inline void GPUReconstructionCUDA::runKernelBackend(const krnlSetupTime& _xyz, c
7474
}
7575

7676
#undef GPUCA_KRNL_REG
77-
#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
77+
#define GPUCA_KRNL_REG(...) GPUCA_KRNL_REG_DEFAULT(__VA_ARGS__)
7878

7979
// clang-format off
8080
#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE != 1 // ---------- COMPILE_MODE = perkernel ----------

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDARTCCalls.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
#define GPUCA_GPUCODE_HOSTONLY
1616
#define GPUCA_GPUCODE_NO_LAUNCH_BOUNDS
1717

18-
#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
18+
#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_STRIP(args))
1919

2020
#include "GPUReconstructionCUDAIncludesSystem.h"
2121
#include "GPUReconstructionCUDADef.h"

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAkernel.template.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
#define GPUCA_GPUCODE_COMPILEKERNELS
1616
#include "GPUReconstructionCUDAIncludesSystem.h"
17-
#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
17+
#define GPUCA_KRNL_REG(...) GPUCA_KRNL_REG_DEFAULT(__VA_ARGS__)
1818
#define GPUCA_KRNL(...) GPUCA_KRNLGPU(__VA_ARGS__);
1919
#include "GPUReconstructionCUDADef.h"
2020
#include "GPUReconstructionKernelMacros.h"

GPU/GPUTracking/Base/hip/GPUReconstructionHIPkernel.template.hip

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
#define GPUCA_GPUCODE_COMPILEKERNELS
1616
#include "GPUReconstructionHIPIncludesSystem.h"
17-
#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))
17+
#define GPUCA_KRNL_REG(...) GPUCA_KRNL_REG_DEFAULT(__VA_ARGS__)
1818
#define GPUCA_KRNL(...) GPUCA_KRNLGPU(__VA_ARGS__);
1919
#include "GPUReconstructionHIPDef.h"
2020
#include "GPUReconstructionKernelMacros.h"

GPU/GPUTracking/Definitions/GPUDefMacros.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,7 @@
5050
#define GPUCA_UNROLL(...)
5151
#endif
5252

53+
#define GPUCA_CEIL_INT_DIV(a, b) (((a) + (b) - 1) / (b))
54+
5355
#endif
5456
// clang-format on

GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
// GPU-architecture-dependent default settings
2626
#if defined(GPUCA_GPUTYPE_MI2xx)
2727
#define GPUCA_WARP_SIZE 64
28+
#define GPUCA_PAR_AMD_EUS_PER_CU 4
2829
#define GPUCA_THREAD_COUNT_DEFAULT 256
2930
#define GPUCA_LB_GPUTPCCreateTrackingData 256
3031
#define GPUCA_LB_GPUTPCStartHitsSorter 512, 1
@@ -87,6 +88,7 @@
8788
#define GPUCA_PAR_COMP_GATHER_MODE 3
8889
#elif defined(GPUCA_GPUTYPE_VEGA)
8990
#define GPUCA_WARP_SIZE 64
91+
#define GPUCA_PAR_AMD_EUS_PER_CU 4
9092
#define GPUCA_THREAD_COUNT_DEFAULT 256
9193
#define GPUCA_LB_GPUTPCCreateTrackingData 128
9294
#define GPUCA_LB_GPUTPCStartHitsSorter 1024, 2
@@ -272,6 +274,9 @@
272274
#ifndef GPUCA_WARP_SIZE
273275
#define GPUCA_WARP_SIZE 32
274276
#endif
277+
#ifndef GPUCA_PAR_AMD_EUS_PER_CU
278+
#define GPUCA_PAR_AMD_EUS_PER_CU 0
279+
#endif
275280
#ifndef GPUCA_THREAD_COUNT_DEFAULT
276281
#define GPUCA_THREAD_COUNT_DEFAULT 256
277282
#endif

GPU/GPUTracking/Definitions/GPUDefParametersLoad.template.inc

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -39,23 +39,23 @@ static GPUDefParameters GPUDefParametersLoad()
3939
};
4040
}
4141

42-
#define GPUCA_EXPORT_KERNEL_LB(name) \
43-
if (par.par_LB_maxThreads[i] > 0) { \
44-
o << "#define GPUCA_LB_" GPUCA_M_STR(name) " " << par.par_LB_maxThreads[i]; \
45-
if (par.par_LB_minBlocks[i] > 0) { \
46-
o << ", " << par.par_LB_minBlocks[i]; \
47-
} \
48-
if (!forRTC && par.par_LB_forceBlocks[i] > 0) { \
49-
o << ", " << par.par_LB_forceBlocks[i]; \
50-
} \
51-
o << "\n"; \
52-
} \
42+
#define GPUCA_EXPORT_KERNEL_LB(name) \
43+
if (par.par_LB_maxThreads[i] > 0) { \
44+
o << "#define GPUCA_LB_" GPUCA_M_STR(name) " " << par.par_LB_maxThreads[i]; \
45+
if (par.par_LB_minBlocks[i] > 0) { \
46+
o << ", " << GPUCA_CEIL_INT_DIV(par.par_LB_maxThreads[i] * par.par_LB_minBlocks[i], (minBlockFactor ? minBlockFactor : par.par_LB_maxThreads[i])); \
47+
} \
48+
if (!forRTC && par.par_LB_forceBlocks[i] > 0) { \
49+
o << ", " << par.par_LB_forceBlocks[i]; \
50+
} \
51+
o << "\n"; \
52+
} \
5353
i++;
5454

5555
#define GPUCA_EXPORT_KERNEL_PARAM(name) \
5656
o << "#define GPUCA_PAR_" GPUCA_M_STR(name) " " << GPUCA_M_CAT(par.par_, name) << "\n";
5757

58-
static std::string GPUDefParametersExport(const GPUDefParameters& par, bool forRTC)
58+
static std::string GPUDefParametersExport(const GPUDefParameters& par, bool forRTC, int32_t minBlockFactor = 0)
5959
{
6060
std::stringstream o; // clang-format off
6161
int32_t i = 0;

GPU/GPUTracking/kernels.cmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,8 @@ o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP # Number of neighhbo
145145
COMP_GATHER_KERNEL # Default kernel to use for Compression Gather Operation [0 - 4]
146146
COMP_GATHER_MODE # TPC Compression Gather Mode [0 - 3]
147147
SORT_STARTHITS # Sort start hits to improve cache locality during tracklet construction [0/1]
148-
CF_SCAN_WORKGROUP_SIZE) # Work group size to use in clusterizer scan operation
148+
CF_SCAN_WORKGROUP_SIZE # Work group size to use in clusterizer scan operation
149+
AMD_EUS_PER_CU) # Number of SIMD units per Compute Unit (only for AMD GPUs)
149150

150151
o2_gpu_kernel_add_string_parameter(DEDX_STORAGE_TYPE # Data type to use for intermediate storage of dEdx truncated mean inputs
151152
MERGER_INTERPOLATION_ERROR_TYPE) # Data type for storing intermediate track residuals for interpolation

0 commit comments

Comments
 (0)