Skip to content

Commit eec0f01

Browse files
committed
GPU: Remove obsolete preprocessor magic to create 2 different kernels for single-slice and multi-slice
1 parent 9855ba3 commit eec0f01

15 files changed

+201
-300
lines changed

GPU/GPUTracking/Base/GPUReconstructionCPU.cxx

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -66,28 +66,25 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetu
6666
if (x.nThreads != 1) {
6767
throw std::runtime_error("Cannot run device kernel on host with nThreads != 1");
6868
}
69-
uint32_t num = y.num == 0 || y.num == -1 ? 1 : y.num;
70-
for (uint32_t k = 0; k < num; k++) {
71-
int32_t nThreads = getNKernelHostThreads(false);
72-
if (nThreads > 1) {
73-
if (mProcessingSettings.debugLevel >= 5) {
74-
printf("Running %d Threads\n", nThreads);
75-
}
76-
tbb::this_task_arena::isolate([&] {
77-
mThreading->activeThreads->execute([&] {
78-
tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
79-
typename T::GPUSharedMemory smem;
80-
for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
81-
T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
82-
}
83-
});
69+
int32_t nThreads = getNKernelHostThreads(false);
70+
if (nThreads > 1) {
71+
if (mProcessingSettings.debugLevel >= 5) {
72+
printf("Running %d Threads\n", nThreads);
73+
}
74+
tbb::this_task_arena::isolate([&] {
75+
mThreading->activeThreads->execute([&] {
76+
tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
77+
typename T::GPUSharedMemory smem;
78+
for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
79+
T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.index], args...);
80+
}
8481
});
8582
});
86-
} else {
87-
for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
88-
typename T::GPUSharedMemory smem;
89-
T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
90-
}
83+
});
84+
} else {
85+
for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
86+
typename T::GPUSharedMemory smem;
87+
T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.index], args...);
9188
}
9289
}
9390
}

GPU/GPUTracking/Base/GPUReconstructionCPU.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
4949

5050
public:
5151
~GPUReconstructionCPU() override;
52-
static constexpr krnlRunRange krnlRunRangeNone{0, -1};
52+
static constexpr krnlRunRange krnlRunRangeNone{0};
5353
static constexpr krnlEvent krnlEventNone = krnlEvent{nullptr, nullptr, 0};
5454

5555
template <class S, int32_t I = 0, typename... Args>
@@ -77,7 +77,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
7777

7878
GPUReconstructionCPU(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionKernels(cfg) {}
7979

80-
#define GPUCA_KRNL(x_class, attributes, x_arguments, x_forward, x_types) \
80+
#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \
8181
inline void runKernelImplWrapper(gpu_reconstruction_kernels::classArgument<GPUCA_M_KRNL_TEMPLATE(x_class)>, bool cpuFallback, double& timer, krnlSetup&& setup GPUCA_M_STRIP(x_arguments)) \
8282
{ \
8383
if (cpuFallback) { \
@@ -161,7 +161,7 @@ inline void GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args)
161161
throw std::runtime_error("GPUCA_MAX_THREADS exceeded");
162162
}
163163
if (mProcessingSettings.debugLevel >= 3) {
164-
GPUInfo("Running kernel %s (Stream %d, Range %d/%d, Grid %d/%d) on %s", GetKernelName<S, I>(), stream, setup.y.start, setup.y.num, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : cpuFallback ? "CPU (fallback)" : mDeviceName.c_str());
164+
GPUInfo("Running kernel %s (Stream %d, Index %d, Grid %d/%d) on %s", GetKernelName<S, I>(), stream, setup.y.index, nBlocks, nThreads, cpuFallback == 2 ? "CPU (forced)" : cpuFallback ? "CPU (fallback)" : mDeviceName.c_str());
165165
}
166166
if (nThreads == 0 || nBlocks == 0) {
167167
return;

GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h

Lines changed: 15 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -51,97 +51,45 @@
5151
#define GPUCA_ATTRRES3(XX) // 3 attributes not supported
5252
#define GPUCA_ATTRRES2(XX, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES2_, GPUCA_M_FIRST(__VA_ARGS__)))(XX, __VA_ARGS__)
5353
#define GPUCA_ATTRRES(XX, ...) GPUCA_M_EXPAND(GPUCA_M_CAT(GPUCA_ATTRRES_, GPUCA_M_FIRST(__VA_ARGS__)))(XX, __VA_ARGS__)
54-
// GPU Kernel entry point for single sector
55-
#define GPUCA_KRNLGPU_SINGLE_DEF(x_class, x_attributes, x_arguments, ...) \
56-
GPUg() void GPUCA_ATTRRES(,GPUCA_M_SHIFT(GPUCA_M_STRIP(x_attributes))) GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))(GPUCA_CONSMEM_PTR int32_t iSector_internal GPUCA_M_STRIP(x_arguments))
57-
#ifdef GPUCA_KRNL_DEFONLY
58-
#define GPUCA_KRNLGPU_SINGLE(...) GPUCA_KRNLGPU_SINGLE_DEF(__VA_ARGS__);
59-
#else
60-
#define GPUCA_KRNLGPU_SINGLE(x_class, x_attributes, x_arguments, x_forward, ...) GPUCA_KRNLGPU_SINGLE_DEF(x_class, x_attributes, x_arguments, x_forward, __VA_ARGS__) \
61-
{ \
62-
GPUshared() typename GPUCA_M_STRIP_FIRST(x_class)::GPUSharedMemory smem; \
63-
GPUCA_M_STRIP_FIRST(x_class)::template Thread<GPUCA_M_KRNL_NUM(x_class)>(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, GPUCA_M_STRIP_FIRST(x_class)::Processor(GPUCA_CONSMEM)[iSector_internal] GPUCA_M_STRIP(x_forward)); \
64-
}
65-
#endif
6654

67-
// GPU Kernel entry point for multiple sector
68-
#define GPUCA_KRNLGPU_MULTI_DEF(x_class, x_attributes, x_arguments, ...) \
69-
GPUg() void GPUCA_ATTRRES(,GPUCA_M_SHIFT(GPUCA_M_STRIP(x_attributes))) GPUCA_M_CAT3(krnl_, GPUCA_M_KRNL_NAME(x_class), _multi)(GPUCA_CONSMEM_PTR int32_t firstSector, int32_t nSectorCount GPUCA_M_STRIP(x_arguments))
55+
// GPU Kernel entry point
56+
#define GPUCA_KRNLGPU_DEF(x_class, x_attributes, x_arguments, ...) \
57+
GPUg() void GPUCA_ATTRRES(,GPUCA_M_STRIP(x_attributes)) GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))(GPUCA_CONSMEM_PTR int32_t _iSector_internal GPUCA_M_STRIP(x_arguments))
58+
7059
#ifdef GPUCA_KRNL_DEFONLY
71-
#define GPUCA_KRNLGPU_MULTI(...) GPUCA_KRNLGPU_MULTI_DEF(__VA_ARGS__);
60+
#define GPUCA_KRNLGPU(...) GPUCA_KRNLGPU_DEF(__VA_ARGS__);
7261
#else
73-
#define GPUCA_KRNLGPU_MULTI(x_class, x_attributes, x_arguments, x_forward, ...) GPUCA_KRNLGPU_MULTI_DEF(x_class, x_attributes, x_arguments, x_forward, __VA_ARGS__) \
62+
#define GPUCA_KRNLGPU(x_class, x_attributes, x_arguments, x_forward, ...) \
63+
GPUCA_KRNLGPU_DEF(x_class, x_attributes, x_arguments, x_forward, __VA_ARGS__) \
7464
{ \
75-
const int32_t iSector_internal = nSectorCount * (get_group_id(0) + (get_num_groups(0) % nSectorCount != 0 && nSectorCount * (get_group_id(0) + 1) % get_num_groups(0) != 0)) / get_num_groups(0); \
76-
const int32_t nSectorBlockOffset = get_num_groups(0) * iSector_internal / nSectorCount; \
77-
const int32_t sectorBlockId = get_group_id(0) - nSectorBlockOffset; \
78-
const int32_t sectorGridDim = get_num_groups(0) * (iSector_internal + 1) / nSectorCount - get_num_groups(0) * (iSector_internal) / nSectorCount; \
7965
GPUshared() typename GPUCA_M_STRIP_FIRST(x_class)::GPUSharedMemory smem; \
80-
GPUCA_M_STRIP_FIRST(x_class)::template Thread<GPUCA_M_KRNL_NUM(x_class)>(sectorGridDim, get_local_size(0), sectorBlockId, get_local_id(0), smem, GPUCA_M_STRIP_FIRST(x_class)::Processor(GPUCA_CONSMEM)[firstSector + iSector_internal] GPUCA_M_STRIP(x_forward)); \
66+
GPUCA_M_STRIP_FIRST(x_class)::template Thread<GPUCA_M_KRNL_NUM(x_class)>(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, GPUCA_M_STRIP_FIRST(x_class)::Processor(GPUCA_CONSMEM)[_iSector_internal] GPUCA_M_STRIP(x_forward)); \
8167
}
8268
#endif
8369

84-
// GPU Host wrapper pre- and post-parts
85-
#define GPUCA_KRNL_PRE(x_class, ...) \
70+
// GPU Host wrappers for kernel
71+
#define GPUCA_KRNL_HOST(x_class, ...) \
72+
GPUCA_KRNLGPU(x_class, __VA_ARGS__) \
8673
template <> class GPUCA_KRNL_BACKEND_CLASS::backendInternal<GPUCA_M_KRNL_TEMPLATE(x_class)> { \
8774
public: \
8875
template <typename T, typename... Args> \
8976
static inline void runKernelBackendMacro(const krnlSetupTime& _xyz, T* me, const Args&... args) \
9077
{ \
9178
auto& x = _xyz.x; \
92-
auto& y = _xyz.y;
93-
94-
#define GPUCA_KRNL_POST() \
79+
auto& y = _xyz.y; \
80+
GPUCA_KRNL_CALL(x_class, __VA_ARGS__) \
9581
} \
9682
};
9783

98-
// GPU Host wrappers for single kernel, multi-sector, or auto-detection
99-
#define GPUCA_KRNL_single(...) \
100-
GPUCA_KRNLGPU_SINGLE(__VA_ARGS__) \
101-
GPUCA_KRNL_PRE(__VA_ARGS__) \
102-
if (y.num > 1) { \
103-
throw std::runtime_error("Kernel called with invalid number of sectors"); \
104-
} else { \
105-
GPUCA_KRNL_CALL_single(__VA_ARGS__) \
106-
} \
107-
GPUCA_KRNL_POST()
108-
109-
#define GPUCA_KRNL_multi(...) \
110-
GPUCA_KRNLGPU_MULTI(__VA_ARGS__) \
111-
GPUCA_KRNL_PRE(__VA_ARGS__) \
112-
GPUCA_KRNL_CALL_multi(__VA_ARGS__) \
113-
GPUCA_KRNL_POST()
114-
115-
#define GPUCA_KRNL_(...) GPUCA_KRNL_single(__VA_ARGS__)
116-
#define GPUCA_KRNL_simple(...) GPUCA_KRNL_single(__VA_ARGS__)
117-
#define GPUCA_KRNL_both(...) \
118-
GPUCA_KRNLGPU_SINGLE(__VA_ARGS__) \
119-
GPUCA_KRNLGPU_MULTI(__VA_ARGS__) \
120-
GPUCA_KRNL_PRE(__VA_ARGS__) \
121-
if (y.num <= 1) { \
122-
GPUCA_KRNL_CALL_single(__VA_ARGS__) \
123-
} else { \
124-
GPUCA_KRNL_CALL_multi(__VA_ARGS__) \
125-
} \
126-
GPUCA_KRNL_POST()
127-
128-
#define GPUCA_KRNL_LOAD_(...) GPUCA_KRNL_LOAD_single(__VA_ARGS__)
129-
#define GPUCA_KRNL_LOAD_simple(...) GPUCA_KRNL_LOAD_single(__VA_ARGS__)
130-
#define GPUCA_KRNL_LOAD_both(...) \
131-
GPUCA_KRNL_LOAD_single(__VA_ARGS__) \
132-
GPUCA_KRNL_LOAD_multi(__VA_ARGS__)
133-
13484
#define GPUCA_KRNL_PROP(x_class, x_attributes) \
13585
template <> gpu_reconstruction_kernels::krnlProperties GPUCA_KRNL_BACKEND_CLASS::getKernelPropertiesBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>() { \
136-
gpu_reconstruction_kernels::krnlProperties ret = gpu_reconstruction_kernels::krnlProperties{GPUCA_ATTRRES(_INTERNAL_PROP,GPUCA_M_SHIFT(GPUCA_M_STRIP(x_attributes)))}; \
86+
gpu_reconstruction_kernels::krnlProperties ret = gpu_reconstruction_kernels::krnlProperties{GPUCA_ATTRRES(_INTERNAL_PROP,GPUCA_M_STRIP(x_attributes))}; \
13787
return ret.nThreads > 0 ? ret : gpu_reconstruction_kernels::krnlProperties{(int32_t)mThreadCount}; \
13888
}
13989

140-
// Generate GPU kernel and host wrapper
141-
#define GPUCA_KRNL_WRAP(x_func, x_class, x_attributes, ...) GPUCA_M_CAT(x_func, GPUCA_M_STRIP_FIRST(x_attributes))(x_class, x_attributes, __VA_ARGS__)
14290
#endif // GPUCA_GPUCODE
14391

144-
#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (GPUCA_M_STRIP(x_attributes), REG, (GPUCA_M_CAT(GPUCA_LB_, GPUCA_M_KRNL_NAME(x_class)))), __VA_ARGS__)
92+
#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_LB_, GPUCA_M_KRNL_NAME(x_class)))), __VA_ARGS__)
14593

14694
#endif // O2_GPU_GPURECONSTRUCTIONKERNELMACROS_H
14795
// clang-format on

GPU/GPUTracking/Base/GPUReconstructionKernels.h

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,8 @@ struct krnlExec {
4141
};
4242
struct krnlRunRange {
4343
constexpr krnlRunRange() = default;
44-
constexpr krnlRunRange(uint32_t a) : start(a), num(0) {}
45-
constexpr krnlRunRange(uint32_t s, int32_t n) : start(s), num(n) {}
46-
47-
uint32_t start = 0;
48-
int32_t num = 0;
44+
constexpr krnlRunRange(uint32_t v) : index(v) {}
45+
uint32_t index = 0;
4946
};
5047
struct krnlEvent {
5148
constexpr krnlEvent(deviceEvent* e = nullptr, deviceEvent* el = nullptr, int32_t n = 1) : ev(e), evList(el), nEvents(n) {}
@@ -63,7 +60,7 @@ struct krnlProperties {
6360
};
6461

6562
struct krnlSetup {
66-
krnlSetup(const krnlExec& xx, const krnlRunRange& yy = {0, -1}, const krnlEvent& zz = {nullptr, nullptr, 0}) : x(xx), y(yy), z(zz) {}
63+
krnlSetup(const krnlExec& xx, const krnlRunRange& yy = {0}, const krnlEvent& zz = {nullptr, nullptr, 0}) : x(xx), y(yy), z(zz) {}
6764
krnlExec x;
6865
krnlRunRange y;
6966
krnlEvent z;
@@ -98,7 +95,7 @@ class GPUReconstructionKernels : public T
9895
template <class S, int32_t I = 0, typename... Args>
9996
using krnlSetupArgs = gpu_reconstruction_kernels::krnlSetupArgs<S, I, Args...>;
10097

101-
#define GPUCA_KRNL(x_class, attributes, x_arguments, x_forward, x_types) \
98+
#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types) \
10299
virtual void runKernelImpl(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args) \
103100
{ \
104101
T::template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(args); \

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
380380
GPUFailedMsg(cuModuleLoadData(mInternals->kernelModules.back().get(), GPUCA_M_CAT3(_binary_cuda_kernel_module_fatbin_krnl_, GPUCA_M_KRNL_NAME(x_class), GPUCA_M_CAT(PER_KERNEL_OBJECT_EXT, _start))));
381381
#include "GPUReconstructionKernelList.h"
382382
#undef GPUCA_KRNL
383-
loadKernelModules(true, false);
383+
loadKernelModules(true);
384384
}
385385
#endif
386386
void* devPtrConstantMem = nullptr;
@@ -630,34 +630,20 @@ void GPUReconstructionCUDABackend::PrintKernelOccupancies()
630630
}
631631
}
632632

633-
void GPUReconstructionCUDA::loadKernelModules(bool perKernel, bool perSingleMulti)
633+
void GPUReconstructionCUDA::loadKernelModules(bool perKernel)
634634
{
635635
uint32_t j = 0;
636-
#define GPUCA_KRNL(...) \
637-
GPUCA_KRNL_WRAP(GPUCA_KRNL_LOAD_, __VA_ARGS__) \
638-
j += !perSingleMulti;
639-
#define GPUCA_KRNL_LOAD_single(x_class, ...) \
636+
#define GPUCA_KRNL(x_class, ...) \
640637
getRTCkernelNum<false, GPUCA_M_KRNL_TEMPLATE(x_class)>(mInternals->kernelFunctions.size()); \
641638
mInternals->kernelFunctions.emplace_back(new CUfunction); \
642639
mInternals->kernelNames.emplace_back(GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class)))); \
643640
if (mProcessingSettings.debugLevel >= 3) { \
644641
GPUInfo("Loading kernel %s (j = %u)", GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), j); \
645642
} \
646643
GPUFailedMsg(cuModuleGetFunction(mInternals->kernelFunctions.back().get(), *mInternals->kernelModules[perKernel ? j : 0], GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))))); \
647-
j += perSingleMulti;
648-
#define GPUCA_KRNL_LOAD_multi(x_class, ...) \
649-
getRTCkernelNum<true, GPUCA_M_KRNL_TEMPLATE(x_class)>(mInternals->kernelFunctions.size()); \
650-
mInternals->kernelFunctions.emplace_back(new CUfunction); \
651-
mInternals->kernelNames.emplace_back(GPUCA_M_STR(GPUCA_M_CAT3(krnl_, GPUCA_M_KRNL_NAME(x_class), _multi))); \
652-
if (mProcessingSettings.debugLevel >= 3) { \
653-
GPUInfo("Loading kernel %s (j = %u)", GPUCA_M_STR(GPUCA_M_CAT3(krnl_, GPUCA_M_KRNL_NAME(x_class), _multi)), j); \
654-
} \
655-
GPUFailedMsg(cuModuleGetFunction(mInternals->kernelFunctions.back().get(), *mInternals->kernelModules[perKernel ? j : 0], GPUCA_M_STR(GPUCA_M_CAT3(krnl_, GPUCA_M_KRNL_NAME(x_class), _multi)))); \
656-
j += perSingleMulti;
644+
j++;
657645
#include "GPUReconstructionKernelList.h"
658646
#undef GPUCA_KRNL
659-
#undef GPUCA_KRNL_LOAD_single
660-
#undef GPUCA_KRNL_LOAD_multi
661647

662648
if (j != mInternals->kernelModules.size()) {
663649
GPUFatal("Did not load all kernels (%u < %u)", j, (uint32_t)mInternals->kernelModules.size());

GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels<GPUReconstructionC
9898
private:
9999
int32_t genRTC(std::string& filename, uint32_t& nCompile);
100100
void genAndLoadRTC();
101-
void loadKernelModules(bool perKernel, bool perSingleMulti = true);
101+
void loadKernelModules(bool perKernel);
102102
const char *mRtcSrcExtension = ".src", *mRtcBinExtension = ".o";
103103
};
104104

0 commit comments

Comments
 (0)