Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 3 additions & 15 deletions GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
}

template <class T, int32_t I, typename... Args>
inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
inline void GPUReconstructionCPU::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args)
{
auto& x = _xyz.x;
auto& y = _xyz.y;
Expand Down Expand Up @@ -88,7 +88,7 @@ inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime&
}

template <>
inline void GPUReconstructionCPU::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
inline void GPUReconstructionCPU::runKernelBackend<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
{
int32_t nThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));
if (nThreads > 1) {
Expand All @@ -108,17 +108,6 @@ inline void GPUReconstructionCPU::runKernelBackendInternal<GPUMemClean16, 0>(con
}
}

template <class T, int32_t I, typename... Args>
void GPUReconstructionCPU::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
{
#pragma GCC diagnostic push
#if defined(__clang__)
#pragma GCC diagnostic ignored "-Wunused-lambda-capture" // this is not alway captured below
#endif
std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
#pragma GCC diagnostic push
}

template <class S, int32_t I>
GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu)
{
Expand All @@ -137,8 +126,7 @@ GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelPrope
return ret;
}

#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
template void GPUReconstructionCPU::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
template GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
#include "GPUReconstructionKernelList.h"
#undef GPUCA_KRNL
Expand Down
5 changes: 1 addition & 4 deletions GPU/GPUTracking/Base/GPUReconstructionCPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface
template <class S, int32_t I = 0>
krnlProperties getKernelProperties(int gpu = -1);
template <class T, int32_t I = 0, typename... Args>
void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args);

virtual int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false);
int32_t GPUStuck() { return mGPUStuck; }
Expand All @@ -59,9 +59,6 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface

GPUReconstructionCPU(const GPUSettingsDeviceBackend& cfg) : GPUReconstructionProcessing::KernelInterface<GPUReconstructionCPU, GPUReconstructionProcessing>(cfg) {}

template <class T, int32_t I = 0, typename... Args>
void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);

int32_t registerMemoryForGPU_internal(const void* ptr, size_t size) override { return 0; }
int32_t unregisterMemoryForGPU_internal(const void* ptr) override { return 0; }

Expand Down
14 changes: 0 additions & 14 deletions GPU/GPUTracking/Base/GPUReconstructionKernelMacros.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,20 +62,6 @@
}
#endif

// GPU Host wrappers for kernel
#define GPUCA_KRNL_HOST(x_class, ...) \
GPUCA_KRNLGPU(x_class, __VA_ARGS__) \
template <> class GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::backendInternal<GPUCA_M_KRNL_TEMPLATE(x_class)> { \
public: \
template <typename T, typename... Args> \
static inline void runKernelBackendMacro(const krnlSetupTime& _xyz, T* me, const Args&... args) \
{ \
auto& x = _xyz.x; \
auto& y = _xyz.y; \
GPUCA_KRNL_CALL(x_class, __VA_ARGS__) \
} \
};

#endif // GPUCA_GPUCODE

#define GPUCA_KRNL_LB(x_class, x_attributes, ...) GPUCA_KRNL(x_class, (REG, (GPUCA_M_CAT(GPUCA_LB_, GPUCA_M_KRNL_NAME(x_class))), GPUCA_M_STRIP(x_attributes)), __VA_ARGS__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,28 @@
namespace o2::gpu
{

#pragma GCC diagnostic push
#if defined(__clang__)
#pragma GCC diagnostic ignored "-Wunused-lambda-capture" // this is not alway captured below
#endif

template <class T, class S>
void GPUReconstructionProcessing::KernelInterface<T, S>::runKernelVirtual(const int num, const void* args)
{
switch (num) { // clang-format off
#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, x_num) \
case x_num: { \
const auto& args2 = *(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>*)args; \
((T*)this)->template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(args2); \
std::apply([this, &args2](auto&... vals) { ((T*)this)->template runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>(args2.s, vals...); }, args2.v); \
break; \
}
#include "GPUReconstructionKernelList.h"
#undef GPUCA_KRNL
} // clang-format on
}

#pragma GCC diagnostic push

} // namespace o2::gpu

#endif // GPURECONSTRUCTIONPROCESSINGKERNELS_H
3 changes: 2 additions & 1 deletion GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
#include "GPUParamRTC.h"
#include "GPUReconstructionCUDAHelpers.inc"
#include "GPUDefParametersLoad.inc"
#include "GPUReconstructionProcessingKernels.inc"
#include "GPUReconstructionKernelIncludes.h"
#include "GPUConstantMem.h"

#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1
#include "utils/qGetLdBinarySymbols.h"
Expand Down
9 changes: 5 additions & 4 deletions GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,18 @@ class GPUReconstructionCUDA : public GPUReconstructionProcessing::KernelInterfac
virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const override;

template <class T, int32_t I = 0, typename... Args>
void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args);
template <class T, int32_t I = 0, typename... Args>
void runKernelBackendTimed(const krnlSetupTime& _xyz, const Args&... args);
template <class T, int32_t I>
struct kernelBackendMacro;

template <class T, class S>
friend GPUh() void GPUCommonAlgorithm::sortOnDevice(auto* rec, int32_t stream, T* begin, size_t N, const S& comp);

protected:
GPUReconstructionCUDAInternals* mInternals;

template <class T, int32_t I = 0, typename... Args>
void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);

int32_t InitDevice_Runtime() override;
int32_t ExitDevice_Runtime() override;

Expand Down
71 changes: 40 additions & 31 deletions GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,19 @@ using namespace o2::gpu;
#include "GPUReconstructionIncludesDeviceAll.h"

#include "GPUReconstructionCUDAKernelsSpecialize.inc"
#include "GPUReconstructionProcessingKernels.inc"
template void GPUReconstructionProcessing::KernelInterface<GPUReconstructionCUDA, GPUReconstructionDeviceBase>::runKernelVirtual(const int num, const void* args);

#if defined(__HIPCC__) && defined(GPUCA_HAS_GLOBAL_SYMBOL_CONSTANT_MEM)
__global__ void gGPUConstantMemBuffer_dummy(int32_t* p) { *p = *(int32_t*)&gGPUConstantMemBuffer; }
#endif

template <class T, int32_t I, typename... Args>
inline void GPUReconstructionCUDA::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
inline void GPUReconstructionCUDA::runKernelBackendTimed(const krnlSetupTime& _xyz, const Args&... args)
{
#if !defined(GPUCA_KERNEL_COMPILE_MODE) || GPUCA_KERNEL_COMPILE_MODE != 1
if (!GetProcessingSettings().rtc.enable) {
backendInternal<T, I>::runKernelBackendMacro(_xyz, this, args...);
kernelBackendMacro<T, I>::run(_xyz, this, args...);
} else
#endif
{
Expand All @@ -52,18 +54,18 @@ inline void GPUReconstructionCUDA::runKernelBackendInternal(const krnlSetupTime&
}

template <class T, int32_t I, typename... Args>
void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
inline void GPUReconstructionCUDA::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args)
{
auto& x = args.s.x;
auto& z = args.s.z;
auto& x = _xyz.x;
auto& z = _xyz.z;
if (z.evList) {
for (int32_t k = 0; k < z.nEvents; k++) {
GPUChkErr(cudaStreamWaitEvent(mInternals->Streams[x.stream], ((cudaEvent_t*)z.evList)[k], 0));
}
}
{
GPUDebugTiming timer(GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0, (deviceEvent*)mDebugEvents, mInternals->Streams, args.s, this);
std::apply([this, &args](auto&... vals) { this->runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
GPUDebugTiming timer(GetProcessingSettings().deviceTimers && GetProcessingSettings().debugLevel > 0, (deviceEvent*)mDebugEvents, mInternals->Streams, _xyz, this);
runKernelBackendTimed<T, I, Args...>(_xyz, args...);
}
GPUChkErr(cudaGetLastError());
if (z.ev) {
Expand All @@ -74,31 +76,38 @@ void GPUReconstructionCUDA::runKernelBackend(const krnlSetupArgs<T, I, Args...>&
#undef GPUCA_KRNL_REG
#define GPUCA_KRNL_REG(args) __launch_bounds__(GPUCA_M_MAX2_3(GPUCA_M_STRIP(args)))

#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 1 // ---------- COMPILE_MODE = perkernel ----------
#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionCUDA::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
#else // ---------- COMPILE_MODE = onefile | rdc ----------
#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 2
#define GPUCA_KRNL_DEFONLY // COMPILE_MODE = rdc
#endif

#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
GPUCA_KRNL_HOST(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) \
template void GPUReconstructionCUDA::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);

#ifndef __HIPCC__ // CUDA version
#define GPUCA_KRNL_CALL(x_class, ...) \
GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))<<<x.nBlocks, x.nThreads, 0, me->mInternals->Streams[x.stream]>>>(GPUCA_CONSMEM_CALL y.index, args...);
#else // HIP version
#undef GPUCA_KRNL_CUSTOM
#define GPUCA_KRNL_CUSTOM(args) GPUCA_M_STRIP(args)
#define GPUCA_KRNL_CALL(x_class, ...) \
hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->Streams[x.stream], GPUCA_CONSMEM_CALL y.index, args...);
#endif // __HIPCC__

// clang-format off
#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE != 1 // ---------- COMPILE_MODE = perkernel ----------
#if defined(GPUCA_KERNEL_COMPILE_MODE) && GPUCA_KERNEL_COMPILE_MODE == 2
#define GPUCA_KRNL_DEFONLY // COMPILE_MODE = rdc
#endif

#ifndef __HIPCC__ // CUDA version
#define GPUCA_KRNL_CALL(x_class, ...) \
GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))<<<x.nBlocks, x.nThreads, 0, me->mInternals->Streams[x.stream]>>>(GPUCA_CONSMEM_CALL y.index, args...);
#else // HIP version
#undef GPUCA_KRNL_CUSTOM
#define GPUCA_KRNL_CUSTOM(args) GPUCA_M_STRIP(args)
#define GPUCA_KRNL_CALL(x_class, ...) \
hipLaunchKernelGGL(HIP_KERNEL_NAME(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), dim3(x.nBlocks), dim3(x.nThreads), 0, me->mInternals->Streams[x.stream], GPUCA_CONSMEM_CALL y.index, args...);
#endif // __HIPCC__

#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
GPUCA_KRNLGPU(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) \
template <> struct GPUReconstructionCUDA::kernelBackendMacro<GPUCA_M_KRNL_TEMPLATE(x_class)> { \
template <typename... Args> \
static inline void run(const GPUReconstructionProcessing::krnlSetupTime& _xyz, auto* me, const Args&... args) \
{ \
auto& x = _xyz.x; \
auto& y = _xyz.y; \
GPUCA_KRNL_CALL(x_class, x_attributes, x_arguments, x_forward, x_types, __VA_ARGS__) \
} \
};

#include "GPUReconstructionKernelList.h"
#undef GPUCA_KRNL
#endif // ---------- COMPILE_MODE = onefile | rdc ----------

#include "GPUReconstructionKernelList.h"
#undef GPUCA_KRNL
// clang-format on

#ifndef GPUCA_NO_CONSTANT_MEMORY
static GPUReconstructionDeviceBase::deviceConstantMemRegistration registerConstSymbol([]() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ struct GPUTPCGMO2OutputSort_comp {
} // namespace o2::gpu::internal

template <>
inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)
inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerMergeBorders, 3>(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax)
{
if (cmpMax) {
GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, range, N, MergeBorderTracks_compMax());
Expand All @@ -107,32 +107,32 @@ inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInter
}

template <>
inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)
inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerSortTracks, 0>(const krnlSetupTime& _xyz)
{
GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
}

template <>
inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)
inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerSortTracksQPt, 0>(const krnlSetupTime& _xyz)
{
GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks()));
}

template <>
inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)
inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMMergerMergeLoopers, 1>(const krnlSetupTime& _xyz)
{
GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.LooperCandidates(), processors()->tpcMerger.Memory()->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp());
}

template <>
inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)
inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUTPCGMO2Output, GPUTPCGMO2Output::sort>(const krnlSetupTime& _xyz)
{
GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSortO2(), processors()->tpcMerger.NOutputTracksTPCO2(), GPUTPCGMO2OutputSort_comp());
}
#endif // GPUCA_SPECIALIZE_THRUST_SORTS

template <>
inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
{
GPUChkErr(cudaMemsetAsync(ptr, 0, size, mInternals->Streams[_xyz.x.stream]));
}
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
/// \author David Rohr

#include "GPUReconstructionOCLIncludesHost.h"
#include "GPUReconstructionProcessingKernels.inc"
#include "GPUDefParametersLoad.inc"
#include "GPUConstantMem.h"

#include <map>

Expand Down
4 changes: 1 addition & 3 deletions GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface
~GPUReconstructionOCL() override;

template <class T, int32_t I = 0, typename... Args>
void runKernelBackend(const krnlSetupArgs<T, I, Args...>& args);
void runKernelBackend(const krnlSetupTime& _xyz, const Args&... args);

protected:
int32_t InitDevice_Runtime() override;
Expand All @@ -57,8 +57,6 @@ class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface

template <class T, int32_t I = 0>
int32_t AddKernel();
template <class T, int32_t I = 0, typename... Args>
void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);

GPUReconstructionOCLInternals* mInternals;
float mOclVersion;
Expand Down
14 changes: 3 additions & 11 deletions GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
#include "GPUReconstructionKernelIncludes.h"

#include "GPUReconstructionOCLKernelsSpecialize.inc"
#include "GPUReconstructionProcessingKernels.inc"
template void GPUReconstructionProcessing::KernelInterface<GPUReconstructionOCL, GPUReconstructionDeviceBase>::runKernelVirtual(const int num, const void* args);

template <class T, int32_t I, typename... Args>
inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
inline void GPUReconstructionOCL::runKernelBackend(const krnlSetupTime& _xyz, const Args&... args)
{
cl_kernel k = getKernelObject<cl_kernel, T, I>();
auto& x = _xyz.x;
Expand Down Expand Up @@ -48,12 +50,6 @@ inline void GPUReconstructionOCL::runKernelBackendInternal(const krnlSetupTime&
}
}

template <class T, int32_t I, typename... Args>
void GPUReconstructionOCL::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
{
std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
}

template <class T, int32_t I>
int32_t GPUReconstructionOCL::AddKernel()
{
Expand Down Expand Up @@ -86,7 +82,3 @@ int32_t GPUReconstructionOCL::AddKernels()
#undef GPUCA_KRNL
return 0;
}

#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) template void GPUReconstructionOCL::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args);
#include "GPUReconstructionKernelList.h"
#undef GPUCA_KRNL
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
/// \author David Rohr

template <>
inline void GPUReconstructionOCL::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
inline void GPUReconstructionOCL::runKernelBackend<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
{
cl_int4 val0 = {0, 0, 0, 0};
GPUChkErr(clEnqueueFillBuffer(mInternals->command_queue[_xyz.x.stream], mInternals->mem_gpu, &val0, sizeof(val0), (char*)ptr - (char*)mDeviceMemoryBase, (size + sizeof(val0) - 1) & ~(sizeof(val0) - 1), _xyz.z.evList == nullptr ? 0 : _xyz.z.nEvents, _xyz.z.evList->getEventList<cl_event>(), _xyz.z.ev->getEventList<cl_event>()));
Expand Down