Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions GPU/GPUTracking/Base/GPUReconstruction.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
if (mProcessingSettings.createO2Output > 1) {
mProcessingSettings.createO2Output = 1;
}
mProcessingSettings.rtc.deterministic = 1;
}
if (mProcessingSettings.deterministicGPUReconstruction && mProcessingSettings.debugLevel >= 6) {
mProcessingSettings.nTPCClustererLanes = 1;
Expand Down
76 changes: 53 additions & 23 deletions GPU/GPUTracking/Base/GPUReconstructionProcessing.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,24 @@ void GPUReconstructionProcessing::runParallelOuterLoop(bool doGPU, uint32_t nThr
}
}

namespace o2::gpu
{
namespace // anonymous
uint32_t GPUReconstructionProcessing::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
{
static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class member not global, but cannot be moved to header due to ROOT limitation
} // anonymous namespace
} // namespace o2::gpu
if (condition && mProcessingSettings.inKernelParallel != 1) {
mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
} else {
mNActiveThreadsOuterLoop = 1;
}
if (mProcessingSettings.debugLevel >= 5) {
printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop);
}
return mNActiveThreadsOuterLoop;
}

std::atomic_flag GPUReconstructionProcessing::mTimerFlag = ATOMIC_FLAG_INIT;

GPUReconstructionProcessing::timerMeta* GPUReconstructionProcessing::insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step)
{
while (timerFlag.test_and_set()) {
while (mTimerFlag.test_and_set()) {
}
if (mTimers.size() <= id) {
mTimers.resize(id + 1);
Expand All @@ -81,20 +88,20 @@ GPUReconstructionProcessing::timerMeta* GPUReconstructionProcessing::insertTimer
mTimers[id]->count++;
}
timerMeta* retVal = mTimers[id].get();
timerFlag.clear();
mTimerFlag.clear();
return retVal;
}

GPUReconstructionProcessing::timerMeta* GPUReconstructionProcessing::getTimerById(uint32_t id, bool increment)
{
timerMeta* retVal = nullptr;
while (timerFlag.test_and_set()) {
while (mTimerFlag.test_and_set()) {
}
if (mTimers.size() > id && mTimers[id]) {
retVal = mTimers[id].get();
retVal->count += increment;
}
timerFlag.clear();
mTimerFlag.clear();
return retVal;
}

Expand All @@ -104,23 +111,46 @@ uint32_t GPUReconstructionProcessing::getNextTimerId()
return id.fetch_add(1);
}

uint32_t GPUReconstructionProcessing::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
{
if (condition && mProcessingSettings.inKernelParallel != 1) {
mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
} else {
mNActiveThreadsOuterLoop = 1;
}
if (mProcessingSettings.debugLevel >= 5) {
printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop);
}
return mNActiveThreadsOuterLoop;
}

std::unique_ptr<gpu_reconstruction_kernels::threadContext> GPUReconstructionProcessing::GetThreadContext()
{
return std::make_unique<gpu_reconstruction_kernels::threadContext>();
}

gpu_reconstruction_kernels::threadContext::threadContext() = default;
gpu_reconstruction_kernels::threadContext::~threadContext() = default;

template <class T, int32_t I>
uint32_t GPUReconstructionProcessing::GetKernelNum(int32_t k)
{
static int32_t num = k;
if (num < 0) {
throw std::runtime_error("Internal Error - Kernel Number not Set");
}
return num;
}

namespace o2::gpu::internal
{
static std::vector<std::string> initKernelNames()
{
std::vector<std::string> retVal;
#define GPUCA_KRNL(x_class, ...) \
GPUReconstructionProcessing::GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>(retVal.size()); \
retVal.emplace_back(GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class)));
#include "GPUReconstructionKernelList.h"
#undef GPUCA_KRNL
return retVal;
}
} // namespace o2::gpu::internal

const std::vector<std::string> GPUReconstructionProcessing::mKernelNames = o2::gpu::internal::initKernelNames();

#define GPUCA_KRNL(x_class, ...) \
template uint32_t GPUReconstructionProcessing::GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>(int32_t); \
template <> \
const char* GPUReconstructionProcessing::GetKernelName<GPUCA_M_KRNL_TEMPLATE(x_class)>() \
{ \
return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class)); \
}
#include "GPUReconstructionKernelList.h"
#undef GPUCA_KRNL
19 changes: 9 additions & 10 deletions GPU/GPUTracking/Base/GPUReconstructionProcessing.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "utils/timer.h"
#include <functional>
#include <atomic>

namespace o2::gpu
{
Expand Down Expand Up @@ -74,7 +75,10 @@ class GPUReconstructionProcessing : public GPUReconstruction

// Interface to query name of a kernel
template <class T, int32_t I>
constexpr static const char* GetKernelName();
static const char* GetKernelName();
const std::string& GetKernelName(int32_t i) const { return mKernelNames[i]; }
template <class T, int32_t I = 0>
static uint32_t GetKernelNum(int32_t k = -1);

// Public queries for timers
auto& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)]; }
Expand All @@ -100,6 +104,8 @@ class GPUReconstructionProcessing : public GPUReconstruction
GPUReconstructionProcessing(const GPUSettingsDeviceBackend& cfg) : GPUReconstruction(cfg) {}
using deviceEvent = gpu_reconstruction_kernels::deviceEvent;

static const std::vector<std::string> mKernelNames;

int32_t mActiveHostKernelThreads = 0; // Number of currently active threads on the host for kernels
uint32_t mNActiveThreadsOuterLoop = 1; // Number of threads currently running an outer loop

Expand Down Expand Up @@ -130,6 +136,8 @@ class GPUReconstructionProcessing : public GPUReconstruction
uint32_t getNextTimerId();
timerMeta* getTimerById(uint32_t id, bool increment = true);
timerMeta* insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step);

static std::atomic_flag mTimerFlag;
};

template <class T>
Expand Down Expand Up @@ -174,15 +182,6 @@ HighResTimer& GPUReconstructionProcessing::getTimer(const char* name, int32_t nu
return timer->timer[num];
}

#define GPUCA_KRNL(x_class, ...) \
template <> \
constexpr const char* GPUReconstructionProcessing::GetKernelName<GPUCA_M_KRNL_TEMPLATE(x_class)>() \
{ \
return GPUCA_M_STR(GPUCA_M_KRNL_NAME(x_class)); \
}
#include "GPUReconstructionKernelList.h"
#undef GPUCA_KRNL

} // namespace o2::gpu

#endif
8 changes: 4 additions & 4 deletions GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
Original file line number Diff line number Diff line change
Expand Up @@ -610,25 +610,25 @@ void GPUReconstructionCUDABackend::PrintKernelOccupancies()
GPUChkErr(cuOccupancyMaxActiveBlocksPerMultiprocessor(&maxBlocks, *mInternals->kernelFunctions[i], threads, 0));
GPUChkErr(cuFuncGetAttribute(&nRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, *mInternals->kernelFunctions[i]));
GPUChkErr(cuFuncGetAttribute(&sMem, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, *mInternals->kernelFunctions[i]));
GPUInfo("Kernel: %50s Block size: %4d, Maximum active blocks: %3d, Suggested blocks: %3d, Regs: %3d, smem: %3d", mInternals->kernelNames[i].c_str(), threads, maxBlocks, suggestedBlocks, nRegs, sMem);
GPUInfo("Kernel: %50s Block size: %4d, Maximum active blocks: %3d, Suggested blocks: %3d, Regs: %3d, smem: %3d", GetKernelName(i).c_str(), threads, maxBlocks, suggestedBlocks, nRegs, sMem);
}
}

void GPUReconstructionCUDA::loadKernelModules(bool perKernel)
{
uint32_t j = 0;
#define GPUCA_KRNL(x_class, ...) \
getRTCkernelNum<false, GPUCA_M_KRNL_TEMPLATE(x_class)>(mInternals->kernelFunctions.size()); \
if (GetKernelNum<GPUCA_M_KRNL_TEMPLATE(x_class)>() != j) { \
GPUFatal("kernel numbers out of sync"); \
} \
mInternals->kernelFunctions.emplace_back(new CUfunction); \
mInternals->kernelNames.emplace_back(GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class)))); \
if (mProcessingSettings.debugLevel >= 3) { \
GPUInfo("Loading kernel %s (j = %u)", GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))), j); \
} \
GPUChkErr(cuModuleGetFunction(mInternals->kernelFunctions.back().get(), *mInternals->kernelModules[perKernel ? j : 0], GPUCA_M_STR(GPUCA_M_CAT(krnl_, GPUCA_M_KRNL_NAME(x_class))))); \
j++;
#include "GPUReconstructionKernelList.h"
#undef GPUCA_KRNL

if (j != mInternals->kernelModules.size()) {
GPUFatal("Did not load all kernels (%u < %u)", j, (uint32_t)mInternals->kernelModules.size());
}
Expand Down
5 changes: 0 additions & 5 deletions GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,6 @@ class GPUReconstructionCUDABackend : public GPUReconstructionDeviceBase
void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
template <class T, int32_t I = 0>
gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend();
template <class T, int32_t I>
class backendInternal;

template <bool multi, class T, int32_t I = 0>
static int32_t getRTCkernelNum(int32_t k = -1);

void getRTCKernelCalls(std::vector<std::string>& kernels);

Expand Down
13 changes: 10 additions & 3 deletions GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@ QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command);
QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_arch);
QGET_LD_BINARY_SYMBOLS(GPUReconstructionCUDArtc_command_no_fast_math);

#include "GPUNoFastMathKernels.h"

int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
{
std::string rtcparam = std::string("#define GPUCA_RTC_CODE\n") +
std::string(mProcessingSettings.rtc.optSpecialCode ? "#define GPUCA_RTC_SPECIAL_CODE(...) __VA_ARGS__\n" : "#define GPUCA_RTC_SPECIAL_CODE(...)\n") +
std::string(mProcessingSettings.rtc.deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n") +
GPUParamRTC::generateRTCCode(param(), mProcessingSettings.rtc.optConstexpr);
if (filename == "") {
filename = "/tmp/o2cagpu_rtc_";
Expand All @@ -54,7 +55,6 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
std::string baseCommand = (mProcessingSettings.RTCprependCommand != "" ? (mProcessingSettings.RTCprependCommand + " ") : "");
baseCommand += (getenv("O2_GPU_RTC_OVERRIDE_CMD") ? std::string(getenv("O2_GPU_RTC_OVERRIDE_CMD")) : std::string(_binary_GPUReconstructionCUDArtc_command_start, _binary_GPUReconstructionCUDArtc_command_len));
baseCommand += std::string(" ") + (mProcessingSettings.RTCoverrideArchitecture != "" ? mProcessingSettings.RTCoverrideArchitecture : std::string(_binary_GPUReconstructionCUDArtc_command_arch_start, _binary_GPUReconstructionCUDArtc_command_arch_len));
baseCommand += mProcessingSettings.rtc.deterministic ? (std::string(" ") + std::string(_binary_GPUReconstructionCUDArtc_command_no_fast_math_start, _binary_GPUReconstructionCUDArtc_command_no_fast_math_len)) : std::string("");

char shasource[21], shaparam[21], shacmd[21], shakernels[21];
if (mProcessingSettings.rtc.cacheOutput) {
Expand Down Expand Up @@ -169,13 +169,20 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
kernel += mProcessingSettings.rtc.compilePerKernel ? kernels[i] : kernelsall;
kernel += "}";

if (fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() ||
bool deterministic = mProcessingSettings.rtc.deterministic || o2::gpu::internal::noFastMathKernels.find(GetKernelName(i)) != o2::gpu::internal::noFastMathKernels.end();
const std::string deterministicStr = std::string(deterministic ? "#define GPUCA_DETERMINISTIC_CODE(det, indet) det\n" : "#define GPUCA_DETERMINISTIC_CODE(det, indet) indet\n");

if (fwrite(deterministicStr.c_str(), 1, deterministicStr.size(), fp) != deterministicStr.size() ||
fwrite(rtcparam.c_str(), 1, rtcparam.size(), fp) != rtcparam.size() ||
fwrite(_binary_GPUReconstructionCUDArtc_src_start, 1, _binary_GPUReconstructionCUDArtc_src_len, fp) != _binary_GPUReconstructionCUDArtc_src_len ||
fwrite(kernel.c_str(), 1, kernel.size(), fp) != kernel.size()) {
throw std::runtime_error("Error writing file");
}
fclose(fp);
std::string command = baseCommand;
if (deterministic) {
command += std::string(" ") + std::string(_binary_GPUReconstructionCUDArtc_command_no_fast_math_start, _binary_GPUReconstructionCUDArtc_command_no_fast_math_len);
}
command += " -c " + filename + "_" + std::to_string(i) + mRtcSrcExtension + " -o " + filename + "_" + std::to_string(i) + mRtcBinExtension;
if (mProcessingSettings.debugLevel < 0) {
command += " &> /dev/null";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ namespace o2::gpu
struct GPUReconstructionCUDAInternals {
std::vector<std::unique_ptr<CUmodule>> kernelModules; // module for RTC compilation
std::vector<std::unique_ptr<CUfunction>> kernelFunctions; // vector of ptrs to RTC kernels
std::vector<std::string> kernelNames; // names of kernels
cudaStream_t Streams[GPUCA_MAX_STREAMS]; // Pointer to array of CUDA Streams

static void getArgPtrs(const void** pArgs) {}
Expand Down
18 changes: 1 addition & 17 deletions GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ inline void GPUReconstructionCUDABackend::runKernelBackendInternal(const krnlSet
#endif
pArgs[arg_offset] = &y.index;
GPUReconstructionCUDAInternals::getArgPtrs(&pArgs[arg_offset + 1], args...);
GPUChkErr(cuLaunchKernel(*mInternals->kernelFunctions[getRTCkernelNum<false, T, I>()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void**)pArgs, nullptr));
GPUChkErr(cuLaunchKernel(*mInternals->kernelFunctions[GetKernelNum<T, I>()], x.nBlocks, 1, 1, x.nThreads, 1, 1, 0, mInternals->Streams[x.stream], (void**)pArgs, nullptr));
}
}

Expand Down Expand Up @@ -111,22 +111,6 @@ void GPUReconstructionCUDABackend::runKernelBackend(const krnlSetupArgs<T, I, Ar
#include "GPUReconstructionKernelList.h"
#undef GPUCA_KRNL

template <bool multi, class T, int32_t I>
int32_t GPUReconstructionCUDABackend::getRTCkernelNum(int32_t k)
{
static int32_t num = k;
if (num < 0) {
throw std::runtime_error("Invalid kernel");
}
return num;
}

#define GPUCA_KRNL(x_class, ...) \
template int32_t GPUReconstructionCUDABackend::getRTCkernelNum<false, GPUCA_M_KRNL_TEMPLATE(x_class)>(int32_t k); \
template int32_t GPUReconstructionCUDABackend::getRTCkernelNum<true, GPUCA_M_KRNL_TEMPLATE(x_class)>(int32_t k);
#include "GPUReconstructionKernelList.h"
#undef GPUCA_KRNL

void GPUReconstructionCUDABackend::getRTCKernelCalls(std::vector<std::string>& kernels)
{
#define GPUCA_KRNL(...) kernels.emplace_back(GPUCA_M_STR(GPUCA_KRNLGPU(__VA_ARGS__)));
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ int32_t GPUReconstructionOCLBackend::ExitDevice_Runtime()
clReleaseMemObject(mInternals->mem_gpu);
clReleaseMemObject(mInternals->mem_constant);
for (uint32_t i = 0; i < mInternals->kernels.size(); i++) {
clReleaseKernel(mInternals->kernels[i].first);
clReleaseKernel(mInternals->kernels[i]);
}
mInternals->kernels.clear();
}
Expand Down
2 changes: 0 additions & 2 deletions GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ class GPUReconstructionOCLBackend : public GPUReconstructionDeviceBase

template <class T, int32_t I = 0>
int32_t AddKernel();
template <class T, int32_t I = 0>
uint32_t FindKernel();
template <class T, int32_t I = 0, typename... Args>
void runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args);
template <class T, int32_t I = 0>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ struct GPUReconstructionOCLInternals {
cl_mem mem_host;
cl_program program;

std::vector<std::pair<cl_kernel, std::string>> kernels;
std::vector<cl_kernel> kernels;
};
} // namespace o2::gpu

Expand Down
19 changes: 2 additions & 17 deletions GPU/GPUTracking/Base/opencl/GPUReconstructionOCLKernels.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -58,20 +58,6 @@ void GPUReconstructionOCLBackend::runKernelBackend(const krnlSetupArgs<T, I, Arg
std::apply([this, &args](auto&... vals) { runKernelBackendInternal<T, I, Args...>(args.s, vals...); }, args.v);
}

template <class T, int32_t I>
inline uint32_t GPUReconstructionOCLBackend::FindKernel()
{
std::string name(GetKernelName<T, I>());

for (uint32_t k = 0; k < mInternals->kernels.size(); k++) {
if (mInternals->kernels[k].second == name) {
return (k);
}
}
GPUError("Could not find OpenCL kernel %s", name.c_str());
throw ::std::runtime_error("Requested unsupported OpenCL kernel");
}

template <class T, int32_t I>
int32_t GPUReconstructionOCLBackend::AddKernel()
{
Expand All @@ -84,15 +70,14 @@ int32_t GPUReconstructionOCLBackend::AddKernel()
GPUError("Error creating OPENCL Kernel: %s", name.c_str());
return 1;
}
mInternals->kernels.emplace_back(krnl, name);
mInternals->kernels.emplace_back(krnl);
return 0;
}

template <class S, class T, int32_t I>
S& GPUReconstructionOCLBackend::getKernelObject()
{
static uint32_t krnl = FindKernel<T, I>();
return mInternals->kernels[krnl].first;
return mInternals->kernels[GetKernelNum<T, I>()];
}

int32_t GPUReconstructionOCLBackend::AddKernels()
Expand Down
4 changes: 4 additions & 0 deletions GPU/GPUTracking/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,10 @@ file(GENERATE
OUTPUT include_gpu_onthefly/GPUReconstructionIncludesDeviceAll.h
INPUT Base/GPUReconstructionIncludesDeviceAll.template.h
)
file(GENERATE
OUTPUT include_gpu_onthefly/GPUNoFastMathKernels.h
INPUT cmake/GPUNoFastMathKernels.template.h
)
if(NOT ALIGPU_BUILD_TYPE STREQUAL "O2")
include_directories(${CMAKE_CURRENT_BINARY_DIR}/include_gpu_onthefly)
endif()
Expand Down
Loading