Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 50 additions & 29 deletions GPU/GPUTracking/Base/GPUReconstruction.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,9 @@
#include <condition_variable>
#include <array>

#ifdef WITH_OPENMP
#include <omp.h>
#endif

#include "GPUReconstruction.h"
#include "GPUReconstructionIncludes.h"
#include "GPUReconstructionThreading.h"
#include "GPUROOTDumpCore.h"
#include "GPUConfigDump.h"
#include "GPUChainTracking.h"
Expand Down Expand Up @@ -121,17 +118,18 @@ void GPUReconstruction::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* tr
}
}

int32_t GPUReconstruction::SetNOMPThreads(int32_t n)
void GPUReconstruction::SetNActiveThreads(int32_t n)
{
#ifdef WITH_OPENMP
omp_set_num_threads(mProcessingSettings.ompThreads = std::max(1, n < 0 ? mMaxOMPThreads : std::min(n, mMaxOMPThreads)));
mActiveHostKernelThreads = std::max(1, n < 0 ? mMaxHostThreads : std::min(n, mMaxHostThreads));
mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
if (mProcessingSettings.debugLevel >= 3) {
GPUInfo("Set number of OpenMP threads to %d (%d requested)", mProcessingSettings.ompThreads, n);
GPUInfo("Set number of active parallel kernels threads on host to %d (%d requested)", mActiveHostKernelThreads, n);
}
return n > mMaxOMPThreads;
#else
return 1;
#endif
}

int32_t GPUReconstruction::getHostThreadIndex()
{
return std::max<int32_t>(0, tbb::this_task_arena::current_thread_index());
}

int32_t GPUReconstruction::Init()
Expand Down Expand Up @@ -197,6 +195,24 @@ int32_t GPUReconstruction::Init()
return 0;
}

namespace o2::gpu::internal
{
static uint32_t getDefaultNThreads()
{
const char* tbbEnv = getenv("TBB_NUM_THREADS");
uint32_t tbbNum = tbbEnv ? atoi(tbbEnv) : 0;
if (tbbNum) {
return tbbNum;
}
const char* ompEnv = getenv("OMP_NUM_THREADS");
uint32_t ompNum = ompEnv ? atoi(ompEnv) : 0;
if (ompNum) {
return tbbNum;
}
return tbb::info::default_concurrency();
}
} // namespace o2::gpu::internal

int32_t GPUReconstruction::InitPhaseBeforeDevice()
{
if (mProcessingSettings.printSettings) {
Expand Down Expand Up @@ -299,32 +315,37 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
mMemoryScalers->rescaleMaxMem(mProcessingSettings.forceMaxMemScalers);
}

#ifdef WITH_OPENMP
if (mProcessingSettings.ompThreads <= 0) {
mProcessingSettings.ompThreads = omp_get_max_threads();
} else {
mProcessingSettings.ompAutoNThreads = false;
omp_set_num_threads(mProcessingSettings.ompThreads);
if (mProcessingSettings.nHostThreads != -1 && mProcessingSettings.ompThreads != -1) {
GPUFatal("Must not use both nHostThreads and ompThreads at the same time!");
} else if (mProcessingSettings.ompThreads != -1) {
mProcessingSettings.nHostThreads = mProcessingSettings.ompThreads;
GPUWarning("You are using the deprecated ompThreads option, please switch to nHostThreads!");
}
if (mProcessingSettings.ompKernels) {
if (omp_get_max_active_levels() < 2) {
omp_set_max_active_levels(2);
}

if (mProcessingSettings.nHostThreads <= 0) {
mProcessingSettings.nHostThreads = internal::getDefaultNThreads();
} else {
mProcessingSettings.autoAdjustHostThreads = false;
}
mMaxHostThreads = mActiveHostKernelThreads = mProcessingSettings.nHostThreads;
if (mMaster == nullptr) {
mThreading = std::make_shared<GPUReconstructionThreading>();
mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism, mMaxHostThreads);
mThreading->allThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
} else {
mThreading = mMaster->mThreading;
}
#else
mProcessingSettings.ompThreads = 1;
#endif
mMaxOMPThreads = mProcessingSettings.ompThreads;
mMaxThreads = std::max(mMaxThreads, mProcessingSettings.ompThreads);
mMaxBackendThreads = std::max(mMaxBackendThreads, mMaxHostThreads);
if (IsGPU()) {
mNStreams = std::max<int32_t>(mProcessingSettings.nStreams, 3);
}

if (mProcessingSettings.nTPCClustererLanes == -1) {
mProcessingSettings.nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max<int32_t>(1, std::min<int32_t>(GPUCA_NSLICES, mProcessingSettings.ompKernels ? (mProcessingSettings.ompThreads >= 4 ? std::min<int32_t>(mProcessingSettings.ompThreads / 2, mProcessingSettings.ompThreads >= 32 ? GPUCA_NSLICES : 4) : 1) : mProcessingSettings.ompThreads));
mProcessingSettings.nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max<int32_t>(1, std::min<int32_t>(GPUCA_NSLICES, mProcessingSettings.inKernelParallel ? (mMaxHostThreads >= 4 ? std::min<int32_t>(mMaxHostThreads / 2, mMaxHostThreads >= 32 ? GPUCA_NSLICES : 4) : 1) : mMaxHostThreads));
}
if (mProcessingSettings.overrideClusterizerFragmentLen == -1) {
mProcessingSettings.overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mProcessingSettings.ompThreads / mProcessingSettings.nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST;
mProcessingSettings.overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mMaxHostThreads / mProcessingSettings.nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST;
}
if (mProcessingSettings.nTPCClustererLanes > GPUCA_NSLICES) {
GPUError("Invalid value for nTPCClustererLanes: %d", mProcessingSettings.nTPCClustererLanes);
Expand Down
19 changes: 12 additions & 7 deletions GPU/GPUTracking/Base/GPUReconstruction.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ namespace gpu
class GPUChain;
struct GPUMemorySizeScalers;
struct GPUReconstructionPipelineContext;
struct GPUReconstructionThreading;
class GPUROOTDumpCore;

namespace gpu_reconstruction_kernels
Expand Down Expand Up @@ -206,8 +207,8 @@ class GPUReconstruction
void SetOutputControl(void* ptr, size_t size);
void SetInputControl(void* ptr, size_t size);
GPUOutputControl& OutputControl() { return mOutputControl; }
int32_t GetMaxThreads() const { return mMaxThreads; }
int32_t SetNOMPThreads(int32_t n);
int32_t GetMaxBackendThreads() const { return mMaxBackendThreads; }
void SetNActiveThreads(int32_t n);
int32_t NStreams() const { return mNStreams; }
const void* DeviceMemoryBase() const { return mDeviceMemoryBase; }

Expand All @@ -234,6 +235,9 @@ class GPUReconstruction
double GetStatKernelTime() { return mStatKernelTime; }
double GetStatWallTime() { return mStatWallTime; }

std::shared_ptr<GPUReconstructionThreading> mThreading;
static int32_t getHostThreadIndex();

protected:
void AllocateRegisteredMemoryInternal(GPUMemoryResource* res, GPUOutputControl* control, GPUReconstruction* recPool);
void FreeRegisteredMemory(GPUMemoryResource* res);
Expand Down Expand Up @@ -343,11 +347,12 @@ class GPUReconstruction
std::shared_ptr<GPUROOTDumpCore> mROOTDump;
std::vector<std::array<uint32_t, 4>>* mOutputErrorCodes = nullptr;

int32_t mMaxThreads = 0; // Maximum number of threads that may be running, on CPU or GPU
int32_t mThreadId = -1; // Thread ID that is valid for the local CUDA context
int32_t mGPUStuck = 0; // Marks that the GPU is stuck, skip future events
int32_t mNStreams = 1; // Number of parallel GPU streams
int32_t mMaxOMPThreads = 0; // Maximum number of OMP threads
int32_t mMaxBackendThreads = 0; // Maximum number of threads that may be running, on CPU or GPU
int32_t mThreadId = -1; // Thread ID that is valid for the local CUDA context
int32_t mGPUStuck = 0; // Marks that the GPU is stuck, skip future events
int32_t mNStreams = 1; // Number of parallel GPU streams
int32_t mMaxHostThreads = 0; // Maximum number of OMP threads
int32_t mActiveHostKernelThreads = 0; // Number of currently active threads on the host for kernels

// Management for GPUProcessors
struct ProcessorData {
Expand Down
93 changes: 39 additions & 54 deletions GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include "GPUReconstructionCPU.h"
#include "GPUReconstructionIncludes.h"
#include "GPUReconstructionThreading.h"
#include "GPUChain.h"

#include "GPUTPCClusterData.h"
Expand All @@ -40,13 +41,6 @@
#include <unistd.h>
#endif

#if defined(WITH_OPENMP) || defined(_OPENMP)
#include <omp.h>
#else
static inline int32_t omp_get_thread_num() { return 0; }
static inline int32_t omp_get_max_threads() { return 1; }
#endif

using namespace o2::gpu;
using namespace o2::gpu::gpu_reconstruction_kernels;

Expand All @@ -60,19 +54,21 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
Exit(); // Needs to be identical to GPU backend bahavior in order to avoid calling abstract methods later in the destructor
}

int32_t GPUReconstructionCPUBackend::getNOMPThreads()
int32_t GPUReconstructionCPUBackend::getNKernelHostThreads(bool splitCores)
{
int32_t ompThreads = 0;
if (mProcessingSettings.ompKernels == 2) {
ompThreads = mProcessingSettings.ompThreads / mNestedLoopOmpFactor;
if ((uint32_t)getOMPThreadNum() < mProcessingSettings.ompThreads % mNestedLoopOmpFactor) {
ompThreads++;
int32_t nThreads = 0;
if (mProcessingSettings.inKernelParallel == 2 && mNActiveThreadsOuterLoop) {
if (splitCores) {
nThreads = mMaxHostThreads / mNActiveThreadsOuterLoop;
nThreads += (uint32_t)getHostThreadIndex() < mMaxHostThreads % mNActiveThreadsOuterLoop;
} else {
nThreads = mMaxHostThreads;
}
ompThreads = std::max(1, ompThreads);
nThreads = std::max(1, nThreads);
} else {
ompThreads = mProcessingSettings.ompKernels ? mProcessingSettings.ompThreads : 1;
nThreads = mProcessingSettings.inKernelParallel ? mMaxHostThreads : 1;
}
return ompThreads;
return nThreads;
}

template <class T, int32_t I, typename... Args>
Expand All @@ -88,16 +84,19 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
}
uint32_t num = y.num == 0 || y.num == -1 ? 1 : y.num;
for (uint32_t k = 0; k < num; k++) {
int32_t ompThreads = getNOMPThreads();
if (ompThreads > 1) {
int32_t nThreads = getNKernelHostThreads(false);
if (nThreads > 1) {
if (mProcessingSettings.debugLevel >= 5) {
printf("Running %d ompThreads\n", ompThreads);
}
GPUCA_OPENMP(parallel for num_threads(ompThreads))
for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
typename T::GPUSharedMemory smem;
T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
printf("Running %d Threads\n", nThreads);
}
mThreading->activeThreads->execute([&] {
tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
typename T::GPUSharedMemory smem;
for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
}
});
});
} else {
for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
typename T::GPUSharedMemory smem;
Expand All @@ -111,24 +110,20 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
template <>
inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
{
#ifdef WITH_OPENMP
int32_t nOMPThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNOMPThreads()));
if (nOMPThreads > 1) {
GPUCA_OPENMP(parallel num_threads(nOMPThreads))
{
size_t threadSize = size / omp_get_num_threads();
int32_t nnThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));
if (nnThreads > 1) {
tbb::parallel_for(0, nnThreads, [&](int iThread) {
size_t threadSize = size / nnThreads;
if (threadSize % 4096) {
threadSize += 4096 - threadSize % 4096;
}
size_t offset = threadSize * omp_get_thread_num();
size_t offset = threadSize * iThread;
size_t mySize = std::min<size_t>(threadSize, size - offset);
if (mySize) {
memset((char*)ptr + offset, 0, mySize);
}
}
} else
#endif
{
} // clang-format off
}, tbb::static_partitioner()); // clang-format on
} else {
memset(ptr, 0, size);
}
return 0;
Expand Down Expand Up @@ -213,8 +208,8 @@ int32_t GPUReconstructionCPU::InitDevice()
mHostMemoryPermanent = mHostMemoryBase;
ClearAllocatedMemory();
}
if (mProcessingSettings.ompKernels) {
mBlockCount = getOMPMaxThreads();
if (mProcessingSettings.inKernelParallel) {
mBlockCount = mMaxHostThreads;
}
mThreadId = GetThread();
mProcShadow.mProcessorsProc = processors();
Expand Down Expand Up @@ -351,16 +346,6 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes()
}
}

int32_t GPUReconstructionCPUBackend::getOMPThreadNum()
{
return omp_get_thread_num();
}

int32_t GPUReconstructionCPUBackend::getOMPMaxThreads()
{
return omp_get_max_threads();
}

static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class member not global, but cannot be moved to header due to ROOT limitation

GPUReconstructionCPU::timerMeta* GPUReconstructionCPU::insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step)
Expand Down Expand Up @@ -402,17 +387,17 @@ uint32_t GPUReconstructionCPU::getNextTimerId()
return id.fetch_add(1);
}

uint32_t GPUReconstructionCPU::SetAndGetNestedLoopOmpFactor(bool condition, uint32_t max)
uint32_t GPUReconstructionCPU::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
{
if (condition && mProcessingSettings.ompKernels != 1) {
mNestedLoopOmpFactor = mProcessingSettings.ompKernels == 2 ? std::min<uint32_t>(max, mProcessingSettings.ompThreads) : mProcessingSettings.ompThreads;
if (condition && mProcessingSettings.inKernelParallel != 1) {
mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
} else {
mNestedLoopOmpFactor = 1;
mNActiveThreadsOuterLoop = 1;
}
if (mProcessingSettings.debugLevel >= 5) {
printf("Running %d OMP threads in outer loop\n", mNestedLoopOmpFactor);
printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop);
}
return mNestedLoopOmpFactor;
return mNActiveThreadsOuterLoop;
}

void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream)
Expand Down
18 changes: 8 additions & 10 deletions GPU/GPUTracking/Base/GPUReconstructionCPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,8 @@ class GPUReconstructionCPUBackend : public GPUReconstruction
int32_t runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime& _xyz, const Args&... args);
template <class T, int32_t I>
gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend();
uint32_t mNestedLoopOmpFactor = 1;
static int32_t getOMPThreadNum();
static int32_t getOMPMaxThreads();
int32_t getNOMPThreads();
uint32_t mNActiveThreadsOuterLoop = 1;
int32_t getNKernelHostThreads(bool splitCores);
};

class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCPUBackend>
Expand Down Expand Up @@ -81,8 +79,8 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
HighResTimer& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)].timerTotal; }
HighResTimer& getGeneralStepTimer(GeneralStep step) { return mTimersGeneralSteps[getGeneralStepNum(step)]; }

void SetNestedLoopOmpFactor(uint32_t f) { mNestedLoopOmpFactor = f; }
uint32_t SetAndGetNestedLoopOmpFactor(bool condition, uint32_t max);
void SetNActiveThreadsOuterLoop(uint32_t f) { mNActiveThreadsOuterLoop = f; }
uint32_t SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max);

void UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream = -1);

Expand Down Expand Up @@ -220,8 +218,8 @@ inline int32_t GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args
return 0;
}
if (mProcessingSettings.debugLevel >= 1) {
t = &getKernelTimer<S, I>(myStep, !IsGPU() || cpuFallback ? getOMPThreadNum() : stream);
if ((!mProcessingSettings.deviceTimers || !IsGPU() || cpuFallback) && (mNestedLoopOmpFactor < 2 || getOMPThreadNum() == 0)) {
t = &getKernelTimer<S, I>(myStep, !IsGPU() || cpuFallback ? getHostThreadIndex() : stream);
if ((!mProcessingSettings.deviceTimers || !IsGPU() || cpuFallback) && (mNActiveThreadsOuterLoop < 2 || getHostThreadIndex() == 0)) {
t->Start();
}
}
Expand Down Expand Up @@ -287,11 +285,11 @@ HighResTimer& GPUReconstructionCPU::getTimer(const char* name, int32_t num)
static int32_t id = getNextTimerId();
timerMeta* timer = getTimerById(id);
if (timer == nullptr) {
int32_t max = std::max<int32_t>({getOMPMaxThreads(), mProcessingSettings.nStreams});
int32_t max = std::max<int32_t>({mMaxHostThreads, mProcessingSettings.nStreams});
timer = insertTimer(id, name, J, max, 1, RecoStep::NoRecoStep);
}
if (num == -1) {
num = getOMPThreadNum();
num = getHostThreadIndex();
}
if (num < 0 || num >= timer->num) {
throw std::runtime_error("Invalid timer requested");
Expand Down
Loading