Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 8 additions & 24 deletions GPU/GPUTracking/Base/GPUReconstruction.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,9 @@
#include <condition_variable>
#include <array>

#ifdef WITH_OPENMP
#include <omp.h>
#endif

#include "GPUReconstruction.h"
#include "GPUReconstructionIncludes.h"
#include "GPUReconstructionThreading.h"
#include "GPUROOTDumpCore.h"
#include "GPUConfigDump.h"
#include "GPUChainTracking.h"
Expand Down Expand Up @@ -121,17 +118,12 @@ void GPUReconstruction::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* tr
}
}

int32_t GPUReconstruction::SetNOMPThreads(int32_t n)
void GPUReconstruction::SetNOMPThreads(int32_t n)
{
#ifdef WITH_OPENMP
omp_set_num_threads(mProcessingSettings.ompThreads = std::max(1, n < 0 ? mMaxOMPThreads : std::min(n, mMaxOMPThreads)));
mProcessingSettings.ompThreads = std::max(1, n < 0 ? mMaxOMPThreads : std::min(n, mMaxOMPThreads));
if (mProcessingSettings.debugLevel >= 3) {
GPUInfo("Set number of OpenMP threads to %d (%d requested)", mProcessingSettings.ompThreads, n);
GPUInfo("Set number of parallel threads to %d (%d requested)", mProcessingSettings.ompThreads, n);
}
return n > mMaxOMPThreads;
#else
return 1;
#endif
}

int32_t GPUReconstruction::Init()
Expand Down Expand Up @@ -299,23 +291,15 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
mMemoryScalers->rescaleMaxMem(mProcessingSettings.forceMaxMemScalers);
}

#ifdef WITH_OPENMP
if (mProcessingSettings.ompThreads <= 0) {
mProcessingSettings.ompThreads = omp_get_max_threads();
mProcessingSettings.ompThreads = tbb::info::default_concurrency();
} else {
mProcessingSettings.ompAutoNThreads = false;
omp_set_num_threads(mProcessingSettings.ompThreads);
}
if (mProcessingSettings.ompKernels) {
if (omp_get_max_active_levels() < 2) {
omp_set_max_active_levels(2);
}
}
#else
mProcessingSettings.ompThreads = 1;
#endif
mMaxOMPThreads = mProcessingSettings.ompThreads;
mMaxThreads = std::max(mMaxThreads, mProcessingSettings.ompThreads);
mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism, mMaxOMPThreads);
mThreading->allThreads = std::make_unique<tbb::task_arena>(mMaxOMPThreads);
mMaxThreads = std::max(mMaxThreads, mMaxOMPThreads);
if (IsGPU()) {
mNStreams = std::max<int32_t>(mProcessingSettings.nStreams, 3);
}
Expand Down
5 changes: 4 additions & 1 deletion GPU/GPUTracking/Base/GPUReconstruction.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ namespace gpu
class GPUChain;
struct GPUMemorySizeScalers;
struct GPUReconstructionPipelineContext;
struct GPUReconstructionThreading;
class GPUROOTDumpCore;

namespace gpu_reconstruction_kernels
Expand Down Expand Up @@ -207,7 +208,7 @@ class GPUReconstruction
void SetInputControl(void* ptr, size_t size);
GPUOutputControl& OutputControl() { return mOutputControl; }
int32_t GetMaxThreads() const { return mMaxThreads; }
int32_t SetNOMPThreads(int32_t n);
void SetNOMPThreads(int32_t n);
int32_t NStreams() const { return mNStreams; }
const void* DeviceMemoryBase() const { return mDeviceMemoryBase; }

Expand All @@ -234,6 +235,8 @@ class GPUReconstruction
double GetStatKernelTime() { return mStatKernelTime; }
double GetStatWallTime() { return mStatWallTime; }

std::unique_ptr<GPUReconstructionThreading> mThreading;

protected:
void AllocateRegisteredMemoryInternal(GPUMemoryResource* res, GPUOutputControl* control, GPUReconstruction* recPool);
void FreeRegisteredMemory(GPUMemoryResource* res);
Expand Down
26 changes: 8 additions & 18 deletions GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include "GPUReconstructionCPU.h"
#include "GPUReconstructionIncludes.h"
#include "GPUReconstructionThreading.h"
#include "GPUChain.h"

#include "GPUTPCClusterData.h"
Expand All @@ -40,13 +41,6 @@
#include <unistd.h>
#endif

#if defined(WITH_OPENMP) || defined(_OPENMP)
#include <omp.h>
#else
static inline int32_t omp_get_thread_num() { return 0; }
static inline int32_t omp_get_max_threads() { return 1; }
#endif

using namespace o2::gpu;
using namespace o2::gpu::gpu_reconstruction_kernels;

Expand Down Expand Up @@ -111,24 +105,20 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
template <>
inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
{
#ifdef WITH_OPENMP
int32_t nOMPThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNOMPThreads()));
if (nOMPThreads > 1) {
GPUCA_OPENMP(parallel num_threads(nOMPThreads))
{
size_t threadSize = size / omp_get_num_threads();
tbb::parallel_for(0, nOMPThreads, [&](int iThread) {
size_t threadSize = size / nOMPThreads;
if (threadSize % 4096) {
threadSize += 4096 - threadSize % 4096;
}
size_t offset = threadSize * omp_get_thread_num();
size_t offset = threadSize * iThread;
size_t mySize = std::min<size_t>(threadSize, size - offset);
if (mySize) {
memset((char*)ptr + offset, 0, mySize);
}
}
} else
#endif
{
} // clang-format off
}, tbb::static_partitioner()); // clang-format on
} else {
memset(ptr, 0, size);
}
return 0;
Expand Down Expand Up @@ -353,7 +343,7 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes()

int32_t GPUReconstructionCPUBackend::getOMPThreadNum()
{
return omp_get_thread_num();
return tbb::this_task_arena::current_thread_index();
}

int32_t GPUReconstructionCPUBackend::getOMPMaxThreads()
Expand Down
114 changes: 64 additions & 50 deletions GPU/GPUTracking/Base/GPUReconstructionConvert.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
#include "TPCBase/CRU.h"
#include "DetectorsRaw/RDHUtils.h"

#include <oneapi/tbb.h>

using namespace o2::gpu;
using namespace o2::tpc;
using namespace o2::tpc::constants;
Expand Down Expand Up @@ -1306,6 +1308,17 @@ size_t zsEncoderRun<T>::compare(std::vector<zsPage>* buffer, std::vector<o2::tpc
} // anonymous namespace
#endif // GPUCA_TPC_GEOMETRY_O2

namespace o2::gpu::internal
{
struct tmpReductionResult {
uint32_t totalPages = 0;
size_t totalSize = 0;
size_t nErrors = 0;
size_t digitsInput = 0;
size_t digitsEncoded = 0;
};
} // namespace o2::gpu::internal

template <class S>
void GPUReconstructionConvert::RunZSEncoder(const S& in, std::unique_ptr<uint64_t[]>* outBuffer, uint32_t* outSizes, o2::raw::RawFileWriter* raw, const o2::InteractionRecord* ir, const GPUParam& param, int32_t version, bool verify, float threshold, bool padding, std::function<void(std::vector<o2::tpc::Digit>&)> digitsFilter)
{
Expand All @@ -1316,64 +1329,65 @@ void GPUReconstructionConvert::RunZSEncoder(const S& in, std::unique_ptr<uint64_
}
#ifdef GPUCA_TPC_GEOMETRY_O2
std::vector<zsPage> buffer[NSLICES][GPUTrackingInOutZS::NENDPOINTS];
uint32_t totalPages = 0;
size_t totalSize = 0;
size_t nErrors = 0;
size_t digitsInput = 0;
size_t digitsEncoded = 0;
// clang-format off
GPUCA_OPENMP(parallel for reduction(+ : totalPages, nErrors, totalSize, digitsInput, digitsEncoded))
// clang-format on
for (uint32_t i = 0; i < NSLICES; i++) {
std::vector<o2::tpc::Digit> tmpBuffer;
digitsInput += ZSEncoderGetNDigits(in, i);
tmpBuffer.resize(ZSEncoderGetNDigits(in, i));
if (threshold > 0.f && !digitsFilter) {
auto it = std::copy_if(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; });
tmpBuffer.resize(std::distance(tmpBuffer.begin(), it));
} else {
std::copy(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin());
}

if (digitsFilter) {
digitsFilter(tmpBuffer);
if (threshold > 0.f) {
std::vector<o2::tpc::Digit> tmpBuffer2 = std::move(tmpBuffer);
tmpBuffer = std::vector<o2::tpc::Digit>(tmpBuffer2.size());
auto it = std::copy_if(tmpBuffer2.begin(), tmpBuffer2.end(), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; });
tbb::parallel_reduce(tbb::blocked_range<uint32_t>(0, NSLICES), o2::gpu::internal::tmpReductionResult(), [&](const auto range, const auto red) {
for (uint32_t i = r.begin(); i < r.end(); ++i) {
std::vector<o2::tpc::Digit> tmpBuffer;
red.digitsInput += ZSEncoderGetNDigits(in, i);
tmpBuffer.resize(ZSEncoderGetNDigits(in, i));
if (threshold > 0.f && !digitsFilter) {
auto it = std::copy_if(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; });
tmpBuffer.resize(std::distance(tmpBuffer.begin(), it));
} else {
std::copy(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin());
}
}
digitsEncoded += tmpBuffer.size();

auto runZS = [&](auto& encoder) {
encoder.zsVersion = version;
encoder.init();
totalPages += encoder.run(buffer[i], tmpBuffer, &totalSize);
if (verify) {
nErrors += encoder.compare(buffer[i], tmpBuffer); // Verification

if (digitsFilter) {
digitsFilter(tmpBuffer);
if (threshold > 0.f) {
std::vector<o2::tpc::Digit> tmpBuffer2 = std::move(tmpBuffer);
tmpBuffer = std::vector<o2::tpc::Digit>(tmpBuffer2.size());
auto it = std::copy_if(tmpBuffer2.begin(), tmpBuffer2.end(), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; });
tmpBuffer.resize(std::distance(tmpBuffer.begin(), it));
}
}
};
red.digitsEncoded += tmpBuffer.size();

auto runZS = [&](auto& encoder) {
encoder.zsVersion = version;
encoder.init();
red.totalPages += encoder.run(buffer[i], tmpBuffer, &red.totalSize);
if (verify) {
red.nErrors += encoder.compare(buffer[i], tmpBuffer); // Verification
}
};

if (version >= ZSVersion::ZSVersionRowBased10BitADC && version <= ZSVersion::ZSVersionRowBased12BitADC) {
zsEncoderRun<zsEncoderRow> enc{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}};
runZS(enc);
} else if (version >= ZSVersion::ZSVersionLinkBasedWithMeta && version <= ZSVersion::ZSVersionDenseLinkBasedV2) {
#ifdef GPUCA_O2_LIB
if (version == ZSVersion::ZSVersionLinkBasedWithMeta) {
zsEncoderRun<zsEncoderImprovedLinkBased> enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}}};
if (version >= ZSVersion::ZSVersionRowBased10BitADC && version <= ZSVersion::ZSVersionRowBased12BitADC) {
zsEncoderRun<zsEncoderRow> enc{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}};
runZS(enc);
} else if (version >= ZSVersion::ZSVersionDenseLinkBased && version <= ZSVersion::ZSVersionDenseLinkBasedV2) {
zsEncoderRun<zsEncoderDenseLinkBased> enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}}};
runZS(enc);
}
} else if (version >= ZSVersion::ZSVersionLinkBasedWithMeta && version <= ZSVersion::ZSVersionDenseLinkBasedV2) {
#ifdef GPUCA_O2_LIB
if (version == ZSVersion::ZSVersionLinkBasedWithMeta) {
zsEncoderRun<zsEncoderImprovedLinkBased> enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}}};
runZS(enc);
} else if (version >= ZSVersion::ZSVersionDenseLinkBased && version <= ZSVersion::ZSVersionDenseLinkBasedV2) {
zsEncoderRun<zsEncoderDenseLinkBased> enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}}};
runZS(enc);
}
#else
throw std::runtime_error("Link based ZS encoding not supported in standalone build");
throw std::runtime_error("Link based ZS encoding not supported in standalone build");
#endif
} else {
throw std::runtime_error("Invalid ZS version "s + std::to_string(version) + ", cannot decode"s);
} else {
throw std::runtime_error("Invalid ZS version "s + std::to_string(version) + ", cannot decode"s);
}
}
}
return red; }, [&](const auto& red1, const auto& red2) {
auto red = red1;
red.totalPages += red2.totalPages;
red.totalSize += red2.totalSize;
red.nErrors += red2.nErrors;
red.digitsInput += red2.digitsInput;
red.digitsEncoded += red2.digitsEncoded;
return red; });

if (outBuffer) {
outBuffer->reset(new uint64_t[totalPages * TPCZSHDR::TPC_ZS_PAGE_SIZE / sizeof(uint64_t)]);
Expand Down
4 changes: 0 additions & 4 deletions GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,6 @@
#include <unistd.h>
#endif

#ifdef WITH_OPENMP
#include <omp.h>
#endif

#include "GPUReconstruction.h"
#include "GPUReconstructionAvailableBackends.h"

Expand Down
59 changes: 59 additions & 0 deletions GPU/GPUTracking/Base/GPUReconstructionThreading.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
// All rights not expressly granted are reserved.
//
// This software is distributed under the terms of the GNU General Public
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
//
// In applying this license CERN does not waive the privileges and immunities
// granted to it by virtue of its status as an Intergovernmental Organization
// or submit itself to any jurisdiction.

/// \file GPUReconstructionThreading.h
/// \author David Rohr

#if !defined(GPURECONSTRUCTIONTHREADING_H)
#define GPURECONSTRUCTIONTHREADING_H

#if !defined(GPUCA_GPUCODE)
#include "GPUReconstruction.h"

#include <memory>
#include <oneapi/tbb.h>

namespace o2::gpu
{

struct GPUReconstructionThreading {
std::unique_ptr<tbb::global_control> control;
std::unique_ptr<tbb::task_arena> allThreads;
std::unique_ptr<tbb::task_arena> outerThreads;
};

} // namespace o2::gpu

#endif

#define GPUCA_TBB_KERNEL_LOOP_HOST(rec, vartype, varname, iEnd, code) \
for (vartype varname = get_global_id(0); varname < iEnd; varname += get_global_size(0)) { \
code \
}

#ifdef GPUCA_GPUCODE
#define GPUCA_TBB_KERNEL_LOOP GPUCA_TBB_KERNEL_LOOP_HOST
#else
#define GPUCA_TBB_KERNEL_LOOP(rec, vartype, varname, iEnd, code) \
if (!rec.GetProcessingSettings().ompKernels) { \
rec.mThreading->allThreads->execute([&] { \
tbb::parallel_for(tbb::blocked_range<vartype>(get_global_id(0), iEnd, get_global_size(0)), [&](const tbb::blocked_range<vartype>& _r_internal) { \
for (vartype varname = _r_internal.begin(); varname < _r_internal.end(); varname += get_global_size(0)) { \
code \
} \
}); \
}); \
} else { \
GPUCA_TBB_KERNEL_LOOP_HOST(rec, vartype, varname, iEnd, code) \
}
#endif

#endif
7 changes: 1 addition & 6 deletions GPU/GPUTracking/Base/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,7 @@ set_target_cuda_arch(${targetName})
#target_link_options(${targetName} PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/version_script.ld")
#set_target_properties(${targetName} PROPERTIES LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/version_script.ld)

if(OpenMP_CXX_FOUND)
# Must be private, depending libraries might be compiled by compiler not understanding -fopenmp
target_compile_definitions(${targetName} PRIVATE WITH_OPENMP)
target_link_libraries(${targetName} PRIVATE OpenMP::OpenMP_CXX)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fopenmp")
endif()
target_link_libraries(${targetName} PRIVATE TBB::tbb)

# Special handling of GPU kernels in case of per-kernel compilation / RDC
if(NOT DEFINED GPUCA_CUDA_COMPILE_MODE)
Expand Down
Loading