Skip to content

Commit f6b40be

Browse files
committed
GPU: Replace OpenMP parallization with TBB
1 parent 96d683b commit f6b40be

22 files changed

+199
-227
lines changed

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 8 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,9 @@
2323
#include <condition_variable>
2424
#include <array>
2525

26-
#ifdef WITH_OPENMP
27-
#include <omp.h>
28-
#endif
29-
3026
#include "GPUReconstruction.h"
3127
#include "GPUReconstructionIncludes.h"
28+
#include "GPUReconstructionThreading.h"
3229
#include "GPUROOTDumpCore.h"
3330
#include "GPUConfigDump.h"
3431
#include "GPUChainTracking.h"
@@ -121,17 +118,12 @@ void GPUReconstruction::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* tr
121118
}
122119
}
123120

124-
int32_t GPUReconstruction::SetNOMPThreads(int32_t n)
121+
void GPUReconstruction::SetNOMPThreads(int32_t n)
125122
{
126-
#ifdef WITH_OPENMP
127-
omp_set_num_threads(mProcessingSettings.ompThreads = std::max(1, n < 0 ? mMaxOMPThreads : std::min(n, mMaxOMPThreads)));
123+
mProcessingSettings.ompThreads = std::max(1, n < 0 ? mMaxOMPThreads : std::min(n, mMaxOMPThreads));
128124
if (mProcessingSettings.debugLevel >= 3) {
129-
GPUInfo("Set number of OpenMP threads to %d (%d requested)", mProcessingSettings.ompThreads, n);
125+
GPUInfo("Set number of parallel threads to %d (%d requested)", mProcessingSettings.ompThreads, n);
130126
}
131-
return n > mMaxOMPThreads;
132-
#else
133-
return 1;
134-
#endif
135127
}
136128

137129
int32_t GPUReconstruction::Init()
@@ -299,23 +291,15 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
299291
mMemoryScalers->rescaleMaxMem(mProcessingSettings.forceMaxMemScalers);
300292
}
301293

302-
#ifdef WITH_OPENMP
303294
if (mProcessingSettings.ompThreads <= 0) {
304-
mProcessingSettings.ompThreads = omp_get_max_threads();
295+
mProcessingSettings.ompThreads = tbb::info::default_concurrency();
305296
} else {
306297
mProcessingSettings.ompAutoNThreads = false;
307-
omp_set_num_threads(mProcessingSettings.ompThreads);
308298
}
309-
if (mProcessingSettings.ompKernels) {
310-
if (omp_get_max_active_levels() < 2) {
311-
omp_set_max_active_levels(2);
312-
}
313-
}
314-
#else
315-
mProcessingSettings.ompThreads = 1;
316-
#endif
317299
mMaxOMPThreads = mProcessingSettings.ompThreads;
318-
mMaxThreads = std::max(mMaxThreads, mProcessingSettings.ompThreads);
300+
mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism, mMaxOMPThreads);
301+
mThreading->allThreads = std::make_unique<tbb::task_arena>(mMaxOMPThreads);
302+
mMaxThreads = std::max(mMaxThreads, mMaxOMPThreads);
319303
if (IsGPU()) {
320304
mNStreams = std::max<int32_t>(mProcessingSettings.nStreams, 3);
321305
}

GPU/GPUTracking/Base/GPUReconstruction.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ namespace gpu
5151
class GPUChain;
5252
struct GPUMemorySizeScalers;
5353
struct GPUReconstructionPipelineContext;
54+
struct GPUReconstructionThreading;
5455
class GPUROOTDumpCore;
5556

5657
namespace gpu_reconstruction_kernels
@@ -207,7 +208,7 @@ class GPUReconstruction
207208
void SetInputControl(void* ptr, size_t size);
208209
GPUOutputControl& OutputControl() { return mOutputControl; }
209210
int32_t GetMaxThreads() const { return mMaxThreads; }
210-
int32_t SetNOMPThreads(int32_t n);
211+
void SetNOMPThreads(int32_t n);
211212
int32_t NStreams() const { return mNStreams; }
212213
const void* DeviceMemoryBase() const { return mDeviceMemoryBase; }
213214

@@ -234,6 +235,8 @@ class GPUReconstruction
234235
double GetStatKernelTime() { return mStatKernelTime; }
235236
double GetStatWallTime() { return mStatWallTime; }
236237

238+
std::unique_ptr<GPUReconstructionThreading> mThreading;
239+
237240
protected:
238241
void AllocateRegisteredMemoryInternal(GPUMemoryResource* res, GPUOutputControl* control, GPUReconstruction* recPool);
239242
void FreeRegisteredMemory(GPUMemoryResource* res);

GPU/GPUTracking/Base/GPUReconstructionCPU.cxx

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include "GPUReconstructionCPU.h"
1616
#include "GPUReconstructionIncludes.h"
17+
#include "GPUReconstructionThreading.h"
1718
#include "GPUChain.h"
1819

1920
#include "GPUTPCClusterData.h"
@@ -40,13 +41,6 @@
4041
#include <unistd.h>
4142
#endif
4243

43-
#if defined(WITH_OPENMP) || defined(_OPENMP)
44-
#include <omp.h>
45-
#else
46-
static inline int32_t omp_get_thread_num() { return 0; }
47-
static inline int32_t omp_get_max_threads() { return 1; }
48-
#endif
49-
5044
using namespace o2::gpu;
5145
using namespace o2::gpu::gpu_reconstruction_kernels;
5246

@@ -111,24 +105,20 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
111105
template <>
112106
inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
113107
{
114-
#ifdef WITH_OPENMP
115108
int32_t nOMPThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNOMPThreads()));
116109
if (nOMPThreads > 1) {
117-
GPUCA_OPENMP(parallel num_threads(nOMPThreads))
118-
{
119-
size_t threadSize = size / omp_get_num_threads();
110+
tbb::parallel_for(0, nOMPThreads, [&](int iThread) {
111+
size_t threadSize = size / nOMPThreads;
120112
if (threadSize % 4096) {
121113
threadSize += 4096 - threadSize % 4096;
122114
}
123-
size_t offset = threadSize * omp_get_thread_num();
115+
size_t offset = threadSize * iThread;
124116
size_t mySize = std::min<size_t>(threadSize, size - offset);
125117
if (mySize) {
126118
memset((char*)ptr + offset, 0, mySize);
127-
}
128-
}
129-
} else
130-
#endif
131-
{
119+
}// clang-format off
120+
}, tbb::static_partitioner());// clang-format on
121+
} else {
132122
memset(ptr, 0, size);
133123
}
134124
return 0;
@@ -353,7 +343,7 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes()
353343

354344
int32_t GPUReconstructionCPUBackend::getOMPThreadNum()
355345
{
356-
return omp_get_thread_num();
346+
return tbb::this_task_arena::current_thread_index();
357347
}
358348

359349
int32_t GPUReconstructionCPUBackend::getOMPMaxThreads()

GPU/GPUTracking/Base/GPUReconstructionConvert.cxx

Lines changed: 64 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
#include "TPCBase/CRU.h"
4141
#include "DetectorsRaw/RDHUtils.h"
4242

43+
#include <oneapi/tbb.h>
44+
4345
using namespace o2::gpu;
4446
using namespace o2::tpc;
4547
using namespace o2::tpc::constants;
@@ -1306,6 +1308,17 @@ size_t zsEncoderRun<T>::compare(std::vector<zsPage>* buffer, std::vector<o2::tpc
13061308
} // anonymous namespace
13071309
#endif // GPUCA_TPC_GEOMETRY_O2
13081310

1311+
namespace o2::gpu::internal
1312+
{
1313+
struct tmpReductionResult {
1314+
uint32_t totalPages = 0;
1315+
size_t totalSize = 0;
1316+
size_t nErrors = 0;
1317+
size_t digitsInput = 0;
1318+
size_t digitsEncoded = 0;
1319+
};
1320+
} // namespace o2::gpu::internal
1321+
13091322
template <class S>
13101323
void GPUReconstructionConvert::RunZSEncoder(const S& in, std::unique_ptr<uint64_t[]>* outBuffer, uint32_t* outSizes, o2::raw::RawFileWriter* raw, const o2::InteractionRecord* ir, const GPUParam& param, int32_t version, bool verify, float threshold, bool padding, std::function<void(std::vector<o2::tpc::Digit>&)> digitsFilter)
13111324
{
@@ -1316,64 +1329,65 @@ void GPUReconstructionConvert::RunZSEncoder(const S& in, std::unique_ptr<uint64_
13161329
}
13171330
#ifdef GPUCA_TPC_GEOMETRY_O2
13181331
std::vector<zsPage> buffer[NSLICES][GPUTrackingInOutZS::NENDPOINTS];
1319-
uint32_t totalPages = 0;
1320-
size_t totalSize = 0;
1321-
size_t nErrors = 0;
1322-
size_t digitsInput = 0;
1323-
size_t digitsEncoded = 0;
1324-
// clang-format off
1325-
GPUCA_OPENMP(parallel for reduction(+ : totalPages, nErrors, totalSize, digitsInput, digitsEncoded))
1326-
// clang-format on
1327-
for (uint32_t i = 0; i < NSLICES; i++) {
1328-
std::vector<o2::tpc::Digit> tmpBuffer;
1329-
digitsInput += ZSEncoderGetNDigits(in, i);
1330-
tmpBuffer.resize(ZSEncoderGetNDigits(in, i));
1331-
if (threshold > 0.f && !digitsFilter) {
1332-
auto it = std::copy_if(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; });
1333-
tmpBuffer.resize(std::distance(tmpBuffer.begin(), it));
1334-
} else {
1335-
std::copy(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin());
1336-
}
1337-
1338-
if (digitsFilter) {
1339-
digitsFilter(tmpBuffer);
1340-
if (threshold > 0.f) {
1341-
std::vector<o2::tpc::Digit> tmpBuffer2 = std::move(tmpBuffer);
1342-
tmpBuffer = std::vector<o2::tpc::Digit>(tmpBuffer2.size());
1343-
auto it = std::copy_if(tmpBuffer2.begin(), tmpBuffer2.end(), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; });
1332+
tbb::parallel_reduce(tbb::blocked_range<uint32_t>(0, NSLICES), o2::gpu::internal::tmpReductionResult(), [&](const auto range, const auto red) {
1333+
for (uint32_t i = r.begin(); i < r.end(); ++i) {
1334+
std::vector<o2::tpc::Digit> tmpBuffer;
1335+
red.digitsInput += ZSEncoderGetNDigits(in, i);
1336+
tmpBuffer.resize(ZSEncoderGetNDigits(in, i));
1337+
if (threshold > 0.f && !digitsFilter) {
1338+
auto it = std::copy_if(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; });
13441339
tmpBuffer.resize(std::distance(tmpBuffer.begin(), it));
1340+
} else {
1341+
std::copy(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin());
13451342
}
1346-
}
1347-
digitsEncoded += tmpBuffer.size();
1348-
1349-
auto runZS = [&](auto& encoder) {
1350-
encoder.zsVersion = version;
1351-
encoder.init();
1352-
totalPages += encoder.run(buffer[i], tmpBuffer, &totalSize);
1353-
if (verify) {
1354-
nErrors += encoder.compare(buffer[i], tmpBuffer); // Verification
1343+
1344+
if (digitsFilter) {
1345+
digitsFilter(tmpBuffer);
1346+
if (threshold > 0.f) {
1347+
std::vector<o2::tpc::Digit> tmpBuffer2 = std::move(tmpBuffer);
1348+
tmpBuffer = std::vector<o2::tpc::Digit>(tmpBuffer2.size());
1349+
auto it = std::copy_if(tmpBuffer2.begin(), tmpBuffer2.end(), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; });
1350+
tmpBuffer.resize(std::distance(tmpBuffer.begin(), it));
1351+
}
13551352
}
1356-
};
1353+
red.digitsEncoded += tmpBuffer.size();
1354+
1355+
auto runZS = [&](auto& encoder) {
1356+
encoder.zsVersion = version;
1357+
encoder.init();
1358+
red.totalPages += encoder.run(buffer[i], tmpBuffer, &red.totalSize);
1359+
if (verify) {
1360+
red.nErrors += encoder.compare(buffer[i], tmpBuffer); // Verification
1361+
}
1362+
};
13571363

1358-
if (version >= ZSVersion::ZSVersionRowBased10BitADC && version <= ZSVersion::ZSVersionRowBased12BitADC) {
1359-
zsEncoderRun<zsEncoderRow> enc{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}};
1360-
runZS(enc);
1361-
} else if (version >= ZSVersion::ZSVersionLinkBasedWithMeta && version <= ZSVersion::ZSVersionDenseLinkBasedV2) {
1362-
#ifdef GPUCA_O2_LIB
1363-
if (version == ZSVersion::ZSVersionLinkBasedWithMeta) {
1364-
zsEncoderRun<zsEncoderImprovedLinkBased> enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}}};
1364+
if (version >= ZSVersion::ZSVersionRowBased10BitADC && version <= ZSVersion::ZSVersionRowBased12BitADC) {
1365+
zsEncoderRun<zsEncoderRow> enc{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}};
13651366
runZS(enc);
1366-
} else if (version >= ZSVersion::ZSVersionDenseLinkBased && version <= ZSVersion::ZSVersionDenseLinkBasedV2) {
1367-
zsEncoderRun<zsEncoderDenseLinkBased> enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}}};
1368-
runZS(enc);
1369-
}
1367+
} else if (version >= ZSVersion::ZSVersionLinkBasedWithMeta && version <= ZSVersion::ZSVersionDenseLinkBasedV2) {
1368+
#ifdef GPUCA_O2_LIB
1369+
if (version == ZSVersion::ZSVersionLinkBasedWithMeta) {
1370+
zsEncoderRun<zsEncoderImprovedLinkBased> enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}}};
1371+
runZS(enc);
1372+
} else if (version >= ZSVersion::ZSVersionDenseLinkBased && version <= ZSVersion::ZSVersionDenseLinkBasedV2) {
1373+
zsEncoderRun<zsEncoderDenseLinkBased> enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}}};
1374+
runZS(enc);
1375+
}
13701376
#else
1371-
throw std::runtime_error("Link based ZS encoding not supported in standalone build");
1377+
throw std::runtime_error("Link based ZS encoding not supported in standalone build");
13721378
#endif
1373-
} else {
1374-
throw std::runtime_error("Invalid ZS version "s + std::to_string(version) + ", cannot decode"s);
1379+
} else {
1380+
throw std::runtime_error("Invalid ZS version "s + std::to_string(version) + ", cannot decode"s);
1381+
}
13751382
}
1376-
}
1383+
return red; }, [&](const auto& red1, const auto& red2) {
1384+
auto red = red1;
1385+
red.totalPages += red2.totalPages;
1386+
red.totalSize += red2.totalSize;
1387+
red.nErrors += red2.nErrors;
1388+
red.digitsInput += red2.digitsInput;
1389+
red.digitsEncoded += red2.digitsEncoded;
1390+
return red; });
13771391

13781392
if (outBuffer) {
13791393
outBuffer->reset(new uint64_t[totalPages * TPCZSHDR::TPC_ZS_PAGE_SIZE / sizeof(uint64_t)]);

GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,6 @@
2222
#include <unistd.h>
2323
#endif
2424

25-
#ifdef WITH_OPENMP
26-
#include <omp.h>
27-
#endif
28-
2925
#include "GPUReconstruction.h"
3026
#include "GPUReconstructionAvailableBackends.h"
3127

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
2+
// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
3+
// All rights not expressly granted are reserved.
4+
//
5+
// This software is distributed under the terms of the GNU General Public
6+
// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
7+
//
8+
// In applying this license CERN does not waive the privileges and immunities
9+
// granted to it by virtue of its status as an Intergovernmental Organization
10+
// or submit itself to any jurisdiction.
11+
12+
/// \file GPUReconstructionThreading.h
13+
/// \author David Rohr
14+
15+
#if !defined(GPURECONSTRUCTIONTHREADING_H)
16+
#define GPURECONSTRUCTIONTHREADING_H
17+
18+
#if !defined(GPUCA_GPUCODE)
19+
#include "GPUReconstruction.h"
20+
21+
#include <memory>
22+
#include <oneapi/tbb.h>
23+
24+
namespace o2::gpu
25+
{
26+
27+
struct GPUReconstructionThreading {
28+
std::unique_ptr<tbb::global_control> control;
29+
std::unique_ptr<tbb::task_arena> allThreads;
30+
std::unique_ptr<tbb::task_arena> outerThreads;
31+
};
32+
33+
} // namespace o2::gpu
34+
35+
#endif
36+
37+
#define GPUCA_TBB_KERNEL_LOOP_HOST(rec, vartype, varname, iEnd, code) \
38+
for (vartype varname = get_global_id(0); varname < iEnd; varname += get_global_size(0)) { \
39+
code \
40+
}
41+
42+
#ifdef GPUCA_GPUCODE
43+
#define GPUCA_TBB_KERNEL_LOOP GPUCA_TBB_KERNEL_LOOP_HOST
44+
#else
45+
#define GPUCA_TBB_KERNEL_LOOP(rec, vartype, varname, iEnd, code) \
46+
if (!rec.GetProcessingSettings().ompKernels) { \
47+
rec.mThreading->allThreads->execute([&] { \
48+
tbb::parallel_for(tbb::blocked_range<vartype>(get_global_id(0), iEnd, get_global_size(0)), [&](const tbb::blocked_range<vartype>& _r_internal) { \
49+
for (vartype varname = _r_internal.begin(); varname < _r_internal.end(); varname += get_global_size(0)) { \
50+
code \
51+
} \
52+
}); \
53+
}); \
54+
} else { \
55+
GPUCA_TBB_KERNEL_LOOP_HOST(rec, vartype, varname, iEnd, code) \
56+
}
57+
#endif
58+
59+
#endif

GPU/GPUTracking/Base/cuda/CMakeLists.txt

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -136,12 +136,7 @@ set_target_cuda_arch(${targetName})
136136
#target_link_options(${targetName} PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/version_script.ld")
137137
#set_target_properties(${targetName} PROPERTIES LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/version_script.ld)
138138

139-
if(OpenMP_CXX_FOUND)
140-
# Must be private, depending libraries might be compiled by compiler not understanding -fopenmp
141-
target_compile_definitions(${targetName} PRIVATE WITH_OPENMP)
142-
target_link_libraries(${targetName} PRIVATE OpenMP::OpenMP_CXX)
143-
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fopenmp")
144-
endif()
139+
target_link_libraries(${targetName} PRIVATE TBB::tbb)
145140

146141
# Special handling of GPU kernels in case of per-kernel compilation / RDC
147142
if(NOT DEFINED GPUCA_CUDA_COMPILE_MODE)

0 commit comments

Comments
 (0)