Skip to content

Commit f041ea2

Browse files
committed
GPU: Refactor some code, and keep only CPU-kernel realted code in GPUReconstructionCPU
1 parent e7c4cf2 commit f041ea2

15 files changed

+779
-667
lines changed

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "GPUReconstruction.h"
2727
#include "GPUReconstructionIncludes.h"
2828
#include "GPUReconstructionThreading.h"
29+
#include "GPUReconstructionIO.h"
2930
#include "GPUROOTDumpCore.h"
3031
#include "GPUConfigDump.h"
3132
#include "GPUChainTracking.h"
@@ -118,15 +119,6 @@ void GPUReconstruction::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* tr
118119
}
119120
}
120121

121-
void GPUReconstruction::SetNActiveThreads(int32_t n)
122-
{
123-
mActiveHostKernelThreads = std::max(1, n < 0 ? mMaxHostThreads : std::min(n, mMaxHostThreads));
124-
mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
125-
if (mProcessingSettings.debugLevel >= 3) {
126-
GPUInfo("Set number of active parallel kernels threads on host to %d (%d requested)", mActiveHostKernelThreads, n);
127-
}
128-
}
129-
130122
int32_t GPUReconstruction::getHostThreadIndex()
131123
{
132124
return std::max<int32_t>(0, tbb::this_task_arena::current_thread_index());
@@ -327,12 +319,12 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
327319
} else {
328320
mProcessingSettings.autoAdjustHostThreads = false;
329321
}
330-
mMaxHostThreads = mActiveHostKernelThreads = mProcessingSettings.nHostThreads;
322+
mMaxHostThreads = mProcessingSettings.nHostThreads;
331323
if (mMaster == nullptr) {
332324
mThreading = std::make_shared<GPUReconstructionThreading>();
333325
mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism, mMaxHostThreads);
334326
mThreading->allThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
335-
mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
327+
mThreading->activeThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
336328
} else {
337329
mThreading = mMaster->mThreading;
338330
}
@@ -1181,8 +1173,3 @@ void GPUReconstruction::SetInputControl(void* ptr, size_t size)
11811173
{
11821174
mInputControl.set(ptr, size);
11831175
}
1184-
1185-
GPUReconstruction::GPUThreadContext::GPUThreadContext() = default;
1186-
GPUReconstruction::GPUThreadContext::~GPUThreadContext() = default;
1187-
1188-
std::unique_ptr<GPUReconstruction::GPUThreadContext> GPUReconstruction::GetThreadContext() { return std::unique_ptr<GPUReconstruction::GPUThreadContext>(new GPUThreadContext); }

GPU/GPUTracking/Base/GPUReconstruction.h

Lines changed: 8 additions & 173 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,11 @@ class GPUROOTDumpCore;
5757
namespace gpu_reconstruction_kernels
5858
{
5959
struct deviceEvent;
60+
class threadContext;
6061
}
6162

6263
class GPUReconstruction
6364
{
64-
friend class GPUChain;
65-
6665
protected:
6766
class LibraryLoader; // These must be the first members to ensure correct destructor order!
6867
std::shared_ptr<LibraryLoader> mMyLib = nullptr;
@@ -207,8 +206,6 @@ class GPUReconstruction
207206
void SetOutputControl(void* ptr, size_t size);
208207
void SetInputControl(void* ptr, size_t size);
209208
GPUOutputControl& OutputControl() { return mOutputControl; }
210-
int32_t GetMaxBackendThreads() const { return mMaxBackendThreads; }
211-
void SetNActiveThreads(int32_t n);
212209
int32_t NStreams() const { return mNStreams; }
213210
const void* DeviceMemoryBase() const { return mDeviceMemoryBase; }
214211

@@ -235,8 +232,10 @@ class GPUReconstruction
235232
double GetStatKernelTime() { return mStatKernelTime; }
236233
double GetStatWallTime() { return mStatWallTime; }
237234

235+
// Threading
238236
std::shared_ptr<GPUReconstructionThreading> mThreading;
239237
static int32_t getHostThreadIndex();
238+
int32_t GetMaxBackendThreads() const { return mMaxBackendThreads; }
240239

241240
protected:
242241
void AllocateRegisteredMemoryInternal(GPUMemoryResource* res, GPUOutputControl* control, GPUReconstruction* recPool);
@@ -258,13 +257,7 @@ class GPUReconstruction
258257
virtual int32_t unregisterMemoryForGPU_internal(const void* ptr) = 0;
259258

260259
// Management for GPU thread contexts
261-
class GPUThreadContext
262-
{
263-
public:
264-
GPUThreadContext();
265-
virtual ~GPUThreadContext();
266-
};
267-
virtual std::unique_ptr<GPUThreadContext> GetThreadContext();
260+
virtual std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() = 0;
268261

269262
// Private helpers for library loading
270263
static std::shared_ptr<LibraryLoader>* GetLibraryInstance(DeviceType type, bool verbose);
@@ -347,11 +340,10 @@ class GPUReconstruction
347340
std::shared_ptr<GPUROOTDumpCore> mROOTDump;
348341
std::vector<std::array<uint32_t, 4>>* mOutputErrorCodes = nullptr;
349342

350-
int32_t mMaxBackendThreads = 0; // Maximum number of threads that may be running, on CPU or GPU
351-
int32_t mGPUStuck = 0; // Marks that the GPU is stuck, skip future events
352-
int32_t mNStreams = 1; // Number of parallel GPU streams
353-
int32_t mMaxHostThreads = 0; // Maximum number of OMP threads
354-
int32_t mActiveHostKernelThreads = 0; // Number of currently active threads on the host for kernels
343+
int32_t mMaxBackendThreads = 0; // Maximum number of threads that may be running, on CPU or GPU
344+
int32_t mGPUStuck = 0; // Marks that the GPU is stuck, skip future events
345+
int32_t mNStreams = 1; // Number of parallel GPU streams
346+
int32_t mMaxHostThreads = 0; // Maximum number of OMP threads
355347

356348
// Management for GPUProcessors
357349
struct ProcessorData {
@@ -491,163 +483,6 @@ inline void GPUReconstruction::SetupGPUProcessor(T* proc, bool allocate)
491483
}
492484
}
493485

494-
template <class T, class S>
495-
inline uint32_t GPUReconstruction::DumpData(FILE* fp, const T* const* entries, const S* num, InOutPointerType type)
496-
{
497-
int32_t count = getNIOTypeMultiplicity(type);
498-
uint32_t numTotal = 0;
499-
for (int32_t i = 0; i < count; i++) {
500-
numTotal += num[i];
501-
}
502-
if (numTotal == 0) {
503-
return 0;
504-
}
505-
fwrite(&type, sizeof(type), 1, fp);
506-
for (int32_t i = 0; i < count; i++) {
507-
fwrite(&num[i], sizeof(num[i]), 1, fp);
508-
if (num[i]) {
509-
fwrite(entries[i], sizeof(*entries[i]), num[i], fp);
510-
}
511-
}
512-
if (mProcessingSettings.debugLevel >= 2) {
513-
GPUInfo("Dumped %ld %s", (int64_t)numTotal, IOTYPENAMES[type]);
514-
}
515-
return numTotal;
516-
}
517-
518-
template <class T, class S>
519-
inline size_t GPUReconstruction::ReadData(FILE* fp, const T** entries, S* num, std::unique_ptr<T[]>* mem, InOutPointerType type, T** nonConstPtrs)
520-
{
521-
if (feof(fp)) {
522-
return 0;
523-
}
524-
InOutPointerType inType;
525-
size_t r, pos = ftell(fp);
526-
r = fread(&inType, sizeof(inType), 1, fp);
527-
if (r != 1 || inType != type) {
528-
fseek(fp, pos, SEEK_SET);
529-
return 0;
530-
}
531-
532-
int32_t count = getNIOTypeMultiplicity(type);
533-
size_t numTotal = 0;
534-
for (int32_t i = 0; i < count; i++) {
535-
r = fread(&num[i], sizeof(num[i]), 1, fp);
536-
T* m = AllocateIOMemoryHelper(num[i], entries[i], mem[i]);
537-
if (nonConstPtrs) {
538-
nonConstPtrs[i] = m;
539-
}
540-
if (num[i]) {
541-
r = fread(m, sizeof(*entries[i]), num[i], fp);
542-
}
543-
numTotal += num[i];
544-
}
545-
(void)r;
546-
if (mProcessingSettings.debugLevel >= 2) {
547-
GPUInfo("Read %ld %s", (int64_t)numTotal, IOTYPENAMES[type]);
548-
}
549-
return numTotal;
550-
}
551-
552-
template <class T>
553-
inline void GPUReconstruction::DumpFlatObjectToFile(const T* obj, const char* file)
554-
{
555-
FILE* fp = fopen(file, "w+b");
556-
if (fp == nullptr) {
557-
return;
558-
}
559-
size_t size[2] = {sizeof(*obj), obj->getFlatBufferSize()};
560-
fwrite(size, sizeof(size[0]), 2, fp);
561-
fwrite(obj, 1, size[0], fp);
562-
fwrite(obj->getFlatBufferPtr(), 1, size[1], fp);
563-
fclose(fp);
564-
}
565-
566-
template <class T>
567-
inline std::unique_ptr<T> GPUReconstruction::ReadFlatObjectFromFile(const char* file)
568-
{
569-
FILE* fp = fopen(file, "rb");
570-
if (fp == nullptr) {
571-
return nullptr;
572-
}
573-
size_t size[2] = {0}, r;
574-
r = fread(size, sizeof(size[0]), 2, fp);
575-
if (r == 0 || size[0] != sizeof(T)) {
576-
fclose(fp);
577-
GPUError("ERROR reading %s, invalid size: %ld (%ld expected)", file, (int64_t)size[0], (int64_t)sizeof(T));
578-
throw std::runtime_error("invalid size");
579-
}
580-
std::unique_ptr<T> retVal(new T);
581-
retVal->destroy();
582-
char* buf = new char[size[1]]; // Not deleted as ownership is transferred to FlatObject
583-
r = fread((void*)retVal.get(), 1, size[0], fp);
584-
r = fread(buf, 1, size[1], fp);
585-
fclose(fp);
586-
if (mProcessingSettings.debugLevel >= 2) {
587-
GPUInfo("Read %ld bytes from %s", (int64_t)r, file);
588-
}
589-
retVal->clearInternalBufferPtr();
590-
retVal->setActualBufferAddress(buf);
591-
retVal->adoptInternalBuffer(buf);
592-
return retVal;
593-
}
594-
595-
template <class T>
596-
inline void GPUReconstruction::DumpStructToFile(const T* obj, const char* file)
597-
{
598-
FILE* fp = fopen(file, "w+b");
599-
if (fp == nullptr) {
600-
return;
601-
}
602-
size_t size = sizeof(*obj);
603-
fwrite(&size, sizeof(size), 1, fp);
604-
fwrite(obj, 1, size, fp);
605-
fclose(fp);
606-
}
607-
608-
template <class T>
609-
inline std::unique_ptr<T> GPUReconstruction::ReadStructFromFile(const char* file)
610-
{
611-
FILE* fp = fopen(file, "rb");
612-
if (fp == nullptr) {
613-
return nullptr;
614-
}
615-
size_t size, r;
616-
r = fread(&size, sizeof(size), 1, fp);
617-
if (r == 0 || size != sizeof(T)) {
618-
fclose(fp);
619-
GPUError("ERROR reading %s, invalid size: %ld (%ld expected)", file, (int64_t)size, (int64_t)sizeof(T));
620-
throw std::runtime_error("invalid size");
621-
}
622-
std::unique_ptr<T> newObj(new T);
623-
r = fread(newObj.get(), 1, size, fp);
624-
fclose(fp);
625-
if (mProcessingSettings.debugLevel >= 2) {
626-
GPUInfo("Read %ld bytes from %s", (int64_t)r, file);
627-
}
628-
return newObj;
629-
}
630-
631-
template <class T>
632-
inline int32_t GPUReconstruction::ReadStructFromFile(const char* file, T* obj)
633-
{
634-
FILE* fp = fopen(file, "rb");
635-
if (fp == nullptr) {
636-
return 1;
637-
}
638-
size_t size, r;
639-
r = fread(&size, sizeof(size), 1, fp);
640-
if (r == 0) {
641-
fclose(fp);
642-
return 1;
643-
}
644-
r = fread(obj, 1, size, fp);
645-
fclose(fp);
646-
if (mProcessingSettings.debugLevel >= 2) {
647-
GPUInfo("Read %ld bytes from %s", (int64_t)r, file);
648-
}
649-
return 0;
650-
}
651486
} // namespace gpu
652487
} // namespace o2
653488

GPU/GPUTracking/Base/GPUReconstructionCPU.cxx

Lines changed: 2 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -54,23 +54,6 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
5454
Exit(); // Needs to be identical to GPU backend bahavior in order to avoid calling abstract methods later in the destructor
5555
}
5656

57-
int32_t GPUReconstructionCPUBackend::getNKernelHostThreads(bool splitCores)
58-
{
59-
int32_t nThreads = 0;
60-
if (mProcessingSettings.inKernelParallel == 2 && mNActiveThreadsOuterLoop) {
61-
if (splitCores) {
62-
nThreads = mMaxHostThreads / mNActiveThreadsOuterLoop;
63-
nThreads += (uint32_t)getHostThreadIndex() < mMaxHostThreads % mNActiveThreadsOuterLoop;
64-
} else {
65-
nThreads = mMaxHostThreads;
66-
}
67-
nThreads = std::max(1, nThreads);
68-
} else {
69-
nThreads = mProcessingSettings.inKernelParallel ? mMaxHostThreads : 1;
70-
}
71-
return nThreads;
72-
}
73-
7457
template <class T, int32_t I, typename... Args>
7558
inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
7659
{
@@ -198,6 +181,8 @@ int32_t GPUReconstructionCPU::GetThread()
198181

199182
int32_t GPUReconstructionCPU::InitDevice()
200183
{
184+
mActiveHostKernelThreads = mMaxHostThreads;
185+
mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
201186
if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
202187
if (mMaster == nullptr) {
203188
if (mDeviceMemorySize > mHostMemorySize) {
@@ -339,60 +324,6 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes()
339324
}
340325
}
341326

342-
static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class member not global, but cannot be moved to header due to ROOT limitation
343-
344-
GPUReconstructionCPU::timerMeta* GPUReconstructionCPU::insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step)
345-
{
346-
while (timerFlag.test_and_set()) {
347-
}
348-
if (mTimers.size() <= id) {
349-
mTimers.resize(id + 1);
350-
}
351-
if (mTimers[id] == nullptr) {
352-
if (J >= 0) {
353-
name += std::to_string(J);
354-
}
355-
mTimers[id].reset(new timerMeta{std::unique_ptr<HighResTimer[]>{new HighResTimer[num]}, name, num, type, 1u, step, (size_t)0});
356-
} else {
357-
mTimers[id]->count++;
358-
}
359-
timerMeta* retVal = mTimers[id].get();
360-
timerFlag.clear();
361-
return retVal;
362-
}
363-
364-
GPUReconstructionCPU::timerMeta* GPUReconstructionCPU::getTimerById(uint32_t id, bool increment)
365-
{
366-
timerMeta* retVal = nullptr;
367-
while (timerFlag.test_and_set()) {
368-
}
369-
if (mTimers.size() > id && mTimers[id]) {
370-
retVal = mTimers[id].get();
371-
retVal->count += increment;
372-
}
373-
timerFlag.clear();
374-
return retVal;
375-
}
376-
377-
uint32_t GPUReconstructionCPU::getNextTimerId()
378-
{
379-
static std::atomic<uint32_t> id{0};
380-
return id.fetch_add(1);
381-
}
382-
383-
uint32_t GPUReconstructionCPU::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
384-
{
385-
if (condition && mProcessingSettings.inKernelParallel != 1) {
386-
mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
387-
} else {
388-
mNActiveThreadsOuterLoop = 1;
389-
}
390-
if (mProcessingSettings.debugLevel >= 5) {
391-
printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop);
392-
}
393-
return mNActiveThreadsOuterLoop;
394-
}
395-
396327
void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream)
397328
{
398329
param().occupancyMap = mapHost;

0 commit comments

Comments
 (0)