Skip to content

Commit be5ae54

Browse files
committed
GPU: Simplify kernel Call interface, remove intermediate classes and headers where possible
1 parent 497d53f commit be5ae54

File tree

99 files changed

+1061
-804
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+1061
-804
lines changed

Detectors/TRD/workflow/src/TRDGlobalTrackingSpec.cxx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151
#include "GPUTRDTrackletWord.h"
5252
#include "GPUTRDInterfaces.h"
5353
#include "GPUTRDGeometry.h"
54+
#include "GPUConstantMem.h"
55+
#include "GPUTRDTrackerKernels.h"
5456

5557
#ifdef ENABLE_UPGRADES
5658
#include "ITS3Reconstruction/IOUtils.h"

GPU/GPUTracking/Base/GPUProcessor.cxx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
#include "GPUProcessor.h"
1616
#include "GPUReconstruction.h"
17-
#include "GPUReconstructionDeviceBase.h"
17+
#include "GPUSettings.h"
1818

1919
using namespace o2::gpu;
2020

GPU/GPUTracking/Base/GPUReconstruction.cxx

Lines changed: 144 additions & 109 deletions
Large diffs are not rendered by default.

GPU/GPUTracking/Base/GPUReconstruction.h

Lines changed: 32 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,13 @@
2525
#include <unordered_map>
2626
#include <unordered_set>
2727

28-
#include "GPUTRDDef.h"
29-
#include "GPUParam.h"
30-
#include "GPUSettings.h"
31-
#include "GPUOutputControl.h"
28+
#include "GPUDataTypes.h"
3229
#include "GPUMemoryResource.h"
33-
#include "GPUConstantMem.h"
34-
#include "GPULogging.h"
30+
#include "GPUOutputControl.h"
31+
32+
/*#include "GPUParam.h"
33+
#include "GPUSettings.h"
34+
#include "GPULogging.h"*/
3535

3636
namespace o2::its
3737
{
@@ -49,6 +49,13 @@ struct GPUReconstructionThreading;
4949
class GPUROOTDumpCore;
5050
class ThrustVolatileAllocator;
5151
struct GPUDefParameters;
52+
class GPUMemoryResource;
53+
struct GPUSettingsDeviceBackend;
54+
struct GPUSettingsGRP;
55+
struct GPUSettingsProcessing;
56+
struct GPUSettingsRec;
57+
struct GPUSettingsRecDynamic;
58+
struct GPUMemoryReuse;
5259

5360
namespace gpu_reconstruction_kernels
5461
{
@@ -186,18 +193,20 @@ class GPUReconstruction
186193
bool slavesExist() { return mSlaves.size() || mMaster; }
187194

188195
// Getters / setters for parameters
189-
DeviceType GetDeviceType() const { return (DeviceType)mDeviceBackendSettings.deviceType; }
196+
DeviceType GetDeviceType() const;
190197
bool IsGPU() const { return GetDeviceType() != DeviceType::INVALID_DEVICE && GetDeviceType() != DeviceType::CPU; }
191-
const GPUParam& GetParam() const { return mHostConstantMem->param; }
198+
const GPUParam& GetParam() const;
192199
const GPUConstantMem& GetConstantMem() const { return *mHostConstantMem; }
193-
const GPUSettingsGRP& GetGRPSettings() const { return mGRPSettings; }
194-
const GPUSettingsDeviceBackend& GetDeviceBackendSettings() { return mDeviceBackendSettings; }
195-
const GPUSettingsProcessing& GetProcessingSettings() const { return mProcessingSettings; }
200+
const GPUTrackingInOutPointers GetIOPtrs() const;
201+
const GPUSettingsGRP& GetGRPSettings() const { return *mGRPSettings; }
202+
const GPUSettingsDeviceBackend& GetDeviceBackendSettings() const { return *mDeviceBackendSettings; }
203+
const GPUSettingsProcessing& GetProcessingSettings() const { return *mProcessingSettings; }
204+
const GPUCalibObjectsConst& GetCalib() const;
196205
bool IsInitialized() const { return mInitialized; }
197206
void SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration* workflow = nullptr);
198207
void SetSettings(const GPUSettingsGRP* grp, const GPUSettingsRec* rec = nullptr, const GPUSettingsProcessing* proc = nullptr, const GPURecoStepConfiguration* workflow = nullptr);
199-
void SetResetTimers(bool reset) { mProcessingSettings.resetTimers = reset; } // May update also after Init()
200-
void SetDebugLevelTmp(int32_t level) { mProcessingSettings.debugLevel = level; } // Temporarily, before calling SetSettings()
208+
void SetResetTimers(bool reset); // May update also after Init()
209+
void SetDebugLevelTmp(int32_t level); // Temporarily, before calling SetSettings()
201210
void UpdateSettings(const GPUSettingsGRP* g, const GPUSettingsProcessing* p = nullptr, const GPUSettingsRecDynamic* d = nullptr);
202211
void UpdateDynamicSettings(const GPUSettingsRecDynamic* d);
203212
void SetOutputControl(const GPUOutputControl& v) { mOutputControl = v; }
@@ -272,6 +281,7 @@ class GPUReconstruction
272281
size_t ReadData(FILE* fp, const T** entries, S* num, std::unique_ptr<T[]>* mem, InOutPointerType type, T** nonConstPtrs = nullptr);
273282
template <class T>
274283
T* AllocateIOMemoryHelper(size_t n, const T*& ptr, std::unique_ptr<T[]>& u);
284+
int16_t RegisterMemoryAllocationHelper(GPUProcessor* proc, void* (GPUProcessor::*setPtr)(void*), int32_t type, const char* name, const GPUMemoryReuse& re);
275285

276286
// Private helper functions to dump / load flat objects
277287
template <class T>
@@ -292,17 +302,17 @@ class GPUReconstruction
292302
// Pointers to tracker classes
293303
GPUConstantMem* processors() { return mHostConstantMem.get(); }
294304
const GPUConstantMem* processors() const { return mHostConstantMem.get(); }
295-
GPUParam& param() { return mHostConstantMem->param; }
305+
GPUParam& param();
296306
std::unique_ptr<GPUConstantMem> mHostConstantMem;
297307
GPUConstantMem* mDeviceConstantMem = nullptr;
298308

299309
// Settings
300-
GPUSettingsGRP mGRPSettings; // Global Run Parameters
301-
GPUSettingsDeviceBackend mDeviceBackendSettings; // Processing Parameters (at constructor level)
302-
GPUSettingsProcessing mProcessingSettings; // Processing Parameters (at init level)
303-
GPUOutputControl mOutputControl; // Controls the output of the individual components
304-
GPUOutputControl mInputControl; // Prefefined input memory location for reading standalone dumps
305-
std::unique_ptr<GPUMemorySizeScalers> mMemoryScalers; // Scalers how much memory will be needed
310+
std::unique_ptr<GPUSettingsGRP> mGRPSettings; // Global Run Parameters
311+
std::unique_ptr<GPUSettingsDeviceBackend> mDeviceBackendSettings; // Processing Parameters (at constructor level)
312+
std::unique_ptr<GPUSettingsProcessing> mProcessingSettings; // Processing Parameters (at init level)
313+
GPUOutputControl mOutputControl; // Controls the output of the individual components
314+
GPUOutputControl mInputControl; // Prefefined input memory location for reading standalone dumps
315+
std::unique_ptr<GPUMemorySizeScalers> mMemoryScalers; // Scalers how much memory will be needed
306316

307317
GPURecoStepConfiguration mRecoSteps;
308318

@@ -392,35 +402,6 @@ class GPUReconstruction
392402
static GPUReconstruction* GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend& cfg);
393403
};
394404

395-
template <class T>
396-
inline T* GPUReconstruction::AllocateIOMemoryHelper(size_t n, const T*& ptr, std::unique_ptr<T[]>& u)
397-
{
398-
if (n == 0) {
399-
u.reset(nullptr);
400-
return nullptr;
401-
}
402-
T* retVal;
403-
if (mInputControl.useExternal()) {
404-
u.reset(nullptr);
405-
mInputControl.checkCurrent();
406-
GPUProcessor::computePointerWithAlignment(mInputControl.ptrCurrent, retVal, n);
407-
if ((size_t)((char*)mInputControl.ptrCurrent - (char*)mInputControl.ptrBase) > mInputControl.size) {
408-
throw std::bad_alloc();
409-
}
410-
} else {
411-
u.reset(new T[n]);
412-
retVal = u.get();
413-
if (mProcessingSettings.registerStandaloneInputMemory) {
414-
if (registerMemoryForGPU(u.get(), n * sizeof(T))) {
415-
GPUError("Error registering memory for GPU: %p - %ld bytes\n", (void*)u.get(), (int64_t)(n * sizeof(T)));
416-
throw std::bad_alloc();
417-
}
418-
}
419-
}
420-
ptr = retVal;
421-
return retVal;
422-
}
423-
424405
template <class T, typename... Args>
425406
inline T* GPUReconstruction::AddChain(Args... args)
426407
{
@@ -431,31 +412,7 @@ inline T* GPUReconstruction::AddChain(Args... args)
431412
template <class T>
432413
inline int16_t GPUReconstruction::RegisterMemoryAllocation(T* proc, void* (T::*setPtr)(void*), int32_t type, const char* name, const GPUMemoryReuse& re)
433414
{
434-
if (!(type & (GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU))) {
435-
if ((type & GPUMemoryResource::MEMORY_SCRATCH) && !mProcessingSettings.keepDisplayMemory) { // keepAllMemory --> keepDisplayMemory
436-
type |= (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU ? GPUMemoryResource::MEMORY_HOST : GPUMemoryResource::MEMORY_GPU);
437-
} else {
438-
type |= GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU;
439-
}
440-
}
441-
if (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU) {
442-
type &= ~GPUMemoryResource::MEMORY_GPU;
443-
}
444-
mMemoryResources.emplace_back(proc, static_cast<void* (GPUProcessor::*)(void*)>(setPtr), (GPUMemoryResource::MemoryType)type, name);
445-
if (mMemoryResources.size() >= 32768) {
446-
throw std::bad_alloc();
447-
}
448-
uint16_t retVal = mMemoryResources.size() - 1;
449-
if (re.type != GPUMemoryReuse::NONE && !mProcessingSettings.disableMemoryReuse) {
450-
const auto& it = mMemoryReuse1to1.find(re.id);
451-
if (it == mMemoryReuse1to1.end()) {
452-
mMemoryReuse1to1[re.id] = {proc, retVal};
453-
} else {
454-
mMemoryResources[retVal].mReuse = it->second.res[0];
455-
it->second.res.emplace_back(retVal);
456-
}
457-
}
458-
return retVal;
415+
return RegisterMemoryAllocationHelper(proc, static_cast<void* (GPUProcessor::*)(void*)>(setPtr), type, name, re);
459416
}
460417

461418
template <class T>
@@ -471,7 +428,7 @@ inline void GPUReconstruction::SetupGPUProcessor(T* proc, bool allocate)
471428
{
472429
static_assert(sizeof(T) > sizeof(GPUProcessor), "Need to setup derived class");
473430
if (allocate) {
474-
proc->SetMaxData(mHostConstantMem->ioPtrs);
431+
proc->SetMaxData(GetIOPtrs());
475432
}
476433
if (proc->mGPUProcessorType != GPUProcessor::PROCESSOR_TYPE_DEVICE && proc->mLinkedProcessor) {
477434
std::memcpy((void*)proc->mLinkedProcessor, (const void*)proc, sizeof(*proc));

GPU/GPUTracking/Base/GPUReconstructionCPU.cxx

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,18 @@
3030
#include "GPUTRDTrackletLabels.h"
3131
#include "GPUMemoryResource.h"
3232
#include "GPUConstantMem.h"
33+
#include "GPULogging.h"
3334
#include "GPUMemorySizeScalers.h"
35+
#include "GPUReconstructionProcessingKernels.inc"
36+
3437
#include <atomic>
3538
#include <ctime>
3639

37-
#define GPUCA_LOGGING_PRINTF
38-
#include "GPULogging.h"
39-
4040
#ifndef _WIN32
4141
#include <unistd.h>
4242
#endif
4343

4444
using namespace o2::gpu;
45-
using namespace o2::gpu::gpu_reconstruction_kernels;
4645

4746
constexpr GPUReconstructionCPU::krnlRunRange GPUReconstructionCPU::krnlRunRangeNone;
4847
constexpr GPUReconstructionCPU::krnlEvent GPUReconstructionCPU::krnlEventNone;
@@ -67,7 +66,7 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetu
6766
}
6867
int32_t nThreads = getNKernelHostThreads(false);
6968
if (nThreads > 1) {
70-
if (mProcessingSettings.debugLevel >= 5) {
69+
if (GetProcessingSettings().debugLevel >= 5) {
7170
printf("Running %d Threads\n", mThreading->activeThreads->max_concurrency());
7271
}
7372
tbb::this_task_arena::isolate([&] {
@@ -121,14 +120,14 @@ void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs<T, I, Arg
121120
}
122121

123122
template <class S, int32_t I>
124-
gpu_reconstruction_kernels::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu)
123+
GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu)
125124
{
126125
if (gpu == -1) {
127126
gpu = IsGPU();
128127
}
129128
const auto num = GetKernelNum<S, I>();
130129
const auto* p = gpu ? mParDevice : mParCPU;
131-
gpu_reconstruction_kernels::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]};
130+
GPUReconstructionProcessing::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]};
132131
if (ret.nThreads == 0) {
133132
ret.nThreads = gpu ? mThreadCount : 1u;
134133
}
@@ -140,7 +139,7 @@ gpu_reconstruction_kernels::krnlProperties GPUReconstructionCPU::getKernelProper
140139

141140
#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
142141
template void GPUReconstructionCPUBackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
143-
template krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
142+
template GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
144143
#include "GPUReconstructionKernelList.h"
145144
#undef GPUCA_KRNL
146145

@@ -169,7 +168,7 @@ size_t GPUReconstructionCPU::TransferMemoryResourcesHelper(GPUProcessor* proc, i
169168
if (!(res.mType & GPUMemoryResource::MEMORY_GPU) || (res.mType & GPUMemoryResource::MEMORY_CUSTOM_TRANSFER)) {
170169
continue;
171170
}
172-
if (!mProcessingSettings.keepAllMemory && !all && (res.mType & exc) && !(res.mType & inc)) {
171+
if (!GetProcessingSettings().keepAllMemory && !all && (res.mType & exc) && !(res.mType & inc)) {
173172
continue;
174173
}
175174
if (toGPU) {
@@ -197,7 +196,7 @@ int32_t GPUReconstructionCPU::InitDevice()
197196
{
198197
mActiveHostKernelThreads = mMaxHostThreads;
199198
mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
200-
if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
199+
if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
201200
if (mMaster == nullptr) {
202201
if (mDeviceMemorySize > mHostMemorySize) {
203202
mHostMemorySize = mDeviceMemorySize;
@@ -207,7 +206,7 @@ int32_t GPUReconstructionCPU::InitDevice()
207206
mHostMemoryPermanent = mHostMemoryBase;
208207
ClearAllocatedMemory();
209208
}
210-
if (mProcessingSettings.inKernelParallel) {
209+
if (GetProcessingSettings().inKernelParallel) {
211210
mBlockCount = mMaxHostThreads;
212211
}
213212
mProcShadow.mProcessorsProc = processors();
@@ -216,7 +215,7 @@ int32_t GPUReconstructionCPU::InitDevice()
216215

217216
int32_t GPUReconstructionCPU::ExitDevice()
218217
{
219-
if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
218+
if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
220219
if (mMaster == nullptr) {
221220
operator delete(mHostMemoryBase, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
222221
}
@@ -232,13 +231,13 @@ int32_t GPUReconstructionCPU::RunChains()
232231
mStatNEvents++;
233232
mNEventsProcessed++;
234233

235-
if (mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) {
234+
if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) {
236235
printf("Allocated memory when starting processing %34s", "");
237236
PrintMemoryOverview();
238237
}
239238
mTimerTotal.Start();
240239
const std::clock_t cpuTimerStart = std::clock();
241-
if (mProcessingSettings.doublePipeline) {
240+
if (GetProcessingSettings().doublePipeline) {
242241
int32_t retVal = EnqueuePipeline();
243242
if (retVal) {
244243
return retVal;
@@ -259,7 +258,7 @@ int32_t GPUReconstructionCPU::RunChains()
259258
}
260259
mTimerTotal.Stop();
261260
mStatCPUTime += (double)(std::clock() - cpuTimerStart) / CLOCKS_PER_SEC;
262-
if (mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) {
261+
if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) {
263262
printf("Allocated memory when ending processing %36s", "");
264263
PrintMemoryOverview();
265264
}
@@ -281,7 +280,7 @@ int32_t GPUReconstructionCPU::RunChains()
281280
for (int32_t j = 0; j < mTimers[i]->num; j++) {
282281
HighResTimer& timer = mTimers[i]->timer[j];
283282
time += timer.GetElapsedTime();
284-
if (mProcessingSettings.resetTimers) {
283+
if (GetProcessingSettings().resetTimers) {
285284
timer.Reset();
286285
}
287286
}
@@ -297,7 +296,7 @@ int32_t GPUReconstructionCPU::RunChains()
297296
snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count);
298297
}
299298
printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth);
300-
if (mProcessingSettings.resetTimers) {
299+
if (GetProcessingSettings().resetTimers) {
301300
mTimers[i]->count = 0;
302301
mTimers[i]->memSize = 0;
303302
}
@@ -317,7 +316,7 @@ int32_t GPUReconstructionCPU::RunChains()
317316
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", GPUDataTypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents,
318317
mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost);
319318
}
320-
if (mProcessingSettings.resetTimers) {
319+
if (GetProcessingSettings().resetTimers) {
321320
mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0;
322321
mTimersRecoSteps[i].timerToGPU.Reset();
323322
mTimersRecoSteps[i].timerToHost.Reset();
@@ -340,7 +339,7 @@ int32_t GPUReconstructionCPU::RunChains()
340339
} else if (GetProcessingSettings().debugLevel >= 0) {
341340
GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str());
342341
}
343-
if (mProcessingSettings.resetTimers) {
342+
if (GetProcessingSettings().resetTimers) {
344343
mStatNEvents = 0;
345344
mStatCPUTime = 0;
346345
mTimerTotal.Reset();
@@ -366,7 +365,7 @@ void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, cons
366365
if (!((size_t)&param().occupancyTotal - (size_t)&param().occupancyMap == sizeof(param().occupancyMap) && sizeof(param().occupancyMap) == sizeof(size_t) && sizeof(param().occupancyTotal) < sizeof(size_t))) {
367366
throw std::runtime_error("occupancy data not consecutive in GPUParam");
368367
}
369-
const auto threadContext = GetThreadContext();
368+
const auto holdContext = GetThreadContext();
370369
size_t tmp[2] = {(size_t)mapGPU, 0};
371370
memcpy(&tmp[1], &occupancyTotal, sizeof(occupancyTotal));
372371
WriteToConstantMemory((char*)&processors()->param.occupancyMap - (char*)processors(), &tmp, sizeof(param().occupancyMap) + sizeof(param().occupancyTotal), stream);

0 commit comments

Comments
 (0)