Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Detectors/TRD/workflow/src/TRDGlobalTrackingSpec.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@
#include "GPUTRDTrackletWord.h"
#include "GPUTRDInterfaces.h"
#include "GPUTRDGeometry.h"
#include "GPUConstantMem.h"
#include "GPUTRDTrackerKernels.h"

#ifdef ENABLE_UPGRADES
#include "ITS3Reconstruction/IOUtils.h"
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Base/GPUProcessor.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

#include "GPUProcessor.h"
#include "GPUReconstruction.h"
#include "GPUReconstructionDeviceBase.h"
#include "GPUSettings.h"

using namespace o2::gpu;

Expand Down
257 changes: 146 additions & 111 deletions GPU/GPUTracking/Base/GPUReconstruction.cxx

Large diffs are not rendered by default.

107 changes: 32 additions & 75 deletions GPU/GPUTracking/Base/GPUReconstruction.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@
#include <unordered_map>
#include <unordered_set>

#include "GPUTRDDef.h"
#include "GPUParam.h"
#include "GPUSettings.h"
#include "GPUOutputControl.h"
#include "GPUDataTypes.h"
#include "GPUMemoryResource.h"
#include "GPUConstantMem.h"
#include "GPULogging.h"
#include "GPUOutputControl.h"

/*#include "GPUParam.h"
#include "GPUSettings.h"
#include "GPULogging.h"*/

namespace o2::its
{
Expand All @@ -49,6 +49,13 @@ struct GPUReconstructionThreading;
class GPUROOTDumpCore;
class ThrustVolatileAllocator;
struct GPUDefParameters;
class GPUMemoryResource;
struct GPUSettingsDeviceBackend;
struct GPUSettingsGRP;
struct GPUSettingsProcessing;
struct GPUSettingsRec;
struct GPUSettingsRecDynamic;
struct GPUMemoryReuse;

namespace gpu_reconstruction_kernels
{
Expand Down Expand Up @@ -186,18 +193,20 @@ class GPUReconstruction
bool slavesExist() { return mSlaves.size() || mMaster; }

// Getters / setters for parameters
DeviceType GetDeviceType() const { return (DeviceType)mDeviceBackendSettings.deviceType; }
DeviceType GetDeviceType() const;
bool IsGPU() const { return GetDeviceType() != DeviceType::INVALID_DEVICE && GetDeviceType() != DeviceType::CPU; }
const GPUParam& GetParam() const { return mHostConstantMem->param; }
const GPUParam& GetParam() const;
const GPUConstantMem& GetConstantMem() const { return *mHostConstantMem; }
const GPUSettingsGRP& GetGRPSettings() const { return mGRPSettings; }
const GPUSettingsDeviceBackend& GetDeviceBackendSettings() { return mDeviceBackendSettings; }
const GPUSettingsProcessing& GetProcessingSettings() const { return mProcessingSettings; }
const GPUTrackingInOutPointers GetIOPtrs() const;
const GPUSettingsGRP& GetGRPSettings() const { return *mGRPSettings; }
const GPUSettingsDeviceBackend& GetDeviceBackendSettings() const { return *mDeviceBackendSettings; }
const GPUSettingsProcessing& GetProcessingSettings() const { return *mProcessingSettings; }
const GPUCalibObjectsConst& GetCalib() const;
bool IsInitialized() const { return mInitialized; }
void SetSettings(float solenoidBzNominalGPU, const GPURecoStepConfiguration* workflow = nullptr);
void SetSettings(const GPUSettingsGRP* grp, const GPUSettingsRec* rec = nullptr, const GPUSettingsProcessing* proc = nullptr, const GPURecoStepConfiguration* workflow = nullptr);
void SetResetTimers(bool reset) { mProcessingSettings.resetTimers = reset; } // May update also after Init()
void SetDebugLevelTmp(int32_t level) { mProcessingSettings.debugLevel = level; } // Temporarily, before calling SetSettings()
void SetResetTimers(bool reset); // May update also after Init()
void SetDebugLevelTmp(int32_t level); // Temporarily, before calling SetSettings()
void UpdateSettings(const GPUSettingsGRP* g, const GPUSettingsProcessing* p = nullptr, const GPUSettingsRecDynamic* d = nullptr);
void UpdateDynamicSettings(const GPUSettingsRecDynamic* d);
void SetOutputControl(const GPUOutputControl& v) { mOutputControl = v; }
Expand Down Expand Up @@ -272,6 +281,7 @@ class GPUReconstruction
size_t ReadData(FILE* fp, const T** entries, S* num, std::unique_ptr<T[]>* mem, InOutPointerType type, T** nonConstPtrs = nullptr);
template <class T>
T* AllocateIOMemoryHelper(size_t n, const T*& ptr, std::unique_ptr<T[]>& u);
int16_t RegisterMemoryAllocationHelper(GPUProcessor* proc, void* (GPUProcessor::*setPtr)(void*), int32_t type, const char* name, const GPUMemoryReuse& re);

// Private helper functions to dump / load flat objects
template <class T>
Expand All @@ -292,17 +302,17 @@ class GPUReconstruction
// Pointers to tracker classes
GPUConstantMem* processors() { return mHostConstantMem.get(); }
const GPUConstantMem* processors() const { return mHostConstantMem.get(); }
GPUParam& param() { return mHostConstantMem->param; }
GPUParam& param();
std::unique_ptr<GPUConstantMem> mHostConstantMem;
GPUConstantMem* mDeviceConstantMem = nullptr;

// Settings
GPUSettingsGRP mGRPSettings; // Global Run Parameters
GPUSettingsDeviceBackend mDeviceBackendSettings; // Processing Parameters (at constructor level)
GPUSettingsProcessing mProcessingSettings; // Processing Parameters (at init level)
GPUOutputControl mOutputControl; // Controls the output of the individual components
GPUOutputControl mInputControl; // Prefefined input memory location for reading standalone dumps
std::unique_ptr<GPUMemorySizeScalers> mMemoryScalers; // Scalers how much memory will be needed
std::unique_ptr<GPUSettingsGRP> mGRPSettings; // Global Run Parameters
std::unique_ptr<GPUSettingsDeviceBackend> mDeviceBackendSettings; // Processing Parameters (at constructor level)
std::unique_ptr<GPUSettingsProcessing> mProcessingSettings; // Processing Parameters (at init level)
GPUOutputControl mOutputControl; // Controls the output of the individual components
GPUOutputControl mInputControl; // Prefefined input memory location for reading standalone dumps
std::unique_ptr<GPUMemorySizeScalers> mMemoryScalers; // Scalers how much memory will be needed

GPURecoStepConfiguration mRecoSteps;

Expand Down Expand Up @@ -392,35 +402,6 @@ class GPUReconstruction
static GPUReconstruction* GPUReconstruction_Create_CPU(const GPUSettingsDeviceBackend& cfg);
};

template <class T>
inline T* GPUReconstruction::AllocateIOMemoryHelper(size_t n, const T*& ptr, std::unique_ptr<T[]>& u)
{
if (n == 0) {
u.reset(nullptr);
return nullptr;
}
T* retVal;
if (mInputControl.useExternal()) {
u.reset(nullptr);
mInputControl.checkCurrent();
GPUProcessor::computePointerWithAlignment(mInputControl.ptrCurrent, retVal, n);
if ((size_t)((char*)mInputControl.ptrCurrent - (char*)mInputControl.ptrBase) > mInputControl.size) {
throw std::bad_alloc();
}
} else {
u.reset(new T[n]);
retVal = u.get();
if (mProcessingSettings.registerStandaloneInputMemory) {
if (registerMemoryForGPU(u.get(), n * sizeof(T))) {
GPUError("Error registering memory for GPU: %p - %ld bytes\n", (void*)u.get(), (int64_t)(n * sizeof(T)));
throw std::bad_alloc();
}
}
}
ptr = retVal;
return retVal;
}

template <class T, typename... Args>
inline T* GPUReconstruction::AddChain(Args... args)
{
Expand All @@ -431,31 +412,7 @@ inline T* GPUReconstruction::AddChain(Args... args)
template <class T>
inline int16_t GPUReconstruction::RegisterMemoryAllocation(T* proc, void* (T::*setPtr)(void*), int32_t type, const char* name, const GPUMemoryReuse& re)
{
if (!(type & (GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU))) {
if ((type & GPUMemoryResource::MEMORY_SCRATCH) && !mProcessingSettings.keepDisplayMemory) { // keepAllMemory --> keepDisplayMemory
type |= (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU ? GPUMemoryResource::MEMORY_HOST : GPUMemoryResource::MEMORY_GPU);
} else {
type |= GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU;
}
}
if (proc->mGPUProcessorType == GPUProcessor::PROCESSOR_TYPE_CPU) {
type &= ~GPUMemoryResource::MEMORY_GPU;
}
mMemoryResources.emplace_back(proc, static_cast<void* (GPUProcessor::*)(void*)>(setPtr), (GPUMemoryResource::MemoryType)type, name);
if (mMemoryResources.size() >= 32768) {
throw std::bad_alloc();
}
uint16_t retVal = mMemoryResources.size() - 1;
if (re.type != GPUMemoryReuse::NONE && !mProcessingSettings.disableMemoryReuse) {
const auto& it = mMemoryReuse1to1.find(re.id);
if (it == mMemoryReuse1to1.end()) {
mMemoryReuse1to1[re.id] = {proc, retVal};
} else {
mMemoryResources[retVal].mReuse = it->second.res[0];
it->second.res.emplace_back(retVal);
}
}
return retVal;
return RegisterMemoryAllocationHelper(proc, static_cast<void* (GPUProcessor::*)(void*)>(setPtr), type, name, re);
}

template <class T>
Expand All @@ -471,7 +428,7 @@ inline void GPUReconstruction::SetupGPUProcessor(T* proc, bool allocate)
{
static_assert(sizeof(T) > sizeof(GPUProcessor), "Need to setup derived class");
if (allocate) {
proc->SetMaxData(mHostConstantMem->ioPtrs);
proc->SetMaxData(GetIOPtrs());
}
if (proc->mGPUProcessorType != GPUProcessor::PROCESSOR_TYPE_DEVICE && proc->mLinkedProcessor) {
std::memcpy((void*)proc->mLinkedProcessor, (const void*)proc, sizeof(*proc));
Expand Down
49 changes: 24 additions & 25 deletions GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,18 @@
#include "GPUTRDTrackletLabels.h"
#include "GPUMemoryResource.h"
#include "GPUConstantMem.h"
#include "GPULogging.h"
#include "GPUMemorySizeScalers.h"
#include "GPUReconstructionProcessingKernels.inc"

#include <atomic>
#include <ctime>

#define GPUCA_LOGGING_PRINTF
#include "GPULogging.h"

#ifndef _WIN32
#include <unistd.h>
#endif

using namespace o2::gpu;
using namespace o2::gpu::gpu_reconstruction_kernels;

constexpr GPUReconstructionCPU::krnlRunRange GPUReconstructionCPU::krnlRunRangeNone;
constexpr GPUReconstructionCPU::krnlEvent GPUReconstructionCPU::krnlEventNone;
Expand All @@ -55,7 +54,7 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
}

template <class T, int32_t I, typename... Args>
inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
inline void GPUReconstructionCPU::runKernelBackendInternal(const krnlSetupTime& _xyz, const Args&... args)
{
auto& x = _xyz.x;
auto& y = _xyz.y;
Expand All @@ -67,7 +66,7 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetu
}
int32_t nThreads = getNKernelHostThreads(false);
if (nThreads > 1) {
if (mProcessingSettings.debugLevel >= 5) {
if (GetProcessingSettings().debugLevel >= 5) {
printf("Running %d Threads\n", mThreading->activeThreads->max_concurrency());
}
tbb::this_task_arena::isolate([&] {
Expand All @@ -89,7 +88,7 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetu
}

template <>
inline void GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
inline void GPUReconstructionCPU::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
{
int32_t nThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));
if (nThreads > 1) {
Expand All @@ -110,7 +109,7 @@ inline void GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16,
}

template <class T, int32_t I, typename... Args>
void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
void GPUReconstructionCPU::runKernelBackend(const krnlSetupArgs<T, I, Args...>& args)
{
#pragma GCC diagnostic push
#if defined(__clang__)
Expand All @@ -121,14 +120,14 @@ void GPUReconstructionCPUBackend::runKernelBackend(const krnlSetupArgs<T, I, Arg
}

template <class S, int32_t I>
gpu_reconstruction_kernels::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu)
GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties(int gpu)
{
if (gpu == -1) {
gpu = IsGPU();
}
const auto num = GetKernelNum<S, I>();
const auto* p = gpu ? mParDevice : mParCPU;
gpu_reconstruction_kernels::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]};
GPUReconstructionProcessing::krnlProperties ret = {p->par_LB_maxThreads[num], p->par_LB_minBlocks[num], p->par_LB_forceBlocks[num]};
if (ret.nThreads == 0) {
ret.nThreads = gpu ? mThreadCount : 1u;
}
Expand All @@ -138,9 +137,9 @@ gpu_reconstruction_kernels::krnlProperties GPUReconstructionCPU::getKernelProper
return ret;
}

#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
template void GPUReconstructionCPUBackend::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
template krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
#define GPUCA_KRNL(x_class, x_attributes, x_arguments, x_forward, x_types, ...) \
template void GPUReconstructionCPU::runKernelBackend<GPUCA_M_KRNL_TEMPLATE(x_class)>(const krnlSetupArgs<GPUCA_M_KRNL_TEMPLATE(x_class) GPUCA_M_STRIP(x_types)>& args); \
template GPUReconstructionProcessing::krnlProperties GPUReconstructionCPU::getKernelProperties<GPUCA_M_KRNL_TEMPLATE(x_class)>(int gpu);
#include "GPUReconstructionKernelList.h"
#undef GPUCA_KRNL

Expand Down Expand Up @@ -169,7 +168,7 @@ size_t GPUReconstructionCPU::TransferMemoryResourcesHelper(GPUProcessor* proc, i
if (!(res.mType & GPUMemoryResource::MEMORY_GPU) || (res.mType & GPUMemoryResource::MEMORY_CUSTOM_TRANSFER)) {
continue;
}
if (!mProcessingSettings.keepAllMemory && !all && (res.mType & exc) && !(res.mType & inc)) {
if (!GetProcessingSettings().keepAllMemory && !all && (res.mType & exc) && !(res.mType & inc)) {
continue;
}
if (toGPU) {
Expand Down Expand Up @@ -197,7 +196,7 @@ int32_t GPUReconstructionCPU::InitDevice()
{
mActiveHostKernelThreads = mMaxHostThreads;
mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
if (mMaster == nullptr) {
if (mDeviceMemorySize > mHostMemorySize) {
mHostMemorySize = mDeviceMemorySize;
Expand All @@ -207,7 +206,7 @@ int32_t GPUReconstructionCPU::InitDevice()
mHostMemoryPermanent = mHostMemoryBase;
ClearAllocatedMemory();
}
if (mProcessingSettings.inKernelParallel) {
if (GetProcessingSettings().inKernelParallel) {
mBlockCount = mMaxHostThreads;
}
mProcShadow.mProcessorsProc = processors();
Expand All @@ -216,7 +215,7 @@ int32_t GPUReconstructionCPU::InitDevice()

int32_t GPUReconstructionCPU::ExitDevice()
{
if (mProcessingSettings.memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
if (mMaster == nullptr) {
operator delete(mHostMemoryBase, std::align_val_t(GPUCA_BUFFER_ALIGNMENT));
}
Expand All @@ -232,13 +231,13 @@ int32_t GPUReconstructionCPU::RunChains()
mStatNEvents++;
mNEventsProcessed++;

if (mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) {
if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) {
printf("Allocated memory when starting processing %34s", "");
PrintMemoryOverview();
}
mTimerTotal.Start();
const std::clock_t cpuTimerStart = std::clock();
if (mProcessingSettings.doublePipeline) {
if (GetProcessingSettings().doublePipeline) {
int32_t retVal = EnqueuePipeline();
if (retVal) {
return retVal;
Expand All @@ -259,7 +258,7 @@ int32_t GPUReconstructionCPU::RunChains()
}
mTimerTotal.Stop();
mStatCPUTime += (double)(std::clock() - cpuTimerStart) / CLOCKS_PER_SEC;
if (mProcessingSettings.debugLevel >= 3 || mProcessingSettings.allocDebugLevel) {
if (GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) {
printf("Allocated memory when ending processing %36s", "");
PrintMemoryOverview();
}
Expand All @@ -281,7 +280,7 @@ int32_t GPUReconstructionCPU::RunChains()
for (int32_t j = 0; j < mTimers[i]->num; j++) {
HighResTimer& timer = mTimers[i]->timer[j];
time += timer.GetElapsedTime();
if (mProcessingSettings.resetTimers) {
if (GetProcessingSettings().resetTimers) {
timer.Reset();
}
}
Expand All @@ -297,7 +296,7 @@ int32_t GPUReconstructionCPU::RunChains()
snprintf(bandwidth, 256, " (%8.3f GB/s - %'14zu bytes - %'14zu per call)", mTimers[i]->memSize / time * 1e-9, mTimers[i]->memSize / mStatNEvents, mTimers[i]->memSize / mStatNEvents / mTimers[i]->count);
}
printf("Execution Time: Task (%c %8ux): %50s Time: %'10.0f us%s\n", type == 0 ? 'K' : 'C', mTimers[i]->count, mTimers[i]->name.c_str(), time * 1000000 / mStatNEvents, bandwidth);
if (mProcessingSettings.resetTimers) {
if (GetProcessingSettings().resetTimers) {
mTimers[i]->count = 0;
mTimers[i]->memSize = 0;
}
Expand All @@ -317,7 +316,7 @@ int32_t GPUReconstructionCPU::RunChains()
printf("Execution Time: Step (D %8ux): %11s %38s Time: %'10.0f us (%8.3f GB/s - %'14zu bytes - %'14zu per call)\n", mTimersRecoSteps[i].countToHost, "DMA to Host", GPUDataTypes::RECO_STEP_NAMES[i], mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1000000 / mStatNEvents,
mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].timerToHost.GetElapsedTime() * 1e-9, mTimersRecoSteps[i].bytesToHost / mStatNEvents, mTimersRecoSteps[i].bytesToHost / mTimersRecoSteps[i].countToHost);
}
if (mProcessingSettings.resetTimers) {
if (GetProcessingSettings().resetTimers) {
mTimersRecoSteps[i].bytesToGPU = mTimersRecoSteps[i].bytesToHost = 0;
mTimersRecoSteps[i].timerToGPU.Reset();
mTimersRecoSteps[i].timerToHost.Reset();
Expand All @@ -340,7 +339,7 @@ int32_t GPUReconstructionCPU::RunChains()
} else if (GetProcessingSettings().debugLevel >= 0) {
GPUInfo("Total Wall Time: %10.0f us%s", mStatWallTime, nEventReport.c_str());
}
if (mProcessingSettings.resetTimers) {
if (GetProcessingSettings().resetTimers) {
mStatNEvents = 0;
mStatCPUTime = 0;
mTimerTotal.Reset();
Expand All @@ -366,7 +365,7 @@ void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, cons
if (!((size_t)&param().occupancyTotal - (size_t)&param().occupancyMap == sizeof(param().occupancyMap) && sizeof(param().occupancyMap) == sizeof(size_t) && sizeof(param().occupancyTotal) < sizeof(size_t))) {
throw std::runtime_error("occupancy data not consecutive in GPUParam");
}
const auto threadContext = GetThreadContext();
const auto holdContext = GetThreadContext();
size_t tmp[2] = {(size_t)mapGPU, 0};
memcpy(&tmp[1], &occupancyTotal, sizeof(occupancyTotal));
WriteToConstantMemory((char*)&processors()->param.occupancyMap - (char*)processors(), &tmp, sizeof(param().occupancyMap) + sizeof(param().occupancyTotal), stream);
Expand Down
Loading