Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 17 additions & 17 deletions GPU/GPUTracking/Base/GPUMemoryResource.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,24 +56,24 @@ class GPUMemoryResource

public:
enum MemoryType {
MEMORY_HOST = 1,
MEMORY_GPU = 2,
MEMORY_INPUT_FLAG = 4,
MEMORY_INPUT = 7,
MEMORY_OUTPUT_FLAG = 8,
MEMORY_OUTPUT = 11,
MEMORY_INOUT = 15,
MEMORY_SCRATCH = 16,
MEMORY_SCRATCH_HOST = 17,
MEMORY_EXTERNAL = 32,
MEMORY_PERMANENT = 64,
MEMORY_CUSTOM = 128,
MEMORY_CUSTOM_TRANSFER = 256,
MEMORY_STACK = 512
MEMORY_HOST = 1, // Memory allocated on host (irrespective of other flags)
MEMORY_GPU = 2, // Memory allocated on GPU (irrespective of other flags)
MEMORY_INPUT_FLAG = 4, // Flag to signal this memory is copied to GPU with TransferMemoryResourcesToGPU, and alike
MEMORY_INPUT = 7, // Input data for GPU has the MEMORY_INPUT_FLAG flat and is allocated on host and GPU
MEMORY_OUTPUT_FLAG = 8, // Flag to signal this memory is copied to Host with TransferMemoryResourcesToHost, and alike
MEMORY_OUTPUT = 11, // Output data for GPU has the MEMORY_OUTPUT_FLAG flat and is allocated on host and GPU
MEMORY_INOUT = 15, // Combination if MEMORY_INPUT and MEMORY_OUTPUT
MEMORY_SCRATCH = 16, // Scratch memory, is allocated only on GPU by default if running on GPU, only on host otherwise, if MEMORY_HOST and MEMORY_GPU flags not set.
MEMORY_SCRATCH_HOST = 17, // Scratch memory only on host
MEMORY_EXTERNAL = 32, // Special flag to signal that memory on host shall not be allocated, but will be provided externally and manually
MEMORY_PERMANENT = 64, // Permanent memory, registered once with AllocateRegisteredPermanentMemory, not per time frame. Only for small sizes!
MEMORY_CUSTOM = 128, // Memory is not allocated automatically with AllocateRegisteredMemory(GPUProcessor), but must be allocated manually via AllocateRegisteredMemory(memoryId)
MEMORY_CUSTOM_TRANSFER = 256, // Memory is not transfered automatically with TransferMemoryResourcesTo, but must be transferred manually with TransferMemoryTo...(memoryId)
MEMORY_STACK = 512 // Use memory from non-persistent stack at the end of the global memory region. Not persistent for full TF. Use PushNonPersistentMemory and PopNonPersistentMemory to release memory from the stack
};
enum AllocationType { ALLOCATION_AUTO = 0,
ALLOCATION_INDIVIDUAL = 1,
ALLOCATION_GLOBAL = 2 };
enum AllocationType { ALLOCATION_AUTO = 0, // --> GLOBAL if GPU is used, INDIVIDUAL otherwise
ALLOCATION_INDIVIDUAL = 1, // Individual memory allocations with malloc (host only)
ALLOCATION_GLOBAL = 2 }; // Allocate memory blocks from large preallocated memory range with internal allocator (host and GPU)

GPUMemoryResource(GPUProcessor* proc, void* (GPUProcessor::*setPtr)(void*), MemoryType type, const char* name = "") : mProcessor(proc), mPtr(nullptr), mPtrDevice(nullptr), mSetPointers(setPtr), mName(name), mSize(0), mOverrideSize(0), mReuse(-1), mType(type)
{
Expand Down
56 changes: 37 additions & 19 deletions GPU/GPUTracking/Base/GPUReconstruction.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,10 @@ size_t GPUReconstruction::AllocateRegisteredPermanentMemory()
if (GetProcessingSettings().debugLevel >= 5) {
GPUInfo("Allocating Permanent Memory");
}
if (mVolatileMemoryStart) {
GPUError("Must not allocate permanent memory while volatile chunks are allocated");
throw std::bad_alloc();
}
int32_t total = 0;
for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
if ((mMemoryResources[i].mType & GPUMemoryResource::MEMORY_PERMANENT) && mMemoryResources[i].mPtr == nullptr) {
Expand Down Expand Up @@ -669,6 +673,10 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res,
GPUError("Device Processor not set (%s)", res->mName);
throw std::bad_alloc();
}
if (mVolatileMemoryStart && !mDeviceMemoryAsVolatile && !(res->mType & GPUMemoryResource::MEMORY_STACK)) {
GPUError("Must not allocate non-stacked device memory while volatile chunks are allocated");
throw std::bad_alloc();
}
size_t size = AllocateRegisteredMemoryHelper(res, res->mPtrDevice, recPool->mDeviceMemoryPool, recPool->mDeviceMemoryBase, recPool->mDeviceMemorySize, &GPUMemoryResource::SetDevicePointers, recPool->mDeviceMemoryPoolEnd, " gpu");

if (!(res->mType & GPUMemoryResource::MEMORY_HOST) || (res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) {
Expand Down Expand Up @@ -702,7 +710,7 @@ size_t GPUReconstruction::AllocateRegisteredMemory(int16_t ires, GPUOutputContro
return res->mReuse >= 0 ? 0 : res->mSize;
}

void* GPUReconstruction::AllocateUnmanagedMemory(size_t size, int32_t type)
void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type)
{
if (type != GPUMemoryResource::MEMORY_HOST && (!IsGPU() || type != GPUMemoryResource::MEMORY_GPU)) {
throw std::runtime_error("Requested invalid memory typo for unmanaged allocation");
Expand All @@ -711,6 +719,10 @@ void* GPUReconstruction::AllocateUnmanagedMemory(size_t size, int32_t type)
mUnmanagedChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]);
return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(mUnmanagedChunks.back().get());
} else {
if (mVolatileMemoryStart && !mDeviceMemoryAsVolatile && (type & GPUMemoryResource::MEMORY_GPU) && !(type & GPUMemoryResource::MEMORY_STACK)) {
GPUError("Must not allocate direct memory while volatile chunks are allocated");
throw std::bad_alloc();
}
void*& pool = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPool : mHostMemoryPool;
void*& poolend = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPoolEnd : mHostMemoryPoolEnd;
char* retVal;
Expand Down Expand Up @@ -745,7 +757,6 @@ void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size)
if (GetProcessingSettings().allocDebugLevel >= 2) {
std::cout << "Allocated (volatile GPU): " << size << " - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
}

return retVal;
}

Expand All @@ -758,6 +769,30 @@ void* GPUReconstruction::AllocateVolatileMemory(size_t size, bool device)
return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(mVolatileChunks.back().get());
}

void GPUReconstruction::MakeFutureDeviceMemoryAllocationsVolatile()
{
mDeviceMemoryAsVolatile = true;
AllocateVolatileDeviceMemory(0);
}

void GPUReconstruction::ReturnVolatileDeviceMemory()
{
mDeviceMemoryAsVolatile = false;
if (mVolatileMemoryStart) {
mDeviceMemoryPool = mVolatileMemoryStart;
mVolatileMemoryStart = nullptr;
}
if (GetProcessingSettings().allocDebugLevel >= 2) {
std::cout << "Freed (volatile GPU) - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
}
}

void GPUReconstruction::ReturnVolatileMemory()
{
ReturnVolatileDeviceMemory();
mVolatileChunks.clear();
}

void GPUReconstruction::ResetRegisteredMemoryPointers(GPUProcessor* proc)
{
for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
Expand Down Expand Up @@ -814,23 +849,6 @@ void GPUReconstruction::FreeRegisteredMemory(GPUMemoryResource* res)
res->mPtrDevice = nullptr;
}

void GPUReconstruction::ReturnVolatileDeviceMemory()
{
if (mVolatileMemoryStart) {
mDeviceMemoryPool = mVolatileMemoryStart;
mVolatileMemoryStart = nullptr;
}
if (GetProcessingSettings().allocDebugLevel >= 2) {
std::cout << "Freed (volatile GPU) - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
}
}

void GPUReconstruction::ReturnVolatileMemory()
{
ReturnVolatileDeviceMemory();
mVolatileChunks.clear();
}

void GPUReconstruction::PushNonPersistentMemory(uint64_t tag)
{
mNonPersistentMemoryStack.emplace_back(mHostMemoryPoolEnd, mDeviceMemoryPoolEnd, mNonPersistentIndividualAllocations.size(), tag);
Expand Down
18 changes: 10 additions & 8 deletions GPU/GPUTracking/Base/GPUReconstruction.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,10 @@ class GPUReconstruction

size_t AllocateRegisteredMemory(int16_t res, GPUOutputControl* control = nullptr);
void AllocateRegisteredForeignMemory(int16_t res, GPUReconstruction* rec, GPUOutputControl* control = nullptr);
void* AllocateUnmanagedMemory(size_t size, int32_t type);
void* AllocateDirectMemory(size_t size, int32_t type);
void* AllocateVolatileDeviceMemory(size_t size);
void* AllocateVolatileMemory(size_t size, bool device);
void MakeFutureDeviceMemoryAllocationsVolatile();
void FreeRegisteredMemory(GPUProcessor* proc, bool freeCustom = false, bool freePermanent = false);
void FreeRegisteredMemory(int16_t res);
void ClearAllocatedMemory(bool clearOutputs = true);
Expand Down Expand Up @@ -326,14 +327,15 @@ class GPUReconstruction
void* mHostMemoryPoolBlocked = nullptr; // Ptr to end of pool
size_t mHostMemorySize = 0; // Size of host memory buffer
size_t mHostMemoryUsedMax = 0; // Maximum host memory size used over time
void* mDeviceMemoryBase = nullptr; //
void* mDeviceMemoryPermanent = nullptr; //
void* mDeviceMemoryPool = nullptr; //
void* mDeviceMemoryPoolEnd = nullptr; //
void* mDeviceMemoryPoolBlocked = nullptr; //
size_t mDeviceMemorySize = 0; //
void* mDeviceMemoryBase = nullptr; // Same for device ...
void* mDeviceMemoryPermanent = nullptr; // ...
void* mDeviceMemoryPool = nullptr; // ...
void* mDeviceMemoryPoolEnd = nullptr; // ...
void* mDeviceMemoryPoolBlocked = nullptr; // ...
size_t mDeviceMemorySize = 0; // ...
size_t mDeviceMemoryUsedMax = 0; // ...
void* mVolatileMemoryStart = nullptr; // Ptr to beginning of temporary volatile memory allocation, nullptr if uninitialized
size_t mDeviceMemoryUsedMax = 0; //
bool mDeviceMemoryAsVolatile = false; // Make device memory allocations volatile

std::unordered_set<const void*> mRegisteredMemoryPtrs; // List of pointers registered for GPU

Expand Down
4 changes: 2 additions & 2 deletions GPU/GPUTracking/Global/GPUChainITS.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator
public:
void* allocate(size_t size) override
{
return mFWReco->AllocateUnmanagedMemory(size, GPUMemoryResource::MEMORY_GPU);
return mFWReco->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU);
}

void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; }
Expand Down Expand Up @@ -86,7 +86,7 @@ o2::its::TimeFrame* GPUChainITS::GetITSTimeframe()
}
#if !defined(GPUCA_STANDALONE)
if (mITSTimeFrame->mIsGPU) {
auto doFWExtAlloc = [this](size_t size) -> void* { return rec()->AllocateUnmanagedMemory(size, GPUMemoryResource::MEMORY_GPU); };
auto doFWExtAlloc = [this](size_t size) -> void* { return rec()->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU); };

mFrameworkAllocator.reset(new o2::its::GPUFrameworkExternalAllocator);
mFrameworkAllocator->setReconstructionFramework(rec());
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ int32_t GPUChainTracking::RunTPCCompression()
}

if (gatherMode == 3) {
mRec->AllocateVolatileDeviceMemory(0); // make future device memory allocation volatile
mRec->MakeFutureDeviceMemoryAllocationsVolatile();
}
SetupGPUProcessor(&Compressor, true);
new (Compressor.mMemory) GPUTPCCompression::memory;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ static inline uint32_t RGB(uint8_t r, uint8_t g, uint8_t b) { return (uint32_t)r
int32_t GPUChainTracking::PrepareProfile()
{
#ifdef GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE
char* tmpMem = (char*)mRec->AllocateUnmanagedMemory(PROFILE_MAX_SIZE, GPUMemoryResource::MEMORY_GPU);
char* tmpMem = (char*)mRec->AllocateDirectMemory(PROFILE_MAX_SIZE, GPUMemoryResource::MEMORY_GPU);
processorsShadow()->tpcTrackers[0].mStageAtSync = tmpMem;
runKernel<GPUMemClean16>({{BlockCount(), ThreadCount(), -1}}, tmpMem, PROFILE_MAX_SIZE);
#endif
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
SynchronizeEventAndRelease(mEvents->single, doGPU);

if (GetProcessingSettings().clearO2OutputFromGPU) {
mRec->AllocateVolatileDeviceMemory(0); // make future device memory allocation volatile
mRec->MakeFutureDeviceMemoryAllocationsVolatile();
}
AllocateRegisteredMemory(Merger.MemoryResOutputO2(), mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::tpcTracksO2)]);
AllocateRegisteredMemory(Merger.MemoryResOutputO2Clus(), mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::tpcTracksO2ClusRefs)]);
Expand Down