Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 44 additions & 28 deletions GPU/GPUTracking/Base/GPUReconstruction.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -712,31 +712,43 @@ size_t GPUReconstruction::AllocateRegisteredMemory(int16_t ires, GPUOutputContro

void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type)
{
if (type != GPUMemoryResource::MEMORY_HOST && (!IsGPU() || type != GPUMemoryResource::MEMORY_GPU)) {
throw std::runtime_error("Requested invalid memory typo for unmanaged allocation");
}
if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
mUnmanagedChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]);
return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(mUnmanagedChunks.back().get());
} else {
if (mVolatileMemoryStart && !mDeviceMemoryAsVolatile && (type & GPUMemoryResource::MEMORY_GPU) && !(type & GPUMemoryResource::MEMORY_STACK)) {
GPUError("Must not allocate direct memory while volatile chunks are allocated");
throw std::bad_alloc();
}
void*& pool = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPool : mHostMemoryPool;
void*& poolend = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPoolEnd : mHostMemoryPoolEnd;
char* retVal;
GPUProcessor::computePointerWithAlignment(pool, retVal, size);
if (pool > poolend) {
GPUError("Insufficient unmanaged memory: missing %ld bytes", ptrDiff(pool, poolend));
throw std::bad_alloc();
}
UpdateMaxMemoryUsed();
if (GetProcessingSettings().allocDebugLevel >= 2) {
std::cout << "Allocated (unmanaged " << (type == GPUMemoryResource::MEMORY_GPU ? "gpu" : "host") << "): " << size << " - available: " << ptrDiff(poolend, pool) << "\n";
char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
if ((type & GPUMemoryResource::MEMORY_STACK)) {
mNonPersistentIndividualDirectAllocations.emplace_back(retVal, alignedDeleter());
} else {
mDirectMemoryChunks.emplace_back(retVal, alignedDeleter());
}
return retVal;
}

if ((type & ~(GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK)) || ((type & GPUMemoryResource::MEMORY_HOST) && (type & GPUMemoryResource::MEMORY_GPU))) {
throw std::runtime_error("Requested invalid memory typo for direct allocation");
}
if (mVolatileMemoryStart && !mDeviceMemoryAsVolatile && (type & GPUMemoryResource::MEMORY_GPU) && !(type & GPUMemoryResource::MEMORY_STACK)) {
GPUError("Must not allocate direct memory while volatile chunks are allocated");
throw std::bad_alloc();
}

void*& pool = (type & GPUMemoryResource::MEMORY_GPU) ? mDeviceMemoryPool : mHostMemoryPool;
void*& poolend = (type & GPUMemoryResource::MEMORY_GPU) ? mDeviceMemoryPoolEnd : mHostMemoryPoolEnd;
char* retVal;
if ((type & GPUMemoryResource::MEMORY_STACK)) {
poolend = (char*)poolend - size;
poolend = (char*)poolend - GPUProcessor::getAlignmentMod<GPUCA_MEMALIGN>(poolend);
retVal = (char*)poolend;
} else {
GPUProcessor::computePointerWithAlignment(pool, retVal, size);
}
if (pool > poolend) {
GPUError("Insufficient unmanaged memory: missing %ld bytes", ptrDiff(pool, poolend));
throw std::bad_alloc();
}
UpdateMaxMemoryUsed();
if (GetProcessingSettings().allocDebugLevel >= 2) {
std::cout << "Allocated (unmanaged " << (type == GPUMemoryResource::MEMORY_GPU ? "gpu" : "host") << "): " << size << " - available: " << ptrDiff(poolend, pool) << "\n";
}
return retVal;
}

void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size)
Expand Down Expand Up @@ -765,8 +777,9 @@ void* GPUReconstruction::AllocateVolatileMemory(size_t size, bool device)
if (device) {
return AllocateVolatileDeviceMemory(size);
}
mVolatileChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]);
return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(mVolatileChunks.back().get());
char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
mVolatileChunks.emplace_back(retVal, alignedDeleter());
return retVal;
}

void GPUReconstruction::MakeFutureDeviceMemoryAllocationsVolatile()
Expand Down Expand Up @@ -851,7 +864,7 @@ void GPUReconstruction::FreeRegisteredMemory(GPUMemoryResource* res)

void GPUReconstruction::PushNonPersistentMemory(uint64_t tag)
{
mNonPersistentMemoryStack.emplace_back(mHostMemoryPoolEnd, mDeviceMemoryPoolEnd, mNonPersistentIndividualAllocations.size(), tag);
mNonPersistentMemoryStack.emplace_back(mHostMemoryPoolEnd, mDeviceMemoryPoolEnd, mNonPersistentIndividualAllocations.size(), mNonPersistentIndividualDirectAllocations.size(), tag);
}

void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag)
Expand All @@ -862,11 +875,11 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag)
if (mNonPersistentMemoryStack.size() == 0) {
GPUFatal("Trying to pop memory state from empty stack");
}
if (tag != 0 && std::get<3>(mNonPersistentMemoryStack.back()) != tag) {
GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<3>(mNonPersistentMemoryStack.back())).c_str());
if (tag != 0 && std::get<4>(mNonPersistentMemoryStack.back()) != tag) {
GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str());
}
if ((GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) {
printf("Allocated memory after %30s (%8s) (Stack %zu): ", GPUDataTypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<3>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size());
printf("Allocated memory after %30s (%8s) (Stack %zu): ", GPUDataTypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size());
PrintMemoryOverview();
printf("%76s", "");
PrintMemoryMax();
Expand All @@ -882,6 +895,7 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag)
res->mPtrDevice = nullptr;
}
mNonPersistentIndividualAllocations.resize(std::get<2>(mNonPersistentMemoryStack.back()));
mNonPersistentIndividualDirectAllocations.resize(std::get<3>(mNonPersistentMemoryStack.back()));
mNonPersistentMemoryStack.pop_back();
}

Expand Down Expand Up @@ -917,9 +931,11 @@ void GPUReconstruction::ClearAllocatedMemory(bool clearOutputs)
FreeRegisteredMemory(i);
}
}
mUnmanagedChunks.clear();
mNonPersistentMemoryStack.clear();
mNonPersistentIndividualAllocations.clear();
mDirectMemoryChunks.clear();
mNonPersistentIndividualDirectAllocations.clear();
mVolatileChunks.clear();
mVolatileMemoryStart = nullptr;
if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
mHostMemoryPool = GPUProcessor::alignPointer<GPUCA_MEMALIGN>(mHostMemoryPermanent);
Expand Down
10 changes: 7 additions & 3 deletions GPU/GPUTracking/Base/GPUReconstruction.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,6 @@ class GPUReconstruction
class LibraryLoader; // These must be the first members to ensure correct destructor order!
std::shared_ptr<LibraryLoader> mMyLib = nullptr;
std::vector<GPUMemoryResource> mMemoryResources;
std::vector<std::unique_ptr<char[]>> mUnmanagedChunks;
std::vector<std::unique_ptr<char[]>> mVolatileChunks;
std::vector<std::unique_ptr<GPUChain>> mChains;

public:
Expand Down Expand Up @@ -373,9 +371,15 @@ class GPUReconstruction
GPUProcessor* proc = nullptr;
std::vector<uint16_t> res;
};
struct alignedDeleter {
void operator()(void* ptr) { ::operator delete(ptr, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)); };
};
std::unordered_map<GPUMemoryReuse::ID, MemoryReuseMeta> mMemoryReuse1to1;
std::vector<std::tuple<void*, void*, size_t, uint64_t>> mNonPersistentMemoryStack;
std::vector<std::tuple<void*, void*, size_t, size_t, uint64_t>> mNonPersistentMemoryStack; // hostPoolAddress, devicePoolAddress, individualAllocationCount, directIndividualAllocationCound, tag
std::vector<GPUMemoryResource*> mNonPersistentIndividualAllocations;
std::vector<std::unique_ptr<char[], alignedDeleter>> mNonPersistentIndividualDirectAllocations;
std::vector<std::unique_ptr<char[], alignedDeleter>> mDirectMemoryChunks;
std::vector<std::unique_ptr<char[], alignedDeleter>> mVolatileChunks;

std::unique_ptr<GPUReconstructionPipelineContext> mPipelineContext;

Expand Down
6 changes: 3 additions & 3 deletions GPU/GPUTracking/Standalone/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,10 @@ find_package(O2GPU REQUIRED)

if(GPUCA_CONFIG_ONNX)
find_package(onnxruntime REQUIRED)
if(CUDA_ENABLED AND NOT DEFINED ORT_CUDA_BUILD)
set(ORT_CUDA_BUILD ON)
elseif(HIP_ENABLED AND NOT DEFINED ORT_ROCM_BUILD)
if(HIP_ENABLED AND NOT DEFINED ORT_ROCM_BUILD)
set(ORT_ROCM_BUILD ON)
elseif(CUDA_ENABLED AND NOT DEFINED ORT_CUDA_BUILD)
set(ORT_CUDA_BUILD ON)
endif()
else()
set(onnxruntime_FOUND OFF)
Expand Down
17 changes: 5 additions & 12 deletions dependencies/FindO2GPU.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,9 @@ endif()
# ---------------------------------- CUDA ----------------------------------
if(ENABLE_CUDA)
if(CUDA_COMPUTETARGET)
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_COMPUTETARGET} CACHE STRING "" FORCE)
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_COMPUTETARGET})
else()
set(CMAKE_CUDA_ARCHITECTURES 61-virtual)
endif()
set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
Expand All @@ -121,11 +123,6 @@ if(ENABLE_CUDA)
message(STATUS "Using as CUDA GCC version: ${GPUCA_CUDA_GCCBIN}")
set(CMAKE_CUDA_HOST_COMPILER "${GPUCA_CUDA_GCCBIN}")
endif()
if(CUDA_COMPUTETARGET)
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_COMPUTETARGET} CACHE STRING "" FORCE)
else()
set(CMAKE_CUDA_ARCHITECTURES 61-virtual CACHE STRING "" FORCE)
endif()
enable_language(CUDA)
get_property(LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
if (ENABLE_CUDA STREQUAL "AUTO")
Expand Down Expand Up @@ -231,19 +228,15 @@ endif()
# ---------------------------------- HIP ----------------------------------
if(ENABLE_HIP)
if(HIP_AMDGPUTARGET)
set(CMAKE_HIP_ARCHITECTURES "${HIP_AMDGPUTARGET}" CACHE STRING "" FORCE)
set(CMAKE_HIP_ARCHITECTURES "${HIP_AMDGPUTARGET}")
set(AMDGPU_TARGETS "${HIP_AMDGPUTARGET}")
endif()
if(NOT "$ENV{CMAKE_PREFIX_PATH}" MATCHES "rocm" AND NOT CMAKE_PREFIX_PATH MATCHES "rocm" AND EXISTS "/opt/rocm/lib/cmake/")
list(APPEND CMAKE_PREFIX_PATH "/opt/rocm/lib/cmake")
endif()
if("$ENV{CMAKE_PREFIX_PATH}" MATCHES "rocm" OR CMAKE_PREFIX_PATH MATCHES "rocm")
set(CMAKE_HIP_STANDARD ${CMAKE_CXX_STANDARD})
set(CMAKE_HIP_STANDARD_REQUIRED TRUE)
if(HIP_AMDGPUTARGET)
set(AMDGPU_TARGETS "${HIP_AMDGPUTARGET}" CACHE STRING "AMD GPU targets to compile for" FORCE)
set(GPU_TARGETS "${HIP_AMDGPUTARGET}" CACHE STRING "AMD GPU targets to compile for" FORCE)
set(CMAKE_HIP_ARCHITECTURES "${HIP_AMDGPUTARGET}" CACHE STRING "AMD GPU targets to compile for" FORCE)
endif()
set(TMP_ROCM_DIR_LIST "${CMAKE_PREFIX_PATH}:$ENV{CMAKE_PREFIX_PATH}")
string(REPLACE ":" ";" TMP_ROCM_DIR_LIST "${TMP_ROCM_DIR_LIST}")
list(FILTER TMP_ROCM_DIR_LIST INCLUDE REGEX rocm)
Expand Down