AliceO2Group · davidrohr · Apr 23, 2025 · Apr 21, 2025 · Apr 21, 2025 · Apr 22, 2025
@@ -712,31 +712,43 @@ size_t GPUReconstruction::AllocateRegisteredMemory(int16_t ires, GPUOutputContro
 
 void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type)
 {
-  if (type != GPUMemoryResource::MEMORY_HOST && (!IsGPU() || type != GPUMemoryResource::MEMORY_GPU)) {
-    throw std::runtime_error("Requested invalid memory typo for unmanaged allocation");
-  }
   if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
-    mUnmanagedChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]);
-    return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(mUnmanagedChunks.back().get());
-  } else {
-    if (mVolatileMemoryStart && !mDeviceMemoryAsVolatile && (type & GPUMemoryResource::MEMORY_GPU) && !(type & GPUMemoryResource::MEMORY_STACK)) {
-      GPUError("Must not allocate direct memory while volatile chunks are allocated");
-      throw std::bad_alloc();
-    }
-    void*& pool = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPool : mHostMemoryPool;
-    void*& poolend = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPoolEnd : mHostMemoryPoolEnd;
-    char* retVal;
-    GPUProcessor::computePointerWithAlignment(pool, retVal, size);
-    if (pool > poolend) {
-      GPUError("Insufficient unmanaged memory: missing %ld bytes", ptrDiff(pool, poolend));
-      throw std::bad_alloc();
-    }
-    UpdateMaxMemoryUsed();
-    if (GetProcessingSettings().allocDebugLevel >= 2) {
-      std::cout << "Allocated (unmanaged " << (type == GPUMemoryResource::MEMORY_GPU ? "gpu" : "host") << "): " << size << " - available: " << ptrDiff(poolend, pool) << "\n";
+    char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
+    if ((type & GPUMemoryResource::MEMORY_STACK)) {
+      mNonPersistentIndividualDirectAllocations.emplace_back(retVal, alignedDeleter());
+    } else {
+      mDirectMemoryChunks.emplace_back(retVal, alignedDeleter());
     }
     return retVal;
   }
+
+  if ((type & ~(GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK)) || ((type & GPUMemoryResource::MEMORY_HOST) && (type & GPUMemoryResource::MEMORY_GPU))) {
+    throw std::runtime_error("Requested invalid memory typo for direct allocation");
+  }
+  if (mVolatileMemoryStart && !mDeviceMemoryAsVolatile && (type & GPUMemoryResource::MEMORY_GPU) && !(type & GPUMemoryResource::MEMORY_STACK)) {
+    GPUError("Must not allocate direct memory while volatile chunks are allocated");
+    throw std::bad_alloc();
+  }
+
+  void*& pool = (type & GPUMemoryResource::MEMORY_GPU) ? mDeviceMemoryPool : mHostMemoryPool;
+  void*& poolend = (type & GPUMemoryResource::MEMORY_GPU) ? mDeviceMemoryPoolEnd : mHostMemoryPoolEnd;
+  char* retVal;
+  if ((type & GPUMemoryResource::MEMORY_STACK)) {
+    poolend = (char*)poolend - size;
+    poolend = (char*)poolend - GPUProcessor::getAlignmentMod<GPUCA_MEMALIGN>(poolend);
+    retVal = (char*)poolend;
+  } else {
+    GPUProcessor::computePointerWithAlignment(pool, retVal, size);
+  }
+  if (pool > poolend) {
+    GPUError("Insufficient unmanaged memory: missing %ld bytes", ptrDiff(pool, poolend));
+    throw std::bad_alloc();
+  }
+  UpdateMaxMemoryUsed();
+  if (GetProcessingSettings().allocDebugLevel >= 2) {
+    std::cout << "Allocated (unmanaged " << (type == GPUMemoryResource::MEMORY_GPU ? "gpu" : "host") << "): " << size << " - available: " << ptrDiff(poolend, pool) << "\n";
+  }
+  return retVal;
 }
 
 void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size)
@@ -765,8 +777,9 @@ void* GPUReconstruction::AllocateVolatileMemory(size_t size, bool device)
   if (device) {
     return AllocateVolatileDeviceMemory(size);
   }
-  mVolatileChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]);
-  return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(mVolatileChunks.back().get());
+  char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
+  mVolatileChunks.emplace_back(retVal, alignedDeleter());
+  return retVal;
 }
 
 void GPUReconstruction::MakeFutureDeviceMemoryAllocationsVolatile()
@@ -851,7 +864,7 @@ void GPUReconstruction::FreeRegisteredMemory(GPUMemoryResource* res)
 
 void GPUReconstruction::PushNonPersistentMemory(uint64_t tag)
 {
-  mNonPersistentMemoryStack.emplace_back(mHostMemoryPoolEnd, mDeviceMemoryPoolEnd, mNonPersistentIndividualAllocations.size(), tag);
+  mNonPersistentMemoryStack.emplace_back(mHostMemoryPoolEnd, mDeviceMemoryPoolEnd, mNonPersistentIndividualAllocations.size(), mNonPersistentIndividualDirectAllocations.size(), tag);
 }
 
 void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag)
@@ -862,11 +875,11 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag)
   if (mNonPersistentMemoryStack.size() == 0) {
     GPUFatal("Trying to pop memory state from empty stack");
   }
-  if (tag != 0 && std::get<3>(mNonPersistentMemoryStack.back()) != tag) {
-    GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<3>(mNonPersistentMemoryStack.back())).c_str());
+  if (tag != 0 && std::get<4>(mNonPersistentMemoryStack.back()) != tag) {
+    GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str());
   }
   if ((GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) {
-    printf("Allocated memory after %30s (%8s) (Stack %zu): ", GPUDataTypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<3>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size());
+    printf("Allocated memory after %30s (%8s) (Stack %zu): ", GPUDataTypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size());
     PrintMemoryOverview();
     printf("%76s", "");
     PrintMemoryMax();
@@ -882,6 +895,7 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag)
     res->mPtrDevice = nullptr;
   }
   mNonPersistentIndividualAllocations.resize(std::get<2>(mNonPersistentMemoryStack.back()));
+  mNonPersistentIndividualDirectAllocations.resize(std::get<3>(mNonPersistentMemoryStack.back()));
   mNonPersistentMemoryStack.pop_back();
 }
 
@@ -917,9 +931,11 @@ void GPUReconstruction::ClearAllocatedMemory(bool clearOutputs)
       FreeRegisteredMemory(i);
     }
   }
-  mUnmanagedChunks.clear();
   mNonPersistentMemoryStack.clear();
   mNonPersistentIndividualAllocations.clear();
+  mDirectMemoryChunks.clear();
+  mNonPersistentIndividualDirectAllocations.clear();
+  mVolatileChunks.clear();
   mVolatileMemoryStart = nullptr;
   if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) {
     mHostMemoryPool = GPUProcessor::alignPointer<GPUCA_MEMALIGN>(mHostMemoryPermanent);

@@ -69,8 +69,6 @@ class GPUReconstruction
   class LibraryLoader; // These must be the first members to ensure correct destructor order!
   std::shared_ptr<LibraryLoader> mMyLib = nullptr;
   std::vector<GPUMemoryResource> mMemoryResources;
-  std::vector<std::unique_ptr<char[]>> mUnmanagedChunks;
-  std::vector<std::unique_ptr<char[]>> mVolatileChunks;
   std::vector<std::unique_ptr<GPUChain>> mChains;
 
  public:
@@ -373,9 +371,15 @@ class GPUReconstruction
     GPUProcessor* proc = nullptr;
     std::vector<uint16_t> res;
   };
+  struct alignedDeleter {
+    void operator()(void* ptr) { ::operator delete(ptr, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)); };
+  };
   std::unordered_map<GPUMemoryReuse::ID, MemoryReuseMeta> mMemoryReuse1to1;
-  std::vector<std::tuple<void*, void*, size_t, uint64_t>> mNonPersistentMemoryStack;
+  std::vector<std::tuple<void*, void*, size_t, size_t, uint64_t>> mNonPersistentMemoryStack; // hostPoolAddress, devicePoolAddress, individualAllocationCount, directIndividualAllocationCound, tag
   std::vector<GPUMemoryResource*> mNonPersistentIndividualAllocations;
+  std::vector<std::unique_ptr<char[], alignedDeleter>> mNonPersistentIndividualDirectAllocations;
+  std::vector<std::unique_ptr<char[], alignedDeleter>> mDirectMemoryChunks;
+  std::vector<std::unique_ptr<char[], alignedDeleter>> mVolatileChunks;
 
   std::unique_ptr<GPUReconstructionPipelineContext> mPipelineContext;
 

@@ -125,10 +125,10 @@ find_package(O2GPU REQUIRED)
 
 if(GPUCA_CONFIG_ONNX)
   find_package(onnxruntime REQUIRED)
-  if(CUDA_ENABLED AND NOT DEFINED ORT_CUDA_BUILD)
-    set(ORT_CUDA_BUILD ON)
-  elseif(HIP_ENABLED AND NOT DEFINED ORT_ROCM_BUILD)
+  if(HIP_ENABLED AND NOT DEFINED ORT_ROCM_BUILD)
     set(ORT_ROCM_BUILD ON)
+  elseif(CUDA_ENABLED AND NOT DEFINED ORT_CUDA_BUILD)
+    set(ORT_CUDA_BUILD ON)
   endif()
 else()
   set(onnxruntime_FOUND OFF)

@@ -104,7 +104,9 @@ endif()
 # ---------------------------------- CUDA ----------------------------------
 if(ENABLE_CUDA)
   if(CUDA_COMPUTETARGET)
-    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_COMPUTETARGET} CACHE STRING "" FORCE)
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_COMPUTETARGET})
+  else()
+    set(CMAKE_CUDA_ARCHITECTURES 61-virtual)
   endif()
   set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
   set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
@@ -121,11 +123,6 @@ if(ENABLE_CUDA)
       message(STATUS "Using as CUDA GCC version: ${GPUCA_CUDA_GCCBIN}")
       set(CMAKE_CUDA_HOST_COMPILER "${GPUCA_CUDA_GCCBIN}")
     endif()
-    if(CUDA_COMPUTETARGET)
-      set(CMAKE_CUDA_ARCHITECTURES ${CUDA_COMPUTETARGET} CACHE STRING "" FORCE)
-    else()
-      set(CMAKE_CUDA_ARCHITECTURES 61-virtual CACHE STRING "" FORCE)
-    endif()
     enable_language(CUDA)
     get_property(LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
     if (ENABLE_CUDA STREQUAL "AUTO")
@@ -231,19 +228,15 @@ endif()
 # ---------------------------------- HIP ----------------------------------
 if(ENABLE_HIP)
   if(HIP_AMDGPUTARGET)
-    set(CMAKE_HIP_ARCHITECTURES "${HIP_AMDGPUTARGET}" CACHE STRING "" FORCE)
+    set(CMAKE_HIP_ARCHITECTURES "${HIP_AMDGPUTARGET}")
+    set(AMDGPU_TARGETS "${HIP_AMDGPUTARGET}")
   endif()
   if(NOT "$ENV{CMAKE_PREFIX_PATH}" MATCHES "rocm" AND NOT CMAKE_PREFIX_PATH MATCHES "rocm" AND EXISTS "/opt/rocm/lib/cmake/")
     list(APPEND CMAKE_PREFIX_PATH "/opt/rocm/lib/cmake")
   endif()
   if("$ENV{CMAKE_PREFIX_PATH}" MATCHES "rocm" OR CMAKE_PREFIX_PATH MATCHES "rocm")
     set(CMAKE_HIP_STANDARD ${CMAKE_CXX_STANDARD})
     set(CMAKE_HIP_STANDARD_REQUIRED TRUE)
-    if(HIP_AMDGPUTARGET)
-      set(AMDGPU_TARGETS "${HIP_AMDGPUTARGET}" CACHE STRING "AMD GPU targets to compile for" FORCE)
-      set(GPU_TARGETS "${HIP_AMDGPUTARGET}" CACHE STRING "AMD GPU targets to compile for" FORCE)
-      set(CMAKE_HIP_ARCHITECTURES "${HIP_AMDGPUTARGET}" CACHE STRING "AMD GPU targets to compile for" FORCE)
-    endif()
     set(TMP_ROCM_DIR_LIST "${CMAKE_PREFIX_PATH}:$ENV{CMAKE_PREFIX_PATH}")
     string(REPLACE ":" ";" TMP_ROCM_DIR_LIST "${TMP_ROCM_DIR_LIST}")
     list(FILTER TMP_ROCM_DIR_LIST INCLUDE REGEX rocm)