GPU: Make memory allocation and freeing of individual stacked memory thread-safe

davidrohr · davidrohr · commit ce00c8b7b924 · 2025-10-18T09:20:10.000+02:00
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -40,6 +40,7 @@
 
 #include "GPULogging.h"
 #include "utils/strtag.h"
+#include "utils/stdspinlock.h"
 
 #ifdef GPUCA_O2_LIB
 #include "GPUO2InterfaceConfiguration.h"
@@ -589,6 +590,7 @@ size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res,
     throw std::bad_alloc();
   }
   size_t retVal;
+  stdspinlock spinlock(mMemoryMutex);
   if ((res->mType & GPUMemoryResource::MEMORY_STACK) && memorypoolend) {
     retVal = ptrDiff((res->*setPtr)((char*)1), (char*)(1));
     memorypoolend = (void*)((char*)memorypoolend - GPUProcessor::getAlignmentMod<GPUCA_MEMALIGN>(memorypoolend));
@@ -642,6 +644,7 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res,
         std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << " (individual" << ((res->mType & GPUMemoryResource::MEMORY_STACK) ? " stack" : "") << ")\n";
       }
       if (res->mType & GPUMemoryResource::MEMORY_STACK) {
+        stdspinlock spinlock(mMemoryMutex);
         mNonPersistentIndividualAllocations.emplace_back(res);
       }
       if ((size_t)res->mPtr % GPUCA_BUFFER_ALIGNMENT) {
@@ -722,6 +725,7 @@ size_t GPUReconstruction::AllocateRegisteredMemory(int16_t ires, GPUOutputContro
 
 void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type)
 {
+  stdspinlock spinlock(mMemoryMutex);
   if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
     char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
     if ((type & GPUMemoryResource::MEMORY_STACK)) {
@@ -763,6 +767,7 @@ void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type)
 
 void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size)
 {
+  stdspinlock spinlock(mMemoryMutex);
   if (mVolatileMemoryStart == nullptr) {
     mVolatileMemoryStart = mDeviceMemoryPool;
   }
@@ -788,6 +793,7 @@ void* GPUReconstruction::AllocateVolatileMemory(size_t size, bool device)
     return AllocateVolatileDeviceMemory(size);
   }
   char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];
+  stdspinlock spinlock(mMemoryMutex);
   mVolatileChunks.emplace_back(retVal, alignedDeleter());
   return retVal;
 }
@@ -912,6 +918,7 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag, cons
     res->mPtrDevice = nullptr;
   }
   if (!proc) {
+    stdspinlock spinlock(mMemoryMutex);
     mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back());
     mDeviceMemoryPoolEnd = std::get<1>(mNonPersistentMemoryStack.back());
     mNonPersistentIndividualAllocations.resize(std::get<2>(mNonPersistentMemoryStack.back()));
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -25,6 +25,7 @@
 #include <functional>
 #include <unordered_map>
 #include <unordered_set>
+#include <atomic>
 
 #include "GPUDataTypes.h"
 #include "GPUMemoryResource.h"
@@ -390,6 +391,7 @@ class GPUReconstruction
   std::vector<std::unique_ptr<char[], alignedDeleter>> mNonPersistentIndividualDirectAllocations;
   std::vector<std::unique_ptr<char[], alignedDeleter>> mDirectMemoryChunks;
   std::vector<std::unique_ptr<char[], alignedDeleter>> mVolatileChunks;
+  std::atomic_flag mMemoryMutex = ATOMIC_FLAG_INIT;
 
   std::unique_ptr<GPUReconstructionPipelineContext> mPipelineContext;
 
diff --git a/GPU/GPUTracking/utils/stdspinlock.h b/GPU/GPUTracking/utils/stdspinlock.h
@@ -0,0 +1,44 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file stdspinlock.h
+/// \author David Rohr
+
+#ifndef Q_STDSPINLOCK_H
+#define Q_STDSPINLOCK_H
+
+#include <atomic>
+
+class stdspinlock
+{
+ public:
+  stdspinlock(std::atomic_flag& flag) : mFlag(&flag)
+  {
+    while (flag.test_and_set(std::memory_order_acquire)) {
+    }
+  }
+  void release()
+  {
+    if (mFlag) {
+      mFlag->clear(std::memory_order_release);
+      mFlag = nullptr;
+    }
+  }
+  ~stdspinlock()
+  {
+    release();
+  }
+
+ private:
+  std::atomic_flag* mFlag;
+};
+
+#endif // Q_STDSPINLOCK_H

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,7 @@`
`40`	`40`
`41`	`41`	`#include "GPULogging.h"`
`42`	`42`	`#include "utils/strtag.h"`
	`43`	`+#include "utils/stdspinlock.h"`
`43`	`44`
`44`	`45`	`#ifdef GPUCA_O2_LIB`
`45`	`46`	`#include "GPUO2InterfaceConfiguration.h"`
`@@ -589,6 +590,7 @@ size_t GPUReconstruction::AllocateRegisteredMemoryHelper(GPUMemoryResource* res,`
`589`	`590`	`throw std::bad_alloc();`
`590`	`591`	`}`
`591`	`592`	`size_t retVal;`
	`593`	`+ stdspinlock spinlock(mMemoryMutex);`
`592`	`594`	`if ((res->mType & GPUMemoryResource::MEMORY_STACK) && memorypoolend) {`
`593`	`595`	`retVal = ptrDiff((res->setPtr)((char)1), (char*)(1));`
`594`	`596`	`memorypoolend = (void)((char)memorypoolend - GPUProcessor::getAlignmentMod<GPUCA_MEMALIGN>(memorypoolend));`
`@@ -642,6 +644,7 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res,`
`642`	`644`	`std::cout << (res->mReuse >= 0 ? "Reused " : "Allocated ") << res->mName << ": " << res->mSize << " (individual" << ((res->mType & GPUMemoryResource::MEMORY_STACK) ? " stack" : "") << ")\n";`
`643`	`645`	`}`
`644`	`646`	`if (res->mType & GPUMemoryResource::MEMORY_STACK) {`
	`647`	`+ stdspinlock spinlock(mMemoryMutex);`
`645`	`648`	`mNonPersistentIndividualAllocations.emplace_back(res);`
`646`	`649`	`}`
`647`	`650`	`if ((size_t)res->mPtr % GPUCA_BUFFER_ALIGNMENT) {`
`@@ -722,6 +725,7 @@ size_t GPUReconstruction::AllocateRegisteredMemory(int16_t ires, GPUOutputContro`
`722`	`725`
`723`	`726`	`void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type)`
`724`	`727`	`{`
	`728`	`+ stdspinlock spinlock(mMemoryMutex);`
`725`	`729`	`if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {`
`726`	`730`	`char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];`
`727`	`731`	`if ((type & GPUMemoryResource::MEMORY_STACK)) {`
`@@ -763,6 +767,7 @@ void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type)`
`763`	`767`
`764`	`768`	`void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size)`
`765`	`769`	`{`
	`770`	`+ stdspinlock spinlock(mMemoryMutex);`
`766`	`771`	`if (mVolatileMemoryStart == nullptr) {`
`767`	`772`	`mVolatileMemoryStart = mDeviceMemoryPool;`
`768`	`773`	`}`
`@@ -788,6 +793,7 @@ void* GPUReconstruction::AllocateVolatileMemory(size_t size, bool device)`
`788`	`793`	`return AllocateVolatileDeviceMemory(size);`
`789`	`794`	`}`
`790`	`795`	`char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size];`
	`796`	`+ stdspinlock spinlock(mMemoryMutex);`
`791`	`797`	`mVolatileChunks.emplace_back(retVal, alignedDeleter());`
`792`	`798`	`return retVal;`
`793`	`799`	`}`
`@@ -912,6 +918,7 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag, cons`
`912`	`918`	`res->mPtrDevice = nullptr;`
`913`	`919`	`}`
`914`	`920`	`if (!proc) {`
	`921`	`+ stdspinlock spinlock(mMemoryMutex);`
`915`	`922`	`mHostMemoryPoolEnd = std::get<0>(mNonPersistentMemoryStack.back());`
`916`	`923`	`mDeviceMemoryPoolEnd = std::get<1>(mNonPersistentMemoryStack.back());`
`917`	`924`	`mNonPersistentIndividualAllocations.resize(std::get<2>(mNonPersistentMemoryStack.back()));`