From a10834678d0bb47e6469fa31445ac5ab01d2d54a Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Mon, 21 Apr 2025 10:30:29 +0200
Subject: [PATCH] GPU: Add protections not to do invalid memory allocations
 while volatile memory is allocated

---
 GPU/GPUTracking/Base/GPUMemoryResource.h      | 34 +++++------
 GPU/GPUTracking/Base/GPUReconstruction.cxx    | 56 ++++++++++++-------
 GPU/GPUTracking/Base/GPUReconstruction.h      | 18 +++---
 GPU/GPUTracking/Global/GPUChainITS.cxx        |  4 +-
 .../Global/GPUChainTrackingCompression.cxx    |  2 +-
 .../GPUChainTrackingDebugAndProfiling.cxx     |  2 +-
 .../Global/GPUChainTrackingMerger.cxx         |  2 +-
 7 files changed, 69 insertions(+), 49 deletions(-)

diff --git a/GPU/GPUTracking/Base/GPUMemoryResource.h b/GPU/GPUTracking/Base/GPUMemoryResource.h
index 06e350db0bfc7..947bcac504733 100644
--- a/GPU/GPUTracking/Base/GPUMemoryResource.h
+++ b/GPU/GPUTracking/Base/GPUMemoryResource.h
@@ -56,24 +56,24 @@ class GPUMemoryResource
 
  public:
   enum MemoryType {
-    MEMORY_HOST = 1,
-    MEMORY_GPU = 2,
-    MEMORY_INPUT_FLAG = 4,
-    MEMORY_INPUT = 7,
-    MEMORY_OUTPUT_FLAG = 8,
-    MEMORY_OUTPUT = 11,
-    MEMORY_INOUT = 15,
-    MEMORY_SCRATCH = 16,
-    MEMORY_SCRATCH_HOST = 17,
-    MEMORY_EXTERNAL = 32,
-    MEMORY_PERMANENT = 64,
-    MEMORY_CUSTOM = 128,
-    MEMORY_CUSTOM_TRANSFER = 256,
-    MEMORY_STACK = 512
+    MEMORY_HOST = 1,              // Memory allocated on host (irrespective of other flags)
+    MEMORY_GPU = 2,               // Memory allocated on GPU (irrespective of other flags)
+    MEMORY_INPUT_FLAG = 4,        // Flag to signal this memory is copied to GPU with TransferMemoryResourcesToGPU, and alike
+    MEMORY_INPUT = 7,             // Input data for GPU has the MEMORY_INPUT_FLAG flat and is allocated on host and GPU
+    MEMORY_OUTPUT_FLAG = 8,       // Flag to signal this memory is copied to Host with TransferMemoryResourcesToHost, and alike
+    MEMORY_OUTPUT = 11,           // Output data for GPU has the MEMORY_OUTPUT_FLAG flat and is allocated on host and GPU
+    MEMORY_INOUT = 15,            // Combination if MEMORY_INPUT and MEMORY_OUTPUT
+    MEMORY_SCRATCH = 16,          // Scratch memory, is allocated only on GPU by default if running on GPU, only on host otherwise, if MEMORY_HOST and MEMORY_GPU flags not set.
+    MEMORY_SCRATCH_HOST = 17,     // Scratch memory only on host
+    MEMORY_EXTERNAL = 32,         // Special flag to signal that memory on host shall not be allocated, but will be provided externally and manually
+    MEMORY_PERMANENT = 64,        // Permanent memory, registered once with AllocateRegisteredPermanentMemory, not per time frame. Only for small sizes!
+    MEMORY_CUSTOM = 128,          // Memory is not allocated automatically with AllocateRegisteredMemory(GPUProcessor), but must be allocated manually via AllocateRegisteredMemory(memoryId)
+    MEMORY_CUSTOM_TRANSFER = 256, // Memory is not transfered automatically with TransferMemoryResourcesTo, but must be transferred manually with TransferMemoryTo...(memoryId)
+    MEMORY_STACK = 512            // Use memory from non-persistent stack at the end of the global memory region. Not persistent for full TF. Use PushNonPersistentMemory and PopNonPersistentMemory to release memory from the stack
   };
-  enum AllocationType { ALLOCATION_AUTO = 0,
-                        ALLOCATION_INDIVIDUAL = 1,
-                        ALLOCATION_GLOBAL = 2 };
+  enum AllocationType { ALLOCATION_AUTO = 0,       // --> GLOBAL if GPU is used, INDIVIDUAL otherwise
+                        ALLOCATION_INDIVIDUAL = 1, // Individual memory allocations with malloc (host only)
+                        ALLOCATION_GLOBAL = 2 };   // Allocate memory blocks from large preallocated memory range with internal allocator (host and GPU)
 
   GPUMemoryResource(GPUProcessor* proc, void* (GPUProcessor::*setPtr)(void*), MemoryType type, const char* name = "") : mProcessor(proc), mPtr(nullptr), mPtrDevice(nullptr), mSetPointers(setPtr), mName(name), mSize(0), mOverrideSize(0), mReuse(-1), mType(type)
   {
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index c79c743e96ce5..ab2210e5dd555 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -538,6 +538,10 @@ size_t GPUReconstruction::AllocateRegisteredPermanentMemory()
   if (GetProcessingSettings().debugLevel >= 5) {
     GPUInfo("Allocating Permanent Memory");
   }
+  if (mVolatileMemoryStart) {
+    GPUError("Must not allocate permanent memory while volatile chunks are allocated");
+    throw std::bad_alloc();
+  }
   int32_t total = 0;
   for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
     if ((mMemoryResources[i].mType & GPUMemoryResource::MEMORY_PERMANENT) && mMemoryResources[i].mPtr == nullptr) {
@@ -669,6 +673,10 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res,
         GPUError("Device Processor not set (%s)", res->mName);
         throw std::bad_alloc();
       }
+      if (mVolatileMemoryStart && !mDeviceMemoryAsVolatile && !(res->mType & GPUMemoryResource::MEMORY_STACK)) {
+        GPUError("Must not allocate non-stacked device memory while volatile chunks are allocated");
+        throw std::bad_alloc();
+      }
       size_t size = AllocateRegisteredMemoryHelper(res, res->mPtrDevice, recPool->mDeviceMemoryPool, recPool->mDeviceMemoryBase, recPool->mDeviceMemorySize, &GPUMemoryResource::SetDevicePointers, recPool->mDeviceMemoryPoolEnd, " gpu");
 
       if (!(res->mType & GPUMemoryResource::MEMORY_HOST) || (res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) {
@@ -702,7 +710,7 @@ size_t GPUReconstruction::AllocateRegisteredMemory(int16_t ires, GPUOutputContro
   return res->mReuse >= 0 ? 0 : res->mSize;
 }
 
-void* GPUReconstruction::AllocateUnmanagedMemory(size_t size, int32_t type)
+void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type)
 {
   if (type != GPUMemoryResource::MEMORY_HOST && (!IsGPU() || type != GPUMemoryResource::MEMORY_GPU)) {
     throw std::runtime_error("Requested invalid memory typo for unmanaged allocation");
@@ -711,6 +719,10 @@ void* GPUReconstruction::AllocateUnmanagedMemory(size_t size, int32_t type)
     mUnmanagedChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]);
     return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(mUnmanagedChunks.back().get());
   } else {
+    if (mVolatileMemoryStart && !mDeviceMemoryAsVolatile && (type & GPUMemoryResource::MEMORY_GPU) && !(type & GPUMemoryResource::MEMORY_STACK)) {
+      GPUError("Must not allocate direct memory while volatile chunks are allocated");
+      throw std::bad_alloc();
+    }
     void*& pool = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPool : mHostMemoryPool;
     void*& poolend = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPoolEnd : mHostMemoryPoolEnd;
     char* retVal;
@@ -745,7 +757,6 @@ void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size)
   if (GetProcessingSettings().allocDebugLevel >= 2) {
     std::cout << "Allocated (volatile GPU): " << size << " - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
   }
-
   return retVal;
 }
 
@@ -758,6 +769,30 @@ void* GPUReconstruction::AllocateVolatileMemory(size_t size, bool device)
   return GPUProcessor::alignPointer<GPUCA_BUFFER_ALIGNMENT>(mVolatileChunks.back().get());
 }
 
+void GPUReconstruction::MakeFutureDeviceMemoryAllocationsVolatile()
+{
+  mDeviceMemoryAsVolatile = true;
+  AllocateVolatileDeviceMemory(0);
+}
+
+void GPUReconstruction::ReturnVolatileDeviceMemory()
+{
+  mDeviceMemoryAsVolatile = false;
+  if (mVolatileMemoryStart) {
+    mDeviceMemoryPool = mVolatileMemoryStart;
+    mVolatileMemoryStart = nullptr;
+  }
+  if (GetProcessingSettings().allocDebugLevel >= 2) {
+    std::cout << "Freed (volatile GPU) - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
+  }
+}
+
+void GPUReconstruction::ReturnVolatileMemory()
+{
+  ReturnVolatileDeviceMemory();
+  mVolatileChunks.clear();
+}
+
 void GPUReconstruction::ResetRegisteredMemoryPointers(GPUProcessor* proc)
 {
   for (uint32_t i = 0; i < mMemoryResources.size(); i++) {
@@ -814,23 +849,6 @@ void GPUReconstruction::FreeRegisteredMemory(GPUMemoryResource* res)
   res->mPtrDevice = nullptr;
 }
 
-void GPUReconstruction::ReturnVolatileDeviceMemory()
-{
-  if (mVolatileMemoryStart) {
-    mDeviceMemoryPool = mVolatileMemoryStart;
-    mVolatileMemoryStart = nullptr;
-  }
-  if (GetProcessingSettings().allocDebugLevel >= 2) {
-    std::cout << "Freed (volatile GPU) - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n";
-  }
-}
-
-void GPUReconstruction::ReturnVolatileMemory()
-{
-  ReturnVolatileDeviceMemory();
-  mVolatileChunks.clear();
-}
-
 void GPUReconstruction::PushNonPersistentMemory(uint64_t tag)
 {
   mNonPersistentMemoryStack.emplace_back(mHostMemoryPoolEnd, mDeviceMemoryPoolEnd, mNonPersistentIndividualAllocations.size(), tag);
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
index b6256f7f8ad82..396a007761fb7 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.h
+++ b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -166,9 +166,10 @@ class GPUReconstruction
 
   size_t AllocateRegisteredMemory(int16_t res, GPUOutputControl* control = nullptr);
   void AllocateRegisteredForeignMemory(int16_t res, GPUReconstruction* rec, GPUOutputControl* control = nullptr);
-  void* AllocateUnmanagedMemory(size_t size, int32_t type);
+  void* AllocateDirectMemory(size_t size, int32_t type);
   void* AllocateVolatileDeviceMemory(size_t size);
   void* AllocateVolatileMemory(size_t size, bool device);
+  void MakeFutureDeviceMemoryAllocationsVolatile();
   void FreeRegisteredMemory(GPUProcessor* proc, bool freeCustom = false, bool freePermanent = false);
   void FreeRegisteredMemory(int16_t res);
   void ClearAllocatedMemory(bool clearOutputs = true);
@@ -326,14 +327,15 @@ class GPUReconstruction
   void* mHostMemoryPoolBlocked = nullptr;   // Ptr to end of pool
   size_t mHostMemorySize = 0;               // Size of host memory buffer
   size_t mHostMemoryUsedMax = 0;            // Maximum host memory size used over time
-  void* mDeviceMemoryBase = nullptr;        //
-  void* mDeviceMemoryPermanent = nullptr;   //
-  void* mDeviceMemoryPool = nullptr;        //
-  void* mDeviceMemoryPoolEnd = nullptr;     //
-  void* mDeviceMemoryPoolBlocked = nullptr; //
-  size_t mDeviceMemorySize = 0;             //
+  void* mDeviceMemoryBase = nullptr;        // Same for device ...
+  void* mDeviceMemoryPermanent = nullptr;   // ...
+  void* mDeviceMemoryPool = nullptr;        // ...
+  void* mDeviceMemoryPoolEnd = nullptr;     // ...
+  void* mDeviceMemoryPoolBlocked = nullptr; // ...
+  size_t mDeviceMemorySize = 0;             // ...
+  size_t mDeviceMemoryUsedMax = 0;          // ...
   void* mVolatileMemoryStart = nullptr;     // Ptr to beginning of temporary volatile memory allocation, nullptr if uninitialized
-  size_t mDeviceMemoryUsedMax = 0;          //
+  bool mDeviceMemoryAsVolatile = false;     // Make device memory allocations volatile
 
   std::unordered_set<const void*> mRegisteredMemoryPtrs; // List of pointers registered for GPU
 
diff --git a/GPU/GPUTracking/Global/GPUChainITS.cxx b/GPU/GPUTracking/Global/GPUChainITS.cxx
index eeead79b1840b..5d36dc63ca85d 100644
--- a/GPU/GPUTracking/Global/GPUChainITS.cxx
+++ b/GPU/GPUTracking/Global/GPUChainITS.cxx
@@ -28,7 +28,7 @@ class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator
  public:
   void* allocate(size_t size) override
   {
-    return mFWReco->AllocateUnmanagedMemory(size, GPUMemoryResource::MEMORY_GPU);
+    return mFWReco->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU);
   }
 
   void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; }
@@ -86,7 +86,7 @@ o2::its::TimeFrame* GPUChainITS::GetITSTimeframe()
   }
 #if !defined(GPUCA_STANDALONE)
   if (mITSTimeFrame->mIsGPU) {
-    auto doFWExtAlloc = [this](size_t size) -> void* { return rec()->AllocateUnmanagedMemory(size, GPUMemoryResource::MEMORY_GPU); };
+    auto doFWExtAlloc = [this](size_t size) -> void* { return rec()->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU); };
 
     mFrameworkAllocator.reset(new o2::its::GPUFrameworkExternalAllocator);
     mFrameworkAllocator->setReconstructionFramework(rec());
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index fc07a91004c5f..24c74a661f18e 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -43,7 +43,7 @@ int32_t GPUChainTracking::RunTPCCompression()
   }
 
   if (gatherMode == 3) {
-    mRec->AllocateVolatileDeviceMemory(0); // make future device memory allocation volatile
+    mRec->MakeFutureDeviceMemoryAllocationsVolatile();
   }
   SetupGPUProcessor(&Compressor, true);
   new (Compressor.mMemory) GPUTPCCompression::memory;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
index 5d05cd6a97776..53bdfbadd4b25 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
@@ -34,7 +34,7 @@ static inline uint32_t RGB(uint8_t r, uint8_t g, uint8_t b) { return (uint32_t)r
 int32_t GPUChainTracking::PrepareProfile()
 {
 #ifdef GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE
-  char* tmpMem = (char*)mRec->AllocateUnmanagedMemory(PROFILE_MAX_SIZE, GPUMemoryResource::MEMORY_GPU);
+  char* tmpMem = (char*)mRec->AllocateDirectMemory(PROFILE_MAX_SIZE, GPUMemoryResource::MEMORY_GPU);
   processorsShadow()->tpcTrackers[0].mStageAtSync = tmpMem;
   runKernel<GPUMemClean16>({{BlockCount(), ThreadCount(), -1}}, tmpMem, PROFILE_MAX_SIZE);
 #endif
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
index 163f08634ef86..84835a1695071 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
@@ -297,7 +297,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
     SynchronizeEventAndRelease(mEvents->single, doGPU);
 
     if (GetProcessingSettings().clearO2OutputFromGPU) {
-      mRec->AllocateVolatileDeviceMemory(0); // make future device memory allocation volatile
+      mRec->MakeFutureDeviceMemoryAllocationsVolatile();
     }
     AllocateRegisteredMemory(Merger.MemoryResOutputO2(), mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::tpcTracksO2)]);
     AllocateRegisteredMemory(Merger.MemoryResOutputO2Clus(), mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::tpcTracksO2ClusRefs)]);