From a10834678d0bb47e6469fa31445ac5ab01d2d54a Mon Sep 17 00:00:00 2001 From: David Rohr Date: Mon, 21 Apr 2025 10:30:29 +0200 Subject: [PATCH] GPU: Add protections not to do invalid memory allocations while volatile memory is allocated --- GPU/GPUTracking/Base/GPUMemoryResource.h | 34 +++++------ GPU/GPUTracking/Base/GPUReconstruction.cxx | 56 ++++++++++++------- GPU/GPUTracking/Base/GPUReconstruction.h | 18 +++--- GPU/GPUTracking/Global/GPUChainITS.cxx | 4 +- .../Global/GPUChainTrackingCompression.cxx | 2 +- .../GPUChainTrackingDebugAndProfiling.cxx | 2 +- .../Global/GPUChainTrackingMerger.cxx | 2 +- 7 files changed, 69 insertions(+), 49 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUMemoryResource.h b/GPU/GPUTracking/Base/GPUMemoryResource.h index 06e350db0bfc7..947bcac504733 100644 --- a/GPU/GPUTracking/Base/GPUMemoryResource.h +++ b/GPU/GPUTracking/Base/GPUMemoryResource.h @@ -56,24 +56,24 @@ class GPUMemoryResource public: enum MemoryType { - MEMORY_HOST = 1, - MEMORY_GPU = 2, - MEMORY_INPUT_FLAG = 4, - MEMORY_INPUT = 7, - MEMORY_OUTPUT_FLAG = 8, - MEMORY_OUTPUT = 11, - MEMORY_INOUT = 15, - MEMORY_SCRATCH = 16, - MEMORY_SCRATCH_HOST = 17, - MEMORY_EXTERNAL = 32, - MEMORY_PERMANENT = 64, - MEMORY_CUSTOM = 128, - MEMORY_CUSTOM_TRANSFER = 256, - MEMORY_STACK = 512 + MEMORY_HOST = 1, // Memory allocated on host (irrespective of other flags) + MEMORY_GPU = 2, // Memory allocated on GPU (irrespective of other flags) + MEMORY_INPUT_FLAG = 4, // Flag to signal this memory is copied to GPU with TransferMemoryResourcesToGPU, and alike + MEMORY_INPUT = 7, // Input data for GPU has the MEMORY_INPUT_FLAG flat and is allocated on host and GPU + MEMORY_OUTPUT_FLAG = 8, // Flag to signal this memory is copied to Host with TransferMemoryResourcesToHost, and alike + MEMORY_OUTPUT = 11, // Output data for GPU has the MEMORY_OUTPUT_FLAG flat and is allocated on host and GPU + MEMORY_INOUT = 15, // Combination if MEMORY_INPUT and MEMORY_OUTPUT + MEMORY_SCRATCH = 16, // Scratch memory, is allocated only on GPU by default if running on GPU, only on host otherwise, if MEMORY_HOST and MEMORY_GPU flags not set. + MEMORY_SCRATCH_HOST = 17, // Scratch memory only on host + MEMORY_EXTERNAL = 32, // Special flag to signal that memory on host shall not be allocated, but will be provided externally and manually + MEMORY_PERMANENT = 64, // Permanent memory, registered once with AllocateRegisteredPermanentMemory, not per time frame. Only for small sizes! + MEMORY_CUSTOM = 128, // Memory is not allocated automatically with AllocateRegisteredMemory(GPUProcessor), but must be allocated manually via AllocateRegisteredMemory(memoryId) + MEMORY_CUSTOM_TRANSFER = 256, // Memory is not transfered automatically with TransferMemoryResourcesTo, but must be transferred manually with TransferMemoryTo...(memoryId) + MEMORY_STACK = 512 // Use memory from non-persistent stack at the end of the global memory region. Not persistent for full TF. Use PushNonPersistentMemory and PopNonPersistentMemory to release memory from the stack }; - enum AllocationType { ALLOCATION_AUTO = 0, - ALLOCATION_INDIVIDUAL = 1, - ALLOCATION_GLOBAL = 2 }; + enum AllocationType { ALLOCATION_AUTO = 0, // --> GLOBAL if GPU is used, INDIVIDUAL otherwise + ALLOCATION_INDIVIDUAL = 1, // Individual memory allocations with malloc (host only) + ALLOCATION_GLOBAL = 2 }; // Allocate memory blocks from large preallocated memory range with internal allocator (host and GPU) GPUMemoryResource(GPUProcessor* proc, void* (GPUProcessor::*setPtr)(void*), MemoryType type, const char* name = "") : mProcessor(proc), mPtr(nullptr), mPtrDevice(nullptr), mSetPointers(setPtr), mName(name), mSize(0), mOverrideSize(0), mReuse(-1), mType(type) { diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index c79c743e96ce5..ab2210e5dd555 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -538,6 +538,10 @@ size_t GPUReconstruction::AllocateRegisteredPermanentMemory() if (GetProcessingSettings().debugLevel >= 5) { GPUInfo("Allocating Permanent Memory"); } + if (mVolatileMemoryStart) { + GPUError("Must not allocate permanent memory while volatile chunks are allocated"); + throw std::bad_alloc(); + } int32_t total = 0; for (uint32_t i = 0; i < mMemoryResources.size(); i++) { if ((mMemoryResources[i].mType & GPUMemoryResource::MEMORY_PERMANENT) && mMemoryResources[i].mPtr == nullptr) { @@ -669,6 +673,10 @@ void GPUReconstruction::AllocateRegisteredMemoryInternal(GPUMemoryResource* res, GPUError("Device Processor not set (%s)", res->mName); throw std::bad_alloc(); } + if (mVolatileMemoryStart && !mDeviceMemoryAsVolatile && !(res->mType & GPUMemoryResource::MEMORY_STACK)) { + GPUError("Must not allocate non-stacked device memory while volatile chunks are allocated"); + throw std::bad_alloc(); + } size_t size = AllocateRegisteredMemoryHelper(res, res->mPtrDevice, recPool->mDeviceMemoryPool, recPool->mDeviceMemoryBase, recPool->mDeviceMemorySize, &GPUMemoryResource::SetDevicePointers, recPool->mDeviceMemoryPoolEnd, " gpu"); if (!(res->mType & GPUMemoryResource::MEMORY_HOST) || (res->mType & GPUMemoryResource::MEMORY_EXTERNAL)) { @@ -702,7 +710,7 @@ size_t GPUReconstruction::AllocateRegisteredMemory(int16_t ires, GPUOutputContro return res->mReuse >= 0 ? 0 : res->mSize; } -void* GPUReconstruction::AllocateUnmanagedMemory(size_t size, int32_t type) +void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type) { if (type != GPUMemoryResource::MEMORY_HOST && (!IsGPU() || type != GPUMemoryResource::MEMORY_GPU)) { throw std::runtime_error("Requested invalid memory typo for unmanaged allocation"); @@ -711,6 +719,10 @@ void* GPUReconstruction::AllocateUnmanagedMemory(size_t size, int32_t type) mUnmanagedChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]); return GPUProcessor::alignPointer(mUnmanagedChunks.back().get()); } else { + if (mVolatileMemoryStart && !mDeviceMemoryAsVolatile && (type & GPUMemoryResource::MEMORY_GPU) && !(type & GPUMemoryResource::MEMORY_STACK)) { + GPUError("Must not allocate direct memory while volatile chunks are allocated"); + throw std::bad_alloc(); + } void*& pool = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPool : mHostMemoryPool; void*& poolend = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPoolEnd : mHostMemoryPoolEnd; char* retVal; @@ -745,7 +757,6 @@ void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size) if (GetProcessingSettings().allocDebugLevel >= 2) { std::cout << "Allocated (volatile GPU): " << size << " - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n"; } - return retVal; } @@ -758,6 +769,30 @@ void* GPUReconstruction::AllocateVolatileMemory(size_t size, bool device) return GPUProcessor::alignPointer(mVolatileChunks.back().get()); } +void GPUReconstruction::MakeFutureDeviceMemoryAllocationsVolatile() +{ + mDeviceMemoryAsVolatile = true; + AllocateVolatileDeviceMemory(0); +} + +void GPUReconstruction::ReturnVolatileDeviceMemory() +{ + mDeviceMemoryAsVolatile = false; + if (mVolatileMemoryStart) { + mDeviceMemoryPool = mVolatileMemoryStart; + mVolatileMemoryStart = nullptr; + } + if (GetProcessingSettings().allocDebugLevel >= 2) { + std::cout << "Freed (volatile GPU) - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n"; + } +} + +void GPUReconstruction::ReturnVolatileMemory() +{ + ReturnVolatileDeviceMemory(); + mVolatileChunks.clear(); +} + void GPUReconstruction::ResetRegisteredMemoryPointers(GPUProcessor* proc) { for (uint32_t i = 0; i < mMemoryResources.size(); i++) { @@ -814,23 +849,6 @@ void GPUReconstruction::FreeRegisteredMemory(GPUMemoryResource* res) res->mPtrDevice = nullptr; } -void GPUReconstruction::ReturnVolatileDeviceMemory() -{ - if (mVolatileMemoryStart) { - mDeviceMemoryPool = mVolatileMemoryStart; - mVolatileMemoryStart = nullptr; - } - if (GetProcessingSettings().allocDebugLevel >= 2) { - std::cout << "Freed (volatile GPU) - available: " << ptrDiff(mDeviceMemoryPoolEnd, mDeviceMemoryPool) << "\n"; - } -} - -void GPUReconstruction::ReturnVolatileMemory() -{ - ReturnVolatileDeviceMemory(); - mVolatileChunks.clear(); -} - void GPUReconstruction::PushNonPersistentMemory(uint64_t tag) { mNonPersistentMemoryStack.emplace_back(mHostMemoryPoolEnd, mDeviceMemoryPoolEnd, mNonPersistentIndividualAllocations.size(), tag); diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index b6256f7f8ad82..396a007761fb7 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -166,9 +166,10 @@ class GPUReconstruction size_t AllocateRegisteredMemory(int16_t res, GPUOutputControl* control = nullptr); void AllocateRegisteredForeignMemory(int16_t res, GPUReconstruction* rec, GPUOutputControl* control = nullptr); - void* AllocateUnmanagedMemory(size_t size, int32_t type); + void* AllocateDirectMemory(size_t size, int32_t type); void* AllocateVolatileDeviceMemory(size_t size); void* AllocateVolatileMemory(size_t size, bool device); + void MakeFutureDeviceMemoryAllocationsVolatile(); void FreeRegisteredMemory(GPUProcessor* proc, bool freeCustom = false, bool freePermanent = false); void FreeRegisteredMemory(int16_t res); void ClearAllocatedMemory(bool clearOutputs = true); @@ -326,14 +327,15 @@ class GPUReconstruction void* mHostMemoryPoolBlocked = nullptr; // Ptr to end of pool size_t mHostMemorySize = 0; // Size of host memory buffer size_t mHostMemoryUsedMax = 0; // Maximum host memory size used over time - void* mDeviceMemoryBase = nullptr; // - void* mDeviceMemoryPermanent = nullptr; // - void* mDeviceMemoryPool = nullptr; // - void* mDeviceMemoryPoolEnd = nullptr; // - void* mDeviceMemoryPoolBlocked = nullptr; // - size_t mDeviceMemorySize = 0; // + void* mDeviceMemoryBase = nullptr; // Same for device ... + void* mDeviceMemoryPermanent = nullptr; // ... + void* mDeviceMemoryPool = nullptr; // ... + void* mDeviceMemoryPoolEnd = nullptr; // ... + void* mDeviceMemoryPoolBlocked = nullptr; // ... + size_t mDeviceMemorySize = 0; // ... + size_t mDeviceMemoryUsedMax = 0; // ... void* mVolatileMemoryStart = nullptr; // Ptr to beginning of temporary volatile memory allocation, nullptr if uninitialized - size_t mDeviceMemoryUsedMax = 0; // + bool mDeviceMemoryAsVolatile = false; // Make device memory allocations volatile std::unordered_set mRegisteredMemoryPtrs; // List of pointers registered for GPU diff --git a/GPU/GPUTracking/Global/GPUChainITS.cxx b/GPU/GPUTracking/Global/GPUChainITS.cxx index eeead79b1840b..5d36dc63ca85d 100644 --- a/GPU/GPUTracking/Global/GPUChainITS.cxx +++ b/GPU/GPUTracking/Global/GPUChainITS.cxx @@ -28,7 +28,7 @@ class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator public: void* allocate(size_t size) override { - return mFWReco->AllocateUnmanagedMemory(size, GPUMemoryResource::MEMORY_GPU); + return mFWReco->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU); } void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; } @@ -86,7 +86,7 @@ o2::its::TimeFrame* GPUChainITS::GetITSTimeframe() } #if !defined(GPUCA_STANDALONE) if (mITSTimeFrame->mIsGPU) { - auto doFWExtAlloc = [this](size_t size) -> void* { return rec()->AllocateUnmanagedMemory(size, GPUMemoryResource::MEMORY_GPU); }; + auto doFWExtAlloc = [this](size_t size) -> void* { return rec()->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU); }; mFrameworkAllocator.reset(new o2::its::GPUFrameworkExternalAllocator); mFrameworkAllocator->setReconstructionFramework(rec()); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index fc07a91004c5f..24c74a661f18e 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -43,7 +43,7 @@ int32_t GPUChainTracking::RunTPCCompression() } if (gatherMode == 3) { - mRec->AllocateVolatileDeviceMemory(0); // make future device memory allocation volatile + mRec->MakeFutureDeviceMemoryAllocationsVolatile(); } SetupGPUProcessor(&Compressor, true); new (Compressor.mMemory) GPUTPCCompression::memory; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx index 5d05cd6a97776..53bdfbadd4b25 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx @@ -34,7 +34,7 @@ static inline uint32_t RGB(uint8_t r, uint8_t g, uint8_t b) { return (uint32_t)r int32_t GPUChainTracking::PrepareProfile() { #ifdef GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE - char* tmpMem = (char*)mRec->AllocateUnmanagedMemory(PROFILE_MAX_SIZE, GPUMemoryResource::MEMORY_GPU); + char* tmpMem = (char*)mRec->AllocateDirectMemory(PROFILE_MAX_SIZE, GPUMemoryResource::MEMORY_GPU); processorsShadow()->tpcTrackers[0].mStageAtSync = tmpMem; runKernel({{BlockCount(), ThreadCount(), -1}}, tmpMem, PROFILE_MAX_SIZE); #endif diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx index 163f08634ef86..84835a1695071 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx @@ -297,7 +297,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) SynchronizeEventAndRelease(mEvents->single, doGPU); if (GetProcessingSettings().clearO2OutputFromGPU) { - mRec->AllocateVolatileDeviceMemory(0); // make future device memory allocation volatile + mRec->MakeFutureDeviceMemoryAllocationsVolatile(); } AllocateRegisteredMemory(Merger.MemoryResOutputO2(), mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::tpcTracksO2)]); AllocateRegisteredMemory(Merger.MemoryResOutputO2Clus(), mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::tpcTracksO2ClusRefs)]);