diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index ab2210e5dd555..ad2ee2e840d00 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -712,31 +712,43 @@ size_t GPUReconstruction::AllocateRegisteredMemory(int16_t ires, GPUOutputContro void* GPUReconstruction::AllocateDirectMemory(size_t size, int32_t type) { - if (type != GPUMemoryResource::MEMORY_HOST && (!IsGPU() || type != GPUMemoryResource::MEMORY_GPU)) { - throw std::runtime_error("Requested invalid memory typo for unmanaged allocation"); - } if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { - mUnmanagedChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]); - return GPUProcessor::alignPointer(mUnmanagedChunks.back().get()); - } else { - if (mVolatileMemoryStart && !mDeviceMemoryAsVolatile && (type & GPUMemoryResource::MEMORY_GPU) && !(type & GPUMemoryResource::MEMORY_STACK)) { - GPUError("Must not allocate direct memory while volatile chunks are allocated"); - throw std::bad_alloc(); - } - void*& pool = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPool : mHostMemoryPool; - void*& poolend = type == GPUMemoryResource::MEMORY_GPU ? mDeviceMemoryPoolEnd : mHostMemoryPoolEnd; - char* retVal; - GPUProcessor::computePointerWithAlignment(pool, retVal, size); - if (pool > poolend) { - GPUError("Insufficient unmanaged memory: missing %ld bytes", ptrDiff(pool, poolend)); - throw std::bad_alloc(); - } - UpdateMaxMemoryUsed(); - if (GetProcessingSettings().allocDebugLevel >= 2) { - std::cout << "Allocated (unmanaged " << (type == GPUMemoryResource::MEMORY_GPU ? "gpu" : "host") << "): " << size << " - available: " << ptrDiff(poolend, pool) << "\n"; + char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size]; + if ((type & GPUMemoryResource::MEMORY_STACK)) { + mNonPersistentIndividualDirectAllocations.emplace_back(retVal, alignedDeleter()); + } else { + mDirectMemoryChunks.emplace_back(retVal, alignedDeleter()); } return retVal; } + + if ((type & ~(GPUMemoryResource::MEMORY_HOST | GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK)) || ((type & GPUMemoryResource::MEMORY_HOST) && (type & GPUMemoryResource::MEMORY_GPU))) { + throw std::runtime_error("Requested invalid memory typo for direct allocation"); + } + if (mVolatileMemoryStart && !mDeviceMemoryAsVolatile && (type & GPUMemoryResource::MEMORY_GPU) && !(type & GPUMemoryResource::MEMORY_STACK)) { + GPUError("Must not allocate direct memory while volatile chunks are allocated"); + throw std::bad_alloc(); + } + + void*& pool = (type & GPUMemoryResource::MEMORY_GPU) ? mDeviceMemoryPool : mHostMemoryPool; + void*& poolend = (type & GPUMemoryResource::MEMORY_GPU) ? mDeviceMemoryPoolEnd : mHostMemoryPoolEnd; + char* retVal; + if ((type & GPUMemoryResource::MEMORY_STACK)) { + poolend = (char*)poolend - size; + poolend = (char*)poolend - GPUProcessor::getAlignmentMod(poolend); + retVal = (char*)poolend; + } else { + GPUProcessor::computePointerWithAlignment(pool, retVal, size); + } + if (pool > poolend) { + GPUError("Insufficient unmanaged memory: missing %ld bytes", ptrDiff(pool, poolend)); + throw std::bad_alloc(); + } + UpdateMaxMemoryUsed(); + if (GetProcessingSettings().allocDebugLevel >= 2) { + std::cout << "Allocated (unmanaged " << (type == GPUMemoryResource::MEMORY_GPU ? "gpu" : "host") << "): " << size << " - available: " << ptrDiff(poolend, pool) << "\n"; + } + return retVal; } void* GPUReconstruction::AllocateVolatileDeviceMemory(size_t size) @@ -765,8 +777,9 @@ void* GPUReconstruction::AllocateVolatileMemory(size_t size, bool device) if (device) { return AllocateVolatileDeviceMemory(size); } - mVolatileChunks.emplace_back(new char[size + GPUCA_BUFFER_ALIGNMENT]); - return GPUProcessor::alignPointer(mVolatileChunks.back().get()); + char* retVal = new (std::align_val_t(GPUCA_BUFFER_ALIGNMENT)) char[size]; + mVolatileChunks.emplace_back(retVal, alignedDeleter()); + return retVal; } void GPUReconstruction::MakeFutureDeviceMemoryAllocationsVolatile() @@ -851,7 +864,7 @@ void GPUReconstruction::FreeRegisteredMemory(GPUMemoryResource* res) void GPUReconstruction::PushNonPersistentMemory(uint64_t tag) { - mNonPersistentMemoryStack.emplace_back(mHostMemoryPoolEnd, mDeviceMemoryPoolEnd, mNonPersistentIndividualAllocations.size(), tag); + mNonPersistentMemoryStack.emplace_back(mHostMemoryPoolEnd, mDeviceMemoryPoolEnd, mNonPersistentIndividualAllocations.size(), mNonPersistentIndividualDirectAllocations.size(), tag); } void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag) @@ -862,11 +875,11 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag) if (mNonPersistentMemoryStack.size() == 0) { GPUFatal("Trying to pop memory state from empty stack"); } - if (tag != 0 && std::get<3>(mNonPersistentMemoryStack.back()) != tag) { - GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<3>(mNonPersistentMemoryStack.back())).c_str()); + if (tag != 0 && std::get<4>(mNonPersistentMemoryStack.back()) != tag) { + GPUFatal("Tag mismatch when popping non persistent memory from stack : pop %s vs on stack %s", qTag2Str(tag).c_str(), qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str()); } if ((GetProcessingSettings().debugLevel >= 3 || GetProcessingSettings().allocDebugLevel) && (IsGPU() || GetProcessingSettings().forceHostMemoryPoolSize)) { - printf("Allocated memory after %30s (%8s) (Stack %zu): ", GPUDataTypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<3>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size()); + printf("Allocated memory after %30s (%8s) (Stack %zu): ", GPUDataTypes::RECO_STEP_NAMES[getRecoStepNum(step, true)], qTag2Str(std::get<4>(mNonPersistentMemoryStack.back())).c_str(), mNonPersistentMemoryStack.size()); PrintMemoryOverview(); printf("%76s", ""); PrintMemoryMax(); @@ -882,6 +895,7 @@ void GPUReconstruction::PopNonPersistentMemory(RecoStep step, uint64_t tag) res->mPtrDevice = nullptr; } mNonPersistentIndividualAllocations.resize(std::get<2>(mNonPersistentMemoryStack.back())); + mNonPersistentIndividualDirectAllocations.resize(std::get<3>(mNonPersistentMemoryStack.back())); mNonPersistentMemoryStack.pop_back(); } @@ -917,9 +931,11 @@ void GPUReconstruction::ClearAllocatedMemory(bool clearOutputs) FreeRegisteredMemory(i); } } - mUnmanagedChunks.clear(); mNonPersistentMemoryStack.clear(); mNonPersistentIndividualAllocations.clear(); + mDirectMemoryChunks.clear(); + mNonPersistentIndividualDirectAllocations.clear(); + mVolatileChunks.clear(); mVolatileMemoryStart = nullptr; if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_GLOBAL) { mHostMemoryPool = GPUProcessor::alignPointer(mHostMemoryPermanent); diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 396a007761fb7..f5b39cb370b9e 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -69,8 +69,6 @@ class GPUReconstruction class LibraryLoader; // These must be the first members to ensure correct destructor order! std::shared_ptr mMyLib = nullptr; std::vector mMemoryResources; - std::vector> mUnmanagedChunks; - std::vector> mVolatileChunks; std::vector> mChains; public: @@ -373,9 +371,15 @@ class GPUReconstruction GPUProcessor* proc = nullptr; std::vector res; }; + struct alignedDeleter { + void operator()(void* ptr) { ::operator delete(ptr, std::align_val_t(GPUCA_BUFFER_ALIGNMENT)); }; + }; std::unordered_map mMemoryReuse1to1; - std::vector> mNonPersistentMemoryStack; + std::vector> mNonPersistentMemoryStack; // hostPoolAddress, devicePoolAddress, individualAllocationCount, directIndividualAllocationCound, tag std::vector mNonPersistentIndividualAllocations; + std::vector> mNonPersistentIndividualDirectAllocations; + std::vector> mDirectMemoryChunks; + std::vector> mVolatileChunks; std::unique_ptr mPipelineContext; diff --git a/GPU/GPUTracking/Standalone/CMakeLists.txt b/GPU/GPUTracking/Standalone/CMakeLists.txt index a17c58ad1ba03..0859223187f00 100644 --- a/GPU/GPUTracking/Standalone/CMakeLists.txt +++ b/GPU/GPUTracking/Standalone/CMakeLists.txt @@ -125,10 +125,10 @@ find_package(O2GPU REQUIRED) if(GPUCA_CONFIG_ONNX) find_package(onnxruntime REQUIRED) - if(CUDA_ENABLED AND NOT DEFINED ORT_CUDA_BUILD) - set(ORT_CUDA_BUILD ON) - elseif(HIP_ENABLED AND NOT DEFINED ORT_ROCM_BUILD) + if(HIP_ENABLED AND NOT DEFINED ORT_ROCM_BUILD) set(ORT_ROCM_BUILD ON) + elseif(CUDA_ENABLED AND NOT DEFINED ORT_CUDA_BUILD) + set(ORT_CUDA_BUILD ON) endif() else() set(onnxruntime_FOUND OFF) diff --git a/dependencies/FindO2GPU.cmake b/dependencies/FindO2GPU.cmake index 0c5313c16af68..33925e8cf1341 100644 --- a/dependencies/FindO2GPU.cmake +++ b/dependencies/FindO2GPU.cmake @@ -104,7 +104,9 @@ endif() # ---------------------------------- CUDA ---------------------------------- if(ENABLE_CUDA) if(CUDA_COMPUTETARGET) - set(CMAKE_CUDA_ARCHITECTURES ${CUDA_COMPUTETARGET} CACHE STRING "" FORCE) + set(CMAKE_CUDA_ARCHITECTURES ${CUDA_COMPUTETARGET}) + else() + set(CMAKE_CUDA_ARCHITECTURES 61-virtual) endif() set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD}) set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) @@ -121,11 +123,6 @@ if(ENABLE_CUDA) message(STATUS "Using as CUDA GCC version: ${GPUCA_CUDA_GCCBIN}") set(CMAKE_CUDA_HOST_COMPILER "${GPUCA_CUDA_GCCBIN}") endif() - if(CUDA_COMPUTETARGET) - set(CMAKE_CUDA_ARCHITECTURES ${CUDA_COMPUTETARGET} CACHE STRING "" FORCE) - else() - set(CMAKE_CUDA_ARCHITECTURES 61-virtual CACHE STRING "" FORCE) - endif() enable_language(CUDA) get_property(LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) if (ENABLE_CUDA STREQUAL "AUTO") @@ -231,7 +228,8 @@ endif() # ---------------------------------- HIP ---------------------------------- if(ENABLE_HIP) if(HIP_AMDGPUTARGET) - set(CMAKE_HIP_ARCHITECTURES "${HIP_AMDGPUTARGET}" CACHE STRING "" FORCE) + set(CMAKE_HIP_ARCHITECTURES "${HIP_AMDGPUTARGET}") + set(AMDGPU_TARGETS "${HIP_AMDGPUTARGET}") endif() if(NOT "$ENV{CMAKE_PREFIX_PATH}" MATCHES "rocm" AND NOT CMAKE_PREFIX_PATH MATCHES "rocm" AND EXISTS "/opt/rocm/lib/cmake/") list(APPEND CMAKE_PREFIX_PATH "/opt/rocm/lib/cmake") @@ -239,11 +237,6 @@ if(ENABLE_HIP) if("$ENV{CMAKE_PREFIX_PATH}" MATCHES "rocm" OR CMAKE_PREFIX_PATH MATCHES "rocm") set(CMAKE_HIP_STANDARD ${CMAKE_CXX_STANDARD}) set(CMAKE_HIP_STANDARD_REQUIRED TRUE) - if(HIP_AMDGPUTARGET) - set(AMDGPU_TARGETS "${HIP_AMDGPUTARGET}" CACHE STRING "AMD GPU targets to compile for" FORCE) - set(GPU_TARGETS "${HIP_AMDGPUTARGET}" CACHE STRING "AMD GPU targets to compile for" FORCE) - set(CMAKE_HIP_ARCHITECTURES "${HIP_AMDGPUTARGET}" CACHE STRING "AMD GPU targets to compile for" FORCE) - endif() set(TMP_ROCM_DIR_LIST "${CMAKE_PREFIX_PATH}:$ENV{CMAKE_PREFIX_PATH}") string(REPLACE ":" ";" TMP_ROCM_DIR_LIST "${TMP_ROCM_DIR_LIST}") list(FILTER TMP_ROCM_DIR_LIST INCLUDE REGEX rocm)