From 86986f63800c79a1ba216979e576c644894336f0 Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Wed, 12 Nov 2025 18:59:49 +0100 Subject: [PATCH] ITS: GPU: free artefacts from memory after iteration Signed-off-by: Felix Schlepper --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 6 +- .../ITS/tracking/GPU/ITStrackingGPU/Utils.h | 2 +- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 129 ++++++++++-------- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 17 ++- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 17 --- .../include/ITStracking/ExternalAllocator.h | 28 +++- .../tracking/include/ITStracking/TimeFrame.h | 26 ++-- .../ITSMFT/ITS/tracking/src/TimeFrame.cxx | 34 +++-- GPU/GPUTracking/Global/GPUChainITS.cxx | 24 ++-- GPU/GPUTracking/Global/GPUChainITS.h | 3 +- 10 files changed, 158 insertions(+), 128 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 2bd1550e7c72b..8095d690bbcc8 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -35,6 +35,8 @@ class TimeFrameGPU final : public TimeFrame ~TimeFrameGPU() = default; /// Most relevant operations + void pushMemoryStack(const int); + void popMemoryStack(const int); void registerHostMemory(const int); void unregisterHostMemory(const int); void initialise(const int, const TrackingParameters&, const int, IndexTableUtilsN* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr); @@ -177,8 +179,8 @@ class TimeFrameGPU final : public TimeFrame int getNumberOfNeighbours() const final; private: - void allocMemAsync(void**, size_t, Stream&, bool); // Abstract owned and unowned memory allocations on specific stream - void allocMem(void**, size_t, bool); // Abstract owned and unowned memory allocations on default stream + void allocMemAsync(void**, size_t, Stream&, bool, int32_t = o2::gpu::GPUMemoryResource::MEMORY_GPU); // Abstract owned and unowned memory allocations on specific stream + void allocMem(void**, size_t, bool, int32_t = o2::gpu::GPUMemoryResource::MEMORY_GPU); // Abstract owned and unowned memory allocations on default stream TimeFrameGPUParameters mGpuParams; // Host-available device buffer sizes diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index e5b9253ca4090..ee0a203f32fda 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -313,7 +313,7 @@ struct TypedAllocator { pointer allocate(size_type n) { - void* raw = mInternalAllocator->allocate(n * sizeof(T)); + void* raw = mInternalAllocator->allocateStack(n * sizeof(T)); return thrust::device_pointer_cast(static_cast(raw)); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 27bcf04746da5..6532165d9628a 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -11,6 +11,7 @@ /// #include +#include #include #include @@ -24,15 +25,16 @@ #include "GPUCommonMath.h" #include "GPUCommonLogger.h" #include "GPUCommonHelpers.h" +#include "utils/strtag.h" namespace o2::its::gpu { template -void TimeFrameGPU::allocMemAsync(void** ptr, size_t size, Stream& stream, bool extAllocator) +void TimeFrameGPU::allocMemAsync(void** ptr, size_t size, Stream& stream, bool extAllocator, int32_t type) { if (extAllocator) { - *ptr = this->mExtDeviceAllocator->allocate(size); + *ptr = (this->mExternalAllocator)->allocate(size, type); } else { GPULog("Calling default CUDA allocator"); GPUChkErrS(cudaMallocAsync(reinterpret_cast(ptr), size, stream.get())); @@ -40,10 +42,10 @@ void TimeFrameGPU::allocMemAsync(void** ptr, size_t size, Stream& strea } template -void TimeFrameGPU::allocMem(void** ptr, size_t size, bool extAllocator) +void TimeFrameGPU::allocMem(void** ptr, size_t size, bool extAllocator, int32_t type) { if (extAllocator) { - *ptr = this->mExtDeviceAllocator->allocate(size); + *ptr = (this->mExternalAllocator)->allocate(size, type); } else { GPULog("Calling default CUDA allocator"); GPUChkErrS(cudaMalloc(reinterpret_cast(ptr), size)); @@ -56,7 +58,7 @@ void TimeFrameGPU::loadIndexTableUtils(const int iteration) GPUTimer timer("loading indextable utils"); if (!iteration) { GPULog("gpu-allocation: allocating IndexTableUtils buffer, for {:.2f} MB.", sizeof(IndexTableUtilsN) / constants::MB); - allocMem(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtilsN), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtilsN), this->hasFrameworkAllocator()); } GPULog("gpu-transfer: loading IndexTableUtils object, for {:.2f} MB.", sizeof(IndexTableUtilsN) / constants::MB); GPUChkErrS(cudaMemcpy(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtilsN), cudaMemcpyHostToDevice)); @@ -67,10 +69,10 @@ void TimeFrameGPU::createUnsortedClustersDeviceArray(const int iteratio { if (!iteration) { GPUTimer timer("creating unsorted clusters array"); - allocMem(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), this->hasFrameworkAllocator()); GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); mPinnedUnsortedClusters.set(nLayers); - if (!this->hasExternalDeviceAllocator()) { + if (!this->hasFrameworkAllocator()) { for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) { GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); mPinnedUnsortedClusters.set(iLayer); @@ -85,7 +87,7 @@ void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration, cons if (!iteration) { GPUTimer timer(mGpuStreams[layer], "loading unsorted clusters", layer); GPULog("gpu-transfer: loading {} unsorted clusters on layer {}, for {:.2f} MB.", this->mUnsortedClusters[layer].size(), layer, this->mUnsortedClusters[layer].size() * sizeof(Cluster) / constants::MB); - allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[layer]), this->mUnsortedClusters[layer].size() * sizeof(Cluster), mGpuStreams[layer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[layer]), this->mUnsortedClusters[layer].size() * sizeof(Cluster), mGpuStreams[layer], this->hasFrameworkAllocator()); GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[layer], this->mUnsortedClusters[layer].data(), this->mUnsortedClusters[layer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mUnsortedClustersDeviceArray[layer], &mUnsortedClustersDevice[layer], sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -96,10 +98,10 @@ void TimeFrameGPU::createClustersDeviceArray(const int iteration, const { if (!iteration) { GPUTimer timer("creating sorted clusters array"); - allocMem(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), this->hasFrameworkAllocator()); GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); mPinnedClusters.set(nLayers); - if (!this->hasExternalDeviceAllocator()) { + if (!this->hasFrameworkAllocator()) { for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) { GPUChkErrS(cudaHostRegister(this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); mPinnedClusters.set(iLayer); @@ -114,7 +116,7 @@ void TimeFrameGPU::loadClustersDevice(const int iteration, const int la if (!iteration) { GPUTimer timer(mGpuStreams[layer], "loading sorted clusters", layer); GPULog("gpu-transfer: loading {} clusters on layer {}, for {:.2f} MB.", this->mClusters[layer].size(), layer, this->mClusters[layer].size() * sizeof(Cluster) / constants::MB); - allocMemAsync(reinterpret_cast(&mClustersDevice[layer]), this->mClusters[layer].size() * sizeof(Cluster), mGpuStreams[layer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mClustersDevice[layer]), this->mClusters[layer].size() * sizeof(Cluster), mGpuStreams[layer], this->hasFrameworkAllocator()); GPUChkErrS(cudaMemcpyAsync(mClustersDevice[layer], this->mClusters[layer].data(), this->mClusters[layer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mClustersDeviceArray[layer], &mClustersDevice[layer], sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -125,10 +127,10 @@ void TimeFrameGPU::createClustersIndexTablesArray(const int iteration) { if (!iteration) { GPUTimer timer("creating clustersindextable array"); - allocMem(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int*), this->hasFrameworkAllocator()); GPUChkErrS(cudaHostRegister(mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaHostRegisterPortable)); mPinnedClustersIndexTables.set(nLayers); - if (!this->hasExternalDeviceAllocator()) { + if (!this->hasFrameworkAllocator()) { for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { GPUChkErrS(cudaHostRegister(this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); mPinnedClustersIndexTables.set(iLayer); @@ -143,7 +145,7 @@ void TimeFrameGPU::loadClustersIndexTables(const int iteration, const i if (!iteration) { GPUTimer timer(mGpuStreams[layer], "loading sorted clusters", layer); GPULog("gpu-transfer: loading clusters indextable for layer {} with {} elements, for {:.2f} MB.", layer, this->mIndexTables[layer].size(), this->mIndexTables[layer].size() * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[layer]), this->mIndexTables[layer].size() * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[layer]), this->mIndexTables[layer].size() * sizeof(int), mGpuStreams[layer], this->hasFrameworkAllocator()); GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[layer], this->mIndexTables[layer].data(), this->mIndexTables[layer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mClustersIndexTablesDeviceArray[layer], &mClustersIndexTablesDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -154,10 +156,10 @@ void TimeFrameGPU::createUsedClustersDeviceArray(const int iteration, c { if (!iteration) { GPUTimer timer("creating used clusters flags"); - allocMem(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(uint8_t*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(uint8_t*), this->hasFrameworkAllocator()); GPUChkErrS(cudaHostRegister(mUsedClustersDevice.data(), nLayers * sizeof(uint8_t*), cudaHostRegisterPortable)); mPinnedUsedClusters.set(nLayers); - if (!this->hasExternalDeviceAllocator()) { + if (!this->hasFrameworkAllocator()) { for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) { GPUChkErrS(cudaHostRegister(this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(uint8_t), cudaHostRegisterPortable)); mPinnedUsedClusters.set(iLayer); @@ -172,7 +174,7 @@ void TimeFrameGPU::createUsedClustersDevice(const int iteration, const if (!iteration) { GPUTimer timer(mGpuStreams[layer], "creating used clusters flags", layer); GPULog("gpu-transfer: creating {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[layer].size(), layer, this->mUsedClusters[layer].size() * sizeof(unsigned char) / constants::MB); - allocMemAsync(reinterpret_cast(&mUsedClustersDevice[layer]), this->mUsedClusters[layer].size() * sizeof(unsigned char), mGpuStreams[layer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mUsedClustersDevice[layer]), this->mUsedClusters[layer].size() * sizeof(unsigned char), mGpuStreams[layer], this->hasFrameworkAllocator()); GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[layer], 0, this->mUsedClusters[layer].size() * sizeof(unsigned char), mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mUsedClustersDeviceArray[layer], &mUsedClustersDevice[layer], sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -193,10 +195,10 @@ void TimeFrameGPU::createROFrameClustersDeviceArray(const int iteration { if (!iteration) { GPUTimer timer("creating ROFrame clusters array"); - allocMem(reinterpret_cast(&mROFramesClustersDeviceArray), nLayers * sizeof(int*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mROFramesClustersDeviceArray), nLayers * sizeof(int*), this->hasFrameworkAllocator()); GPUChkErrS(cudaHostRegister(mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaHostRegisterPortable)); mPinnedROFramesClusters.set(nLayers); - if (!this->hasExternalDeviceAllocator()) { + if (!this->hasFrameworkAllocator()) { for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { GPUChkErrS(cudaHostRegister(this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); mPinnedROFramesClusters.set(iLayer); @@ -211,7 +213,7 @@ void TimeFrameGPU::loadROFrameClustersDevice(const int iteration, const if (!iteration) { GPUTimer timer(mGpuStreams[layer], "loading ROframe clusters", layer); GPULog("gpu-transfer: loading {} ROframe clusters info on layer {}, for {:.2f} MB.", this->mROFramesClusters[layer].size(), layer, this->mROFramesClusters[layer].size() * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[layer]), this->mROFramesClusters[layer].size() * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[layer]), this->mROFramesClusters[layer].size() * sizeof(int), mGpuStreams[layer], this->hasFrameworkAllocator()); GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[layer], this->mROFramesClusters[layer].data(), this->mROFramesClusters[layer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mROFramesClustersDeviceArray[layer], &mROFramesClustersDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -222,10 +224,10 @@ void TimeFrameGPU::createTrackingFrameInfoDeviceArray(const int iterati { if (!iteration) { GPUTimer timer("creating trackingframeinfo array"); - allocMem(reinterpret_cast(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), this->hasFrameworkAllocator()); GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable)); mPinnedTrackingFrameInfo.set(nLayers); - if (!this->hasExternalDeviceAllocator()) { + if (!this->hasFrameworkAllocator()) { for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { GPUChkErrS(cudaHostRegister(this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable)); mPinnedTrackingFrameInfo.set(iLayer); @@ -240,7 +242,7 @@ void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration, con if (!iteration) { GPUTimer timer(mGpuStreams[layer], "loading trackingframeinfo", layer); GPULog("gpu-transfer: loading {} tfinfo on layer {}, for {:.2f} MB.", this->mTrackingFrameInfo[layer].size(), layer, this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[layer]), this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo), mGpuStreams[layer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[layer]), this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo), mGpuStreams[layer], this->hasFrameworkAllocator()); GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[layer], this->mTrackingFrameInfo[layer].data(), this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mTrackingFrameInfoDeviceArray[layer], &mTrackingFrameInfoDevice[layer], sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -253,7 +255,7 @@ void TimeFrameGPU::loadMultiplicityCutMask(const int iteration) GPUTimer timer("loading multiplicity cut mask"); GPULog("gpu-transfer: iteration {} loading multiplicity cut mask with {} elements, for {:.2f} MB.", iteration, this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(uint8_t) / constants::MB); if (!iteration) { // only allocate on first call - allocMem(reinterpret_cast(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), this->hasFrameworkAllocator()); } GPUChkErrS(cudaMemcpy(mMultMaskDevice, this->mMultiplicityCutMask.data(), this->mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice)); } @@ -265,10 +267,10 @@ void TimeFrameGPU::loadVertices(const int iteration) if (!iteration) { GPUTimer timer("loading seeding vertices"); GPULog("gpu-transfer: loading {} ROframes vertices, for {:.2f} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / constants::MB); - allocMem(reinterpret_cast(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), this->hasFrameworkAllocator()); GPUChkErrS(cudaMemcpy(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice)); GPULog("gpu-transfer: loading {} seeding vertices, for {:.2f} MB.", this->mPrimaryVertices.size(), this->mPrimaryVertices.size() * sizeof(Vertex) / constants::MB); - allocMem(reinterpret_cast(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), this->hasFrameworkAllocator()); GPUChkErrS(cudaMemcpy(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice)); } } @@ -277,7 +279,7 @@ template void TimeFrameGPU::createTrackletsLUTDeviceArray(const int iteration) { if (!iteration) { - allocMem(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), this->hasFrameworkAllocator()); } } @@ -288,7 +290,7 @@ void TimeFrameGPU::createTrackletsLUTDevice(const int iteration, const const int ncls = this->mClusters[layer].size() + 1; if (!iteration) { GPULog("gpu-allocation: creating tracklets LUT for {} elements on layer {}, for {:.2f} MB.", ncls, layer, ncls * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[layer]), ncls * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[layer]), ncls * sizeof(int), mGpuStreams[layer], this->hasFrameworkAllocator()); GPUChkErrS(cudaMemcpyAsync(&mTrackletsLUTDeviceArray[layer], &mTrackletsLUTDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[layer], 0, ncls * sizeof(int), mGpuStreams[layer].get())); @@ -299,7 +301,7 @@ void TimeFrameGPU::createTrackletsBuffersArray(const int iteration) { if (!iteration) { GPUTimer timer("creating tracklet buffers array"); - allocMem(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), this->hasFrameworkAllocator()); } } @@ -311,7 +313,7 @@ void TimeFrameGPU::createTrackletsBuffers(const int layer) GPUChkErrS(cudaMemcpyAsync(&mNTracklets[layer], mTrackletsLUTDevice[layer] + this->mClusters[layer].size(), sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get())); mGpuStreams[layer].sync(); // ensure number of tracklets is correct GPULog("gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {:.2f} MB.", mNTracklets[layer], layer, mNTracklets[layer] * sizeof(Tracklet) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackletsDevice[layer]), mNTracklets[layer] * sizeof(Tracklet), mGpuStreams[layer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mTrackletsDevice[layer]), mNTracklets[layer] * sizeof(Tracklet), mGpuStreams[layer], this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); GPUChkErrS(cudaMemcpyAsync(&mTrackletsDeviceArray[layer], &mTrackletsDevice[layer], sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -343,7 +345,7 @@ void TimeFrameGPU::createNeighboursIndexTablesDevice(const int layer) { GPUTimer timer(mGpuStreams[layer], "creating cells neighbours", layer); GPULog("gpu-transfer: reserving neighbours LUT for {} elements on layer {}, for {:.2f} MB.", mNCells[layer] + 1, layer, (mNCells[layer] + 1) * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[layer]), (mNCells[layer] + 1) * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[layer]), (mNCells[layer] + 1) * sizeof(int), mGpuStreams[layer], this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[layer], 0, (mNCells[layer] + 1) * sizeof(int), mGpuStreams[layer].get())); } @@ -352,7 +354,7 @@ void TimeFrameGPU::createNeighboursLUTDevice(const int layer, const uns { GPUTimer timer(mGpuStreams[layer], "reserving neighboursLUT"); GPULog("gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {:.2f} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); // We need one element more to move exc -> inc + allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[layer], this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); // We need one element more to move exc -> inc GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[layer].get())); } @@ -362,8 +364,8 @@ void TimeFrameGPU::loadCellsDevice() GPUTimer timer(mGpuStreams, "loading cell seeds", nLayers - 2); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { GPULog("gpu-transfer: loading {} cell seeds on layer {}, for {:.2f} MB.", this->mCells[iLayer].size(), iLayer, this->mCells[iLayer].size() * sizeof(CellSeedN) / constants::MB); - allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeedN), mGpuStreams[iLayer], this->hasExternalDeviceAllocator()); - allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer], this->hasExternalDeviceAllocator()); // accessory for the neigh. finding. + allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeedN), mGpuStreams[iLayer], this->hasFrameworkAllocator()); + allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer], this->hasFrameworkAllocator()); // accessory for the neigh. finding. GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer].get())); GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeedN), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); } @@ -374,7 +376,7 @@ void TimeFrameGPU::createCellsLUTDeviceArray(const int iteration) { if (!iteration) { GPUTimer timer("creating cells LUTs array"); - allocMem(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), this->hasFrameworkAllocator()); } } @@ -383,7 +385,7 @@ void TimeFrameGPU::createCellsLUTDevice(const int layer) { GPUTimer timer(mGpuStreams[layer], "creating cells LUTs", layer); GPULog("gpu-transfer: creating cell LUT for {} elements on layer {}, for {:.2f} MB.", mNTracklets[layer] + 1, layer, (mNTracklets[layer] + 1) * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mCellsLUTDevice[layer]), (mNTracklets[layer] + 1) * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mCellsLUTDevice[layer]), (mNTracklets[layer] + 1) * sizeof(int), mGpuStreams[layer], this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[layer], 0, (mNTracklets[layer] + 1) * sizeof(int), mGpuStreams[layer].get())); GPUChkErrS(cudaMemcpyAsync(&mCellsLUTDeviceArray[layer], &mCellsLUTDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -393,7 +395,7 @@ void TimeFrameGPU::createCellsBuffersArray(const int iteration) { if (!iteration) { GPUTimer timer("creating cells buffers array"); - allocMem(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeedN*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeedN*), this->hasFrameworkAllocator()); GPUChkErrS(cudaMemcpy(mCellsDeviceArray, mCellsDevice.data(), mCellsDevice.size() * sizeof(CellSeedN*), cudaMemcpyHostToDevice)); } } @@ -406,7 +408,7 @@ void TimeFrameGPU::createCellsBuffers(const int layer) GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get())); mGpuStreams[layer].sync(); // ensure number of cells is correct GPULog("gpu-transfer: creating cell buffer for {} elements on layer {}, for {:.2f} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeedN) / constants::MB); - allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeedN), mGpuStreams[layer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeedN), mGpuStreams[layer], this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); GPUChkErrS(cudaMemcpyAsync(&mCellsDeviceArray[layer], &mCellsDevice[layer], sizeof(CellSeedN*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } @@ -426,7 +428,7 @@ void TimeFrameGPU::loadRoadsDevice() { GPUTimer timer("loading roads device"); GPULog("gpu-transfer: loading {} roads, for {:.2f} MB.", this->mRoads.size(), this->mRoads.size() * sizeof(Road) / constants::MB); - allocMem(reinterpret_cast(&mRoadsDevice), this->mRoads.size() * sizeof(Road), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mRoadsDevice), this->mRoads.size() * sizeof(Road), this->hasFrameworkAllocator()); GPUChkErrS(cudaHostRegister(this->mRoads.data(), this->mRoads.size() * sizeof(Road), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpy(mRoadsDevice, this->mRoads.data(), this->mRoads.size() * sizeof(Road), cudaMemcpyHostToDevice)); } @@ -436,7 +438,7 @@ void TimeFrameGPU::loadTrackSeedsDevice(bounded_vector& seed { GPUTimer timer("loading track seeds"); GPULog("gpu-transfer: loading {} track seeds, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(CellSeedN) / constants::MB); - allocMem(reinterpret_cast(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeedN), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeedN), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeedN), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpy(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeedN), cudaMemcpyHostToDevice)); } @@ -449,10 +451,10 @@ void TimeFrameGPU::createNeighboursDevice(const unsigned int layer) GPUChkErrS(cudaMemcpyAsync(&(this->mNNeighbours[layer]), &(mNeighboursLUTDevice[layer][this->mNCells[layer + 1] - 1]), sizeof(unsigned int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get())); mGpuStreams[layer].sync(); // ensure number of neighbours is correct GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", this->mNNeighbours[layer], (this->mNNeighbours[layer]) * sizeof(gpuPair) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), (this->mNNeighbours[layer]) * sizeof(gpuPair), mGpuStreams[layer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), (this->mNNeighbours[layer]) * sizeof(gpuPair), mGpuStreams[layer], this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, (this->mNNeighbours[layer]) * sizeof(gpuPair), mGpuStreams[layer].get())); GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", this->mNNeighbours[layer], (this->mNNeighbours[layer]) * sizeof(gpuPair) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), (this->mNNeighbours[layer]) * sizeof(int), mGpuStreams[layer], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), (this->mNNeighbours[layer]) * sizeof(int), mGpuStreams[layer], this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); } template @@ -461,7 +463,7 @@ void TimeFrameGPU::createTrackITSExtDevice(bounded_vector& s GPUTimer timer("reserving tracks"); mTrackITSExt = bounded_vector(seeds.size(), {}, this->getMemoryPool().get()); GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / constants::MB); - allocMem(reinterpret_cast(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), this->hasFrameworkAllocator(), (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt))); GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable)); } @@ -474,13 +476,13 @@ void TimeFrameGPU::createVtxTrackletsLUTDevice(const int32_t iteration) for (int32_t iMode{0}; iMode < 2; ++iMode) { if (!iteration) { GPULog("gpu-transfer: creating vertexer tracklets per cluster for {} elements for mode {}, for {:.2f} MB.", ncls, iMode, ncls * sizeof(int32_t) / constants::MB); - allocMemAsync(reinterpret_cast(&mNTrackletsPerClusterDevice[iMode]), ncls * sizeof(int32_t), mGpuStreams[iMode], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mNTrackletsPerClusterDevice[iMode]), ncls * sizeof(int32_t), mGpuStreams[iMode], this->hasFrameworkAllocator()); GPULog("gpu-transfer: creating vertexer tracklets per cluster sum for {} elements for mode {}, for {:.2f} MB.", ncls + 1, iMode, (ncls + 1) * sizeof(int32_t) / constants::MB); - allocMemAsync(reinterpret_cast(&mNTrackletsPerClusterSumDevice[iMode]), (ncls + 1) * sizeof(int32_t), mGpuStreams[iMode], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mNTrackletsPerClusterSumDevice[iMode]), (ncls + 1) * sizeof(int32_t), mGpuStreams[iMode], this->hasFrameworkAllocator()); GPULog("gpu-transfer: creating vertexer tracklets per ROF for {} elements for mode {}, for {:.2f} MB.", this->mNrof + 1, iMode, (this->mNrof + 1) * sizeof(int32_t) / constants::MB); - allocMemAsync(reinterpret_cast(&mNTrackletsPerROFDevice[iMode]), (this->mNrof + 1) * sizeof(int32_t), mGpuStreams[iMode], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mNTrackletsPerROFDevice[iMode]), (this->mNrof + 1) * sizeof(int32_t), mGpuStreams[iMode], this->hasFrameworkAllocator()); } GPUChkErrS(cudaMemsetAsync(mNTrackletsPerClusterDevice[iMode], 0, ncls * sizeof(int32_t), mGpuStreams[iMode].get())); GPUChkErrS(cudaMemsetAsync(mNTrackletsPerClusterSumDevice[iMode], 0, (ncls + 1) * sizeof(int32_t), mGpuStreams[iMode].get())); @@ -489,13 +491,13 @@ void TimeFrameGPU::createVtxTrackletsLUTDevice(const int32_t iteration) mGpuStreams[0].sync(); mGpuStreams[1].sync(); if (!iteration) { - allocMem(reinterpret_cast(&mNTrackletsPerClusterDeviceArray), mNTrackletsPerClusterDevice.size() * sizeof(int32_t*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mNTrackletsPerClusterDeviceArray), mNTrackletsPerClusterDevice.size() * sizeof(int32_t*), this->hasFrameworkAllocator()); GPUChkErrS(cudaMemcpy(mNTrackletsPerClusterDeviceArray, mNTrackletsPerClusterDevice.data(), mNTrackletsPerClusterDevice.size() * sizeof(int32_t*), cudaMemcpyHostToDevice)); - allocMem(reinterpret_cast(&mNTrackletsPerClusterSumDeviceArray), mNTrackletsPerClusterSumDevice.size() * sizeof(int32_t*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mNTrackletsPerClusterSumDeviceArray), mNTrackletsPerClusterSumDevice.size() * sizeof(int32_t*), this->hasFrameworkAllocator()); GPUChkErrS(cudaMemcpy(mNTrackletsPerClusterSumDeviceArray, mNTrackletsPerClusterSumDevice.data(), mNTrackletsPerClusterSumDevice.size() * sizeof(int32_t*), cudaMemcpyHostToDevice)); - allocMem(reinterpret_cast(&mNTrackletsPerROFDeviceArray), mNTrackletsPerROFDevice.size() * sizeof(int32_t*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mNTrackletsPerROFDeviceArray), mNTrackletsPerROFDevice.size() * sizeof(int32_t*), this->hasFrameworkAllocator()); GPUChkErrS(cudaMemcpy(mNTrackletsPerROFDeviceArray, mNTrackletsPerROFDevice.data(), mNTrackletsPerROFDevice.size() * sizeof(int32_t*), cudaMemcpyHostToDevice)); } } @@ -508,11 +510,11 @@ void TimeFrameGPU::createVtxTrackletsBuffers(const int32_t iteration) this->mTotalTracklets[iMode] = 0; GPUChkErrS(cudaMemcpyAsync(&(this->mTotalTracklets[iMode]), mNTrackletsPerClusterSumDevice[iMode] + this->mClusters[1].size(), sizeof(int32_t), cudaMemcpyDeviceToHost, mGpuStreams[iMode].get())); GPULog("gpu-transfer: creating vertexer tracklets buffer for {} elements on layer {}, for {:.2f} MB.", this->mTotalTracklets[iMode], iMode, this->mTotalTracklets[iMode] * sizeof(Tracklet) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackletsDevice[iMode]), this->mTotalTracklets[iMode] * sizeof(Tracklet), mGpuStreams[iMode], this->hasExternalDeviceAllocator()); + allocMemAsync(reinterpret_cast(&mTrackletsDevice[iMode]), this->mTotalTracklets[iMode] * sizeof(Tracklet), mGpuStreams[iMode], this->hasFrameworkAllocator()); } mGpuStreams[0].sync(); mGpuStreams[1].sync(); - allocMem(reinterpret_cast(&mTrackletsDeviceArray), 2 * sizeof(Tracklet*), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mTrackletsDeviceArray), 2 * sizeof(Tracklet*), this->hasFrameworkAllocator()); GPUChkErrS(cudaHostRegister(mTrackletsDevice.data(), 2 * sizeof(Tracklet*), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpy(mTrackletsDeviceArray, mTrackletsDevice.data(), 2 * sizeof(Tracklet*), cudaMemcpyHostToDevice)); } @@ -524,14 +526,14 @@ void TimeFrameGPU::createVtxLinesLUTDevice(const int32_t iteration) const int32_t ncls = this->mClusters[1].size(); GPULog("gpu-transfer: creating vertexer lines per cluster for {} elements , for {:.2f} MB.", ncls, ncls * sizeof(int32_t) / constants::MB); - allocMem(reinterpret_cast(&mNLinesPerClusterDevice), ncls * sizeof(int32_t), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mNLinesPerClusterDevice), ncls * sizeof(int32_t), this->hasFrameworkAllocator()); GPULog("gpu-transfer: creating vertexer lines per cluster sum for {} elements , for {:.2f} MB.", ncls + 1, (ncls + 1) * sizeof(int32_t) / constants::MB); - allocMem(reinterpret_cast(&mNLinesPerClusterSumDevice), (ncls + 1) * sizeof(int32_t), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mNLinesPerClusterSumDevice), (ncls + 1) * sizeof(int32_t), this->hasFrameworkAllocator()); const int32_t ntrkls = this->mTotalTracklets[0]; GPULog("gpu-transfer: creating vertexer used tracklets for {} elements , for {:.2f} MB.", ntrkls, ntrkls * sizeof(uint8_t) / constants::MB); - allocMem(reinterpret_cast(&mUsedTrackletsDevice), ntrkls * sizeof(uint8_t), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mUsedTrackletsDevice), ntrkls * sizeof(uint8_t), this->hasFrameworkAllocator()); } template @@ -542,7 +544,7 @@ void TimeFrameGPU::createVtxLinesBuffer(const int32_t iteration) GPUChkErrS(cudaMemcpy(&nlines, mNLinesPerClusterDevice + this->mClusters[1].size(), sizeof(int32_t), cudaMemcpyDeviceToHost)); this->mTotalLines = nlines; GPULog("gpu-transfer: creating vertexer lines for {} elements , for {:.2f} MB.", nlines, nlines * sizeof(Line) / constants::MB); - allocMem(reinterpret_cast(&mLinesDevice), nlines * sizeof(Line), this->hasExternalDeviceAllocator()); + allocMem(reinterpret_cast(&mLinesDevice), nlines * sizeof(Line), this->hasFrameworkAllocator()); // reset used tracklets GPUChkErrS(cudaMemset(mUsedTrackletsDevice, 0, this->mTotalTracklets[0] * sizeof(uint8_t))); } @@ -630,6 +632,23 @@ void TimeFrameGPU::unregisterHostMemory(const int maxLayers) checkedUnregisterArray(mPinnedROFramesClusters, mROFramesClustersDevice); } +template +void TimeFrameGPU::pushMemoryStack(const int iteration) +{ + // mark the beginning of memory marked with MEMORY_STACK that can be discarded + // after doing one iteration + const auto name = fmt::format("ITSITER{}", iteration); + (this->mExternalAllocator)->pushTagOnStack(qStr2Tag(name.c_str())); +} + +template +void TimeFrameGPU::popMemoryStack(const int iteration) +{ + // pop all memory on the stack from this iteration + const auto name = fmt::format("ITSITER{}", iteration); + (this->mExternalAllocator)->popTagOffStack(qStr2Tag(name.c_str())); +} + template void TimeFrameGPU::initialise(const int iteration, const TrackingParameters& trkParam, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index cca4283c9b77f..05810f0074811 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -28,7 +28,6 @@ template void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) { mTimeFrameGPU->initialise(iteration, this->mTrkParams[iteration], nLayers); - // on default stream mTimeFrameGPU->loadVertices(iteration); mTimeFrameGPU->loadIndexTableUtils(iteration); @@ -45,6 +44,8 @@ void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) mTimeFrameGPU->createTrackletsBuffersArray(iteration); mTimeFrameGPU->createCellsBuffersArray(iteration); mTimeFrameGPU->createCellsLUTDeviceArray(iteration); + // push every create artefact on the stack + mTimeFrameGPU->pushMemoryStack(iteration); } template @@ -108,7 +109,7 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i mTimeFrameGPU->getPositionResolutions(), this->mTrkParams[iteration].LayerRadii, mTimeFrameGPU->getMSangles(), - mTimeFrameGPU->getExternalDeviceAllocator(), + mTimeFrameGPU->getFrameworkAllocator(), conf.nBlocksLayerTracklets[iteration], conf.nThreadsLayerTracklets[iteration], mTimeFrameGPU->getStreams()); @@ -146,7 +147,7 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i mTimeFrameGPU->getPositionResolutions(), this->mTrkParams[iteration].LayerRadii, mTimeFrameGPU->getMSangles(), - mTimeFrameGPU->getExternalDeviceAllocator(), + mTimeFrameGPU->getFrameworkAllocator(), conf.nBlocksLayerTracklets[iteration], conf.nThreadsLayerTracklets[iteration], mTimeFrameGPU->getStreams()); @@ -197,7 +198,7 @@ void TrackerTraitsGPU::computeLayerCells(const int iteration) this->mTrkParams[iteration].MaxChi2ClusterAttachment, this->mTrkParams[iteration].CellDeltaTanLambdaSigma, this->mTrkParams[iteration].NSigmaCut, - mTimeFrameGPU->getExternalDeviceAllocator(), + mTimeFrameGPU->getFrameworkAllocator(), conf.nBlocksLayerCells[iteration], conf.nThreadsLayerCells[iteration], mTimeFrameGPU->getStreams()); @@ -253,7 +254,7 @@ void TrackerTraitsGPU::findCellsNeighbours(const int iteration) currentLayerCellsNum, nextLayerCellsNum, 1e2, - mTimeFrameGPU->getExternalDeviceAllocator(), + mTimeFrameGPU->getFrameworkAllocator(), conf.nBlocksFindNeighbours[iteration], conf.nThreadsFindNeighbours[iteration], mTimeFrameGPU->getStream(iLayer)); @@ -281,7 +282,7 @@ void TrackerTraitsGPU::findCellsNeighbours(const int iteration) mTimeFrameGPU->getDeviceNeighbours(iLayer), mTimeFrameGPU->getArrayNNeighbours()[iLayer], mTimeFrameGPU->getStream(iLayer), - mTimeFrameGPU->getExternalDeviceAllocator()); + mTimeFrameGPU->getFrameworkAllocator()); } mTimeFrameGPU->syncStreams(false); } @@ -312,7 +313,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) this->mTrkParams[0].MaxChi2NDF, mTimeFrameGPU->getDevicePropagator(), this->mTrkParams[0].CorrType, - mTimeFrameGPU->getExternalDeviceAllocator(), + mTimeFrameGPU->getFrameworkAllocator(), conf.nBlocksProcessNeighbours[iteration], conf.nThreadsProcessNeighbours[iteration]); } @@ -386,6 +387,8 @@ void TrackerTraitsGPU::findRoads(const int iteration) } mTimeFrameGPU->loadUsedClustersDevice(); } + // wipe the artefact memory + mTimeFrameGPU->popMemoryStack(iteration); }; template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 2c43ccd3bb81c..71f1281401e9d 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -644,23 +644,6 @@ GPUg() void __launch_bounds__(256, 1) processNeighboursKernel( } } -GPUhi() void allocateMemory(void** p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) -{ - if (alloc) { - *p = alloc->allocate(bytes); - } else { - GPUChkErrS(cudaMallocAsync(p, bytes, stream)); - } -} - -GPUhi() void deallocateMemory(void* p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) -{ - if (alloc) { - alloc->deallocate(reinterpret_cast(p), bytes); - } else { - GPUChkErrS(cudaFreeAsync(p, stream)); - } -} } // namespace gpu template diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h index 36e78ef24020c..7d1e98736db2c 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h @@ -17,15 +17,39 @@ #define TRACKINGITSU_INCLUDE_EXTERNALALLOCATOR_H_ #include +#include "GPUO2ExternalUser.h" +#include "Base/GPUMemoryResource.h" namespace o2::its { class ExternalAllocator { + using Type = std::underlying_type_t; + public: - virtual void* allocate(size_t) = 0; virtual void deallocate(char*, size_t) = 0; + virtual void* allocate(size_t) = 0; + void* allocate(size_t s, Type type) + { + auto old = mType; + mType = type; + void* p = allocate(s); + mType = old; + return p; + } + void* allocateStack(size_t s) + { + return allocate(s, (o2::gpu::GPUMemoryResource::MEMORY_GPU | o2::gpu::GPUMemoryResource::MEMORY_STACK)); + } + virtual void pushTagOnStack(uint64_t) = 0; + virtual void popTagOffStack(uint64_t) = 0; + + void setType(Type t) noexcept { mType = t; } + Type getType() const noexcept { return mType; } + + protected: + Type mType; }; class ExternalAllocatorAdaptor final : public std::pmr::memory_resource @@ -36,7 +60,7 @@ class ExternalAllocatorAdaptor final : public std::pmr::memory_resource protected: void* do_allocate(size_t bytes, size_t alignment) override { - void* p = mAlloc->allocate(bytes); + void* p = mAlloc->allocate(bytes, o2::gpu::GPUMemoryResource::MemoryType::MEMORY_HOST); if (!p) { throw std::bad_alloc(); } diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index b324092624a6d..4dbb9f09f6192 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -33,7 +33,6 @@ #include "ITStracking/IndexTableUtils.h" #include "ITStracking/ExternalAllocator.h" #include "ITStracking/BoundedAllocator.h" - #include "SimulationDataFormat/MCCompLabel.h" #include "SimulationDataFormat/MCTruthContainer.h" @@ -235,23 +234,14 @@ struct TimeFrame { void setBz(float bz) { mBz = bz; } float getBz() const { return mBz; } - /// State if memory will be externally managed. - // device - ExternalAllocator* mExtDeviceAllocator{nullptr}; - void setExternalDeviceAllocator(ExternalAllocator* allocator) { mExtDeviceAllocator = allocator; } - ExternalAllocator* getExternalDeviceAllocator() { return mExtDeviceAllocator; } - bool hasExternalDeviceAllocator() const noexcept { return mExtDeviceAllocator != nullptr; } - // host - ExternalAllocator* mExtHostAllocator{nullptr}; - void setExternalHostAllocator(ExternalAllocator* allocator) - { - mExtHostAllocator = allocator; - mExtMemoryPool = std::make_shared(mExtHostAllocator); - } - ExternalAllocator* getExternalHostAllocator() { return mExtHostAllocator; } - bool hasExternalHostAllocator() const noexcept { return mExtHostAllocator != nullptr; } - std::shared_ptr mExtMemoryPool; - std::pmr::memory_resource* getMaybeExternalHostResource(bool forceHost = false) { return (hasExternalHostAllocator() && !forceHost) ? mExtMemoryPool.get() : mMemoryPool.get(); } + /// State if memory will be externally managed by the GPU framework + ExternalAllocator* mExternalAllocator{nullptr}; + std::shared_ptr mExtMemoryPool; // host memory pool managed by the framework + auto getFrameworkAllocator() { return mExternalAllocator; }; + void setFrameworkAllocator(ExternalAllocator* ext); + bool hasFrameworkAllocator() const noexcept { return mExternalAllocator != nullptr; } + std::pmr::memory_resource* getMaybeFrameworkHostResource(bool forceHost = false) { return (hasFrameworkAllocator() && !forceHost) ? mExtMemoryPool.get() : mMemoryPool.get(); } + // Propagator const o2::base::PropagatorImpl* getDevicePropagator() const { return mPropagatorDevice; } virtual void setDevicePropagator(const o2::base::PropagatorImpl*) {}; diff --git a/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx b/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx index ca28ee227df56..0d8b461181741 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TimeFrame.cxx @@ -177,9 +177,9 @@ template void TimeFrame::resetROFrameData(size_t nRofs) { for (int iLayer{0}; iLayer < nLayers; ++iLayer) { - deepVectorClear(mUnsortedClusters[iLayer], getMaybeExternalHostResource()); - deepVectorClear(mTrackingFrameInfo[iLayer], getMaybeExternalHostResource()); - clearResizeBoundedVector(mROFramesClusters[iLayer], nRofs + 1, getMaybeExternalHostResource()); + deepVectorClear(mUnsortedClusters[iLayer], getMaybeFrameworkHostResource()); + deepVectorClear(mTrackingFrameInfo[iLayer], getMaybeFrameworkHostResource()); + clearResizeBoundedVector(mROFramesClusters[iLayer], nRofs + 1, getMaybeFrameworkHostResource()); deepVectorClear(mClusterExternalIndices[iLayer], mMemoryPool.get()); if (iLayer < 2) { @@ -302,11 +302,11 @@ void TimeFrame::initialise(const int iteration, const TrackingParameter clearResizeBoundedVector(mBogusClusters, trkParam.NLayers, mMemoryPool.get()); deepVectorClear(mTrackletClusters); for (unsigned int iLayer{0}; iLayer < std::min((int)mClusters.size(), maxLayers); ++iLayer) { - clearResizeBoundedVector(mClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeExternalHostResource(maxLayers != nLayers)); - clearResizeBoundedVector(mUsedClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeExternalHostResource(maxLayers != nLayers)); + clearResizeBoundedVector(mClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeFrameworkHostResource(maxLayers != nLayers)); + clearResizeBoundedVector(mUsedClusters[iLayer], mUnsortedClusters[iLayer].size(), getMaybeFrameworkHostResource(maxLayers != nLayers)); mPositionResolution[iLayer] = o2::gpu::CAMath::Sqrt(0.5f * (trkParam.SystErrorZ2[iLayer] + trkParam.SystErrorY2[iLayer]) + trkParam.LayerResolution[iLayer] * trkParam.LayerResolution[iLayer]); } - clearResizeBoundedArray(mIndexTables, mNrof * (trkParam.ZBins * trkParam.PhiBins + 1), getMaybeExternalHostResource(maxLayers != nLayers)); + clearResizeBoundedArray(mIndexTables, mNrof * (trkParam.ZBins * trkParam.PhiBins + 1), getMaybeFrameworkHostResource(maxLayers != nLayers)); clearResizeBoundedVector(mLines, mNrof, mMemoryPool.get()); clearResizeBoundedVector(mTrackletClusters, mNrof, mMemoryPool.get()); @@ -574,6 +574,7 @@ void TimeFrame::setMemoryPool(std::shared_ptr po initVector(v, useExternal); } }; + // these will only reside on the host for the cpu part initVector(mTotVertPerIteration); initContainers(mClusterExternalIndices); @@ -603,12 +604,19 @@ void TimeFrame::setMemoryPool(std::shared_ptr po initVector(mRoadLabels); initContainers(mTracksLabel); // these will use possibly an externally provided allocator - initContainers(mClusters, hasExternalHostAllocator()); - initContainers(mUsedClusters, hasExternalHostAllocator()); - initContainers(mUnsortedClusters, hasExternalHostAllocator()); - initContainers(mIndexTables, hasExternalHostAllocator()); - initContainers(mTrackingFrameInfo, hasExternalHostAllocator()); - initContainers(mROFramesClusters, hasExternalHostAllocator()); + initContainers(mClusters, hasFrameworkAllocator()); + initContainers(mUsedClusters, hasFrameworkAllocator()); + initContainers(mUnsortedClusters, hasFrameworkAllocator()); + initContainers(mIndexTables, hasFrameworkAllocator()); + initContainers(mTrackingFrameInfo, hasFrameworkAllocator()); + initContainers(mROFramesClusters, hasFrameworkAllocator()); +} + +template +void TimeFrame::setFrameworkAllocator(ExternalAllocator* ext) +{ + mExternalAllocator = ext; + mExtMemoryPool = std::make_shared(mExternalAllocator); } template @@ -639,7 +647,7 @@ void TimeFrame::wipe() deepVectorClear(mLines); // if we use the external host allocator then the assumption is that we // don't clear the memory ourself - if (!hasExternalHostAllocator()) { + if (!hasFrameworkAllocator()) { deepVectorClear(mClusters); deepVectorClear(mUsedClusters); deepVectorClear(mUnsortedClusters); diff --git a/GPU/GPUTracking/Global/GPUChainITS.cxx b/GPU/GPUTracking/Global/GPUChainITS.cxx index a85cdb48c4d1c..9be553de27f95 100644 --- a/GPU/GPUTracking/Global/GPUChainITS.cxx +++ b/GPU/GPUTracking/Global/GPUChainITS.cxx @@ -25,18 +25,23 @@ namespace o2::its class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator { public: - GPUFrameworkExternalAllocator(GPUMemoryResource::MemoryType type) : mType(type) {} - - void* allocate(size_t size) override + void* allocate(size_t size) final { return mFWReco->AllocateDirectMemory(size, mType); } - void deallocate(char* ptr, size_t size) override {} + void deallocate(char* ptr, size_t size) final {} // this is a simple no-op + void pushTagOnStack(uint64_t tag) + { + mFWReco->PushNonPersistentMemory(tag); + } + void popTagOffStack(uint64_t tag) + { + mFWReco->PopNonPersistentMemory(GPUDataTypes::RecoStep::ITSTracking, tag); + } void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; } private: o2::gpu::GPUReconstruction* mFWReco; - GPUMemoryResource::MemoryType mType; }; } // namespace o2::its @@ -73,12 +78,9 @@ o2::its::TimeFrame<7>* GPUChainITS::GetITSTimeframe() } #if !defined(GPUCA_STANDALONE) if (mITSTimeFrame->isGPU()) { - mFrameworkDeviceAllocator.reset(new o2::its::GPUFrameworkExternalAllocator(GPUMemoryResource::MEMORY_GPU)); - mFrameworkDeviceAllocator->setReconstructionFramework(rec()); - mITSTimeFrame->setExternalDeviceAllocator(mFrameworkDeviceAllocator.get()); - mFrameworkHostAllocator.reset(new o2::its::GPUFrameworkExternalAllocator(GPUMemoryResource::MEMORY_HOST)); - mFrameworkHostAllocator->setReconstructionFramework(rec()); - mITSTimeFrame->setExternalHostAllocator(mFrameworkHostAllocator.get()); + mFrameworkAllocator.reset(new o2::its::GPUFrameworkExternalAllocator()); + mFrameworkAllocator->setReconstructionFramework(rec()); + mITSTimeFrame->setFrameworkAllocator(mFrameworkAllocator.get()); } #endif return mITSTimeFrame.get(); diff --git a/GPU/GPUTracking/Global/GPUChainITS.h b/GPU/GPUTracking/Global/GPUChainITS.h index ab693bcef3f8b..a607f66322bab 100644 --- a/GPU/GPUTracking/Global/GPUChainITS.h +++ b/GPU/GPUTracking/Global/GPUChainITS.h @@ -53,8 +53,7 @@ class GPUChainITS final : public GPUChain std::unique_ptr> mITSTrackerTraits; std::unique_ptr> mITSVertexerTraits; std::unique_ptr> mITSTimeFrame; - std::unique_ptr mFrameworkDeviceAllocator; - std::unique_ptr mFrameworkHostAllocator; + std::unique_ptr mFrameworkAllocator; }; } // namespace o2::gpu