From 851dc67da0de57d6a1f2bf33636189886644b603 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Mon, 31 Mar 2025 17:36:41 +0200 Subject: [PATCH] Use typed external allocator for some thrust items --- .../GPU/ITStrackingGPU/TrackingKernels.h | 5 +- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 5 +- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 60 +++++++++++++++---- .../include/ITStracking/ExternalAllocator.h | 2 +- .../tracking/include/ITStracking/TimeFrame.h | 29 +++++---- GPU/GPUTracking/Global/GPUChainITS.cxx | 2 +- 6 files changed, 74 insertions(+), 29 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 54aa0e01c8a78..09c8c39725efa 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -19,6 +19,7 @@ namespace o2::its { class CellSeed; +class ExternalAllocator; namespace gpu { #ifdef GPUCA_GPUCODE // GPUg() global kernels must only when compiled by GPU compiler @@ -178,7 +179,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, int filterCellNeighboursHandler(gpuPair*, int*, - unsigned int); + unsigned int, + o2::its::ExternalAllocator* = nullptr); template void processNeighboursHandler(const int startLayer, @@ -191,6 +193,7 @@ void processNeighboursHandler(const int startLayer, gsl::span neighboursDeviceLUTs, const TrackingFrameInfo** foundTrackingFrameInfo, bounded_vector& seedsHost, + o2::its::ExternalAllocator*, const float bz, const float MaxChi2ClusterAttachment, const float maxChi2NDF, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 871fd7a95f890..89d2b5aeffe63 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -18,6 +18,7 @@ #include "ITStrackingGPU/TrackerTraitsGPU.h" #include "ITStrackingGPU/TrackingKernels.h" #include "ITStracking/TrackingConfigParam.h" + namespace o2::its { constexpr int UnusedIndex{-1}; @@ -209,7 +210,8 @@ void TrackerTraitsGPU::findCellsNeighbours(const int iteration) filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), mTimeFrameGPU->getDeviceNeighbours(iLayer), - nNeigh); + nNeigh, + mTimeFrameGPU->getExternalAllocator()); } mTimeFrameGPU->createNeighboursDeviceArray(); mTimeFrameGPU->unregisterRest(); @@ -236,6 +238,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) mTimeFrameGPU->getDeviceNeighboursLUTs(), mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), trackSeeds, + mTimeFrameGPU->getExternalAllocator(), this->mBz, this->mTrkParams[0].MaxChi2ClusterAttachment, this->mTrkParams[0].MaxChi2NDF, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 301f37767c160..18c89d39adda0 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -28,6 +28,7 @@ #include "ITStracking/Constants.h" #include "ITStracking/IndexTableUtils.h" #include "ITStracking/MathUtils.h" +#include "ITStracking/ExternalAllocator.h" #include "DataFormatsITS/TrackITS.h" #include "ReconstructionDataFormats/Vertex.h" @@ -35,8 +36,6 @@ #include "ITStrackingGPU/TrackingKernels.h" #include "ITStrackingGPU/Utils.h" -#include "GPUCommonHelpers.h" - #ifndef __HIPCC__ #define THRUST_NAMESPACE thrust::cuda #else @@ -64,6 +63,37 @@ GPUdii() float Sq(float v) namespace gpu { +template +class TypedAllocator : public thrust::device_allocator +{ + public: + using value_type = T; + using pointer = T*; + + template + struct rebind { + using other = TypedAllocator; + }; + + explicit TypedAllocator(ExternalAllocator* allocPtr) + : mInternalAllocator(allocPtr) {} + + T* allocate(size_t n) + { + return reinterpret_cast(mInternalAllocator->allocate(n * sizeof(T))); + } + + void deallocate(T* p, size_t n) + { + char* raw_ptr = reinterpret_cast(p); + size_t bytes = n * sizeof(T); + mInternalAllocator->deallocate(raw_ptr, bytes); // redundant as internal dealloc is no-op. + } + + private: + ExternalAllocator* mInternalAllocator; +}; + GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, const o2::its::IndexTableUtils& utils, const float z1, const float z2, float maxdeltaz, float maxdeltaphi) @@ -1117,7 +1147,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, int filterCellNeighboursHandler(gpuPair* cellNeighbourPairs, int* cellNeighbours, - unsigned int nNeigh) + unsigned int nNeigh, + o2::its::ExternalAllocator* allocator) { thrust::device_ptr> neighVectorPairs(cellNeighbourPairs); thrust::device_ptr validNeighs(cellNeighbours); @@ -1140,6 +1171,7 @@ void processNeighboursHandler(const int startLayer, gsl::span neighboursDeviceLUTs, const TrackingFrameInfo** foundTrackingFrameInfo, bounded_vector& seedsHost, + o2::its::ExternalAllocator* allocator, const float bz, const float maxChi2ClusterAttachment, const float maxChi2NDF, @@ -1148,8 +1180,10 @@ void processNeighboursHandler(const int startLayer, const int nBlocks, const int nThreads) { - thrust::device_vector foundSeedsTable(nCells[startLayer] + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency. - // TODO: fix this. + auto allocInt = gpu::TypedAllocator(allocator); + auto allocCellSeed = gpu::TypedAllocator(allocator); + thrust::device_vector> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt); // Shortcut: device_vector skips central memory management, we are relying on the contingency. + // TODO: fix this. gpu::processNeighboursKernel<<>>( @@ -1172,8 +1206,8 @@ void processNeighboursHandler(const int startLayer, matCorrType); gpu::cubExclusiveScanInPlace(foundSeedsTable, nCells[startLayer] + 1); - thrust::device_vector updatedCellId(foundSeedsTable.back()); - thrust::device_vector updatedCellSeed(foundSeedsTable.back()); + thrust::device_vector> updatedCellId(foundSeedsTable.back(), 0, allocInt); + thrust::device_vector> updatedCellSeed(foundSeedsTable.back(), allocCellSeed); gpu::processNeighboursKernel<<>>( startLayer, @@ -1195,13 +1229,13 @@ void processNeighboursHandler(const int startLayer, matCorrType); int level = startLevel; - thrust::device_vector lastCellId; - thrust::device_vector lastCellSeed; + thrust::device_vector> lastCellId(allocInt); + thrust::device_vector> lastCellSeed(allocCellSeed); for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { lastCellSeed.swap(updatedCellSeed); lastCellId.swap(updatedCellId); - thrust::device_vector().swap(updatedCellSeed); - thrust::device_vector().swap(updatedCellId); + thrust::device_vector>(allocCellSeed).swap(updatedCellSeed); + thrust::device_vector>(allocInt).swap(updatedCellId); auto lastCellSeedSize{lastCellSeed.size()}; foundSeedsTable.resize(lastCellSeedSize + 1); thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0); @@ -1253,8 +1287,7 @@ void processNeighboursHandler(const int startLayer, propagator, matCorrType); } - - thrust::device_vector outSeeds(updatedCellSeed.size()); + thrust::device_vector> outSeeds(updatedCellSeed.size(), allocCellSeed); auto end = thrust::copy_if(updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5))); auto s{end - outSeeds.begin()}; seedsHost.reserve(seedsHost.size() + s); @@ -1367,6 +1400,7 @@ template void processNeighboursHandler<7>(const int startLayer, gsl::span neighboursDeviceLUTs, const TrackingFrameInfo** foundTrackingFrameInfo, bounded_vector& seedsHost, + o2::its::ExternalAllocator*, const float bz, const float maxChi2ClusterAttachment, const float maxChi2NDF, diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h index 9bdb2905ba9ba..1628bbc52776b 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/ExternalAllocator.h @@ -23,8 +23,8 @@ class ExternalAllocator { public: virtual void* allocate(size_t) = 0; + virtual void deallocate(char*, size_t) = 0; }; - } // namespace o2::its #endif diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index f6bb9a9b11e66..c3eb25f8d0699 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -230,7 +230,23 @@ struct TimeFrame { void setBz(float bz) { mBz = bz; } float getBz() const { return mBz; } - virtual void setDevicePropagator(const o2::base::PropagatorImpl*) { return; } + void setExternalAllocator(ExternalAllocator* allocator) + { + if (mIsGPU) { + LOGP(debug, "Setting timeFrame allocator to external"); + mAllocator = allocator; + mExtAllocator = true; // to be removed + } else { + LOGP(fatal, "External allocator is currently only supported for GPU"); + } + } + + ExternalAllocator* getExternalAllocator() { return mAllocator; } + + virtual void setDevicePropagator(const o2::base::PropagatorImpl*) + { + return; + }; const o2::base::PropagatorImpl* getDevicePropagator() const { return mPropagatorDevice; } template @@ -277,17 +293,6 @@ struct TimeFrame { // State if memory will be externally managed. bool mExtAllocator = false; ExternalAllocator* mAllocator = nullptr; - void setExternalAllocator(ExternalAllocator* allocator) - { - if (mIsGPU) { - LOGP(debug, "Setting timeFrame allocator to external"); - mAllocator = allocator; - mExtAllocator = true; // to be removed - } else { - LOGP(fatal, "External allocator is currently only supported for GPU"); - } - } - void setExtAllocator(bool ext) { mExtAllocator = ext; } bool getExtAllocator() const { return mExtAllocator; } std::array, nLayers> mUnsortedClusters; diff --git a/GPU/GPUTracking/Global/GPUChainITS.cxx b/GPU/GPUTracking/Global/GPUChainITS.cxx index bcb99fff87a64..cbc19100fe4fa 100644 --- a/GPU/GPUTracking/Global/GPUChainITS.cxx +++ b/GPU/GPUTracking/Global/GPUChainITS.cxx @@ -30,7 +30,7 @@ class GPUFrameworkExternalAllocator final : public o2::its::ExternalAllocator { return mFWReco->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU); } - + void deallocate(char* ptr, size_t) override {} void setReconstructionFramework(o2::gpu::GPUReconstruction* fwr) { mFWReco = fwr; } private: