From 2df7dca9a7c806dd552bf719c5db539d07b40ec3 Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Sat, 17 May 2025 15:47:16 +0200 Subject: [PATCH 1/3] ITS: GPU: print kernel params prints gpu kernel params Signed-off-by: Felix Schlepper --- Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx index 09d9cee06d9f9..5c3a43540f833 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx @@ -39,6 +39,9 @@ Tracker::Tracker(TrackerTraits7* traits) : mTraits(traits) { /// Initialise standard configuration with 1 iteration mTrkParams.resize(1); + if (traits->isGPU()) { + ITSGpuTrackingParamConfig::Instance().printKeyValues(true, true); + } } void Tracker::clustersToTracks(const LogFunc& logger, const LogFunc& error) From 180148c0acb320128b45cf1acf5252b41508bbac Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Mon, 30 Jun 2025 14:55:45 +0200 Subject: [PATCH 2/3] ITS: GPU use ms for tracklets uses multiple streams for trackleting Signed-off-by: Felix Schlepper --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 11 +- .../GPU/ITStrackingGPU/TrackingKernels.h | 6 +- .../ITS/tracking/GPU/ITStrackingGPU/Utils.h | 90 ++++-- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 303 ++++++++---------- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 8 +- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 88 ++--- 6 files changed, 269 insertions(+), 237 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index d41591e6ff25c..8b3e9bddd18d6 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -23,8 +23,6 @@ namespace o2::its::gpu { -class Stream; - class DefaultGPUAllocator : public ExternalAllocator { void* allocate(size_t size) override; @@ -81,10 +79,11 @@ class TimeFrameGPU : public TimeFrame void downloadCellsLUTDevice(); void unregisterRest(); template - Stream& getStream(const size_t stream) + auto& getStream(const size_t stream) { - return *mGpuStreams[stream]; + return mGpuStreams[stream]; } + auto& getStreams() { return mGpuStreams; } void wipe(const int); /// interface @@ -146,7 +145,7 @@ class TimeFrameGPU : public TimeFrame int getNumberOfNeighbours() const final; private: - void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations + void allocMemAsync(void**, size_t, Stream&, bool); // Abstract owned and unowned memory allocations bool mHostRegistered = false; TimeFrameGPUParameters mGpuParams; @@ -200,7 +199,7 @@ class TimeFrameGPU : public TimeFrame const TrackingFrameInfo** mTrackingFrameInfoDeviceArray; // State - std::vector mGpuStreams; + Streams mGpuStreams; size_t mAvailMemGB; bool mFirstInit = true; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index b847aacd9bba5..a058f7e5fab0c 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -84,7 +84,8 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils, std::vector& radii, bounded_vector& mulScatAng, const int nBlocks, - const int nThreads); + const int nThreads, + gpu::Streams& streams); template void computeTrackletsInROFsHandler(const IndexTableUtils* utils, @@ -117,7 +118,8 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, std::vector& radii, bounded_vector& mulScatAng, const int nBlocks, - const int nThreads); + const int nThreads, + gpu::Streams& streams); void countCellsHandler(const Cluster** sortedClusters, const Cluster** unsortedClusters, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index 74c118009d67d..454e39e04a661 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -16,12 +16,14 @@ #ifndef ITSTRACKINGGPU_UTILS_H_ #define ITSTRACKINGGPU_UTILS_H_ +#include + #include "GPUCommonDef.h" +#include "GPUCommonHelpers.h" -namespace o2 -{ -namespace its +namespace o2::its { + template struct gpuPair { T1 first; @@ -31,11 +33,6 @@ struct gpuPair { namespace gpu { -template -void discardResult(const T&) -{ -} - // Poor man implementation of a span-like struct. It is very limited. template struct gpuSpan { @@ -85,19 +82,74 @@ enum class Task { Vertexer = 1 }; -template -GPUhd() T* getPtrFromRuler(int index, T* src, const int* ruler, const int stride = 1) +// Abstract stream class +class Stream { - return src + ruler[index] * stride; -} + public: +#if defined(__HIPCC__) + using Handle = hipStream_t; + static constexpr Handle Default = 0; +#elif defined(__CUDACC__) + using Handle = cudaStream_t; + static constexpr Handle Default = 0; +#else + using Handle = void*; + static constexpr Handle Default = nullptr; +#endif + + Stream(unsigned int flags = 0) + { +#if defined(__HIPCC__) + GPUChkErrS(hipStreamCreateWithFlags(&mHandle, flags)); +#elif defined(__CUDACC__) + GPUChkErrS(cudaStreamCreateWithFlags(&mHandle, flags)); +#endif + } -template -GPUhd() const T* getPtrFromRuler(int index, const T* src, const int* ruler, const int stride = 1) + Stream(Handle h) : mHandle(h) {} + ~Stream() + { + if (mHandle != Default) { +#if defined(__HIPCC__) + GPUChkErrS(hipStreamDestroy(mHandle)); +#elif defined(__CUDACC__) + GPUChkErrS(cudaStreamDestroy(mHandle)); +#endif + } + } + + operator bool() const { return mHandle != Default; } + const Handle& get() { return mHandle; } + void sync() const + { +#if defined(__HIPCC__) + GPUChkErrS(hipStreamSynchronize(mHandle)); +#elif defined(__CUDACC__) + GPUChkErrS(cudaStreamSynchronize(mHandle)); +#endif + } + + private: + Handle mHandle{Default}; +}; +static_assert(sizeof(Stream) == sizeof(void*), "Stream type must match pointer type!"); + +// Abstract vector for streams. +// Handles specifically wrap around. +class Streams { - return src + ruler[index] * stride; -} + public: + size_t size() const noexcept { return mStreams.size(); } + void resize(size_t n) { mStreams.resize(n); } + void clear() { mStreams.clear(); } + auto& operator[](size_t i) { return mStreams[i % mStreams.size()]; } + void push_back(const Stream& stream) { mStreams.push_back(stream); } + + private: + std::vector mStreams; +}; + } // namespace gpu -} // namespace its -} // namespace o2 +} // namespace o2::its -#endif \ No newline at end of file +#endif diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index b336073604b62..13851b4cdc1aa 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -58,38 +58,6 @@ using constants::MB; namespace gpu { -class Stream final -{ - public: - Stream(); - ~Stream(); - - [[nodiscard]] const cudaStream_t& get() const; - - private: - cudaStream_t mStream; -}; - -Stream::Stream() -{ - GPUChkErrS(cudaStreamCreate(&mStream)); -} - -Stream::~Stream() -{ - GPUChkErrS(cudaStreamDestroy(mStream)); -} - -const cudaStream_t& Stream::get() const -{ - return mStream; -} - -void* DefaultGPUAllocator::allocate(size_t size) -{ - LOGP(fatal, "Called DefaultGPUAllocator::allocate with size {}", size); - return nullptr; // to be implemented -} template TimeFrameGPU::TimeFrameGPU() @@ -101,13 +69,13 @@ template TimeFrameGPU::~TimeFrameGPU() = default; template -void TimeFrameGPU::allocMemAsync(void** ptr, size_t size, Stream* strPtr, bool extAllocator) +void TimeFrameGPU::allocMemAsync(void** ptr, size_t size, Stream& stream, bool extAllocator) { if (extAllocator) { *ptr = this->mAllocator->allocate(size); } else { LOGP(debug, "Calling default CUDA allocator"); - GPUChkErrS(cudaMallocAsync(reinterpret_cast(ptr), size, strPtr->get())); + GPUChkErrS(cudaMallocAsync(reinterpret_cast(ptr), size, stream.get())); } } @@ -120,31 +88,31 @@ void TimeFrameGPU::setDevicePropagator(const o2::base::PropagatorImpl void TimeFrameGPU::loadIndexTableUtils(const int iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading indextable utils"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading indextable utils"); if (!iteration) { LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB); - allocMemAsync(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), mGpuStreams[0], this->getExtAllocator()); } LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB); - GPUChkErrS(cudaMemcpyAsync(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + GPUChkErrS(cudaMemcpyAsync(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading unsorted clusters"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading unsorted clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} unsorted clusters on layer {}, for {} MB.", this->mUnsortedClusters[iLayer].size(), iLayer, this->mUnsortedClusters[iLayer].size() * sizeof(Cluster) / MB); - allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[iLayer]), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), nullptr, this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[iLayer]), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -152,17 +120,17 @@ template void TimeFrameGPU::loadClustersDevice(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading sorted clusters"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} clusters on layer {}, for {} MB.", this->mClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(Cluster) / MB); - allocMemAsync(reinterpret_cast(&mClustersDevice[iLayer]), this->mClusters[iLayer].size() * sizeof(Cluster), nullptr, this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mClustersDevice[iLayer]), this->mClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mClustersDevice[iLayer], this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + GPUChkErrS(cudaMemcpyAsync(mClustersDevice[iLayer], this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), nullptr, this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + GPUChkErrS(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -170,15 +138,15 @@ template void TimeFrameGPU::loadClustersIndexTables(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading sorted clusters"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, this->mIndexTables[iLayer].size(), this->mIndexTables[iLayer].size() * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[iLayer]), this->mIndexTables[iLayer].size() * sizeof(int), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[iLayer]), this->mIndexTables[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -186,72 +154,72 @@ template void TimeFrameGPU::createUsedClustersDevice(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "creating used clusters flags"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB); - allocMemAsync(reinterpret_cast(&mUsedClustersDevice[iLayer]), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0]->get())); + allocMemAsync(reinterpret_cast(&mUsedClustersDevice[iLayer]), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + allocMemAsync(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } template void TimeFrameGPU::loadUsedClustersDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading used clusters flags"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading used clusters flags"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(unsigned char) / MB); - GPUChkErrS(cudaMemcpyAsync(mUsedClustersDevice[iLayer], this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + GPUChkErrS(cudaMemcpyAsync(mUsedClustersDevice[iLayer], this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadROframeClustersDevice(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading ROframe clusters"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", this->mROFramesClusters[iLayer].size(), iLayer, this->mROFramesClusters[iLayer].size() * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[iLayer]), this->mROFramesClusters[iLayer].size() * sizeof(int), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[iLayer]), this->mROFramesClusters[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + allocMemAsync(reinterpret_cast(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } template void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading trackingframeinfo"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading trackingframeinfo"); if (!iteration) { for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} tfinfo on layer {}, for {} MB.", this->mTrackingFrameInfo[iLayer].size(), iLayer, this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / MB); - allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[iLayer]), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), nullptr, this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[iLayer]), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), nullptr, this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadMultiplicityCutMask(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading multiplicity cut mask"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask"); LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(bool) / MB); - allocMemAsync(reinterpret_cast(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mMultMaskDevice, this->mMultiplicityCutMask.data(), this->mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + allocMemAsync(reinterpret_cast(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mMultMaskDevice, this->mMultiplicityCutMask.data(), this->mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -259,67 +227,67 @@ template void TimeFrameGPU::loadVertices(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading seeding vertices"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices"); LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + allocMemAsync(reinterpret_cast(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", this->mPrimaryVertices.size(), this->mPrimaryVertices.size() * sizeof(Vertex) / MB); - allocMemAsync(reinterpret_cast(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + allocMemAsync(reinterpret_cast(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } template void TimeFrameGPU::createTrackletsLUTDevice(const int iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "creating tracklets LUTs"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating tracklets LUTs"); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { if (!iteration) { LOGP(debug, "gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {} MB.", this->mClusters[iLayer].size() + 1, iLayer, (this->mClusters[iLayer].size() + 1) * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), (this->mClusters[iLayer].size() + 1) * sizeof(int), nullptr, this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); } - GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0]->get())); + GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); } if (!iteration) { - allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createTrackletsBuffers() { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "creating cells buffers"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { mNTracklets[iLayer] = 0; GPUChkErrS(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + this->mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost)); LOGP(debug, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / MB); - allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), nullptr, this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), mGpuStreams[0], this->getExtAllocator()); } - allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + GPUChkErrS(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadTrackletsDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading tracklets"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", this->mTracklets[iLayer].size(), iLayer, this->mTracklets[iLayer].size() * sizeof(Tracklet) / MB); GPUChkErrS(cudaHostRegister(this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mTrackletsDevice[iLayer], this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + GPUChkErrS(cudaMemcpyAsync(mTrackletsDevice[iLayer], this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadTrackletsLUTDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading tracklets"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", this->mTrackletsLookupTable[iLayer].size(), iLayer + 1, this->mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB); GPUChkErrS(cudaHostRegister(this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); @@ -327,90 +295,90 @@ void TimeFrameGPU::loadTrackletsLUTDevice() } GPUChkErrS(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice)); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createNeighboursIndexTablesDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "creating cells neighbours"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours"); // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps. - allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB); - allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0]->get())); + allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); if (iLayer < nLayers - 3) { mNNeighbours[iLayer] = 0; } } - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createNeighboursLUTDevice(const int layer, const unsigned int nCells) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "reserving neighboursLUT"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT"); LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), nullptr, this->getExtAllocator()); // We need one element more to move exc -> inc - GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); // We need one element more to move exc -> inc + GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadCellsDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading cell seeds"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} cell seeds on layer {}, for {} MB.", this->mCells[iLayer].size(), iLayer, this->mCells[iLayer].size() * sizeof(CellSeed) / MB); - allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeed), nullptr, this->getExtAllocator()); - allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), nullptr, this->getExtAllocator()); // accessory for the neigh. finding. - GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0]->get())); - GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); // accessory for the neigh. finding. + GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createCellsLUTDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "creating cells LUTs"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0]->get())); + allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + allocMemAsync(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createCellsBuffers(const int layer) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "creating cells buffers"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); mNCells[layer] = 0; GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost)); LOGP(debug, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB); - allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), nullptr, this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator()); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadCellsLUTDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading cells LUTs"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cells LUTs"); for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { LOGP(debug, "gpu-transfer: loading cell LUT for {} elements on layer {}, for {} MB.", this->mCellsLookupTable[iLayer].size(), iLayer, this->mCellsLookupTable[iLayer].size() * sizeof(int) / MB); GPUChkErrS(cudaHostRegister(this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + GPUChkErrS(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template @@ -419,128 +387,128 @@ void TimeFrameGPU::loadRoadsDevice() LOGP(debug, "gpu-transfer: loading {} roads, for {} MB.", this->mRoads.size(), this->mRoads.size() * sizeof(Road) / MB); allocMemAsync(reinterpret_cast(&mRoadsDevice), this->mRoads.size() * sizeof(Road), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(this->mRoads.data(), this->mRoads.size() * sizeof(Road), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mRoadsDevice, this->mRoads.data(), this->mRoads.size() * sizeof(Road), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); + GPUChkErrS(cudaMemcpyAsync(mRoadsDevice, this->mRoads.data(), this->mRoads.size() * sizeof(Road), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } template void TimeFrameGPU::loadTrackSeedsDevice(bounded_vector& seeds) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "loading track seeds"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading track seeds"); LOGP(debug, "gpu-transfer: loading {} track seeds, for {} MB.", seeds.size(), seeds.size() * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + GPUChkErrS(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "reserving neighbours"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair) / MB); allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair), mGpuStreams[0]->get())); + GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair), mGpuStreams[0].get())); LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair) / MB); allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), mGpuStreams[0], this->getExtAllocator()); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createNeighboursDevice(const unsigned int layer, std::vector>& neighbours) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "reserving neighbours"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); this->mCellsNeighbours[layer].clear(); this->mCellsNeighbours[layer].resize(neighbours.size()); LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / MB); allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), neighbours.size() * sizeof(gpuPair), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair), mGpuStreams[0]->get())); + GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair), mGpuStreams[0].get())); LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / MB); allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator()); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createNeighboursDeviceArray() { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "reserving neighbours"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); allocMemAsync(reinterpret_cast(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + GPUChkErrS(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createTrackITSExtDevice(bounded_vector& seeds) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "reserving tracks"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving tracks"); mTrackITSExt = bounded_vector(seeds.size(), {}, this->getMemoryPool().get()); LOGP(debug, "gpu-allocation: reserving {} tracks, for {} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / MB); allocMemAsync(reinterpret_cast(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0]->get())); + GPUChkErrS(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get())); GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable)); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::downloadCellsDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "downloading cells"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cells"); for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: downloading {} cells on layer: {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB); this->mCells[iLayer].resize(mNCells[iLayer]); - GPUChkErrS(cudaMemcpyAsync(this->mCells[iLayer].data(), this->mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0]->get())); + GPUChkErrS(cudaMemcpyAsync(this->mCells[iLayer].data(), this->mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::downloadCellsLUTDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "downloading cell luts"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cell luts"); for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1)); this->mCellsLookupTable[iLayer].resize(mNTracklets[iLayer + 1] + 1); - GPUChkErrS(cudaMemcpyAsync(this->mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0]->get())); + GPUChkErrS(cudaMemcpyAsync(this->mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::downloadCellsNeighboursDevice(std::vector>>& neighbours, const int layer) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), fmt::format("downloading neighbours from layer {}", layer)); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours from layer {}", layer)); LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair) / MB); // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :) - GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[0]->get())); + GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } template void TimeFrameGPU::downloadNeighboursLUTDevice(bounded_vector& lut, const int layer) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), fmt::format("downloading neighbours LUT from layer {}", layer)); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours LUT from layer {}", layer)); LOGP(debug, "gpu-transfer: downloading neighbours LUT for {} elements on layer {}, for {} MB.", lut.size(), layer, lut.size() * sizeof(int) / MB); - GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0]->get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::downloadTrackITSExtDevice(bounded_vector& seeds) { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "downloading tracks"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading tracks"); LOGP(debug, "gpu-transfer: downloading {} tracks, for {} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / MB); - GPUChkErrS(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0]->get())); + GPUChkErrS(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); GPUChkErrS(cudaHostUnregister(mTrackITSExt.data())); GPUChkErrS(cudaHostUnregister(seeds.data())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::unregisterRest() { - START_GPU_STREAM_TIMER(mGpuStreams[0]->get(), "unregistering rest of the host memory"); + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "unregistering rest of the host memory"); LOGP(debug, "unregistering rest of the host memory..."); GPUChkErrS(cudaHostUnregister(mCellsDevice.data())); GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0]->get()); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template @@ -563,10 +531,7 @@ void TimeFrameGPU::initialise(const int iteration, IndexTableUtils* utils, const TimeFrameGPUParameters* gpuParam) { - mGpuStreams.resize(mGpuParams.nTimeFrameChunks); - for (auto& str : mGpuStreams) { - str = new Stream(); - } + mGpuStreams.resize(nLayers); o2::its::TimeFrame::initialise(iteration, trkParam, maxLayers); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index b32189f3fabe3..d804d0062764f 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -49,13 +49,13 @@ template void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int iROFslice, int iVertex) { auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - mTimeFrameGPU->createTrackletsLUTDevice(iteration); const Vertex diamondVert({this->mTrkParams[iteration].Diamond[0], this->mTrkParams[iteration].Diamond[1], this->mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f); gsl::span diamondSpan(&diamondVert, 1); int startROF{this->mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * this->mTrkParams[iteration].nROFsPerIterations : 0}; int endROF{o2::gpu::CAMath::Min(this->mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * this->mTrkParams[iteration].nROFsPerIterations + this->mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof(), mTimeFrameGPU->getNrof())}; + mTimeFrameGPU->createTrackletsLUTDevice(iteration); countTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), mTimeFrameGPU->getDeviceMultCutMask(), startROF, @@ -83,7 +83,8 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i this->mTrkParams[iteration].LayerRadii, mTimeFrameGPU->getMSangles(), conf.nBlocks, - conf.nThreads); + conf.nThreads, + mTimeFrameGPU->getStreams()); mTimeFrameGPU->createTrackletsBuffers(); computeTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), mTimeFrameGPU->getDeviceMultCutMask(), @@ -115,7 +116,8 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i this->mTrkParams[iteration].LayerRadii, mTimeFrameGPU->getMSangles(), conf.nBlocks, - conf.nThreads); + conf.nThreads, + mTimeFrameGPU->getStreams()); } template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 8c6367c221583..8245aee33718c 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -890,11 +890,14 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils, std::vector& radii, bounded_vector& mulScatAng, const int nBlocks, - const int nThreads) + const int nThreads, + gpu::Streams& streams) { for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { gpu::computeLayerTrackletsMultiROFKernel<<>>( + o2::gpu::CAMath::Min(nThreads, GPU_THREADS), + 0, + streams[iLayer].get()>>>( utils, multMask, iLayer, @@ -921,7 +924,7 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils, resolutions[iLayer], radii[iLayer + 1] - radii[iLayer], mulScatAng[iLayer]); - gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1); + gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get()); } } @@ -956,45 +959,52 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, std::vector& radii, bounded_vector& mulScatAng, const int nBlocks, - const int nThreads) + const int nThreads, + gpu::Streams& streams) { for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { - gpu::computeLayerTrackletsMultiROFKernel<<>>(utils, - multMask, - iLayer, - startROF, - endROF, - maxROF, - deltaROF, - vertices, - rofPV, - nVertices, - vertexId, - clusters, - ROFClusters, - usedClusters, - clustersIndexTables, - tracklets, - trackletsLUTs, - iteration, - NSigmaCut, - phiCuts[iLayer], - resolutionPV, - minRs[iLayer + 1], - maxRs[iLayer + 1], - resolutions[iLayer], - radii[iLayer + 1] - radii[iLayer], - mulScatAng[iLayer]); + gpu::computeLayerTrackletsMultiROFKernel<<>>( + utils, + multMask, + iLayer, + startROF, + endROF, + maxROF, + deltaROF, + vertices, + rofPV, + nVertices, + vertexId, + clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + tracklets, + trackletsLUTs, + iteration, + NSigmaCut, + phiCuts[iLayer], + resolutionPV, + minRs[iLayer + 1], + maxRs[iLayer + 1], + resolutions[iLayer], + radii[iLayer + 1] - radii[iLayer], + mulScatAng[iLayer]); thrust::device_ptr tracklets_ptr(spanTracklets[iLayer]); - thrust::sort(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets()); - auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets()); + auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(streams[iLayer].get()); + thrust::sort(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets()); + auto unique_end = thrust::unique(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets()); nTracklets[iLayer] = unique_end - tracklets_ptr; if (iLayer > 0) { - GPUChkErrS(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int))); + GPUChkErrS(cudaMemsetAsync(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int), streams[iLayer].get())); gpu::compileTrackletsLookupTableKernel<<>>( - spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]); - gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1); + o2::gpu::CAMath::Min(nThreads, GPU_THREADS), + 0, + streams[iLayer].get()>>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]); + gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get()); } } } @@ -1350,7 +1360,8 @@ template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils, std::vector& radii, bounded_vector& mulScatAng, const int nBlocks, - const int nThreads); + const int nThreads, + gpu::Streams& streams); template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, const uint8_t* multMask, @@ -1382,7 +1393,8 @@ template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, std::vector& radii, bounded_vector& mulScatAng, const int nBlocks, - const int nThreads); + const int nThreads, + gpu::Streams& streams); template void processNeighboursHandler<7>(const int startLayer, const int startLevel, From 26a6670f0fff8c560b0f64fc8ff03d26c6475a19 Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Mon, 9 Jun 2025 11:35:44 +0200 Subject: [PATCH 3/3] ITS: add GPUTimer and use streams Signed-off-by: Felix Schlepper --- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 211 ++++++++++-------- 1 file changed, 114 insertions(+), 97 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 13851b4cdc1aa..d5ea573a2f0e8 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -14,6 +14,7 @@ #include #include "ITStracking/Constants.h" +#include "ITStracking/BoundedAllocator.h" #include "ITStrackingGPU/Utils.h" #include "ITStrackingGPU/TimeFrameGPU.h" @@ -21,6 +22,8 @@ #include #include +#include +#include #include #include "GPUCommonDef.h" @@ -28,27 +31,6 @@ #include "GPUCommonLogger.h" #include "GPUCommonHelpers.h" -#ifdef ITS_MEASURE_GPU_TIME -#define START_GPU_STREAM_TIMER(stream, name) \ - cudaEvent_t event_start, event_stop; \ - GPUChkErrS(cudaEventCreate(&event_start)); \ - GPUChkErrS(cudaEventCreate(&event_stop)); \ - GPUChkErrS(cudaEventRecord(event_start, stream)); \ - const std::string task_name = name; - -#define STOP_GPU_STREAM_TIMER(stream) \ - GPUChkErrS(cudaEventRecord(event_stop, stream)); \ - GPUChkErrS(cudaEventSynchronize(event_stop)); \ - float ms; \ - GPUChkErrS(cudaEventElapsedTime(&ms, event_start, event_stop)); \ - std::cout << "Elapsed time for " << task_name << ": " << ms << " ms" << std::endl; \ - GPUChkErrS(cudaEventDestroy(event_start)); \ - GPUChkErrS(cudaEventDestroy(event_stop)); -#else -#define START_GPU_STREAM_TIMER(stream, name) -#define STOP_GPU_STREAM_TIMER(stream) -#endif - namespace o2 { namespace its @@ -59,6 +41,71 @@ using constants::MB; namespace gpu { +#ifdef ITS_MEASURE_GPU_TIME +class GPUTimer +{ + public: + GPUTimer(Streams& streams, const std::string& name) + : mName(name) + { + for (size_t i{0}; i < streams.size(); ++i) { + mStreams.push_back(streams[i].get()); + } + startTimers(); + } + GPUTimer(Streams& streams, const std::string& name, size_t end, size_t start = 0) + : mName(name) + { + for (size_t sta{start}; sta < end; ++sta) { + mStreams.push_back(streams[sta].get()); + } + startTimers(); + } + GPUTimer(Stream& stream, const std::string& name) + : mName(name) + { + mStreams.push_back(stream.get()); + startTimers(); + } + ~GPUTimer() + { + for (size_t i{0}; i < mStreams.size(); ++i) { + GPUChkErrS(cudaEventRecord(mStops[i], mStreams[i])); + GPUChkErrS(cudaEventSynchronize(mStops[i])); + float ms = 0.0f; + GPUChkErrS(cudaEventElapsedTime(&ms, mStarts[i], mStops[i])); + LOGP(info, "Elapsed time for {}:{} {} ms", mName, i, ms); + GPUChkErrS(cudaEventDestroy(mStarts[i])); + GPUChkErrS(cudaEventDestroy(mStops[i])); + } + } + + void startTimers() + { + mStarts.resize(mStreams.size()); + mStops.resize(mStreams.size()); + for (size_t i{0}; i < mStreams.size(); ++i) { + GPUChkErrS(cudaEventCreate(&mStarts[i])); + GPUChkErrS(cudaEventCreate(&mStops[i])); + GPUChkErrS(cudaEventRecord(mStarts[i], mStreams[i])); + } + } + + private: + std::string mName; + std::vector mStarts, mStops; + std::vector mStreams; +}; +#else // ITS_MEASURE_GPU_TIME not defined +class GPUTimer +{ + public: + GPUTimer(Stream&, const std::string&) {} + GPUTimer(Streams&, const std::string&) {} + GPUTimer(Streams&, const std::string&, int, int = 0) {} +}; +#endif + template TimeFrameGPU::TimeFrameGPU() { @@ -88,21 +135,20 @@ void TimeFrameGPU::setDevicePropagator(const o2::base::PropagatorImpl void TimeFrameGPU::loadIndexTableUtils(const int iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading indextable utils"); + GPUTimer timer(mGpuStreams[0], "loading indextable utils"); if (!iteration) { LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB); allocMemAsync(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), mGpuStreams[0], this->getExtAllocator()); } LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB); GPUChkErrS(cudaMemcpyAsync(mIndexTableUtilsDevice, &(this->mIndexTableUtils), sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading unsorted clusters"); + GPUTimer timer(mGpuStreams[0], "loading unsorted clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} unsorted clusters on layer {}, for {} MB.", this->mUnsortedClusters[iLayer].size(), iLayer, this->mUnsortedClusters[iLayer].size() * sizeof(Cluster) / MB); allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[iLayer]), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[0], this->getExtAllocator()); @@ -112,7 +158,6 @@ void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration) allocMemAsync(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -120,7 +165,7 @@ template void TimeFrameGPU::loadClustersDevice(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters"); + GPUTimer timer(mGpuStreams[0], "loading sorted clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} clusters on layer {}, for {} MB.", this->mClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(Cluster) / MB); allocMemAsync(reinterpret_cast(&mClustersDevice[iLayer]), this->mClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[0], this->getExtAllocator()); @@ -130,7 +175,6 @@ void TimeFrameGPU::loadClustersDevice(const int iteration) allocMemAsync(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpyAsync(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -138,7 +182,7 @@ template void TimeFrameGPU::loadClustersIndexTables(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters"); + GPUTimer timer(mGpuStreams[0], "loading sorted clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, this->mIndexTables[iLayer].size(), this->mIndexTables[iLayer].size() * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[iLayer]), this->mIndexTables[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator()); @@ -146,7 +190,6 @@ void TimeFrameGPU::loadClustersIndexTables(const int iteration) } allocMemAsync(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -154,7 +197,7 @@ template void TimeFrameGPU::createUsedClustersDevice(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags"); + GPUTimer timer(mGpuStreams[0], "creating used clusters flags"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB); allocMemAsync(reinterpret_cast(&mUsedClustersDevice[iLayer]), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0], this->getExtAllocator()); @@ -162,26 +205,24 @@ void TimeFrameGPU::createUsedClustersDevice(const int iteration) } allocMemAsync(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } template void TimeFrameGPU::loadUsedClustersDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading used clusters flags"); + GPUTimer timer(mGpuStreams[0], "loading used clusters flags"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(unsigned char) / MB); GPUChkErrS(cudaMemcpyAsync(mUsedClustersDevice[iLayer], this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadROframeClustersDevice(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters"); + GPUTimer timer(mGpuStreams[0], "loading ROframe clusters"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", this->mROFramesClusters[iLayer].size(), iLayer, this->mROFramesClusters[iLayer].size() * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[iLayer]), this->mROFramesClusters[iLayer].size() * sizeof(int), mGpuStreams[0], this->getExtAllocator()); @@ -189,14 +230,13 @@ void TimeFrameGPU::loadROframeClustersDevice(const int iteration) } allocMemAsync(reinterpret_cast(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } template void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading trackingframeinfo"); + GPUTimer timer(mGpuStreams[0], "loading trackingframeinfo"); if (!iteration) { for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} tfinfo on layer {}, for {} MB.", this->mTrackingFrameInfo[iLayer].size(), iLayer, this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / MB); @@ -208,18 +248,16 @@ void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadMultiplicityCutMask(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask"); + GPUTimer timer(mGpuStreams[0], "loading multiplicity cut mask"); LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(bool) / MB); allocMemAsync(reinterpret_cast(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaMemcpyAsync(mMultMaskDevice, this->mMultiplicityCutMask.data(), this->mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } @@ -227,81 +265,76 @@ template void TimeFrameGPU::loadVertices(const int iteration) { if (!iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices"); + GPUTimer timer(mGpuStreams[0], "loading seeding vertices"); LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaMemcpyAsync(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", this->mPrimaryVertices.size(), this->mPrimaryVertices.size() * sizeof(Vertex) / MB); allocMemAsync(reinterpret_cast(&mPrimaryVerticesDevice), this->mPrimaryVertices.size() * sizeof(Vertex), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaMemcpyAsync(mPrimaryVerticesDevice, this->mPrimaryVertices.data(), this->mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } } template void TimeFrameGPU::createTrackletsLUTDevice(const int iteration) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating tracklets LUTs"); + GPUTimer timer(mGpuStreams, "creating tracklets LUTs", nLayers - 1); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { if (!iteration) { LOGP(debug, "gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {} MB.", this->mClusters[iLayer].size() + 1, iLayer, (this->mClusters[iLayer].size() + 1) * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); } - GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); + GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (this->mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer].get())); } if (!iteration) { allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createTrackletsBuffers() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); + GPUTimer timer(mGpuStreams, "creating cells buffers", nLayers - 1); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { mNTracklets[iLayer] = 0; GPUChkErrS(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + this->mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost)); LOGP(debug, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / MB); - allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), mGpuStreams[0], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), mGpuStreams[iLayer], this->getExtAllocator()); } allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadTrackletsDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); + GPUTimer timer(mGpuStreams, "loading tracklets", nLayers - 1); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", this->mTracklets[iLayer].size(), iLayer, this->mTracklets[iLayer].size() * sizeof(Tracklet) / MB); GPUChkErrS(cudaHostRegister(this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mTrackletsDevice[iLayer], this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mTrackletsDevice[iLayer], this->mTracklets[iLayer].data(), this->mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadTrackletsLUTDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); + GPUTimer timer(mGpuStreams, "loading tracklets", nLayers - 2); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", this->mTrackletsLookupTable[iLayer].size(), iLayer + 1, this->mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB); GPUChkErrS(cudaHostRegister(this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); + GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); } GPUChkErrS(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice)); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createNeighboursIndexTablesDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells neighbours"); + GPUTimer timer(mGpuStreams[0], "creating cells neighbours"); // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps. allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaHostRegisterPortable)); @@ -314,76 +347,70 @@ void TimeFrameGPU::createNeighboursIndexTablesDevice() mNNeighbours[iLayer] = 0; } } - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createNeighboursLUTDevice(const int layer, const unsigned int nCells) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighboursLUT"); + GPUTimer timer(mGpuStreams[0], "reserving neighboursLUT"); LOGP(debug, "gpu-allocation: reserving neighbours LUT for {} elements on layer {} , for {} MB.", nCells + 1, layer, (nCells + 1) * sizeof(int) / MB); allocMemAsync(reinterpret_cast(&mNeighboursLUTDevice[layer]), (nCells + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); // We need one element more to move exc -> inc GPUChkErrS(cudaMemsetAsync(mNeighboursLUTDevice[layer], 0, (nCells + 1) * sizeof(int), mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadCellsDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cell seeds"); + GPUTimer timer(mGpuStreams, "loading cell seeds", nLayers - 2); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} cell seeds on layer {}, for {} MB.", this->mCells[iLayer].size(), iLayer, this->mCells[iLayer].size() * sizeof(CellSeed) / MB); - allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator()); - allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); // accessory for the neigh. finding. - GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); - GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + allocMemAsync(reinterpret_cast(&mCellsDevice[iLayer]), this->mCells[iLayer].size() * sizeof(CellSeed), mGpuStreams[iLayer], this->getExtAllocator()); + allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); // accessory for the neigh. finding. + GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (this->mCells[iLayer].size() + 1) * sizeof(int), mGpuStreams[iLayer].get())); + GPUChkErrS(cudaMemcpyAsync(mCellsDevice[iLayer], this->mCells[iLayer].data(), this->mCells[iLayer].size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); } allocMemAsync(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createCellsLUTDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs"); + GPUTimer timer(mGpuStreams, "creating cells LUTs", nLayers - 2); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); + allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); + GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer].get())); } allocMemAsync(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createCellsBuffers(const int layer) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); + GPUTimer timer(mGpuStreams[0], "creating cells buffers"); mNCells[layer] = 0; GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost)); LOGP(debug, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator()); - - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadCellsLUTDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading cells LUTs"); + GPUTimer timer(mGpuStreams, "loading cells LUTs", nLayers - 3); for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { LOGP(debug, "gpu-transfer: loading cell LUT for {} elements on layer {}, for {} MB.", this->mCellsLookupTable[iLayer].size(), iLayer, this->mCellsLookupTable[iLayer].size() * sizeof(int) / MB); GPUChkErrS(cudaHostRegister(this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(mCellsLUTDevice[iLayer + 1], this->mCellsLookupTable[iLayer].data(), this->mCellsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::loadRoadsDevice() { + GPUTimer timer(mGpuStreams[0], "loading roads device"); LOGP(debug, "gpu-transfer: loading {} roads, for {} MB.", this->mRoads.size(), this->mRoads.size() * sizeof(Road) / MB); allocMemAsync(reinterpret_cast(&mRoadsDevice), this->mRoads.size() * sizeof(Road), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(this->mRoads.data(), this->mRoads.size() * sizeof(Road), cudaHostRegisterPortable)); @@ -393,30 +420,28 @@ void TimeFrameGPU::loadRoadsDevice() template void TimeFrameGPU::loadTrackSeedsDevice(bounded_vector& seeds) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading track seeds"); + GPUTimer timer(mGpuStreams[0], "loading track seeds"); LOGP(debug, "gpu-transfer: loading {} track seeds, for {} MB.", seeds.size(), seeds.size() * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createNeighboursDevice(const unsigned int layer, const unsigned int nNeighbours) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); + GPUTimer timer(mGpuStreams[0], "reserving neighbours"); LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair) / MB); allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), nNeighbours * sizeof(gpuPair), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, nNeighbours * sizeof(gpuPair), mGpuStreams[0].get())); LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", nNeighbours, nNeighbours * sizeof(gpuPair) / MB); allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), nNeighbours * sizeof(int), mGpuStreams[0], this->getExtAllocator()); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createNeighboursDevice(const unsigned int layer, std::vector>& neighbours) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); + GPUTimer timer(mGpuStreams[0], "reserving neighbours"); this->mCellsNeighbours[layer].clear(); this->mCellsNeighbours[layer].resize(neighbours.size()); LOGP(debug, "gpu-allocation: reserving {} neighbours (pairs), for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / MB); @@ -424,58 +449,53 @@ void TimeFrameGPU::createNeighboursDevice(const unsigned int layer, std GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, neighbours.size() * sizeof(gpuPair), mGpuStreams[0].get())); LOGP(debug, "gpu-allocation: reserving {} neighbours, for {} MB.", neighbours.size(), neighbours.size() * sizeof(gpuPair) / MB); allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), neighbours.size() * sizeof(int), mGpuStreams[0], this->getExtAllocator()); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createNeighboursDeviceArray() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving neighbours"); + GPUTimer timer(mGpuStreams[0], "reserving neighbours"); allocMemAsync(reinterpret_cast(&mNeighboursDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaMemcpyAsync(mNeighboursDeviceArray, mNeighboursDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::createTrackITSExtDevice(bounded_vector& seeds) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "reserving tracks"); + GPUTimer timer(mGpuStreams[0], "reserving tracks"); mTrackITSExt = bounded_vector(seeds.size(), {}, this->getMemoryPool().get()); LOGP(debug, "gpu-allocation: reserving {} tracks, for {} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / MB); allocMemAsync(reinterpret_cast(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0], this->getExtAllocator()); GPUChkErrS(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get())); GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable)); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::downloadCellsDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cells"); + GPUTimer timer(mGpuStreams, "downloading cells", nLayers - 2); for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) { LOGP(debug, "gpu-transfer: downloading {} cells on layer: {}, for {} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / MB); this->mCells[iLayer].resize(mNCells[iLayer]); - GPUChkErrS(cudaMemcpyAsync(this->mCells[iLayer].data(), this->mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(this->mCells[iLayer].data(), this->mCellsDevice[iLayer], mNCells[iLayer] * sizeof(CellSeed), cudaMemcpyDeviceToHost, mGpuStreams[iLayer].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::downloadCellsLUTDevice() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cell luts"); + GPUTimer timer(mGpuStreams, "downloading cell luts", nLayers - 3); for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1)); this->mCellsLookupTable[iLayer].resize(mNTracklets[iLayer + 1] + 1); - GPUChkErrS(cudaMemcpyAsync(this->mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(this->mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[iLayer].get())); } - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::downloadCellsNeighboursDevice(std::vector>>& neighbours, const int layer) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours from layer {}", layer)); + GPUTimer timer(mGpuStreams[0], fmt::format("downloading neighbours from layer {}", layer)); LOGP(debug, "gpu-transfer: downloading {} neighbours, for {} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair) / MB); // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :) GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); @@ -484,31 +504,28 @@ void TimeFrameGPU::downloadCellsNeighboursDevice(std::vector void TimeFrameGPU::downloadNeighboursLUTDevice(bounded_vector& lut, const int layer) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), fmt::format("downloading neighbours LUT from layer {}", layer)); + GPUTimer timer(mGpuStreams[0], fmt::format("downloading neighbours LUT from layer {}", layer)); LOGP(debug, "gpu-transfer: downloading neighbours LUT for {} elements on layer {}, for {} MB.", lut.size(), layer, lut.size() * sizeof(int) / MB); GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::downloadTrackITSExtDevice(bounded_vector& seeds) { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading tracks"); + GPUTimer timer(mGpuStreams[0], "downloading tracks"); LOGP(debug, "gpu-transfer: downloading {} tracks, for {} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / MB); GPUChkErrS(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); GPUChkErrS(cudaHostUnregister(mTrackITSExt.data())); GPUChkErrS(cudaHostUnregister(seeds.data())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template void TimeFrameGPU::unregisterRest() { - START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "unregistering rest of the host memory"); + GPUTimer timer(mGpuStreams[0], "unregistering rest of the host memory"); LOGP(debug, "unregistering rest of the host memory..."); GPUChkErrS(cudaHostUnregister(mCellsDevice.data())); GPUChkErrS(cudaHostUnregister(mTrackletsDevice.data())); - STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } template