From c9c881887de08e577102288a589e042e0416a6b8 Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Thu, 14 Aug 2025 13:46:19 +0200 Subject: [PATCH] ITS: GPU: prepare to lazy loading of data Signed-off-by: Felix Schlepper --- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 79 ++- .../GPU/ITStrackingGPU/TrackerTraitsGPU.h | 2 +- .../GPU/ITStrackingGPU/TrackingKernels.h | 55 +- .../ITS/tracking/GPU/ITStrackingGPU/Utils.h | 150 +++++- .../ITS/tracking/GPU/cuda/CMakeLists.txt | 48 +- .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 470 ++++++++++-------- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 151 +++--- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 445 +++++------------ 8 files changed, 708 insertions(+), 692 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 5c10b01412b4e..27b987fb9a84a 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -13,13 +13,14 @@ #ifndef TRACKINGITSGPU_INCLUDE_TIMEFRAMEGPU_H #define TRACKINGITSGPU_INCLUDE_TIMEFRAMEGPU_H +#include +#include + #include "ITStracking/BoundedAllocator.h" #include "ITStracking/TimeFrame.h" #include "ITStracking/Configuration.h" #include "ITStrackingGPU/Utils.h" -#include - namespace o2::its::gpu { @@ -28,7 +29,7 @@ class TimeFrameGPU : public TimeFrame { public: TimeFrameGPU(); - ~TimeFrameGPU(); + ~TimeFrameGPU() = default; /// Most relevant operations void registerHostMemory(const int); @@ -37,18 +38,25 @@ class TimeFrameGPU : public TimeFrame void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int); void initDeviceSAFitting(); void loadIndexTableUtils(const int); - void loadTrackingFrameInfoDevice(const int); - void loadUnsortedClustersDevice(const int); - void loadClustersDevice(const int); - void loadClustersIndexTables(const int iteration); - void createUsedClustersDevice(const int); + void loadTrackingFrameInfoDevice(const int, const int); + void createTrackingFrameInfoDeviceArray(const int); + void loadUnsortedClustersDevice(const int, const int); + void createUnsortedClustersDeviceArray(const int); + void loadClustersDevice(const int, const int); + void createClustersDeviceArray(const int); + void loadClustersIndexTables(const int, const int); + void createClustersIndexTablesArray(const int iteration); + void createUsedClustersDevice(const int, const int); + void createUsedClustersDeviceArray(const int); void loadUsedClustersDevice(); - void loadROframeClustersDevice(const int); + void loadROFrameClustersDevice(const int, const int); + void createROFrameClustersDeviceArray(const int); void loadMultiplicityCutMask(const int); void loadVertices(const int); /// - void createTrackletsLUTDevice(const int); + void createTrackletsLUTDevice(const int, const int); + void createTrackletsLUTDeviceArray(const int); void loadTrackletsDevice(); void loadTrackletsLUTDevice(); void loadCellsDevice(); @@ -57,11 +65,14 @@ class TimeFrameGPU : public TimeFrame void loadTrackSeedsChi2Device(); void loadRoadsDevice(); void loadTrackSeedsDevice(bounded_vector&); - void createTrackletsBuffers(); + void createTrackletsBuffers(const int); + void createTrackletsBuffersArray(const int); void createCellsBuffers(const int); + void createCellsBuffersArray(const int); void createCellsDevice(); - void createCellsLUTDevice(); - void createNeighboursIndexTablesDevice(); + void createCellsLUTDevice(const int); + void createCellsLUTDeviceArray(const int); + void createNeighboursIndexTablesDevice(const int); void createNeighboursDevice(const unsigned int layer); void createNeighboursLUTDevice(const int, const unsigned int); void createTrackITSExtDevice(bounded_vector&); @@ -70,10 +81,17 @@ class TimeFrameGPU : public TimeFrame void downloadNeighboursLUTDevice(bounded_vector&, const int); void downloadCellsDevice(); void downloadCellsLUTDevice(); + + /// synchronization auto& getStream(const size_t stream) { return mGpuStreams[stream]; } auto& getStreams() { return mGpuStreams; } void syncStream(const size_t stream); - void syncStreams(); + void syncStreams(const bool = true); + void waitEvent(const int, const int); + void recordEvent(const int); + void recordEvents(const int = 0, const int = nLayers); + + /// cleanup virtual void wipe() final; /// interface @@ -102,19 +120,19 @@ class TimeFrameGPU : public TimeFrame const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; } std::vector getClusterSizes(); const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; } - const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; } - Tracklet** getDeviceArrayTracklets() { return mTrackletsDevice.data(); } + const int** getDeviceROFrameClusters() const { return mROFramesClustersDeviceArray; } + Tracklet** getDeviceArrayTracklets() { return mTrackletsDeviceArray; } int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; } int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; } - CellSeed** getDeviceArrayCells() { return mCellsDevice.data(); } + CellSeed** getDeviceArrayCells() { return mCellsDeviceArray; } CellSeed* getDeviceTrackSeeds() { return mTrackSeedsDevice; } o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; } float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; } int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; } uint8_t* getDeviceMultCutMask() { return mMultMaskDevice; } - void setDevicePropagator(const o2::base::PropagatorImpl*) override; + void setDevicePropagator(const o2::base::PropagatorImpl* p) final { this->mPropagatorDevice = p; } // Host-specific getters gsl::span getNTracklets() { return mNTracklets; } @@ -126,7 +144,7 @@ class TimeFrameGPU : public TimeFrame // Host-available device getters gsl::span getDeviceTrackletsLUTs() { return mTrackletsLUTDevice; } gsl::span getDeviceCellLUTs() { return mCellsLUTDevice; } - gsl::span getDeviceTracklet() { return mTrackletsDevice; } + gsl::span getDeviceTracklets() { return mTrackletsDevice; } gsl::span getDeviceCells() { return mCellsDevice; } // Overridden getters @@ -137,7 +155,6 @@ class TimeFrameGPU : public TimeFrame private: void allocMemAsync(void**, size_t, Stream&, bool); // Abstract owned and unowned memory allocations on specific stream void allocMem(void**, size_t, bool); // Abstract owned and unowned memory allocations on default stream - bool mHostRegistered = false; TimeFrameGPUParameters mGpuParams; // Host-available device buffer sizes @@ -161,19 +178,21 @@ class TimeFrameGPU : public TimeFrame const Cluster** mUnsortedClustersDeviceArray; const int** mClustersIndexTablesDeviceArray; const unsigned char** mUsedClustersDeviceArray; - const int** mROFrameClustersDeviceArray; + const int** mROFramesClustersDeviceArray; std::array mTrackletsDevice; std::array mTrackletsLUTDevice; std::array mCellsLUTDevice; std::array mNeighboursLUTDevice; - int** mCellsLUTDeviceArray; - int** mNeighboursCellDeviceArray; - int** mNeighboursCellLUTDeviceArray; - int** mTrackletsLUTDeviceArray; + Tracklet** mTrackletsDeviceArray{nullptr}; + int** mCellsLUTDeviceArray{nullptr}; + int** mNeighboursCellDeviceArray{nullptr}; + int** mNeighboursCellLUTDeviceArray{nullptr}; + int** mTrackletsLUTDeviceArray{nullptr}; std::array mCellsDevice; - std::array mNeighboursIndexTablesDevice; - CellSeed* mTrackSeedsDevice; + CellSeed** mCellsDeviceArray; + std::array mNeighboursIndexTablesDevice; + CellSeed* mTrackSeedsDevice{nullptr}; std::array mCellSeedsDevice; o2::track::TrackParCovF** mCellSeedsDeviceArray; std::array mCellSeedsChi2Device; @@ -188,6 +207,12 @@ class TimeFrameGPU : public TimeFrame // State Streams mGpuStreams; + std::bitset mPinnedUnsortedClusters{0}; + std::bitset mPinnedClusters{0}; + std::bitset mPinnedClustersIndexTables{0}; + std::bitset mPinnedUsedClusters{0}; + std::bitset mPinnedROFramesClusters{0}; + std::bitset mPinnedTrackingFrameInfo{0}; // Temporary buffer for storing output tracks from GPU tracking bounded_vector mTrackITSExt; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h index 1654f8cc8cf94..d5c3e8ac74925 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h @@ -24,7 +24,7 @@ class TrackerTraitsGPU final : public TrackerTraits { public: TrackerTraitsGPU() = default; - ~TrackerTraitsGPU() override = default; + ~TrackerTraitsGPU() final = default; void adoptTimeFrame(TimeFrame* tf) final; void initialiseTimeFrame(const int iteration) final; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index a7bf4c70bc5c2..23b23d2b3f3ab 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -13,6 +13,10 @@ #ifndef ITSTRACKINGGPU_TRACKINGKERNELS_H_ #define ITSTRACKINGGPU_TRACKINGKERNELS_H_ +#include + +#include "ITStracking/BoundedAllocator.h" +#include "ITStrackingGPU/Utils.h" #include "DetectorsBase/Propagator.h" #include "GPUCommonDef.h" @@ -25,43 +29,43 @@ namespace gpu #ifdef GPUCA_GPUCODE // GPUg() global kernels must only when compiled by GPU compiler -GPUdi() int4 getEmptyBinsRect() +GPUdii() int4 getEmptyBinsRect() { return int4{0, 0, 0, 0}; } -GPUd() bool fitTrack(TrackITSExt& track, - int start, - int end, - int step, - float chi2clcut, - float chi2ndfcut, - float maxQoverPt, - int nCl, - float Bz, - TrackingFrameInfo** tfInfos, - const o2::base::Propagator* prop, - o2::base::PropagatorF::MatCorrType matCorrType = o2::base::PropagatorImpl::MatCorrType::USEMatCorrNONE); +GPUdii() bool fitTrack(TrackITSExt& track, + int start, + int end, + int step, + float chi2clcut, + float chi2ndfcut, + float maxQoverPt, + int nCl, + float Bz, + TrackingFrameInfo** tfInfos, + const o2::base::Propagator* prop, + o2::base::PropagatorF::MatCorrType matCorrType = o2::base::PropagatorImpl::MatCorrType::USEMatCorrNONE); template -GPUg() void fitTrackSeedsKernel( - CellSeed* trackSeeds, - const TrackingFrameInfo** foundTrackingFrameInfo, - o2::its::TrackITSExt* tracks, - const float* minPts, - const unsigned int nSeeds, - const float Bz, - const int startLevel, - float maxChi2ClusterAttachment, - float maxChi2NDF, - const o2::base::Propagator* propagator, - const o2::base::PropagatorF::MatCorrType matCorrType = o2::base::PropagatorF::MatCorrType::USEMatCorrLUT); +GPUg() void fitTrackSeedsKernel(CellSeed* trackSeeds, + const TrackingFrameInfo** foundTrackingFrameInfo, + o2::its::TrackITSExt* tracks, + const float* minPts, + const unsigned int nSeeds, + const float Bz, + const int startLevel, + float maxChi2ClusterAttachment, + float maxChi2NDF, + const o2::base::Propagator* propagator, + const o2::base::PropagatorF::MatCorrType matCorrType = o2::base::PropagatorF::MatCorrType::USEMatCorrLUT); #endif } // namespace gpu template void countTrackletsInROFsHandler(const IndexTableUtils* utils, const uint8_t* multMask, + const int layer, const int startROF, const int endROF, const int maxROF, @@ -94,6 +98,7 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils, template void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const uint8_t* multMask, + const int layer, const int startROF, const int endROF, const int maxROF, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index cd860c47ebd9c..15fe6f05f7850 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -17,9 +17,12 @@ #define ITSTRACKINGGPU_UTILS_H_ #include +#include +#include #include "GPUCommonDef.h" #include "GPUCommonHelpers.h" +#include "GPUCommonLogger.h" #ifndef __HIPCC__ #define THRUST_NAMESPACE thrust::cuda @@ -27,19 +30,21 @@ #define THRUST_NAMESPACE thrust::hip #endif +#ifdef ITS_GPU_LOG +#define GPULog(...) LOGP(info, __VA_ARGS__) +#else +#define GPULog(...) +#endif + namespace o2::its { template -struct gpuPair { - T1 first; - T2 second; -}; +using gpuPair = std::pair; namespace gpu { -// Poor man implementation of a span-like struct. It is very limited. template struct gpuSpan { using value_type = T; @@ -83,11 +88,6 @@ struct gpuSpan { unsigned int _size; }; -enum class Task { - Tracker = 0, - Vertexer = 1 -}; - // Abstract stream class class Stream { @@ -96,22 +96,27 @@ class Stream using Handle = hipStream_t; static constexpr Handle DefaultStream = 0; static constexpr unsigned int DefaultFlag = hipStreamNonBlocking; + using Event = hipEvent_t; #elif defined(__CUDACC__) using Handle = cudaStream_t; static constexpr Handle DefaultStream = 0; static constexpr unsigned int DefaultFlag = cudaStreamNonBlocking; + using Event = cudaEvent_t; #else using Handle = void*; static constexpr Handle DefaultStream = nullptr; static constexpr unsigned int DefaultFlag = 0; + using Event = void*; #endif Stream(unsigned int flags = DefaultFlag) { #if defined(__HIPCC__) GPUChkErrS(hipStreamCreateWithFlags(&mHandle, flags)); + GPUChkErrS(hipEventCreateWithFlags(&mEvent, hipEventDisableTiming)); #elif defined(__CUDACC__) GPUChkErrS(cudaStreamCreateWithFlags(&mHandle, flags)); + GPUChkErrS(cudaEventCreateWithFlags(&mEvent, cudaEventDisableTiming)); #endif } @@ -121,49 +126,162 @@ class Stream if (mHandle != DefaultStream) { #if defined(__HIPCC__) GPUChkErrS(hipStreamDestroy(mHandle)); + GPUChkErrS(hipEventDestroy(mEvent)); #elif defined(__CUDACC__) GPUChkErrS(cudaStreamDestroy(mHandle)); + GPUChkErrS(cudaEventDestroy(mEvent)); #endif } } operator bool() const { return mHandle != DefaultStream; } const Handle& get() { return mHandle; } + const Handle& getStream() { return mHandle; } + const Event& getEvent() { return mEvent; } void sync() const { #if defined(__HIPCC__) GPUChkErrS(hipStreamSynchronize(mHandle)); #elif defined(__CUDACC__) GPUChkErrS(cudaStreamSynchronize(mHandle)); +#endif + } + void record() + { +#if defined(__HIPCC__) + GPUChkErrS(hipEventRecord(mEvent, mHandle)); +#elif defined(__CUDACC__) + GPUChkErrS(cudaEventRecord(mEvent, mHandle)); #endif } private: Handle mHandle{DefaultStream}; + Event mEvent{nullptr}; }; -static_assert(sizeof(Stream) == sizeof(void*), "Stream type must match pointer type!"); // Abstract vector for streams. -// Handles specifically wrap around. class Streams { public: size_t size() const noexcept { return mStreams.size(); } void resize(size_t n) { mStreams.resize(n); } void clear() { mStreams.clear(); } - auto& operator[](size_t i) { return mStreams[i % mStreams.size()]; } + auto& operator[](size_t i) { return mStreams[i]; } void push_back(const Stream& stream) { mStreams.push_back(stream); } - void sync() + void sync(bool device = true) { - for (auto& s : mStreams) { - s.sync(); + if (device) { +#if defined(__HIPCC__) + GPUChkErrS(hipDeviceSynchronize()); +#elif defined(__CUDACC__) + GPUChkErrS(cudaDeviceSynchronize()); +#endif + } else { + for (auto& s : mStreams) { + s.sync(); + } } } + void waitEvent(size_t iStream, size_t iEvent) + { +#if defined(__HIPCC__) + GPUChkErrS(hipStreamWaitEvent(mStreams[iStream].get(), mStreams[iEvent].getEvent())); +#elif defined(__CUDACC__) + GPUChkErrS(cudaStreamWaitEvent(mStreams[iStream].get(), mStreams[iEvent].getEvent())); +#endif + } private: std::vector mStreams; }; +#ifdef ITS_MEASURE_GPU_TIME +class GPUTimer +{ + public: + GPUTimer(const std::string& name) + : mName(name) + { + mStreams.emplace_back(Stream::DefaultStream); + startTimers(); + } + GPUTimer(Streams& streams, const std::string& name) + : mName(name) + { + for (size_t i{0}; i < streams.size(); ++i) { + mStreams.push_back(streams[i].get()); + } + startTimers(); + } + GPUTimer(Streams& streams, const std::string& name, size_t end, size_t start = 0) + : mName(name) + { + for (size_t sta{start}; sta < end; ++sta) { + mStreams.push_back(streams[sta].get()); + } + startTimers(); + } + GPUTimer(Stream& stream, const std::string& name, const int id = 0) + : mName(name) + { + mStreams.push_back(stream.get()); + mName += ":id" + std::to_string(id); + startTimers(); + } + ~GPUTimer() + { + for (size_t i{0}; i < mStreams.size(); ++i) { + float ms = 0.0f; +#if defined(__HIPCC__) + GPUChkErrS(hipEventRecord(mStops[i], mStreams[i])); + GPUChkErrS(hipEventSynchronize(mStops[i])); + GPUChkErrS(hipEventElapsedTime(&ms, mStarts[i], mStops[i])); + GPUChkErrS(hipEventDestroy(mStarts[i])); + GPUChkErrS(hipEventDestroy(mStops[i])); +#elif defined(__CUDACC__) + GPUChkErrS(cudaEventRecord(mStops[i], mStreams[i])); + GPUChkErrS(cudaEventSynchronize(mStops[i])); + GPUChkErrS(cudaEventElapsedTime(&ms, mStarts[i], mStops[i])); + GPUChkErrS(cudaEventDestroy(mStarts[i])); + GPUChkErrS(cudaEventDestroy(mStops[i])); +#endif + LOGP(info, "Elapsed time for {}:{} {} ms", mName, i, ms); + } + } + + void startTimers() + { + mStarts.resize(mStreams.size()); + mStops.resize(mStreams.size()); + for (size_t i{0}; i < mStreams.size(); ++i) { +#if defined(__HIPCC__) + GPUChkErrS(hipEventCreate(&mStarts[i])); + GPUChkErrS(hipEventCreate(&mStops[i])); + GPUChkErrS(hipEventRecord(mStarts[i], mStreams[i])); +#elif defined(__CUDACC__) + GPUChkErrS(cudaEventCreate(&mStarts[i])); + GPUChkErrS(cudaEventCreate(&mStops[i])); + GPUChkErrS(cudaEventRecord(mStarts[i], mStreams[i])); +#endif + } + } + + private: + std::string mName; + std::vector mStarts, mStops; + std::vector mStreams; +}; +#else // ITS_MEASURE_GPU_TIME not defined +class GPUTimer +{ + public: + template + GPUTimer(Args&&...) + { + } +}; +#endif } // namespace gpu } // namespace o2::its diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt index 9769930504f29..1f6a046a81350 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt @@ -11,29 +11,29 @@ # CUDA if(CUDA_ENABLED) -find_package(CUDAToolkit) -message(STATUS "Building ITS CUDA tracker") -# add_compile_options(-O0 -g -lineinfo -fPIC) -# add_compile_definitions(ITS_MEASURE_GPU_TIME) -o2_add_library(ITStrackingCUDA - SOURCES ClusterLinesGPU.cu - TrackerTraitsGPU.cxx - TimeFrameGPU.cu - TracerGPU.cu - TrackingKernels.cu - VertexingKernels.cu - VertexerTraitsGPU.cxx - PUBLIC_INCLUDE_DIRECTORIES ../ - PUBLIC_LINK_LIBRARIES O2::ITStracking - O2::SimConfig - O2::SimulationDataFormat - O2::ReconstructionDataFormats - O2::GPUCommon - PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider - TARGETVARNAME targetName) - -set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON) -target_compile_definitions(${targetName} PRIVATE $) -set_target_cuda_arch(${targetName}) + find_package(CUDAToolkit) + message(STATUS "Building ITS CUDA tracker") + # add_compile_options(-O0 -g -lineinfo -fPIC -DGPU_FORCE_DEVICE_ASSERTS=ON) + # add_compile_definitions(ITS_MEASURE_GPU_TIME) + # add_compile_definitions(ITS_GPU_LOG) + o2_add_library(ITStrackingCUDA + SOURCES ClusterLinesGPU.cu + TrackerTraitsGPU.cxx + TimeFrameGPU.cu + TracerGPU.cu + TrackingKernels.cu + VertexingKernels.cu + VertexerTraitsGPU.cxx + PUBLIC_INCLUDE_DIRECTORIES ../ + PUBLIC_LINK_LIBRARIES O2::ITStracking + O2::SimConfig + O2::SimulationDataFormat + O2::ReconstructionDataFormats + O2::GPUCommon + PRIVATE_LINK_LIBRARIES O2::GPUTrackingCUDAExternalProvider + TARGETVARNAME targetName) + set_property(TARGET ${targetName} PROPERTY CUDA_SEPARABLE_COMPILATION ON) + target_compile_definitions(${targetName} PRIVATE $) + set_target_cuda_arch(${targetName}) endif() diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 72a1f98d1b78b..4f3b52d56a793 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -9,20 +9,16 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. /// + #include -#include -#include +#include +#include + +#include "ITStrackingGPU/TimeFrameGPU.h" #include "ITStracking/Constants.h" #include "ITStracking/BoundedAllocator.h" - #include "ITStrackingGPU/Utils.h" -#include "ITStrackingGPU/TimeFrameGPU.h" -#include "ITStrackingGPU/TracerGPU.h" - -#include -#include -#include #include "GPUCommonDef.h" #include "GPUCommonMath.h" @@ -32,85 +28,12 @@ namespace o2::its::gpu { -#ifdef ITS_MEASURE_GPU_TIME -class GPUTimer -{ - public: - GPUTimer(Streams& streams, const std::string& name) - : mName(name) - { - for (size_t i{0}; i < streams.size(); ++i) { - mStreams.push_back(streams[i].get()); - } - startTimers(); - } - GPUTimer(Streams& streams, const std::string& name, size_t end, size_t start = 0) - : mName(name) - { - for (size_t sta{start}; sta < end; ++sta) { - mStreams.push_back(streams[sta].get()); - } - startTimers(); - } - GPUTimer(Stream& stream, const std::string& name) - : mName(name) - { - mStreams.push_back(stream.get()); - startTimers(); - } - ~GPUTimer() - { - for (size_t i{0}; i < mStreams.size(); ++i) { - GPUChkErrS(cudaEventRecord(mStops[i], mStreams[i])); - GPUChkErrS(cudaEventSynchronize(mStops[i])); - float ms = 0.0f; - GPUChkErrS(cudaEventElapsedTime(&ms, mStarts[i], mStops[i])); - LOGP(info, "Elapsed time for {}:{} {} ms", mName, i, ms); - GPUChkErrS(cudaEventDestroy(mStarts[i])); - GPUChkErrS(cudaEventDestroy(mStops[i])); - } - } - - void startTimers() - { - mStarts.resize(mStreams.size()); - mStops.resize(mStreams.size()); - for (size_t i{0}; i < mStreams.size(); ++i) { - GPUChkErrS(cudaEventCreate(&mStarts[i])); - GPUChkErrS(cudaEventCreate(&mStops[i])); - GPUChkErrS(cudaEventRecord(mStarts[i], mStreams[i])); - } - } - - private: - std::string mName; - std::vector mStarts, mStops; - std::vector mStreams; -}; - -#define GPULog(...) LOGP(info, __VA_ARGS__) -#else // ITS_MEASURE_GPU_TIME not defined -class GPUTimer -{ - public: - template - GPUTimer(Args&&...) - { - } -}; - -#define GPULog(...) -#endif - template TimeFrameGPU::TimeFrameGPU() { this->mIsGPU = true; } -template -TimeFrameGPU::~TimeFrameGPU() = default; - template void TimeFrameGPU::allocMemAsync(void** ptr, size_t size, Stream& stream, bool extAllocator) { @@ -133,16 +56,10 @@ void TimeFrameGPU::allocMem(void** ptr, size_t size, bool extAllocator) } } -template -void TimeFrameGPU::setDevicePropagator(const o2::base::PropagatorImpl* propagator) -{ - this->mPropagatorDevice = propagator; -} - template void TimeFrameGPU::loadIndexTableUtils(const int iteration) { - GPUTimer timer(mGpuStreams[0], "loading indextable utils"); + GPUTimer timer("loading indextable utils"); if (!iteration) { GPULog("gpu-allocation: allocating IndexTableUtils buffer, for {:.2f} MB.", sizeof(IndexTableUtils) / constants::MB); allocMem(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), this->getExtAllocator()); @@ -152,114 +69,174 @@ void TimeFrameGPU::loadIndexTableUtils(const int iteration) } template -void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration) +void TimeFrameGPU::createUnsortedClustersDeviceArray(const int iteration) { if (!iteration) { - GPUTimer timer(mGpuStreams[0], "loading unsorted clusters"); - for (int iLayer{0}; iLayer < nLayers; ++iLayer) { - GPULog("gpu-transfer: loading {} unsorted clusters on layer {}, for {:.2f} MB.", this->mUnsortedClusters[iLayer].size(), iLayer, this->mUnsortedClusters[iLayer].size() * sizeof(Cluster) / constants::MB); - allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[iLayer]), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[iLayer], this->getExtAllocator()); - GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[iLayer], this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); - } - mGpuStreams.sync(); + GPUTimer timer("creating unsorted clusters array"); allocMem(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpy(mUnsortedClustersDeviceArray, mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice)); + mPinnedUnsortedClusters.set(nLayers); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); + mPinnedUnsortedClusters.set(iLayer); + } + } +} + +template +void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration, const int layer) +{ + if (!iteration) { + GPUTimer timer(mGpuStreams[layer], "loading unsorted clusters", layer); + GPULog("gpu-transfer: loading {} unsorted clusters on layer {}, for {:.2f} MB.", this->mUnsortedClusters[layer].size(), layer, this->mUnsortedClusters[layer].size() * sizeof(Cluster) / constants::MB); + allocMemAsync(reinterpret_cast(&mUnsortedClustersDevice[layer]), this->mUnsortedClusters[layer].size() * sizeof(Cluster), mGpuStreams[layer], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mUnsortedClustersDevice[layer], this->mUnsortedClusters[layer].data(), this->mUnsortedClusters[layer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); + GPUChkErrS(cudaMemcpyAsync(&mUnsortedClustersDeviceArray[layer], &mUnsortedClustersDevice[layer], sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } } template -void TimeFrameGPU::loadClustersDevice(const int iteration) +void TimeFrameGPU::createClustersDeviceArray(const int iteration) { if (!iteration) { - GPUTimer timer(mGpuStreams[0], "loading sorted clusters"); + GPUTimer timer("creating sorted clusters array"); + allocMem(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator()); + GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); + mPinnedClusters.set(nLayers); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - GPULog("gpu-transfer: loading {} clusters on layer {}, for {:.2f} MB.", this->mClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(Cluster) / constants::MB); - allocMemAsync(reinterpret_cast(&mClustersDevice[iLayer]), this->mClusters[iLayer].size() * sizeof(Cluster), mGpuStreams[iLayer], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mClustersDevice[iLayer], this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); + mPinnedClusters.set(iLayer); } - mGpuStreams.sync(); - allocMem(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator()); - GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpy(mClustersDeviceArray, mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaMemcpyHostToDevice)); } } template -void TimeFrameGPU::loadClustersIndexTables(const int iteration) +void TimeFrameGPU::loadClustersDevice(const int iteration, const int layer) { if (!iteration) { - GPUTimer timer(mGpuStreams[0], "loading sorted clusters"); + GPUTimer timer(mGpuStreams[layer], "loading sorted clusters", layer); + GPULog("gpu-transfer: loading {} clusters on layer {}, for {:.2f} MB.", this->mClusters[layer].size(), layer, this->mClusters[layer].size() * sizeof(Cluster) / constants::MB); + allocMemAsync(reinterpret_cast(&mClustersDevice[layer]), this->mClusters[layer].size() * sizeof(Cluster), mGpuStreams[layer], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mClustersDevice[layer], this->mClusters[layer].data(), this->mClusters[layer].size() * sizeof(Cluster), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); + GPUChkErrS(cudaMemcpyAsync(&mClustersDeviceArray[layer], &mClustersDevice[layer], sizeof(Cluster*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); + } +} + +template +void TimeFrameGPU::createClustersIndexTablesArray(const int iteration) +{ + if (!iteration) { + GPUTimer timer("creating clustersindextable array"); + allocMem(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int*), this->getExtAllocator()); + GPUChkErrS(cudaHostRegister(mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaHostRegisterPortable)); + mPinnedClustersIndexTables.set(nLayers); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - GPULog("gpu-transfer: loading clusters indextable for layer {} with {} elements, for {:.2f} MB.", iLayer, this->mIndexTables[iLayer].size(), this->mIndexTables[iLayer].size() * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[iLayer]), this->mIndexTables[iLayer].size() * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); + GPUChkErrS(cudaHostRegister(this->mIndexTables[iLayer].data(), this->mIndexTables[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); + mPinnedClustersIndexTables.set(iLayer); } - mGpuStreams.sync(); - allocMem(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice)); } } template -void TimeFrameGPU::createUsedClustersDevice(const int iteration) +void TimeFrameGPU::loadClustersIndexTables(const int iteration, const int layer) +{ + if (!iteration) { + GPUTimer timer(mGpuStreams[layer], "loading sorted clusters", layer); + GPULog("gpu-transfer: loading clusters indextable for layer {} with {} elements, for {:.2f} MB.", layer, this->mIndexTables[layer].size(), this->mIndexTables[layer].size() * sizeof(int) / constants::MB); + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[layer]), this->mIndexTables[layer].size() * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mClustersIndexTablesDevice[layer], this->mIndexTables[layer].data(), this->mIndexTables[layer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); + GPUChkErrS(cudaMemcpyAsync(&mClustersIndexTablesDeviceArray[layer], &mClustersIndexTablesDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); + } +} + +template +void TimeFrameGPU::createUsedClustersDeviceArray(const int iteration) { if (!iteration) { - GPUTimer timer(mGpuStreams[0], "creating used clusters flags"); + GPUTimer timer("creating used clusters flags"); + allocMem(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), this->getExtAllocator()); + GPUChkErrS(cudaHostRegister(mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaHostRegisterPortable)); + mPinnedUsedClusters.set(nLayers); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - GPULog("gpu-transfer: creating {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mUsedClusters[iLayer].size() * sizeof(unsigned char) / constants::MB); - allocMemAsync(reinterpret_cast(&mUsedClustersDevice[iLayer]), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[iLayer], this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, this->mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[iLayer].get())); + GPUChkErrS(cudaHostRegister(this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaHostRegisterPortable)); + mPinnedUsedClusters.set(iLayer); } - mGpuStreams.sync(); - allocMem(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } +} + +template +void TimeFrameGPU::createUsedClustersDevice(const int iteration, const int layer) +{ + if (!iteration) { + GPUTimer timer(mGpuStreams[layer], "creating used clusters flags", layer); + GPULog("gpu-transfer: creating {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[layer].size(), layer, this->mUsedClusters[layer].size() * sizeof(unsigned char) / constants::MB); + allocMemAsync(reinterpret_cast(&mUsedClustersDevice[layer]), this->mUsedClusters[layer].size() * sizeof(unsigned char), mGpuStreams[layer], this->getExtAllocator()); + GPUChkErrS(cudaMemsetAsync(mUsedClustersDevice[layer], 0, this->mUsedClusters[layer].size() * sizeof(unsigned char), mGpuStreams[layer].get())); + GPUChkErrS(cudaMemcpyAsync(&mUsedClustersDeviceArray[layer], &mUsedClustersDevice[layer], sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } } template void TimeFrameGPU::loadUsedClustersDevice() { - GPUTimer timer(mGpuStreams[0], "loading used clusters flags"); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - GPULog("gpu-transfer: loading {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mClusters[iLayer].size() * sizeof(unsigned char) / constants::MB); - GPUChkErrS(cudaMemcpyAsync(mUsedClustersDevice[iLayer], this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUTimer timer(mGpuStreams[iLayer], "loading used clusters flags", iLayer); + GPULog("gpu-transfer: loading {} used clusters flags on layer {}, for {:.2f} MB.", this->mUsedClusters[iLayer].size(), iLayer, this->mUsedClusters[iLayer].size() * sizeof(unsigned char) / constants::MB); + GPUChkErrS(cudaMemcpyAsync(mUsedClustersDevice[iLayer], this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); } } template -void TimeFrameGPU::loadROframeClustersDevice(const int iteration) +void TimeFrameGPU::createROFrameClustersDeviceArray(const int iteration) { if (!iteration) { - GPUTimer timer(mGpuStreams[0], "loading ROframe clusters"); + GPUTimer timer("creating ROFrame clusters array"); + allocMem(reinterpret_cast(&mROFramesClustersDeviceArray), nLayers * sizeof(int*), this->getExtAllocator()); + GPUChkErrS(cudaHostRegister(mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaHostRegisterPortable)); + mPinnedROFramesClusters.set(nLayers); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - GPULog("gpu-transfer: loading {} ROframe clusters info on layer {}, for {:.2f} MB.", this->mROFramesClusters[iLayer].size(), iLayer, this->mROFramesClusters[iLayer].size() * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[iLayer]), this->mROFramesClusters[iLayer].size() * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaHostRegister(this->mROFramesClusters[iLayer].data(), this->mROFramesClusters[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); + mPinnedROFramesClusters.set(iLayer); } - mGpuStreams.sync(); - allocMem(reinterpret_cast(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), this->getExtAllocator()); - GPUChkErrS(cudaMemcpy(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice)); } } template -void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) +void TimeFrameGPU::loadROFrameClustersDevice(const int iteration, const int layer) +{ + if (!iteration) { + GPUTimer timer(mGpuStreams[layer], "loading ROframe clusters", layer); + GPULog("gpu-transfer: loading {} ROframe clusters info on layer {}, for {:.2f} MB.", this->mROFramesClusters[layer].size(), layer, this->mROFramesClusters[layer].size() * sizeof(int) / constants::MB); + allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[layer]), this->mROFramesClusters[layer].size() * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mROFramesClustersDevice[layer], this->mROFramesClusters[layer].data(), this->mROFramesClusters[layer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); + GPUChkErrS(cudaMemcpyAsync(&mROFramesClustersDeviceArray[layer], &mROFramesClustersDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); + } +} + +template +void TimeFrameGPU::createTrackingFrameInfoDeviceArray(const int iteration) { - GPUTimer timer(mGpuStreams[0], "loading trackingframeinfo"); if (!iteration) { + GPUTimer timer("creating trackingframeinfo array"); + allocMem(reinterpret_cast(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), this->getExtAllocator()); + GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable)); + mPinnedTrackingFrameInfo.set(nLayers); for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - GPULog("gpu-transfer: loading {} tfinfo on layer {}, for {:.2f} MB.", this->mTrackingFrameInfo[iLayer].size(), iLayer, this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[iLayer]), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), mGpuStreams[iLayer], this->getExtAllocator()); GPUChkErrS(cudaHostRegister(this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[iLayer], this->mTrackingFrameInfo[iLayer].data(), this->mTrackingFrameInfo[iLayer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); + mPinnedTrackingFrameInfo.set(iLayer); } - mGpuStreams.sync(); - allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDeviceArray), nLayers * sizeof(TrackingFrameInfo*), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaHostRegister(mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDeviceArray, mTrackingFrameInfoDevice.data(), nLayers * sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } +} + +template +void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration, const int layer) +{ + if (!iteration) { + GPUTimer timer(mGpuStreams[layer], "loading trackingframeinfo", layer); + GPULog("gpu-transfer: loading {} tfinfo on layer {}, for {:.2f} MB.", this->mTrackingFrameInfo[layer].size(), layer, this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo) / constants::MB); + allocMemAsync(reinterpret_cast(&mTrackingFrameInfoDevice[layer]), this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo), mGpuStreams[layer], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(mTrackingFrameInfoDevice[layer], this->mTrackingFrameInfo[layer].data(), this->mTrackingFrameInfo[layer].size() * sizeof(TrackingFrameInfo), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); + GPUChkErrS(cudaMemcpyAsync(&mTrackingFrameInfoDeviceArray[layer], &mTrackingFrameInfoDevice[layer], sizeof(TrackingFrameInfo*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } } @@ -267,8 +244,8 @@ template void TimeFrameGPU::loadMultiplicityCutMask(const int iteration) { if (!iteration || iteration == 3) { // we need to re-load the swapped mult-mask in upc iteration - GPUTimer timer(mGpuStreams[0], "loading multiplicity cut mask"); - GPULog("gpu-transfer: iteration {} loading multiplicity cut mask with {} elements, for {:.2f} MB.", iteration, this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(bool) / constants::MB); + GPUTimer timer("loading multiplicity cut mask"); + GPULog("gpu-transfer: iteration {} loading multiplicity cut mask with {} elements, for {:.2f} MB.", iteration, this->mMultiplicityCutMask.size(), this->mMultiplicityCutMask.size() * sizeof(uint8_t) / constants::MB); if (!iteration) { // only allocate on first call allocMem(reinterpret_cast(&mMultMaskDevice), this->mMultiplicityCutMask.size() * sizeof(uint8_t), this->getExtAllocator()); } @@ -280,7 +257,7 @@ template void TimeFrameGPU::loadVertices(const int iteration) { if (!iteration) { - GPUTimer timer(mGpuStreams[0], "loading seeding vertices"); + GPUTimer timer("loading seeding vertices"); GPULog("gpu-transfer: loading {} ROframes vertices, for {:.2f} MB.", this->mROFramesPV.size(), this->mROFramesPV.size() * sizeof(int) / constants::MB); allocMem(reinterpret_cast(&mROFramesPVDevice), this->mROFramesPV.size() * sizeof(int), this->getExtAllocator()); GPUChkErrS(cudaMemcpy(mROFramesPVDevice, this->mROFramesPV.data(), this->mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice)); @@ -291,35 +268,47 @@ void TimeFrameGPU::loadVertices(const int iteration) } template -void TimeFrameGPU::createTrackletsLUTDevice(const int iteration) +void TimeFrameGPU::createTrackletsLUTDeviceArray(const int iteration) { - GPUTimer timer(mGpuStreams[0], "creating tracklets LUTs"); - for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { - const int ncls = this->mClusters[iLayer].size() + 1; - if (!iteration) { - GPULog("gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {:.2f} MB.", ncls, iLayer, ncls * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), ncls * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); - } - GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, ncls * sizeof(int), mGpuStreams[iLayer].get())); + if (!iteration) { + allocMem(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), this->getExtAllocator()); } +} + +template +void TimeFrameGPU::createTrackletsLUTDevice(const int iteration, const int layer) +{ + GPUTimer timer(mGpuStreams[layer], "creating tracklets LUTs", layer); + const int ncls = this->mClusters[layer].size() + 1; if (!iteration) { - allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPULog("gpu-allocation: creating tracklets LUT for {} elements on layer {}, for {:.2f} MB.", ncls, layer, ncls * sizeof(int) / constants::MB); + allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[layer]), ncls * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(&mTrackletsLUTDeviceArray[layer], &mTrackletsLUTDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } + GPUChkErrS(cudaMemsetAsync(mTrackletsLUTDevice[layer], 0, ncls * sizeof(int), mGpuStreams[layer].get())); } template -void TimeFrameGPU::createTrackletsBuffers() +void TimeFrameGPU::createTrackletsBuffersArray(const int iteration) { - for (int iLayer{0}; iLayer < nLayers - 1; ++iLayer) { - GPUTimer timer(mGpuStreams[iLayer], "creating tracklet buffers"); - mNTracklets[iLayer] = 0; - GPUChkErrS(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + this->mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[iLayer].get())); - GPULog("gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {:.2f} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), mGpuStreams[iLayer], this->getExtAllocator()); + if (!iteration) { + GPUTimer timer("creating tracklet buffers array"); + allocMem(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), this->getExtAllocator()); } } +template +void TimeFrameGPU::createTrackletsBuffers(const int layer) +{ + GPUTimer timer(mGpuStreams[layer], "creating tracklet buffers", layer); + mNTracklets[layer] = 0; + GPUChkErrS(cudaMemcpyAsync(&mNTracklets[layer], mTrackletsLUTDevice[layer] + this->mClusters[layer].size(), sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get())); + mGpuStreams[layer].sync(); // ensure number of tracklets is correct + GPULog("gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {:.2f} MB.", mNTracklets[layer], layer, mNTracklets[layer] * sizeof(Tracklet) / constants::MB); + allocMemAsync(reinterpret_cast(&mTrackletsDevice[layer]), mNTracklets[layer] * sizeof(Tracklet), mGpuStreams[layer], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(&mTrackletsDeviceArray[layer], &mTrackletsDevice[layer], sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); +} + template void TimeFrameGPU::loadTrackletsDevice() { @@ -334,26 +323,22 @@ void TimeFrameGPU::loadTrackletsDevice() template void TimeFrameGPU::loadTrackletsLUTDevice() { - GPUTimer timer(mGpuStreams, "loading tracklets", nLayers - 2); + GPUTimer timer("loading tracklets"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { GPULog("gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {:.2f} MB", this->mTrackletsLookupTable[iLayer].size(), iLayer + 1, this->mTrackletsLookupTable[iLayer].size() * sizeof(int) / constants::MB); - GPUChkErrS(cudaHostRegister(this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], this->mTrackletsLookupTable[iLayer].data(), this->mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[iLayer].get())); } - GPUChkErrS(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + mGpuStreams.sync(); + GPUChkErrS(cudaMemcpy(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice)); } template -void TimeFrameGPU::createNeighboursIndexTablesDevice() +void TimeFrameGPU::createNeighboursIndexTablesDevice(const int layer) { - GPUTimer timer(mGpuStreams[0], "creating cells neighbours"); - // Here we do also the creation of the CellsDeviceArray, as the cells buffers are populated separately in the previous steps. - for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - GPULog("gpu-transfer: loading neighbours LUT for {} elements on layer {}, for {:.2f} MB.", mNCells[iLayer], iLayer, mNCells[iLayer] * sizeof(CellSeed) / constants::MB); - allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[iLayer]), (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[iLayer], 0, (mNCells[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer].get())); - } + GPUTimer timer(mGpuStreams[layer], "creating cells neighbours", layer); + GPULog("gpu-transfer: reserving neighbours LUT for {} elements on layer {}, for {:.2f} MB.", mNCells[layer] + 1, layer, (mNCells[layer] + 1) * sizeof(int) / constants::MB); + allocMemAsync(reinterpret_cast(&mNeighboursIndexTablesDevice[layer]), (mNCells[layer] + 1) * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); + GPUChkErrS(cudaMemsetAsync(mNeighboursIndexTablesDevice[layer], 0, (mNCells[layer] + 1) * sizeof(int), mGpuStreams[layer].get())); } template @@ -379,16 +364,32 @@ void TimeFrameGPU::loadCellsDevice() } template -void TimeFrameGPU::createCellsLUTDevice() +void TimeFrameGPU::createCellsLUTDeviceArray(const int iteration) { - GPUTimer timer(mGpuStreams, "creating cells LUTs", nLayers - 2); - for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - GPULog("gpu-transfer: creating cell LUT for {} elements on layer {}, for {:.2f} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / constants::MB); - allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer], this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[iLayer].get())); + if (!iteration) { + GPUTimer timer("creating cells LUTs array"); + allocMem(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), this->getExtAllocator()); + } +} + +template +void TimeFrameGPU::createCellsLUTDevice(const int layer) +{ + GPUTimer timer(mGpuStreams[layer], "creating cells LUTs", layer); + GPULog("gpu-transfer: creating cell LUT for {} elements on layer {}, for {:.2f} MB.", mNTracklets[layer] + 1, layer, (mNTracklets[layer] + 1) * sizeof(int) / constants::MB); + allocMemAsync(reinterpret_cast(&mCellsLUTDevice[layer]), (mNTracklets[layer] + 1) * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); + GPUChkErrS(cudaMemsetAsync(mCellsLUTDevice[layer], 0, (mNTracklets[layer] + 1) * sizeof(int), mGpuStreams[layer].get())); + GPUChkErrS(cudaMemcpyAsync(&mCellsLUTDeviceArray[layer], &mCellsLUTDevice[layer], sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); +} + +template +void TimeFrameGPU::createCellsBuffersArray(const int iteration) +{ + if (!iteration) { + GPUTimer timer("creating cells buffers array"); + allocMem(reinterpret_cast(&mCellsDeviceArray), (nLayers - 2) * sizeof(CellSeed*), this->getExtAllocator()); + GPUChkErrS(cudaMemcpy(mCellsDeviceArray, mCellsDevice.data(), mCellsDevice.size() * sizeof(CellSeed*), cudaMemcpyHostToDevice)); } - allocMemAsync(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } template @@ -397,8 +398,10 @@ void TimeFrameGPU::createCellsBuffers(const int layer) GPUTimer timer(mGpuStreams[layer], "creating cells buffers"); mNCells[layer] = 0; GPUChkErrS(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get())); + mGpuStreams[layer].sync(); // ensure number of cells is correct GPULog("gpu-transfer: creating cell buffer for {} elements on layer {}, for {:.2f} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / constants::MB); allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), mGpuStreams[layer], this->getExtAllocator()); + GPUChkErrS(cudaMemcpyAsync(&mCellsDeviceArray[layer], &mCellsDevice[layer], sizeof(CellSeed*), cudaMemcpyHostToDevice, mGpuStreams[layer].get())); } template @@ -415,43 +418,45 @@ void TimeFrameGPU::loadCellsLUTDevice() template void TimeFrameGPU::loadRoadsDevice() { - GPUTimer timer(mGpuStreams[0], "loading roads device"); + GPUTimer timer("loading roads device"); GPULog("gpu-transfer: loading {} roads, for {:.2f} MB.", this->mRoads.size(), this->mRoads.size() * sizeof(Road) / constants::MB); - allocMemAsync(reinterpret_cast(&mRoadsDevice), this->mRoads.size() * sizeof(Road), mGpuStreams[0], this->getExtAllocator()); + allocMem(reinterpret_cast(&mRoadsDevice), this->mRoads.size() * sizeof(Road), this->getExtAllocator()); GPUChkErrS(cudaHostRegister(this->mRoads.data(), this->mRoads.size() * sizeof(Road), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mRoadsDevice, this->mRoads.data(), this->mRoads.size() * sizeof(Road), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpy(mRoadsDevice, this->mRoads.data(), this->mRoads.size() * sizeof(Road), cudaMemcpyHostToDevice)); } template void TimeFrameGPU::loadTrackSeedsDevice(bounded_vector& seeds) { - GPUTimer timer(mGpuStreams[0], "loading track seeds"); + GPUTimer timer("loading track seeds"); GPULog("gpu-transfer: loading {} track seeds, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(CellSeed) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), mGpuStreams[0], this->getExtAllocator()); + allocMem(reinterpret_cast(&mTrackSeedsDevice), seeds.size() * sizeof(CellSeed), this->getExtAllocator()); GPUChkErrS(cudaHostRegister(seeds.data(), seeds.size() * sizeof(CellSeed), cudaHostRegisterPortable)); - GPUChkErrS(cudaMemcpyAsync(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpy(mTrackSeedsDevice, seeds.data(), seeds.size() * sizeof(CellSeed), cudaMemcpyHostToDevice)); } template void TimeFrameGPU::createNeighboursDevice(const unsigned int layer) { - GPUTimer timer(mGpuStreams[layer], "reserving neighbours"); + GPUTimer timer(mGpuStreams[layer], "reserving neighbours", layer); + this->mNNeighbours[layer] = 0; GPUChkErrS(cudaMemcpyAsync(&(this->mNNeighbours[layer]), &(mNeighboursLUTDevice[layer][this->mNCells[layer + 1] - 1]), sizeof(unsigned int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get())); - GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair) / constants::MB); + mGpuStreams[layer].sync(); // ensure number of neighbours is correct + GPULog("gpu-allocation: reserving {} neighbours (pairs), for {:.2f} MB.", this->mNNeighbours[layer], (this->mNNeighbours[layer]) * sizeof(gpuPair) / constants::MB); allocMemAsync(reinterpret_cast(&mNeighbourPairsDevice[layer]), (this->mNNeighbours[layer]) * sizeof(gpuPair), mGpuStreams[layer], this->getExtAllocator()); GPUChkErrS(cudaMemsetAsync(mNeighbourPairsDevice[layer], -1, (this->mNNeighbours[layer]) * sizeof(gpuPair), mGpuStreams[layer].get())); - GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", nNeighbours, nNeighbours * sizeof(gpuPair) / constants::MB); + GPULog("gpu-allocation: reserving {} neighbours, for {:.2f} MB.", this->mNNeighbours[layer], (this->mNNeighbours[layer]) * sizeof(gpuPair) / constants::MB); allocMemAsync(reinterpret_cast(&mNeighboursDevice[layer]), (this->mNNeighbours[layer]) * sizeof(int), mGpuStreams[layer], this->getExtAllocator()); } template void TimeFrameGPU::createTrackITSExtDevice(bounded_vector& seeds) { - GPUTimer timer(mGpuStreams[0], "reserving tracks"); + GPUTimer timer("reserving tracks"); mTrackITSExt = bounded_vector(seeds.size(), {}, this->getMemoryPool().get()); GPULog("gpu-allocation: reserving {} tracks, for {:.2f} MB.", seeds.size(), seeds.size() * sizeof(o2::its::TrackITSExt) / constants::MB); - allocMemAsync(reinterpret_cast(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0], this->getExtAllocator()); - GPUChkErrS(cudaMemsetAsync(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt), mGpuStreams[0].get())); + allocMem(reinterpret_cast(&mTrackITSExtDevice), seeds.size() * sizeof(o2::its::TrackITSExt), this->getExtAllocator()); + GPUChkErrS(cudaMemset(mTrackITSExtDevice, 0, seeds.size() * sizeof(o2::its::TrackITSExt))); GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable)); } @@ -480,26 +485,25 @@ void TimeFrameGPU::downloadCellsLUTDevice() template void TimeFrameGPU::downloadCellsNeighboursDevice(std::vector>>& neighbours, const int layer) { - GPUTimer timer(mGpuStreams[0], fmt::format("downloading neighbours from layer {}", layer)); + GPUTimer timer(mGpuStreams[layer], "downloading neighbours from layer", layer); GPULog("gpu-transfer: downloading {} neighbours, for {:.2f} MB.", neighbours[layer].size(), neighbours[layer].size() * sizeof(std::pair) / constants::MB); - // TODO: something less dangerous than assuming the same memory layout of std::pair and gpuPair... or not? :) - GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(neighbours[layer].data(), mNeighbourPairsDevice[layer], neighbours[layer].size() * sizeof(gpuPair), cudaMemcpyDeviceToHost, mGpuStreams[layer].get())); } template void TimeFrameGPU::downloadNeighboursLUTDevice(bounded_vector& lut, const int layer) { - GPUTimer timer(mGpuStreams[0], fmt::format("downloading neighbours LUT from layer {}", layer)); + GPUTimer timer(mGpuStreams[layer], "downloading neighbours LUT from layer", layer); GPULog("gpu-transfer: downloading neighbours LUT for {} elements on layer {}, for {:.2f} MB.", lut.size(), layer, lut.size() * sizeof(int) / constants::MB); - GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpyAsync(lut.data(), mNeighboursLUTDevice[layer], lut.size() * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[layer].get())); } template void TimeFrameGPU::downloadTrackITSExtDevice(bounded_vector& seeds) { - GPUTimer timer(mGpuStreams[0], "downloading tracks"); + GPUTimer timer("downloading tracks"); GPULog("gpu-transfer: downloading {} tracks, for {:.2f} MB.", mTrackITSExt.size(), mTrackITSExt.size() * sizeof(o2::its::TrackITSExt) / constants::MB); - GPUChkErrS(cudaMemcpyAsync(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + GPUChkErrS(cudaMemcpy(mTrackITSExt.data(), mTrackITSExtDevice, seeds.size() * sizeof(o2::its::TrackITSExt), cudaMemcpyDeviceToHost)); GPUChkErrS(cudaHostUnregister(mTrackITSExt.data())); GPUChkErrS(cudaHostUnregister(seeds.data())); } @@ -507,16 +511,36 @@ void TimeFrameGPU::downloadTrackITSExtDevice(bounded_vector& template void TimeFrameGPU::unregisterHostMemory(const int maxLayers) { - GPUTimer timer(mGpuStreams[0], "unregistering host memory"); + GPUTimer timer("unregistering host memory"); GPULog("unregistering host memory"); + + auto checkedUnregisterEntry = [](auto& bits, auto& vec, int layer) { + if (bits.test(layer)) { + GPUChkErrS(cudaHostUnregister(vec[layer].data())); + bits.reset(layer); + } + }; + auto checkedUnregisterArray = [](auto& bits, auto& vec) { + if (bits.test(nLayers)) { + GPUChkErrS(cudaHostUnregister(vec.data())); + bits.reset(nLayers); + } + }; + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - GPUChkErrS(cudaHostUnregister(this->mUnsortedClusters[iLayer].data())); - GPUChkErrS(cudaHostUnregister(this->mClusters[iLayer].data())); - GPUChkErrS(cudaHostUnregister(this->mTrackingFrameInfo[iLayer].data())); + checkedUnregisterEntry(mPinnedUsedClusters, this->mUsedClusters, iLayer); + checkedUnregisterEntry(mPinnedUnsortedClusters, this->mUnsortedClusters, iLayer); + checkedUnregisterEntry(mPinnedClusters, this->mClusters, iLayer); + checkedUnregisterEntry(mPinnedClustersIndexTables, this->mIndexTables, iLayer); + checkedUnregisterEntry(mPinnedTrackingFrameInfo, this->mTrackingFrameInfo, iLayer); + checkedUnregisterEntry(mPinnedROFramesClusters, this->mROFramesClusters, iLayer); } - GPUChkErrS(cudaHostUnregister(mTrackingFrameInfoDevice.data())); - GPUChkErrS(cudaHostUnregister(mUnsortedClustersDevice.data())); - GPUChkErrS(cudaHostUnregister(mClustersDevice.data())); + checkedUnregisterArray(mPinnedUsedClusters, mUsedClustersDevice); + checkedUnregisterArray(mPinnedUnsortedClusters, mUnsortedClustersDevice); + checkedUnregisterArray(mPinnedClusters, mClustersDevice); + checkedUnregisterArray(mPinnedClustersIndexTables, mClustersIndexTablesDevice); + checkedUnregisterArray(mPinnedTrackingFrameInfo, mTrackingFrameInfoDevice); + checkedUnregisterArray(mPinnedROFramesClusters, mROFramesClustersDevice); } template @@ -537,9 +561,29 @@ void TimeFrameGPU::syncStream(const size_t stream) } template -void TimeFrameGPU::syncStreams() +void TimeFrameGPU::syncStreams(const bool device) { - mGpuStreams.sync(); + mGpuStreams.sync(device); +} + +template +void TimeFrameGPU::waitEvent(const int stream, const int event) +{ + mGpuStreams.waitEvent(stream, event); +} + +template +void TimeFrameGPU::recordEvent(const int event) +{ + mGpuStreams[event].record(); +} + +template +void TimeFrameGPU::recordEvents(const int start, const int end) +{ + for (int i{start}; i < end; ++i) { + recordEvent(i); + } } template diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index bef271a1b0129..a6dfc041e4c71 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -11,6 +11,7 @@ /// #include +#include #include #include "DataFormatsITS/TrackITS.h" @@ -27,15 +28,21 @@ template void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) { mTimeFrameGPU->initialise(iteration, this->mTrkParams[iteration], nLayers); - mTimeFrameGPU->loadClustersDevice(iteration); - mTimeFrameGPU->loadUnsortedClustersDevice(iteration); - mTimeFrameGPU->loadClustersIndexTables(iteration); - mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration); - mTimeFrameGPU->loadMultiplicityCutMask(iteration); + + // on default stream mTimeFrameGPU->loadVertices(iteration); - mTimeFrameGPU->loadROframeClustersDevice(iteration); - mTimeFrameGPU->createUsedClustersDevice(iteration); mTimeFrameGPU->loadIndexTableUtils(iteration); + mTimeFrameGPU->loadMultiplicityCutMask(iteration); + mTimeFrameGPU->createUsedClustersDeviceArray(iteration); + mTimeFrameGPU->createClustersDeviceArray(iteration); + mTimeFrameGPU->createUnsortedClustersDeviceArray(iteration); + mTimeFrameGPU->createClustersIndexTablesArray(iteration); + mTimeFrameGPU->createTrackingFrameInfoDeviceArray(iteration); + mTimeFrameGPU->createROFrameClustersDeviceArray(iteration); + mTimeFrameGPU->createTrackletsLUTDeviceArray(iteration); + mTimeFrameGPU->createTrackletsBuffersArray(iteration); + mTimeFrameGPU->createCellsBuffersArray(iteration); + mTimeFrameGPU->createCellsLUTDeviceArray(iteration); } template @@ -53,40 +60,23 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i int startROF{this->mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * this->mTrkParams[iteration].nROFsPerIterations : 0}; int endROF{o2::gpu::CAMath::Min(this->mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * this->mTrkParams[iteration].nROFsPerIterations + this->mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof(), mTimeFrameGPU->getNrof())}; - mTimeFrameGPU->createTrackletsLUTDevice(iteration); - countTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), - mTimeFrameGPU->getDeviceMultCutMask(), - startROF, - endROF, - mTimeFrameGPU->getNrof(), - this->mTrkParams[iteration].DeltaROF, - iVertex, - mTimeFrameGPU->getDeviceVertices(), - mTimeFrameGPU->getDeviceROFramesPV(), - mTimeFrameGPU->getPrimaryVerticesNum(), - mTimeFrameGPU->getDeviceArrayClusters(), - mTimeFrameGPU->getClusterSizes(), - mTimeFrameGPU->getDeviceROframeClusters(), - mTimeFrameGPU->getDeviceArrayUsedClusters(), - mTimeFrameGPU->getDeviceArrayClustersIndexTables(), - mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - mTimeFrameGPU->getDeviceTrackletsLUTs(), // Required for the exclusive sums - iteration, - this->mTrkParams[iteration].NSigmaCut, - mTimeFrameGPU->getPhiCuts(), - this->mTrkParams[iteration].PVres, - mTimeFrameGPU->getMinRs(), - mTimeFrameGPU->getMaxRs(), - mTimeFrameGPU->getPositionResolutions(), - this->mTrkParams[iteration].LayerRadii, - mTimeFrameGPU->getMSangles(), - mTimeFrameGPU->getExternalAllocator(), - conf.nBlocksLayerTracklets[iteration], - conf.nThreadsLayerTracklets[iteration], - mTimeFrameGPU->getStreams()); - mTimeFrameGPU->createTrackletsBuffers(); - computeTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), + for (int iLayer = 0; iLayer < nLayers; ++iLayer) { + // TODO lazy loading of essential data on separate streams + mTimeFrameGPU->createUsedClustersDevice(iteration, iLayer); + mTimeFrameGPU->loadClustersDevice(iteration, iLayer); + mTimeFrameGPU->loadUnsortedClustersDevice(iteration, iLayer); + mTimeFrameGPU->loadClustersIndexTables(iteration, iLayer); + mTimeFrameGPU->loadROFrameClustersDevice(iteration, iLayer); + mTimeFrameGPU->recordEvent(iLayer); + } + + // processing starts here + for (int iLayer{0}; iLayer < this->mTrkParams[iteration].TrackletsPerRoad(); ++iLayer) { + mTimeFrameGPU->createTrackletsLUTDevice(iteration, iLayer); + mTimeFrameGPU->waitEvent(iLayer, iLayer + 1); // wait stream until all data is available + countTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), mTimeFrameGPU->getDeviceMultCutMask(), + iLayer, startROF, endROF, mTimeFrameGPU->getNrof(), @@ -97,14 +87,11 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i mTimeFrameGPU->getPrimaryVerticesNum(), mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getClusterSizes(), - mTimeFrameGPU->getDeviceROframeClusters(), + mTimeFrameGPU->getDeviceROFrameClusters(), mTimeFrameGPU->getDeviceArrayUsedClusters(), mTimeFrameGPU->getDeviceArrayClustersIndexTables(), - mTimeFrameGPU->getDeviceArrayTracklets(), - mTimeFrameGPU->getDeviceTracklet(), - mTimeFrameGPU->getNTracklets(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - mTimeFrameGPU->getDeviceTrackletsLUTs(), + mTimeFrameGPU->getDeviceTrackletsLUTs(), // Required for the exclusive sums iteration, this->mTrkParams[iteration].NSigmaCut, mTimeFrameGPU->getPhiCuts(), @@ -118,24 +105,69 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i conf.nBlocksLayerTracklets[iteration], conf.nThreadsLayerTracklets[iteration], mTimeFrameGPU->getStreams()); + mTimeFrameGPU->createTrackletsBuffers(iLayer); + if (mTimeFrameGPU->getNTracklets()[iLayer] == 0) { + return; + } + computeTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), + mTimeFrameGPU->getDeviceMultCutMask(), + iLayer, + startROF, + endROF, + mTimeFrameGPU->getNrof(), + this->mTrkParams[iteration].DeltaROF, + iVertex, + mTimeFrameGPU->getDeviceVertices(), + mTimeFrameGPU->getDeviceROFramesPV(), + mTimeFrameGPU->getPrimaryVerticesNum(), + mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getClusterSizes(), + mTimeFrameGPU->getDeviceROFrameClusters(), + mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceArrayClustersIndexTables(), + mTimeFrameGPU->getDeviceArrayTracklets(), + mTimeFrameGPU->getDeviceTracklets(), + mTimeFrameGPU->getNTracklets(), + mTimeFrameGPU->getDeviceArrayTrackletsLUT(), + mTimeFrameGPU->getDeviceTrackletsLUTs(), + iteration, + this->mTrkParams[iteration].NSigmaCut, + mTimeFrameGPU->getPhiCuts(), + this->mTrkParams[iteration].PVres, + mTimeFrameGPU->getMinRs(), + mTimeFrameGPU->getMaxRs(), + mTimeFrameGPU->getPositionResolutions(), + this->mTrkParams[iteration].LayerRadii, + mTimeFrameGPU->getMSangles(), + mTimeFrameGPU->getExternalAllocator(), + conf.nBlocksLayerTracklets[iteration], + conf.nThreadsLayerTracklets[iteration], + mTimeFrameGPU->getStreams()); + } } template void TrackerTraitsGPU::computeLayerCells(const int iteration) { - mTimeFrameGPU->createCellsLUTDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - mTimeFrameGPU->syncStream(0); - for (int iLayer = 0; iLayer < this->mTrkParams[iteration].CellsPerRoad(); ++iLayer) { - mTimeFrameGPU->syncStream(iLayer + 1); + for (int iLayer = 0; iLayer < nLayers; ++iLayer) { + // TODO lazy loading of essential data on separate streams + mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration, iLayer); + mTimeFrameGPU->recordEvent(iLayer); + } + for (int iLayer{0}; iLayer < this->mTrkParams[iteration].CellsPerRoad(); ++iLayer) { // if there are no tracklets skip entirely const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getNTracklets()[iLayer])}; if (!currentLayerTrackletsNum || !mTimeFrameGPU->getNTracklets()[iLayer + 1]) { mTimeFrameGPU->getNCells()[iLayer] = 0; - continue; + return; } + + mTimeFrameGPU->createCellsLUTDevice(iLayer); + mTimeFrameGPU->waitEvent(iLayer, iLayer + 1); // wait stream until all data is available + mTimeFrameGPU->waitEvent(iLayer, iLayer + 2); // wait stream until all data is available countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getDeviceArrayUnsortedClusters(), mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), @@ -156,6 +188,9 @@ void TrackerTraitsGPU::computeLayerCells(const int iteration) conf.nThreadsLayerCells[iteration], mTimeFrameGPU->getStreams()); mTimeFrameGPU->createCellsBuffers(iLayer); + if (mTimeFrameGPU->getNCells()[iLayer] == 0) { + return; + } computeCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getDeviceArrayUnsortedClusters(), mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), @@ -180,21 +215,16 @@ void TrackerTraitsGPU::computeLayerCells(const int iteration) template void TrackerTraitsGPU::findCellsNeighbours(const int iteration) { - mTimeFrameGPU->createNeighboursIndexTablesDevice(); const auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - mTimeFrameGPU->syncStream(0); - for (int iLayer{0}; iLayer < this->mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { - // ensure that celling is done for iLayer and iLayer+1 is done - mTimeFrameGPU->syncStream(iLayer + 1); - + for (int iLayer{0}; iLayer < this->mTrkParams[iteration].NeighboursPerRoad(); ++iLayer) { const int currentLayerCellsNum{static_cast(mTimeFrameGPU->getNCells()[iLayer])}; const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCells()[iLayer + 1])}; if (!nextLayerCellsNum || !currentLayerCellsNum) { mTimeFrameGPU->getNNeighbours()[iLayer] = 0; continue; } - + mTimeFrameGPU->createNeighboursIndexTablesDevice(iLayer); mTimeFrameGPU->createNeighboursLUTDevice(iLayer, nextLayerCellsNum); countCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), // LUT is initialised here. @@ -214,6 +244,9 @@ void TrackerTraitsGPU::findCellsNeighbours(const int iteration) conf.nThreadsFindNeighbours[iteration], mTimeFrameGPU->getStream(iLayer)); mTimeFrameGPU->createNeighboursDevice(iLayer); + if (mTimeFrameGPU->getNNeighbours()[iLayer] == 0) { + continue; + } computeCellNeighboursHandler(mTimeFrameGPU->getDeviceArrayCells(), mTimeFrameGPU->getDeviceNeighboursLUT(iLayer), mTimeFrameGPU->getDeviceArrayCellsLUT(), @@ -236,8 +269,8 @@ void TrackerTraitsGPU::findCellsNeighbours(const int iteration) mTimeFrameGPU->getStream(iLayer), mTimeFrameGPU->getExternalAllocator()); } - mTimeFrameGPU->syncStreams(); // TODO evaluate if this can be removed -}; + mTimeFrameGPU->syncStreams(false); +} template void TrackerTraitsGPU::findRoads(const int iteration) diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 401d98ad63560..9b3df193abe34 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -14,7 +14,6 @@ #include #include -#include #include #include #include @@ -23,16 +22,17 @@ #include #include #include -#include #include "ITStracking/Constants.h" +#include "ITStracking/Definitions.h" #include "ITStracking/IndexTableUtils.h" #include "ITStracking/MathUtils.h" #include "ITStracking/ExternalAllocator.h" +#include "ITStracking/Tracklet.h" +#include "ITStracking/Cluster.h" +#include "ITStracking/Cell.h" #include "DataFormatsITS/TrackITS.h" -#include "ReconstructionDataFormats/Vertex.h" -#include "ITStrackingGPU/TrackerTraitsGPU.h" #include "ITStrackingGPU/TrackingKernels.h" #include "ITStrackingGPU/Utils.h" @@ -43,8 +43,6 @@ using namespace o2::track; namespace o2::its { -using Vertex = o2::dataformats::Vertex>; - namespace gpu { @@ -92,9 +90,9 @@ struct TypedAllocator { ExternalAllocator* mInternalAllocator; }; -GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, - const o2::its::IndexTableUtils& utils, - const float z1, const float z2, float maxdeltaz, float maxdeltaphi) +GPUdii() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, + const o2::its::IndexTableUtils& utils, + const float z1, const float z2, float maxdeltaz, float maxdeltaphi) { const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz; const float phiRangeMin = (maxdeltaphi > o2::constants::math::PI) ? 0.f : currentCluster.phi - maxdeltaphi; @@ -112,18 +110,18 @@ GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerInde utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))}; } -GPUd() bool fitTrack(TrackITSExt& track, - int start, - int end, - int step, - float chi2clcut, - float chi2ndfcut, - float maxQoverPt, - int nCl, - float bz, - const TrackingFrameInfo** tfInfos, - const o2::base::Propagator* prop, - o2::base::PropagatorF::MatCorrType matCorrType) +GPUdii() bool fitTrack(TrackITSExt& track, + int start, + int end, + int step, + float chi2clcut, + float chi2ndfcut, + float maxQoverPt, + int nCl, + float bz, + const TrackingFrameInfo** tfInfos, + const o2::base::Propagator* prop, + o2::base::PropagatorF::MatCorrType matCorrType) { for (int iLayer{start}; iLayer != end; iLayer += step) { if (track.getClusterIndex(iLayer) == constants::UnusedIndex) { @@ -164,10 +162,10 @@ GPUd() bool fitTrack(TrackITSExt& track, return o2::gpu::CAMath::Abs(track.getQ2Pt()) < maxQoverPt && track.getChi2() < chi2ndfcut * (nCl * 2 - 5); } -GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, - const Cluster& cluster2, - const TrackingFrameInfo& tf3, - const float bz) +GPUdii() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, + const Cluster& cluster2, + const TrackingFrameInfo& tf3, + const float bz) { const float ca = o2::gpu::CAMath::Cos(tf3.alphaTrackingFrame), sa = o2::gpu::CAMath::Sin(tf3.alphaTrackingFrame); const float x1 = cluster1.xCoordinate * ca + cluster1.yCoordinate * sa; @@ -368,7 +366,7 @@ GPUg() void fitTrackSeedsKernel( } } -template // Version for new tracker to supersede the old one +template GPUg() void computeLayerCellNeighboursKernel( CellSeed** cellSeedArray, int* neighboursLUT, @@ -519,7 +517,7 @@ GPUg() void computeLayerCellsKernel( } } -template +template GPUg() void computeLayerTrackletsMultiROFKernel( const IndexTableUtils* utils, const uint8_t* multMask, @@ -578,6 +576,11 @@ GPUg() void computeLayerTrackletsMultiROFKernel( if (usedClusters[layerIndex][currentCluster.clusterId]) { continue; } + if constexpr (!initRun) { + if (trackletsLUT[layerIndex][currentSortedIndex] == trackletsLUT[layerIndex][currentSortedIndex + 1]) { + continue; + } + } const float inverseR0{1.f / currentCluster.radius}; for (int iV{startVtx}; iV < endVtx; ++iV) { @@ -642,7 +645,6 @@ GPUg() void computeLayerTrackletsMultiROFKernel( } } -template GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, int* trackletsLookUpTable, const int nTracklets) @@ -741,133 +743,6 @@ GPUg() void processNeighboursKernel(const int layer, } } -///////////////////////////////////////// -// Debug Kernels -///////////////////////////////////////// - -template -GPUd() void pPointer(T* ptr) -{ - printf("[%p]\t", ptr); -} - -template -GPUg() void printPointersKernel(std::tuple args) -{ - auto print_all = [&](auto... ptrs) { - (pPointer(ptrs), ...); - }; - std::apply(print_all, args); -} - -template -struct trackletSortEmptyFunctor { - GPUhd() bool operator()(const T& lhs, const T& rhs) const - { - return lhs.firstClusterIndex > rhs.firstClusterIndex; - } -}; - -template -struct trackletSortIndexFunctor { - GPUhd() bool operator()(const T& lhs, const T& rhs) const - { - return lhs.firstClusterIndex < rhs.firstClusterIndex || (lhs.firstClusterIndex == rhs.firstClusterIndex && lhs.secondClusterIndex < rhs.secondClusterIndex); - } -}; - -GPUg() void printBufferLayerOnThread(const int layer, const int* v, unsigned int size, const int len = 150, const unsigned int tId = 0) -{ - if (blockIdx.x * blockDim.x + threadIdx.x == tId) { - for (int i{0}; i < size; ++i) { - if (!(i % len)) { - printf("\n layer %d: ===> %d/%d\t", layer, i, (int)size); - } - printf("%d\t", v[i]); - } - printf("\n"); - } -} - -GPUg() void printMatrixRow(const int row, int** mat, const unsigned int rowLength, const int len = 150, const unsigned int tId = 0) -{ - if (blockIdx.x * blockDim.x + threadIdx.x == tId) { - for (int i{0}; i < rowLength; ++i) { - if (!(i % len)) { - printf("\n row %d: ===> %d/%d\t", row, i, (int)rowLength); - } - printf("%d\t", mat[row][i]); - } - printf("\n"); - } -} - -GPUg() void printBufferPointersLayerOnThread(const int layer, void** v, unsigned int size, const int len = 150, const unsigned int tId = 0) -{ - if (blockIdx.x * blockDim.x + threadIdx.x == tId) { - for (int i{0}; i < size; ++i) { - if (!(i % len)) { - printf("\n layer %d: ===> %d/%d\t", layer, i, (int)size); - } - printf("%p\t", (void*)v[i]); - } - printf("\n"); - } -} - -GPUg() void printVertices(const Vertex* v, unsigned int size, const unsigned int tId = 0) -{ - if (blockIdx.x * blockDim.x + threadIdx.x == tId) { - printf("vertices: \n"); - for (int i{0}; i < size; ++i) { - printf("\tx=%f y=%f z=%f\n", v[i].getX(), v[i].getY(), v[i].getZ()); - } - } -} - -GPUg() void printNeighbours(const gpuPair* neighbours, - const int* nNeighboursIndexTable, - const unsigned int nCells, - const unsigned int tId = 0) -{ - for (unsigned int iNeighbour{0}; iNeighbour < nNeighboursIndexTable[nCells]; ++iNeighbour) { - if (threadIdx.x == tId) { - printf("%d -> %d\n", neighbours[iNeighbour].first, neighbours[iNeighbour].second); - } - } -} - -GPUg() void printTrackletsLUTPerROF(const int layerId, - const int** ROFClusters, - int** luts, - const int tId = 0) -{ - if (blockIdx.x * blockDim.x + threadIdx.x == tId) { - for (auto rofId{0}; rofId < 2304; ++rofId) { - int nClus = ROFClusters[layerId][rofId + 1] - ROFClusters[layerId][rofId]; - if (!nClus) { - continue; - } - printf("rof: %d (%d) ==> ", rofId, nClus); - - for (int iC{0}; iC < nClus; ++iC) { - int nT = luts[layerId][ROFClusters[layerId][rofId] + iC]; - printf("%d\t", nT); - } - printf("\n"); - } - } -} - -GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId = 0) -{ - for (unsigned int iCell{0}; iCell < nCells; ++iCell) { - if (threadIdx.x == tId) { - seed[iCell].printCell(); - } - } -} - GPUhi() void allocateMemory(void** p, size_t bytes, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) { if (alloc) { @@ -885,45 +760,12 @@ GPUhi() void deallocateMemory(void* p, size_t bytes, cudaStream_t stream = nullp GPUChkErrS(cudaFreeAsync(p, stream)); } } - -template -GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) -{ - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - allocateMemory(&d_temp_storage, temp_storage_bytes, stream, alloc); - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - deallocateMemory(d_temp_storage, temp_storage_bytes, stream, alloc); -} - -template -GPUhi() void cubExclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) -{ - cubExclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream, alloc); -} - -template -GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr, ExternalAllocator* alloc = nullptr) -{ - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - allocateMemory(&d_temp_storage, temp_storage_bytes, stream, alloc); - GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, in_out, num_items, stream)); - deallocateMemory(d_temp_storage, temp_storage_bytes, stream, alloc); -} - -template -GPUhi() void cubInclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr, o2::its::ExternalAllocator* alloc = nullptr) -{ - cubInclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream, alloc); -} } // namespace gpu template void countTrackletsInROFsHandler(const IndexTableUtils* utils, const uint8_t* multMask, + const int layer, const int startROF, const int endROF, const int maxROF, @@ -953,41 +795,41 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils, const int nThreads, gpu::Streams& streams) { - for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { - gpu::computeLayerTrackletsMultiROFKernel<<>>( - utils, - multMask, - iLayer, - startROF, - endROF, - maxROF, - deltaROF, - vertices, - rofPV, - nVertices, - vertexId, - clusters, - ROFClusters, - usedClusters, - clustersIndexTables, - nullptr, - trackletsLUTs, - iteration, - NSigmaCut, - phiCuts[iLayer], - resolutionPV, - minRs[iLayer + 1], - maxRs[iLayer + 1], - resolutions[iLayer], - radii[iLayer + 1] - radii[iLayer], - mulScatAng[iLayer]); - gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get(), alloc); - } + gpu::computeLayerTrackletsMultiROFKernel<<>>( + utils, + multMask, + layer, + startROF, + endROF, + maxROF, + deltaROF, + vertices, + rofPV, + nVertices, + vertexId, + clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + nullptr, + trackletsLUTs, + iteration, + NSigmaCut, + phiCuts[layer], + resolutionPV, + minRs[layer + 1], + maxRs[layer + 1], + resolutions[layer], + radii[layer + 1] - radii[layer], + mulScatAng[layer]); + auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator(alloc)).on(streams[layer].get()); + thrust::exclusive_scan(nosync_policy, trackletsLUTsHost[layer], trackletsLUTsHost[layer] + nClusters[layer] + 1, trackletsLUTsHost[layer]); } template void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const uint8_t* multMask, + const int layer, const int startROF, const int endROF, const int maxROF, @@ -1020,66 +862,45 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const int nThreads, gpu::Streams& streams) { - for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { - gpu::computeLayerTrackletsMultiROFKernel<<>>( - utils, - multMask, - iLayer, - startROF, - endROF, - maxROF, - deltaROF, - vertices, - rofPV, - nVertices, - vertexId, - clusters, - ROFClusters, - usedClusters, - clustersIndexTables, - tracklets, - trackletsLUTs, - iteration, - NSigmaCut, - phiCuts[iLayer], - resolutionPV, - minRs[iLayer + 1], - maxRs[iLayer + 1], - resolutions[iLayer], - radii[iLayer + 1] - radii[iLayer], - mulScatAng[iLayer]); - if (nTracklets[iLayer]) { - Tracklet *tracklets_in = spanTracklets[iLayer], *tracklets_out{nullptr}; - size_t n = nTracklets[iLayer]; - size_t sort_temp_bytes = 0; - GPUChkErrS(cub::DeviceMergeSort::SortKeys(nullptr, sort_temp_bytes, tracklets_in, n, gpu::sort_tracklets{}, streams[iLayer].get())); - void* sort_temp_storage = nullptr; - gpu::allocateMemory(&sort_temp_storage, sort_temp_bytes, streams[iLayer].get(), alloc); - GPUChkErrS(cub::DeviceMergeSort::SortKeys(sort_temp_storage, sort_temp_bytes, tracklets_in, n, gpu::sort_tracklets{}, streams[iLayer].get())); - gpu::allocateMemory(reinterpret_cast(&tracklets_out), n * sizeof(Tracklet), streams[iLayer].get(), alloc); - size_t unique_temp_bytes = 0; - int* num_selected = nullptr; - gpu::allocateMemory(reinterpret_cast(&num_selected), sizeof(int), streams[iLayer].get(), alloc); - GPUChkErrS(cub::DeviceSelect::Unique(nullptr, unique_temp_bytes, tracklets_in, tracklets_out, num_selected, n, streams[iLayer].get())); - void* unique_temp_storage = nullptr; - gpu::allocateMemory(&unique_temp_storage, unique_temp_bytes, streams[iLayer].get(), alloc); - GPUChkErrS(cub::DeviceSelect::Unique(unique_temp_storage, unique_temp_bytes, tracklets_in, tracklets_out, num_selected, n, streams[iLayer].get())); - GPUChkErrS(cudaMemcpyAsync(tracklets_in, tracklets_out, n * sizeof(Tracklet), cudaMemcpyDeviceToDevice, streams[iLayer].get())); - GPUChkErrS(cudaMemcpyAsync(&nTracklets[iLayer], num_selected, sizeof(int), cudaMemcpyDeviceToHost, streams[iLayer].get())); - streams[iLayer].sync(); - gpu::deallocateMemory(tracklets_out, n * sizeof(Tracklet), streams[iLayer].get(), alloc); - gpu::deallocateMemory(sort_temp_storage, sort_temp_bytes, streams[iLayer].get(), alloc); - gpu::deallocateMemory(unique_temp_storage, unique_temp_bytes, streams[iLayer].get(), alloc); - gpu::deallocateMemory(num_selected, sizeof(int), streams[iLayer].get(), alloc); - } - if (iLayer > 0) { - GPUChkErrS(cudaMemsetAsync(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int), streams[iLayer].get())); - gpu::compileTrackletsLookupTableKernel<<>>( - spanTracklets[iLayer], - trackletsLUTsHost[iLayer], - nTracklets[iLayer]); - gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1, streams[iLayer].get(), alloc); - } + gpu::computeLayerTrackletsMultiROFKernel<<>>( + utils, + multMask, + layer, + startROF, + endROF, + maxROF, + deltaROF, + vertices, + rofPV, + nVertices, + vertexId, + clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + tracklets, + trackletsLUTs, + iteration, + NSigmaCut, + phiCuts[layer], + resolutionPV, + minRs[layer + 1], + maxRs[layer + 1], + resolutions[layer], + radii[layer + 1] - radii[layer], + mulScatAng[layer]); + thrust::device_ptr tracklets_ptr(spanTracklets[layer]); + auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator(alloc)).on(streams[layer].get()); + thrust::sort(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[layer], gpu::sort_tracklets()); + auto unique_end = thrust::unique(nosync_policy, tracklets_ptr, tracklets_ptr + nTracklets[layer], gpu::equal_tracklets()); + nTracklets[layer] = unique_end - tracklets_ptr; + if (layer) { + GPUChkErrS(cudaMemsetAsync(trackletsLUTsHost[layer], 0, (nClusters[layer] + 1) * sizeof(int), streams[layer].get())); + gpu::compileTrackletsLookupTableKernel<<>>( + spanTracklets[layer], + trackletsLUTsHost[layer], + nTracklets[layer]); + thrust::exclusive_scan(nosync_policy, trackletsLUTsHost[layer], trackletsLUTsHost[layer] + nClusters[layer] + 1, trackletsLUTsHost[layer]); } } @@ -1119,7 +940,8 @@ void countCellsHandler( maxChi2ClusterAttachment, // const float cellDeltaTanLambdaSigma, // const float nSigmaCut); // const float - gpu::cubExclusiveScanInPlace(cellsLUTsHost, nTracklets + 1, streams[layer].get(), alloc); + auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator(alloc)).on(streams[layer].get()); + thrust::exclusive_scan(nosync_policy, cellsLUTsHost, cellsLUTsHost + nTracklets + 1, cellsLUTsHost); } void computeCellsHandler( @@ -1190,8 +1012,9 @@ void countCellNeighboursHandler(CellSeed** cellsLayersDevice, layerIndex, nCells, maxCellNeighbours); - gpu::cubInclusiveScanInPlace(neighboursLUT, nCellsNext, stream.get(), alloc); - gpu::cubExclusiveScanInPlace(neighboursIndexTable, nCells + 1, stream.get(), alloc); + auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator(alloc)).on(stream.get()); + thrust::inclusive_scan(nosync_policy, neighboursLUT, neighboursLUT + nCellsNext, neighboursLUT); + thrust::exclusive_scan(nosync_policy, neighboursIndexTable, neighboursIndexTable + nCells + 1, neighboursIndexTable); } void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, @@ -1232,44 +1055,13 @@ int filterCellNeighboursHandler(gpuPair* cellNeighbourPairs, gpu::Stream& stream, o2::its::ExternalAllocator* allocator) { -#ifndef __HIPCC__ - int* d_num_selected = nullptr; - gpu::allocateMemory(reinterpret_cast(&d_num_selected), sizeof(int), stream.get(), allocator); - size_t select_bytes = 0; - GPUChkErrS(cub::DeviceSelect::If(nullptr, select_bytes, cellNeighbourPairs, static_cast*>(nullptr), d_num_selected, nNeigh, gpu::is_valid_pair(), stream.get())); - void* select_temp = nullptr; - gpu::allocateMemory(&select_temp, select_bytes, stream.get(), allocator); - gpuPair* d_temp_valid = nullptr; - gpu::allocateMemory(reinterpret_cast(&d_temp_valid), nNeigh * sizeof(gpuPair), stream.get(), allocator); - GPUChkErrS(cub::DeviceSelect::If(select_temp, select_bytes, cellNeighbourPairs, d_temp_valid, d_num_selected, nNeigh, gpu::is_valid_pair(), stream.get())); - int newSize = 0; - GPUChkErrS(cudaMemcpyAsync(&newSize, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost, stream.get())); - stream.sync(); // needed to get newSize - size_t sort_bytes = 0; - GPUChkErrS(cub::DeviceMergeSort::SortPairs(nullptr, sort_bytes, d_temp_valid, d_temp_valid, newSize, gpu::sort_by_second(), stream.get())); - void* sort_temp = nullptr; - gpu::allocateMemory(&sort_temp, sort_bytes, stream.get(), allocator); - GPUChkErrS(cub::DeviceMergeSort::SortPairs(sort_temp, sort_bytes, d_temp_valid, d_temp_valid, newSize, gpu::sort_by_second(), stream.get())); - size_t transform_bytes = 0; - GPUChkErrS(cub::DeviceTransform::Transform(nullptr, transform_bytes, d_temp_valid, cellNeighbours, newSize, gpu::pair_to_first(), stream.get())); - void* transform_temp = nullptr; - gpu::allocateMemory(&transform_temp, transform_bytes, stream.get(), allocator); - GPUChkErrS(cub::DeviceTransform::Transform(transform_temp, transform_bytes, d_temp_valid, cellNeighbours, newSize, gpu::pair_to_first(), stream.get())); - gpu::deallocateMemory(transform_temp, transform_bytes, stream.get(), allocator); - gpu::deallocateMemory(d_temp_valid, newSize * sizeof(gpuPair), stream.get(), allocator); - gpu::deallocateMemory(sort_temp, sort_bytes, stream.get(), allocator); - gpu::deallocateMemory(d_num_selected, sizeof(int), stream.get(), allocator); - gpu::deallocateMemory(select_temp, select_bytes, stream.get(), allocator); -#else // FIXME using thrust here since hipcub does not yet have DeviceTransform - auto nosync_policy = THRUST_NAMESPACE::par_nosync.on(stream.get()); + auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator(allocator)).on(stream.get()); thrust::device_ptr> neighVectorPairs(cellNeighbourPairs); thrust::device_ptr validNeighs(cellNeighbours); auto updatedEnd = thrust::remove_if(nosync_policy, neighVectorPairs, neighVectorPairs + nNeigh, gpu::is_invalid_pair()); size_t newSize = updatedEnd - neighVectorPairs; thrust::stable_sort(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, gpu::sort_by_second()); thrust::transform(nosync_policy, neighVectorPairs, neighVectorPairs + newSize, validNeighs, gpu::pair_to_first()); -#endif - return newSize; } @@ -1296,6 +1088,7 @@ void processNeighboursHandler(const int startLayer, auto allocInt = gpu::TypedAllocator(alloc); auto allocCellSeed = gpu::TypedAllocator(alloc); thrust::device_vector> foundSeedsTable(nCells[startLayer] + 1, 0, allocInt); + auto nosync_policy = THRUST_NAMESPACE::par_nosync(gpu::TypedAllocator(alloc)).on(gpu::Stream::DefaultStream); gpu::processNeighboursKernel<<>>( startLayer, @@ -1315,7 +1108,7 @@ void processNeighboursHandler(const int startLayer, maxChi2ClusterAttachment, propagator, matCorrType); - gpu::cubExclusiveScanInPlace(foundSeedsTable, nCells[startLayer] + 1, gpu::Stream::DefaultStream, alloc); + thrust::exclusive_scan(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin()); thrust::device_vector> updatedCellId(foundSeedsTable.back(), 0, allocInt); thrust::device_vector> updatedCellSeed(foundSeedsTable.back(), allocCellSeed); @@ -1337,8 +1130,7 @@ void processNeighboursHandler(const int startLayer, maxChi2ClusterAttachment, propagator, matCorrType); - GPUChkErrS(cudaPeekAtLastError()); - GPUChkErrS(cudaDeviceSynchronize()); + GPUChkErrS(cudaStreamSynchronize(gpu::Stream::DefaultStream)); int level = startLevel; thrust::device_vector> lastCellId(allocInt); @@ -1350,7 +1142,7 @@ void processNeighboursHandler(const int startLayer, thrust::device_vector>(allocInt).swap(updatedCellId); auto lastCellSeedSize{lastCellSeed.size()}; foundSeedsTable.resize(lastCellSeedSize + 1); - thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0); + thrust::fill(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), 0); gpu::processNeighboursKernel<<>>( iLayer, @@ -1370,13 +1162,13 @@ void processNeighboursHandler(const int startLayer, maxChi2ClusterAttachment, propagator, matCorrType); - gpu::cubExclusiveScanInPlace(foundSeedsTable, foundSeedsTable.size(), gpu::Stream::DefaultStream, alloc); + thrust::exclusive_scan(nosync_policy, foundSeedsTable.begin(), foundSeedsTable.end(), foundSeedsTable.begin()); auto foundSeeds{foundSeedsTable.back()}; updatedCellId.resize(foundSeeds); - thrust::fill(updatedCellId.begin(), updatedCellId.end(), 0); + thrust::fill(nosync_policy, updatedCellId.begin(), updatedCellId.end(), 0); updatedCellSeed.resize(foundSeeds); - thrust::fill(updatedCellSeed.begin(), updatedCellSeed.end(), CellSeed()); + thrust::fill(nosync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), CellSeed()); gpu::processNeighboursKernel<<>>( iLayer, @@ -1396,11 +1188,10 @@ void processNeighboursHandler(const int startLayer, maxChi2ClusterAttachment, propagator, matCorrType); - GPUChkErrS(cudaPeekAtLastError()); - GPUChkErrS(cudaDeviceSynchronize()); } + GPUChkErrS(cudaStreamSynchronize(gpu::Stream::DefaultStream)); thrust::device_vector> outSeeds(updatedCellSeed.size(), allocCellSeed); - auto end = thrust::copy_if(updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5))); + auto end = thrust::copy_if(nosync_policy, updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5))); auto s{end - outSeeds.begin()}; seedsHost.reserve(seedsHost.size() + s); thrust::copy(outSeeds.begin(), outSeeds.begin() + s, std::back_inserter(seedsHost)); @@ -1434,14 +1225,13 @@ void trackSeedHandler(CellSeed* trackSeeds, propagator, // const o2::base::Propagator* matCorrType); // o2::base::PropagatorF::MatCorrType thrust::device_ptr tr_ptr(tracks); - thrust::sort(tr_ptr, tr_ptr + nSeeds, gpu::compare_track_chi2()); - GPUChkErrS(cudaPeekAtLastError()); - GPUChkErrS(cudaDeviceSynchronize()); + GPUChkErrS(cudaStreamSynchronize(gpu::Stream::DefaultStream)); } template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils, const uint8_t* multMask, + const int layer, const int startROF, const int endROF, const int maxROF, @@ -1473,6 +1263,7 @@ template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils, template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, const uint8_t* multMask, + const int layer, const int startROF, const int endROF, const int maxROF,