diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index a1d52bff11f9d..2bd1550e7c72b 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -25,13 +25,13 @@ namespace o2::its::gpu { template -class TimeFrameGPU : public TimeFrame +class TimeFrameGPU final : public TimeFrame { using typename TimeFrame::CellSeedN; using typename TimeFrame::IndexTableUtilsN; public: - TimeFrameGPU(); + TimeFrameGPU() = default; ~TimeFrameGPU() = default; /// Most relevant operations @@ -44,13 +44,13 @@ class TimeFrameGPU : public TimeFrame void loadTrackingFrameInfoDevice(const int, const int); void createTrackingFrameInfoDeviceArray(const int); void loadUnsortedClustersDevice(const int, const int); - void createUnsortedClustersDeviceArray(const int); + void createUnsortedClustersDeviceArray(const int, const int = nLayers); void loadClustersDevice(const int, const int); - void createClustersDeviceArray(const int); + void createClustersDeviceArray(const int, const int = nLayers); void loadClustersIndexTables(const int, const int); - void createClustersIndexTablesArray(const int iteration); + void createClustersIndexTablesArray(const int); void createUsedClustersDevice(const int, const int); - void createUsedClustersDeviceArray(const int); + void createUsedClustersDeviceArray(const int, const int = nLayers); void loadUsedClustersDevice(); void loadROFrameClustersDevice(const int, const int); void createROFrameClustersDeviceArray(const int); @@ -85,6 +85,12 @@ class TimeFrameGPU : public TimeFrame void downloadCellsDevice(); void downloadCellsLUTDevice(); + /// Vertexer + void createVtxTrackletsLUTDevice(const int32_t); + void createVtxTrackletsBuffers(const int32_t); + void createVtxLinesLUTDevice(const int32_t); + void createVtxLinesBuffer(const int32_t); + /// synchronization auto& getStream(const size_t stream) { return mGpuStreams[stream]; } auto& getStreams() { return mGpuStreams; } @@ -98,6 +104,8 @@ class TimeFrameGPU : public TimeFrame virtual void wipe() final; /// interface + virtual bool isGPU() const noexcept final { return true; } + virtual const char* getName() const noexcept { return "GPU"; } int getNClustersInRofSpan(const int, const int, const int) const; IndexTableUtilsN* getDeviceIndexTableUtils() { return mIndexTableUtilsDevice; } int* getDeviceROFramesClusters(const int layer) { return mROFramesClustersDevice[layer]; } @@ -122,7 +130,7 @@ class TimeFrameGPU : public TimeFrame const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; } const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; } std::vector getClusterSizes(); - const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; } + uint8_t** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; } const int** getDeviceROFrameClusters() const { return mROFramesClustersDeviceArray; } Tracklet** getDeviceArrayTracklets() { return mTrackletsDeviceArray; } int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } @@ -135,6 +143,19 @@ class TimeFrameGPU : public TimeFrame int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; } uint8_t* getDeviceMultCutMask() { return mMultMaskDevice; } + // Vertexer + auto& getDeviceNTrackletsPerROF() const noexcept { return mNTrackletsPerROFDevice; } + auto& getDeviceNTrackletsPerCluster() const noexcept { return mNTrackletsPerClusterDevice; } + auto& getDeviceNTrackletsPerClusterSum() const noexcept { return mNTrackletsPerClusterSumDevice; } + int32_t** getDeviceArrayNTrackletsPerROF() const noexcept { return mNTrackletsPerROFDeviceArray; } + int32_t** getDeviceArrayNTrackletsPerCluster() const noexcept { return mNTrackletsPerClusterDeviceArray; } + int32_t** getDeviceArrayNTrackletsPerClusterSum() const noexcept { return mNTrackletsPerClusterSumDeviceArray; } + uint8_t* getDeviceUsedTracklets() const noexcept { return mUsedTrackletsDevice; } + int32_t* getDeviceNLinesPerCluster() const noexcept { return mNLinesPerClusterDevice; } + int32_t* getDeviceNLinesPerClusterSum() const noexcept { return mNLinesPerClusterSumDevice; } + Line* getDeviceLines() const noexcept { return mLinesDevice; } + gsl::span getDeviceTrackletsPerROFs() { return mNTrackletsPerROFDevice; } + void setDevicePropagator(const o2::base::PropagatorImpl* p) final { this->mPropagatorDevice = p; } // Host-specific getters @@ -180,7 +201,7 @@ class TimeFrameGPU : public TimeFrame const Cluster** mClustersDeviceArray; const Cluster** mUnsortedClustersDeviceArray; const int** mClustersIndexTablesDeviceArray; - const unsigned char** mUsedClustersDeviceArray; + uint8_t** mUsedClustersDeviceArray; const int** mROFramesClustersDeviceArray; std::array mTrackletsDevice; std::array mTrackletsLUTDevice; @@ -208,6 +229,18 @@ class TimeFrameGPU : public TimeFrame std::array mTrackingFrameInfoDevice; const TrackingFrameInfo** mTrackingFrameInfoDeviceArray; + /// Vertexer + std::array mNTrackletsPerROFDevice; + std::array mNTrackletsPerClusterDevice; + std::array mNTrackletsPerClusterSumDevice; + uint8_t* mUsedTrackletsDevice; + int32_t* mNLinesPerClusterDevice; + int32_t* mNLinesPerClusterSumDevice; + int32_t** mNTrackletsPerROFDeviceArray; + int32_t** mNTrackletsPerClusterDeviceArray; + int32_t** mNTrackletsPerClusterSumDeviceArray; + Line* mLinesDevice; + // State Streams mGpuStreams; std::bitset mPinnedUnsortedClusters{0}; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index 15fe6f05f7850..e5b9253ca4090 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -20,15 +20,22 @@ #include #include +#include "ITStracking/MathUtils.h" +#include "ITStracking/ExternalAllocator.h" + #include "GPUCommonDef.h" #include "GPUCommonHelpers.h" #include "GPUCommonLogger.h" +#include "GPUCommonDefAPI.h" +#ifdef GPUCA_GPUCODE +#include #ifndef __HIPCC__ #define THRUST_NAMESPACE thrust::cuda #else #define THRUST_NAMESPACE thrust::hip #endif +#endif #ifdef ITS_GPU_LOG #define GPULog(...) LOGP(info, __VA_ARGS__) @@ -38,6 +45,10 @@ namespace o2::its { +// FWD declarations +template +class IndexTableUtils; +class Tracklet; template using gpuPair = std::pair; @@ -282,6 +293,184 @@ class GPUTimer } }; #endif + +#ifdef GPUCA_GPUCODE +template +struct TypedAllocator { + using value_type = T; + using pointer = thrust::device_ptr; + using const_pointer = thrust::device_ptr; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + + TypedAllocator() noexcept : mInternalAllocator(nullptr) {} + explicit TypedAllocator(ExternalAllocator* a) noexcept : mInternalAllocator(a) {} + + template + TypedAllocator(const TypedAllocator& o) noexcept : mInternalAllocator(o.mInternalAllocator) + { + } + + pointer allocate(size_type n) + { + void* raw = mInternalAllocator->allocate(n * sizeof(T)); + return thrust::device_pointer_cast(static_cast(raw)); + } + + void deallocate(pointer p, size_type n) noexcept + { + if (!p) { + return; + } + void* raw = thrust::raw_pointer_cast(p); + mInternalAllocator->deallocate(static_cast(raw), n * sizeof(T)); + } + + bool operator==(TypedAllocator const& o) const noexcept + { + return mInternalAllocator == o.mInternalAllocator; + } + bool operator!=(TypedAllocator const& o) const noexcept + { + return !(*this == o); + } + + private: + ExternalAllocator* mInternalAllocator; +}; + +template +GPUdii() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, + const o2::its::IndexTableUtils* utils, + const float z1, const float z2, float maxdeltaz, float maxdeltaphi) +{ + const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz; + const float phiRangeMin = (maxdeltaphi > o2::constants::math::PI) ? 0.f : currentCluster.phi - maxdeltaphi; + const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz; + const float phiRangeMax = (maxdeltaphi > o2::constants::math::PI) ? o2::constants::math::TwoPI : currentCluster.phi + maxdeltaphi; + + if (zRangeMax < -utils->getLayerZ(layerIndex) || + zRangeMin > utils->getLayerZ(layerIndex) || zRangeMin > zRangeMax) { + return {}; + } + + return int4{o2::gpu::CAMath::Max(0, utils->getZBinIndex(layerIndex, zRangeMin)), + utils->getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)), + o2::gpu::CAMath::Min(utils->getNzBins() - 1, utils->getZBinIndex(layerIndex, zRangeMax)), + utils->getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))}; +} + +GPUdii() gpuSpan getPrimaryVertices(const int rof, + const int* roframesPV, + const int nROF, + const uint8_t* mask, + const Vertex* vertices) +{ + const int start_pv_id = roframesPV[rof]; + const int stop_rof = rof >= nROF - 1 ? nROF : rof + 1; + size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if ROF is excluded + return gpuSpan(&vertices[start_pv_id], delta); +}; + +GPUdii() gpuSpan getPrimaryVertices(const int romin, + const int romax, + const int* roframesPV, + const int nROF, + const Vertex* vertices) +{ + const int start_pv_id = roframesPV[romin]; + const int stop_rof = romax >= nROF - 1 ? nROF : romax + 1; + return gpuSpan(&vertices[start_pv_id], roframesPV[stop_rof] - roframesPV[romin]); +}; + +GPUdii() gpuSpan getClustersOnLayer(const int rof, + const int totROFs, + const int layer, + const int** roframesClus, + const Cluster** clusters) +{ + if (rof < 0 || rof >= totROFs) { + return gpuSpan(); + } + const int start_clus_id{roframesClus[layer][rof]}; + const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1; + const unsigned int delta = roframesClus[layer][stop_rof] - start_clus_id; + return gpuSpan(&(clusters[layer][start_clus_id]), delta); +} + +GPUdii() gpuSpan getTrackletsPerCluster(const int rof, + const int totROFs, + const int mode, + const int** roframesClus, + const Tracklet** tracklets) +{ + if (rof < 0 || rof >= totROFs) { + return gpuSpan(); + } + const int start_clus_id{roframesClus[1][rof]}; + const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1; + const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id; + return gpuSpan(&(tracklets[mode][start_clus_id]), delta); +} + +GPUdii() gpuSpan getNTrackletsPerCluster(const int rof, + const int totROFs, + const int mode, + const int** roframesClus, + int** ntracklets) +{ + if (rof < 0 || rof >= totROFs) { + return gpuSpan(); + } + const int start_clus_id{roframesClus[1][rof]}; + const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1; + const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id; + return gpuSpan(&(ntracklets[mode][start_clus_id]), delta); +} + +GPUdii() gpuSpan getNTrackletsPerCluster(const int rof, + const int totROFs, + const int mode, + const int** roframesClus, + const int** ntracklets) +{ + if (rof < 0 || rof >= totROFs) { + return gpuSpan(); + } + const int start_clus_id{roframesClus[1][rof]}; + const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1; + const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id; + return gpuSpan(&(ntracklets[mode][start_clus_id]), delta); +} + +GPUdii() gpuSpan getNLinesPerCluster(const int rof, + const int totROFs, + const int** roframesClus, + int* nlines) +{ + if (rof < 0 || rof >= totROFs) { + return gpuSpan(); + } + const int start_clus_id{roframesClus[1][rof]}; + const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1; + const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id; + return gpuSpan(&(nlines[start_clus_id]), delta); +} + +GPUdii() gpuSpan getNLinesPerCluster(const int rof, + const int totROFs, + const int** roframesClus, + const int* nlines) +{ + if (rof < 0 || rof >= totROFs) { + return gpuSpan(); + } + const int start_clus_id{roframesClus[1][rof]}; + const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1; + const unsigned int delta = roframesClus[1][stop_rof] - start_clus_id; + return gpuSpan(&(nlines[start_clus_id]), delta); +} +#endif } // namespace gpu } // namespace o2::its diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexerTraitsGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexerTraitsGPU.h index 5b1d9194e1174..dddc247466c65 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexerTraitsGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexerTraitsGPU.h @@ -31,32 +31,25 @@ namespace o2::its { -class VertexerTraitsGPU final : public VertexerTraits +template +class VertexerTraitsGPU final : public VertexerTraits { public: void initialise(const TrackingParameters&, const int iteration = 0) final; - void adoptTimeFrame(TimeFrame<7>*) noexcept final; + void adoptTimeFrame(TimeFrame* tf) noexcept final; void computeTracklets(const int iteration = 0) final; void computeTrackletMatching(const int iteration = 0) final; void computeVertices(const int iteration = 0) final; void updateVertexingParameters(const std::vector&, const TimeFrameGPUParameters&) final; - void computeVerticesHist(); bool isGPU() const noexcept final { return true; } const char* getName() const noexcept final { return "GPU"; } protected: - IndexTableUtils* mDeviceIndexTableUtils; - gpu::TimeFrameGPU<7>* mTimeFrameGPU; + gpu::TimeFrameGPU* mTimeFrameGPU; TimeFrameGPUParameters mTfGPUParams; }; -inline void VertexerTraitsGPU::adoptTimeFrame(TimeFrame<7>* tf) noexcept -{ - mTimeFrameGPU = static_cast*>(tf); - mTimeFrame = static_cast*>(tf); -} - } // namespace o2::its #endif diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexingKernels.h index 059b1cdc29082..67f12bad8486c 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/VertexingKernels.h @@ -12,46 +12,104 @@ #ifndef ITSTRACKINGGPU_VERTEXINGKERNELS_H_ #define ITSTRACKINGGPU_VERTEXINGKERNELS_H_ -#include "ITStracking/MathUtils.h" -#include "ITStracking/Configuration.h" -#include "ITStracking/ClusterLines.h" -#include "ITStracking/Tracklet.h" +#include +#include +#include +#include "ITStracking/Tracklet.h" +#include "ITStracking/Cluster.h" +#include "ITStracking/ClusterLines.h" #include "ITStrackingGPU/Utils.h" -#include "ITStrackingGPU/ClusterLinesGPU.h" -#include "ITStrackingGPU/VertexerTraitsGPU.h" -#include "ITStrackingGPU/TracerGPU.h" -namespace o2::its::gpu +namespace o2::its { -#ifdef GPUCA_GPUCODE // GPUg() global kernels must only when compiled by GPU compiler -template -GPUg() void trackleterKernelMultipleRof( - const Cluster* clustersNextLayer, // 0 2 - const Cluster* clustersCurrentLayer, // 1 1 - const int* sizeNextLClusters, - const int* sizeCurrentLClusters, - const int* nextIndexTables, - Tracklet* Tracklets, - int* foundTracklets, - const IndexTableUtils* utils, - const unsigned int startRofId, - const unsigned int rofSize, - const float phiCut, - const size_t maxTrackletsPerCluster); -#endif -template -void trackletFinderHandler(const Cluster* clustersNextLayer, // 0 2 - const Cluster* clustersCurrentLayer, // 1 1 - const int* sizeNextLClusters, - const int* sizeCurrentLClusters, - const int* nextIndexTables, - Tracklet* Tracklets, - int* foundTracklets, - const IndexTableUtils* utils, - const unsigned int startRofId, - const unsigned int rofSize, - const float phiCut, - const size_t maxTrackletsPerCluster = 1e2); -} // namespace o2::its::gpu + +/// Trackleting +template +void countTrackletsInROFsHandler(const IndexTableUtils* GPUrestrict() utils, + const uint8_t* GPUrestrict() multMask, + const int32_t nRofs, + const int32_t deltaROF, + const int32_t* GPUrestrict() rofPV, + const int32_t vertPerRofThreshold, + const Cluster** GPUrestrict() clusters, + const uint32_t nClusters, + const int32_t** GPUrestrict() ROFClusters, + const uint8_t** GPUrestrict() usedClusters, + const int32_t** GPUrestrict() clustersIndexTables, + int32_t** trackletsPerClusterLUTs, + int32_t** trackletsPerClusterSumLUTs, + int32_t** trackletsPerROF, + const std::array& trackletsPerClusterLUTsHost, + const std::array& trackletsPerClusterSumLUTsHost, + const int32_t iteration, + const float phiCut, + const int32_t maxTrackletsPerCluster, + const int32_t nBlocks, + const int32_t nThreads, + gpu::Streams& streams); + +template +void computeTrackletsInROFsHandler(const IndexTableUtils* GPUrestrict() utils, + const uint8_t* GPUrestrict() multMask, + const int32_t nRofs, + const int32_t deltaROF, + const int32_t* GPUrestrict() rofPV, + const int vertPerRofThreshold, + const Cluster** GPUrestrict() clusters, + const uint32_t nClusters, + const int32_t** GPUrestrict() ROFClusters, + const uint8_t** GPUrestrict() usedClusters, + const int32_t** GPUrestrict() clustersIndexTables, + Tracklet** GPUrestrict() foundTracklets, + const int32_t** GPUrestrict() trackletsPerClusterLUTs, + const int32_t** GPUrestrict() trackletsPerClusterSumLUTs, + const int32_t** GPUrestrict() trackletsPerROF, + const int32_t iteration, + const float phiCut, + const int32_t maxTrackletsPerCluster, + const int32_t nBlocks, + const int32_t nThreads, + gpu::Streams& streams); + +/// Selection +void countTrackletsMatchingInROFsHandler(const int32_t nRofs, + const int32_t deltaROF, + const uint32_t nClusters, + const int32_t** GPUrestrict() ROFClusters, + const Cluster** GPUrestrict() clusters, + uint8_t** GPUrestrict() usedClusters, + const Tracklet** GPUrestrict() foundTracklets, + uint8_t* GPUrestrict() usedTracklets, + const int32_t** GPUrestrict() trackletsPerClusterLUTs, + const int32_t** GPUrestrict() trackletsPerClusterSumLUTs, + int32_t* GPUrestrict() linesPerClusterLUT, + int32_t* GPUrestrict() linesPerClusterSumLUT, + const int32_t iteration, + const float phiCut, + const float tanLambdaCut, + const int32_t nBlocks, + const int32_t nThreads, + gpu::Streams& streams); + +void computeTrackletsMatchingInROFsHandler(const int32_t nRofs, + const int32_t deltaROF, + const uint32_t nClusters, + const int32_t** GPUrestrict() ROFClusters, + const Cluster** GPUrestrict() clusters, + const uint8_t** GPUrestrict() usedClusters, + const Tracklet** GPUrestrict() foundTracklets, + uint8_t* GPUrestrict() usedTracklets, + const int32_t** GPUrestrict() trackletsPerClusterLUTs, + const int32_t** GPUrestrict() trackletsPerClusterSumLUTs, + const int32_t* GPUrestrict() linesPerClusterSumLUT, + Line* GPUrestrict() lines, + const int32_t iteration, + const float phiCut, + const float tanLambdaCut, + const int32_t nBlocks, + const int32_t nThreads, + gpu::Streams& streams); + +} // namespace o2::its #endif diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt index 3dff67dbccd80..1f6a046a81350 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/CMakeLists.txt @@ -22,8 +22,8 @@ if(CUDA_ENABLED) TimeFrameGPU.cu TracerGPU.cu TrackingKernels.cu - # VertexingKernels.cu - # VertexerTraitsGPU.cxx + VertexingKernels.cu + VertexerTraitsGPU.cxx PUBLIC_INCLUDE_DIRECTORIES ../ PUBLIC_LINK_LIBRARIES O2::ITStracking O2::SimConfig diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 965bf27fdd12b..4da91522371f8 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -28,12 +28,6 @@ namespace o2::its::gpu { -template -TimeFrameGPU::TimeFrameGPU() -{ - this->mIsGPU = true; -} - template void TimeFrameGPU::allocMemAsync(void** ptr, size_t size, Stream& stream, bool extAllocator) { @@ -69,14 +63,14 @@ void TimeFrameGPU::loadIndexTableUtils(const int iteration) } template -void TimeFrameGPU::createUnsortedClustersDeviceArray(const int iteration) +void TimeFrameGPU::createUnsortedClustersDeviceArray(const int iteration, const int maxLayers) { if (!iteration) { GPUTimer timer("creating unsorted clusters array"); allocMem(reinterpret_cast(&mUnsortedClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mUnsortedClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); mPinnedUnsortedClusters.set(nLayers); - for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) { GPUChkErrS(cudaHostRegister(this->mUnsortedClusters[iLayer].data(), this->mUnsortedClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); mPinnedUnsortedClusters.set(iLayer); } @@ -96,14 +90,14 @@ void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration, cons } template -void TimeFrameGPU::createClustersDeviceArray(const int iteration) +void TimeFrameGPU::createClustersDeviceArray(const int iteration, const int maxLayers) { if (!iteration) { GPUTimer timer("creating sorted clusters array"); allocMem(reinterpret_cast(&mClustersDeviceArray), nLayers * sizeof(Cluster*), this->getExtAllocator()); GPUChkErrS(cudaHostRegister(mClustersDevice.data(), nLayers * sizeof(Cluster*), cudaHostRegisterPortable)); mPinnedClusters.set(nLayers); - for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) { GPUChkErrS(cudaHostRegister(this->mClusters[iLayer].data(), this->mClusters[iLayer].size() * sizeof(Cluster), cudaHostRegisterPortable)); mPinnedClusters.set(iLayer); } @@ -150,15 +144,15 @@ void TimeFrameGPU::loadClustersIndexTables(const int iteration, const i } template -void TimeFrameGPU::createUsedClustersDeviceArray(const int iteration) +void TimeFrameGPU::createUsedClustersDeviceArray(const int iteration, const int maxLayers) { if (!iteration) { GPUTimer timer("creating used clusters flags"); - allocMem(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), this->getExtAllocator()); - GPUChkErrS(cudaHostRegister(mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaHostRegisterPortable)); + allocMem(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(uint8_t*), this->getExtAllocator()); + GPUChkErrS(cudaHostRegister(mUsedClustersDevice.data(), nLayers * sizeof(uint8_t*), cudaHostRegisterPortable)); mPinnedUsedClusters.set(nLayers); - for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { - GPUChkErrS(cudaHostRegister(this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaHostRegisterPortable)); + for (auto iLayer{0}; iLayer < o2::gpu::CAMath::Min(maxLayers, nLayers); ++iLayer) { + GPUChkErrS(cudaHostRegister(this->mUsedClusters[iLayer].data(), this->mUsedClusters[iLayer].size() * sizeof(uint8_t), cudaHostRegisterPortable)); mPinnedUsedClusters.set(iLayer); } } @@ -460,6 +454,87 @@ void TimeFrameGPU::createTrackITSExtDevice(bounded_vector& s GPUChkErrS(cudaHostRegister(mTrackITSExt.data(), seeds.size() * sizeof(o2::its::TrackITSExt), cudaHostRegisterPortable)); } +template +void TimeFrameGPU::createVtxTrackletsLUTDevice(const int32_t iteration) +{ + GPUTimer timer("creating vertexer tracklet LUTs"); + const int32_t ncls = this->mClusters[1].size(); + for (int32_t iMode{0}; iMode < 2; ++iMode) { + if (!iteration) { + GPULog("gpu-transfer: creating vertexer tracklets per cluster for {} elements for mode {}, for {:.2f} MB.", ncls, iMode, ncls * sizeof(int32_t) / constants::MB); + allocMemAsync(reinterpret_cast(&mNTrackletsPerClusterDevice[iMode]), ncls * sizeof(int32_t), mGpuStreams[iMode], this->getExtAllocator()); + + GPULog("gpu-transfer: creating vertexer tracklets per cluster sum for {} elements for mode {}, for {:.2f} MB.", ncls + 1, iMode, (ncls + 1) * sizeof(int32_t) / constants::MB); + allocMemAsync(reinterpret_cast(&mNTrackletsPerClusterSumDevice[iMode]), (ncls + 1) * sizeof(int32_t), mGpuStreams[iMode], this->getExtAllocator()); + + GPULog("gpu-transfer: creating vertexer tracklets per ROF for {} elements for mode {}, for {:.2f} MB.", this->mNrof + 1, iMode, (this->mNrof + 1) * sizeof(int32_t) / constants::MB); + allocMemAsync(reinterpret_cast(&mNTrackletsPerROFDevice[iMode]), (this->mNrof + 1) * sizeof(int32_t), mGpuStreams[iMode], this->getExtAllocator()); + } + GPUChkErrS(cudaMemsetAsync(mNTrackletsPerClusterDevice[iMode], 0, ncls * sizeof(int32_t), mGpuStreams[iMode].get())); + GPUChkErrS(cudaMemsetAsync(mNTrackletsPerClusterSumDevice[iMode], 0, (ncls + 1) * sizeof(int32_t), mGpuStreams[iMode].get())); + GPUChkErrS(cudaMemsetAsync(mNTrackletsPerROFDevice[iMode], 0, (this->mNrof + 1) * sizeof(int32_t), mGpuStreams[iMode].get())); + } + mGpuStreams[0].sync(); + mGpuStreams[1].sync(); + if (!iteration) { + allocMem(reinterpret_cast(&mNTrackletsPerClusterDeviceArray), mNTrackletsPerClusterDevice.size() * sizeof(int32_t*), this->getExtAllocator()); + GPUChkErrS(cudaMemcpy(mNTrackletsPerClusterDeviceArray, mNTrackletsPerClusterDevice.data(), mNTrackletsPerClusterDevice.size() * sizeof(int32_t*), cudaMemcpyHostToDevice)); + + allocMem(reinterpret_cast(&mNTrackletsPerClusterSumDeviceArray), mNTrackletsPerClusterSumDevice.size() * sizeof(int32_t*), this->getExtAllocator()); + GPUChkErrS(cudaMemcpy(mNTrackletsPerClusterSumDeviceArray, mNTrackletsPerClusterSumDevice.data(), mNTrackletsPerClusterSumDevice.size() * sizeof(int32_t*), cudaMemcpyHostToDevice)); + + allocMem(reinterpret_cast(&mNTrackletsPerROFDeviceArray), mNTrackletsPerROFDevice.size() * sizeof(int32_t*), this->getExtAllocator()); + GPUChkErrS(cudaMemcpy(mNTrackletsPerROFDeviceArray, mNTrackletsPerROFDevice.data(), mNTrackletsPerROFDevice.size() * sizeof(int32_t*), cudaMemcpyHostToDevice)); + } +} + +template +void TimeFrameGPU::createVtxTrackletsBuffers(const int32_t iteration) +{ + GPUTimer timer("creating vertexer tracklet buffers"); + for (int32_t iMode{0}; iMode < 2; ++iMode) { + this->mTotalTracklets[iMode] = 0; + GPUChkErrS(cudaMemcpyAsync(&(this->mTotalTracklets[iMode]), mNTrackletsPerClusterSumDevice[iMode] + this->mClusters[1].size(), sizeof(int32_t), cudaMemcpyDeviceToHost, mGpuStreams[iMode].get())); + GPULog("gpu-transfer: creating vertexer tracklets buffer for {} elements on layer {}, for {:.2f} MB.", this->mTotalTracklets[iMode], iMode, this->mTotalTracklets[iMode] * sizeof(Tracklet) / constants::MB); + allocMemAsync(reinterpret_cast(&mTrackletsDevice[iMode]), this->mTotalTracklets[iMode] * sizeof(Tracklet), mGpuStreams[iMode], this->getExtAllocator()); + } + mGpuStreams[0].sync(); + mGpuStreams[1].sync(); + allocMem(reinterpret_cast(&mTrackletsDeviceArray), 2 * sizeof(Tracklet*), this->getExtAllocator()); + GPUChkErrS(cudaHostRegister(mTrackletsDevice.data(), 2 * sizeof(Tracklet*), cudaHostRegisterPortable)); + GPUChkErrS(cudaMemcpy(mTrackletsDeviceArray, mTrackletsDevice.data(), 2 * sizeof(Tracklet*), cudaMemcpyHostToDevice)); +} + +template +void TimeFrameGPU::createVtxLinesLUTDevice(const int32_t iteration) +{ + GPUTimer timer("creating vertexer lines LUT and used tracklets buffer"); + const int32_t ncls = this->mClusters[1].size(); + + GPULog("gpu-transfer: creating vertexer lines per cluster for {} elements , for {:.2f} MB.", ncls, ncls * sizeof(int32_t) / constants::MB); + allocMem(reinterpret_cast(&mNLinesPerClusterDevice), ncls * sizeof(int32_t), this->getExtAllocator()); + + GPULog("gpu-transfer: creating vertexer lines per cluster sum for {} elements , for {:.2f} MB.", ncls + 1, (ncls + 1) * sizeof(int32_t) / constants::MB); + allocMem(reinterpret_cast(&mNLinesPerClusterSumDevice), (ncls + 1) * sizeof(int32_t), this->getExtAllocator()); + + const int32_t ntrkls = this->mTotalTracklets[0]; + GPULog("gpu-transfer: creating vertexer used tracklets for {} elements , for {:.2f} MB.", ntrkls, ntrkls * sizeof(uint8_t) / constants::MB); + allocMem(reinterpret_cast(&mUsedTrackletsDevice), ntrkls * sizeof(uint8_t), this->getExtAllocator()); +} + +template +void TimeFrameGPU::createVtxLinesBuffer(const int32_t iteration) +{ + GPUTimer timer("creating vertexer lines buffer and resetting used tracklets"); + int32_t nlines = 0; + GPUChkErrS(cudaMemcpy(&nlines, mNLinesPerClusterDevice + this->mClusters[1].size(), sizeof(int32_t), cudaMemcpyDeviceToHost)); + this->mTotalLines = nlines; + GPULog("gpu-transfer: creating vertexer lines for {} elements , for {:.2f} MB.", nlines, nlines * sizeof(Line) / constants::MB); + allocMem(reinterpret_cast(&mLinesDevice), nlines * sizeof(Line), this->getExtAllocator()); + // reset used tracklets + GPUChkErrS(cudaMemset(mUsedTrackletsDevice, 0, this->mTotalTracklets[0] * sizeof(uint8_t))); +} + template void TimeFrameGPU::downloadCellsDevice() { diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 6a824de851fed..047d42d815e99 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -93,7 +93,7 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getClusterSizes(), mTimeFrameGPU->getDeviceROFrameClusters(), - mTimeFrameGPU->getDeviceArrayUsedClusters(), + (const uint8_t**)mTimeFrameGPU->getDeviceArrayUsedClusters(), mTimeFrameGPU->getDeviceArrayClustersIndexTables(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), mTimeFrameGPU->getDeviceTrackletsLUTs(), @@ -128,7 +128,7 @@ void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int i mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getClusterSizes(), mTimeFrameGPU->getDeviceROFrameClusters(), - mTimeFrameGPU->getDeviceArrayUsedClusters(), + (const uint8_t**)mTimeFrameGPU->getDeviceArrayUsedClusters(), mTimeFrameGPU->getDeviceArrayClustersIndexTables(), mTimeFrameGPU->getDeviceArrayTracklets(), mTimeFrameGPU->getDeviceTracklets(), @@ -300,7 +300,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) mTimeFrameGPU->getDeviceArrayCells(), mTimeFrameGPU->getDeviceCells()[startLayer], mTimeFrameGPU->getArrayNCells(), - mTimeFrameGPU->getDeviceArrayUsedClusters(), + (const uint8_t**)mTimeFrameGPU->getDeviceArrayUsedClusters(), mTimeFrameGPU->getDeviceNeighboursAll(), mTimeFrameGPU->getDeviceNeighboursLUTs(), mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 0e5fd21e5e90e..2c43ccd3bb81c 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -46,76 +46,6 @@ namespace o2::its namespace gpu { -template -struct TypedAllocator { - using value_type = T; - using pointer = thrust::device_ptr; - using const_pointer = thrust::device_ptr; - using size_type = std::size_t; - using difference_type = std::ptrdiff_t; - - TypedAllocator() noexcept : mInternalAllocator(nullptr) {} - explicit TypedAllocator(ExternalAllocator* a) noexcept : mInternalAllocator(a) {} - - template - TypedAllocator(const TypedAllocator& o) noexcept : mInternalAllocator(o.mInternalAllocator) - { - } - - pointer allocate(size_type n) - { - void* raw = mInternalAllocator->allocate(n * sizeof(T)); - return thrust::device_pointer_cast(static_cast(raw)); - } - - void deallocate(pointer p, size_type n) noexcept - { - if (!p) { - return; - } - void* raw = thrust::raw_pointer_cast(p); - mInternalAllocator->deallocate(static_cast(raw), n * sizeof(T)); - } - - bool operator==(TypedAllocator const& o) const noexcept - { - return mInternalAllocator == o.mInternalAllocator; - } - bool operator!=(TypedAllocator const& o) const noexcept - { - return !(*this == o); - } - - private: - ExternalAllocator* mInternalAllocator; -}; - -GPUdii() int4 getEmptyBinsRect() -{ - return int4{0, 0, 0, 0}; -} - -template -GPUdii() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, - const IndexTableUtils& utils, - const float z1, const float z2, float maxdeltaz, float maxdeltaphi) -{ - const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz; - const float phiRangeMin = (maxdeltaphi > o2::constants::math::PI) ? 0.f : currentCluster.phi - maxdeltaphi; - const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz; - const float phiRangeMax = (maxdeltaphi > o2::constants::math::PI) ? o2::constants::math::TwoPI : currentCluster.phi + maxdeltaphi; - - if (zRangeMax < -utils.getLayerZ(layerIndex) || - zRangeMin > utils.getLayerZ(layerIndex) || zRangeMin > zRangeMax) { - return getEmptyBinsRect(); - } - - return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex, zRangeMin)), - utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)), - o2::gpu::CAMath::Min(utils.getNzBins() - 1, utils.getZBinIndex(layerIndex, zRangeMax)), - utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))}; -} - GPUdii() bool fitTrack(TrackITSExt& track, int start, int end, @@ -272,44 +202,6 @@ struct compare_track_chi2 { } }; -GPUdii() gpuSpan getPrimaryVertices(const int rof, - const int* roframesPV, - const int nROF, - const uint8_t* mask, - const Vertex* vertices) -{ - const int start_pv_id = roframesPV[rof]; - const int stop_rof = rof >= nROF - 1 ? nROF : rof + 1; - const size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if ROF is excluded - return gpuSpan(&vertices[start_pv_id], delta); -}; - -GPUdii() gpuSpan getPrimaryVertices(const int romin, - const int romax, - const int* roframesPV, - const int nROF, - const Vertex* vertices) -{ - const int start_pv_id = roframesPV[romin]; - const int stop_rof = romax >= nROF - 1 ? nROF : romax + 1; - return gpuSpan(&vertices[start_pv_id], roframesPV[stop_rof] - roframesPV[romin]); -}; - -GPUdii() gpuSpan getClustersOnLayer(const int rof, - const int totROFs, - const int layer, - const int** roframesClus, - const Cluster** clusters) -{ - if (rof < 0 || rof >= totROFs) { - return gpuSpan(); - } - const int start_clus_id{roframesClus[layer][rof]}; - const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1; - const unsigned int delta = roframesClus[layer][stop_rof] - start_clus_id; - return gpuSpan(&(clusters[layer][start_clus_id]), delta); -} - template GPUg() void __launch_bounds__(256, 1) fitTrackSeedsKernel( CellSeed* trackSeeds, @@ -602,7 +494,7 @@ GPUg() void __launch_bounds__(256, 1) computeLayerTrackletsMultiROFKernel( const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; const float sqInverseDeltaZ0{1.f / (math_utils::Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + constants::Tolerance)}; /// protecting from overflows adding the detector resolution const float sigmaZ{o2::gpu::CAMath::Sqrt(math_utils::Sq(resolution) * math_utils::Sq(tanLambda) * ((math_utils::Sq(inverseR0) + sqInverseDeltaZ0) * math_utils::Sq(meanDeltaR) + 1.f) + math_utils::Sq(meanDeltaR * MSAngle))}; - const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex + 1, *utils, zAtRmin, zAtRmax, sigmaZ * NSigmaCut, phiCut)}; + const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex + 1, utils, zAtRmin, zAtRmax, sigmaZ * NSigmaCut, phiCut)}; if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { continue; } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx index 90d654a26a43d..658d3cf0dfb91 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexerTraitsGPU.cxx @@ -11,217 +11,169 @@ // /// \author matteo.concas@cern.ch -#include -#include -#include -#include -#include -#include - -#ifdef VTX_DEBUG -#include "TTree.h" -#include "TFile.h" -#endif +#include +#include "ITStracking/TrackingConfigParam.h" #include "ITStrackingGPU/VertexingKernels.h" #include "ITStrackingGPU/VertexerTraitsGPU.h" namespace o2::its { -void VertexerTraitsGPU::initialise(const TrackingParameters& trackingParams, const int iteration) +template +void VertexerTraitsGPU::initialise(const TrackingParameters& trackingParams, const int iteration) +{ + // FIXME + // Two things to fix here: + // This loads all necessary data for this step at once, can be overlayed with computation + // Also if running with the tracker some data is loaded twice! + mTimeFrameGPU->initialise(0, trackingParams, 3, &this->mIndexTableUtils, &mTfGPUParams); + + // FIXME some of these only need to be created once! + mTimeFrameGPU->loadIndexTableUtils(iteration); + mTimeFrameGPU->createUsedClustersDeviceArray(iteration, 3); + mTimeFrameGPU->createClustersDeviceArray(iteration, 3); + mTimeFrameGPU->createUnsortedClustersDeviceArray(iteration, 3); + mTimeFrameGPU->createClustersIndexTablesArray(iteration); + mTimeFrameGPU->createROFrameClustersDeviceArray(iteration); + for (int iLayer{0}; iLayer < 3; ++iLayer) { + mTimeFrameGPU->loadClustersDevice(iteration, iLayer); + mTimeFrameGPU->loadUnsortedClustersDevice(iteration, iLayer); + mTimeFrameGPU->loadClustersIndexTables(iteration, iLayer); + mTimeFrameGPU->createUsedClustersDevice(iteration, iLayer); + mTimeFrameGPU->loadROFrameClustersDevice(iteration, iLayer); + } +} + +template +void VertexerTraitsGPU::adoptTimeFrame(TimeFrame* tf) noexcept { - mTimeFrameGPU->initialise(0, trackingParams, 3, &mIndexTableUtils, &mTfGPUParams); + mTimeFrameGPU = static_cast*>(tf); + this->mTimeFrame = static_cast*>(tf); } -void VertexerTraitsGPU::updateVertexingParameters(const std::vector& vrtPar, const TimeFrameGPUParameters& tfPar) +template +void VertexerTraitsGPU::updateVertexingParameters(const std::vector& vrtPar, const TimeFrameGPUParameters& tfPar) { - mVrtParams = vrtPar; + this->mVrtParams = vrtPar; mTfGPUParams = tfPar; - mIndexTableUtils.setTrackingParameters(vrtPar[0]); - for (auto& par : mVrtParams) { - par.phiSpan = static_cast(std::ceil(mIndexTableUtils.getNphiBins() * par.phiCut / o2::constants::math::TwoPI)); - par.zSpan = static_cast(std::ceil(par.zCut * mIndexTableUtils.getInverseZCoordinate(0))); + this->mIndexTableUtils.setTrackingParameters(vrtPar[0]); + for (auto& par : this->mVrtParams) { + par.phiSpan = static_cast(std::ceil(this->mIndexTableUtils.getNphiBins() * par.phiCut / o2::constants::math::TwoPI)); + par.zSpan = static_cast(std::ceil(par.zCut * this->mIndexTableUtils.getInverseZCoordinate(0))); } } -void VertexerTraitsGPU::computeTracklets(const int iteration) +template +void VertexerTraitsGPU::computeTracklets(const int iteration) { - if (!mTimeFrameGPU->getClusters().size()) { + if (mTimeFrameGPU->getClusters().empty()) { return; } - // std::vector threads(mTimeFrameGPU->getNChunks()); - // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) { - // int rofPerChunk{mTimeFrameGPU->mNrof / (int)mTimeFrameGPU->getNChunks()}; - // mTimeFrameGPU->getVerticesInChunks()[chunkId].clear(); - // mTimeFrameGPU->getNVerticesInChunks()[chunkId].clear(); - // mTimeFrameGPU->getLabelsInChunks()[chunkId].clear(); - // auto doVertexReconstruction = [&, chunkId, rofPerChunk]() -> void { - // auto offset = chunkId * rofPerChunk; - // auto maxROF = offset + rofPerChunk; - // while (offset < maxROF) { - // auto rofs = mTimeFrameGPU->loadChunkData(chunkId, offset, maxROF); - // RANGE("chunk_gpu_vertexing", 1); - // // gpu::GpuTimer timer{offset, mTimeFrameGPU->getStream(chunkId).get()}; - // // timer.Start("vtTrackletFinder"); - // gpu::trackleterKernelMultipleRof<<getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0), // const Cluster* clustersNextLayer, // 0 2 - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1), // const Cluster* clustersCurrentLayer, // 1 1 - // mTimeFrameGPU->getDeviceROframesClusters(0), // const int* sizeNextLClusters, - // mTimeFrameGPU->getDeviceROframesClusters(1), // const int* sizeCurrentLClusters, - // mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(0), // const int* nextIndexTables, - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0), // Tracklet* Tracklets, - // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0), // int* foundTracklets, - // mTimeFrameGPU->getDeviceIndexTableUtils(), // const IndexTableUtils* utils, - // offset, // const unsigned int startRofId, - // rofs, // const unsigned int rofSize, - // mVrtParams.phiCut, // const float phiCut, - // mVrtParams.maxTrackletsPerCluster); // const size_t maxTrackletsPerCluster = 1e2 - - // gpu::trackleterKernelMultipleRof<<getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(2), // const Cluster* clustersNextLayer, // 0 2 - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1), // const Cluster* clustersCurrentLayer, // 1 1 - // mTimeFrameGPU->getDeviceROframesClusters(2), // const int* sizeNextLClusters, - // mTimeFrameGPU->getDeviceROframesClusters(1), // const int* sizeCurrentLClusters, - // mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(2), // const int* nextIndexTables, - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1), // Tracklet* Tracklets, - // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1), // int* foundTracklets, - // mTimeFrameGPU->getDeviceIndexTableUtils(), // const IndexTableUtils* utils, - // offset, // const unsigned int startRofId, - // rofs, // const unsigned int rofSize, - // mVrtParams.phiCut, // const float phiCut, - // mVrtParams.maxTrackletsPerCluster); // const size_t maxTrackletsPerCluster = 1e2 - - // gpu::trackletSelectionKernelMultipleRof<<getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0), // const Cluster* clusters0, // Clusters on layer 0 - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1), // const Cluster* clusters1, // Clusters on layer 1 - // mTimeFrameGPU->getDeviceROframesClusters(0), // const int* sizeClustersL0, // Number of clusters on layer 0 per ROF - // mTimeFrameGPU->getDeviceROframesClusters(1), // const int* sizeClustersL1, // Number of clusters on layer 1 per ROF - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0), // Tracklet* tracklets01, // Tracklets on layer 0-1 - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1), // Tracklet* tracklets12, // Tracklets on layer 1-2 - // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0), // const int* nFoundTracklets01, // Number of tracklets found on layers 0-1 - // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1), // const int* nFoundTracklet12, // Number of tracklets found on layers 1-2 - // mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(), // unsigned char* usedTracklets, // Used tracklets - // mTimeFrameGPU->getChunk(chunkId).getDeviceLines(), // Line* lines, // Lines - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(), // int* nFoundLines, // Number of found lines - // mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), // int* nExclusiveFoundLines, // Number of found lines exclusive scan - // offset, // const unsigned int startRofId, // Starting ROF ID - // rofs, // const unsigned int rofSize, // Number of ROFs to consider - // mVrtParams.maxTrackletsPerCluster, // const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster - // mVrtParams.tanLambdaCut, // const float tanLambdaCut = 0.025f, // Cut on tan lambda - // mVrtParams.phiCut); // const float phiCut = 0.002f) // Cut on phi - - // discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(), - // mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), - // mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1), - // mTimeFrameGPU->getStream(chunkId).get())); - - // // Reset used tracklets - // checkGPUError(cudaMemsetAsync(mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(), - // false, - // sizeof(unsigned char) * mVrtParams.maxTrackletsPerCluster * mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1), - // mTimeFrameGPU->getStream(chunkId).get()), - // __FILE__, __LINE__); - - // gpu::trackletSelectionKernelMultipleRof<<getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(0), // const Cluster* clusters0, // Clusters on layer 0 - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(1), // const Cluster* clusters1, // Clusters on layer 1 - // mTimeFrameGPU->getDeviceROframesClusters(0), // const int* sizeClustersL0, // Number of clusters on layer 0 per ROF - // mTimeFrameGPU->getDeviceROframesClusters(1), // const int* sizeClustersL1, // Number of clusters on layer 1 per ROF - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(0), // Tracklet* tracklets01, // Tracklets on layer 0-1 - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(1), // Tracklet* tracklets12, // Tracklets on layer 1-2 - // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(0), // const int* nFoundTracklets01, // Number of tracklets found on layers 0-1 - // mTimeFrameGPU->getChunk(chunkId).getDeviceNTrackletCluster(1), // const int* nFoundTracklet12, // Number of tracklets found on layers 1-2 - // mTimeFrameGPU->getChunk(chunkId).getDeviceUsedTracklets(), // unsigned char* usedTracklets, // Used tracklets - // mTimeFrameGPU->getChunk(chunkId).getDeviceLines(), // Line* lines, // Lines - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines(), // int* nFoundLines, // Number of found lines - // mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), // int* nExclusiveFoundLines, // Number of found lines exclusive scan - // offset, // const unsigned int startRofId, // Starting ROF ID - // rofs, // const unsigned int rofSize, // Number of ROFs to consider - // mVrtParams.maxTrackletsPerCluster, // const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster - // mVrtParams.tanLambdaCut, // const float tanLambdaCut = 0.025f, // Cut on tan lambda - // mVrtParams.phiCut); // const float phiCut = 0.002f) // Cut on phi - - // int nClusters = mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, 1); - // int lastFoundLines; - // std::vector exclusiveFoundLinesHost(nClusters + 1); - - // // Obtain whole exclusive sum including nCluster+1 element (nCluster+1)th element is the total number of found lines. - // checkGPUError(cudaMemcpyAsync(exclusiveFoundLinesHost.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceNExclusiveFoundLines(), (nClusters) * sizeof(int), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get())); - // checkGPUError(cudaMemcpyAsync(&lastFoundLines, mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundLines() + nClusters - 1, sizeof(int), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get())); - // exclusiveFoundLinesHost[nClusters] = exclusiveFoundLinesHost[nClusters - 1] + lastFoundLines; - - // std::vector lines(exclusiveFoundLinesHost[nClusters]); - - // checkGPUError(cudaMemcpyAsync(lines.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceLines(), sizeof(Line) * lines.size(), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get())); - // checkGPUError(cudaStreamSynchronize(mTimeFrameGPU->getStream(chunkId).get())); - - // // Compute vertices - // std::vector clusterLines; - // std::vector usedLines; - // for (int rofId{0}; rofId < rofs; ++rofId) { - // auto rof = offset + rofId; - // auto clustersL1offsetRof = mTimeFrameGPU->getROframeClusters(1)[rof] - mTimeFrameGPU->getROframeClusters(1)[offset]; // starting cluster offset for this ROF - // auto nClustersL1Rof = mTimeFrameGPU->getROframeClusters(1)[rof + 1] - mTimeFrameGPU->getROframeClusters(1)[rof]; // number of clusters for this ROF - // auto linesOffsetRof = exclusiveFoundLinesHost[clustersL1offsetRof]; // starting line offset for this ROF - // auto nLinesRof = exclusiveFoundLinesHost[clustersL1offsetRof + nClustersL1Rof] - linesOffsetRof; - // gsl::span linesInRof(lines.data() + linesOffsetRof, static_cast::size_type>(nLinesRof)); - - // usedLines.resize(linesInRof.size(), false); - // usedLines.assign(linesInRof.size(), false); - // clusterLines.clear(); - // clusterLines.reserve(nClustersL1Rof); - // computeVerticesInRof(rof, - // linesInRof, - // usedLines, - // clusterLines, - // mTimeFrameGPU->getBeamXY(), - // mTimeFrameGPU->getVerticesInChunks()[chunkId], - // mTimeFrameGPU->getNVerticesInChunks()[chunkId], - // mTimeFrameGPU, - // mTimeFrameGPU->hasMCinformation() ? &mTimeFrameGPU->getLabelsInChunks()[chunkId] : nullptr); - // } - // offset += rofs; - // } - // }; - // // Do work - // threads[chunkId] = std::thread(doVertexReconstruction); - // } - // for (auto& thread : threads) { - // thread.join(); - // } - // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) { - // int start{0}; - // for (int rofId{0}; rofId < mTimeFrameGPU->getNVerticesInChunks()[chunkId].size(); ++rofId) { - // gsl::span rofVerts{mTimeFrameGPU->getVerticesInChunks()[chunkId].data() + start, static_cast::size_type>(mTimeFrameGPU->getNVerticesInChunks()[chunkId][rofId])}; - // mTimeFrameGPU->addPrimaryVertices(rofVerts); - // if (mTimeFrameGPU->hasMCinformation()) { - // mTimeFrameGPU->getVerticesLabels().emplace_back(); - // // TODO: add MC labels - // } - // start += mTimeFrameGPU->getNVerticesInChunks()[chunkId][rofId]; - // } - // } - // mTimeFrameGPU->wipe(3); - // } + const auto& conf = ITSGpuTrackingParamConfig::Instance(); + + mTimeFrameGPU->createVtxTrackletsLUTDevice(iteration); + countTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), + mTimeFrameGPU->getDeviceMultCutMask(), + mTimeFrameGPU->getNrof(), + this->mVrtParams[iteration].deltaRof, + mTimeFrameGPU->getDeviceROFramesPV(), + this->mVrtParams[iteration].vertPerRofThreshold, + mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getClusterSizes()[1], + mTimeFrameGPU->getDeviceROFrameClusters(), + (const uint8_t**)mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceArrayClustersIndexTables(), + mTimeFrameGPU->getDeviceArrayNTrackletsPerCluster(), + mTimeFrameGPU->getDeviceArrayNTrackletsPerClusterSum(), + mTimeFrameGPU->getDeviceArrayNTrackletsPerROF(), + mTimeFrameGPU->getDeviceNTrackletsPerCluster(), + mTimeFrameGPU->getDeviceNTrackletsPerClusterSum(), + iteration, + this->mVrtParams[iteration].phiCut, + this->mVrtParams[iteration].maxTrackletsPerCluster, + conf.nBlocksVtxComputeTracklets[iteration], + conf.nThreadsVtxComputeTracklets[iteration], + mTimeFrameGPU->getStreams()); + mTimeFrameGPU->createVtxTrackletsBuffers(iteration); + computeTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), + mTimeFrameGPU->getDeviceMultCutMask(), + mTimeFrameGPU->getNrof(), + this->mVrtParams[iteration].deltaRof, + mTimeFrameGPU->getDeviceROFramesPV(), + this->mVrtParams[iteration].vertPerRofThreshold, + mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getClusterSizes()[1], + mTimeFrameGPU->getDeviceROFrameClusters(), + (const uint8_t**)mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceArrayClustersIndexTables(), + mTimeFrameGPU->getDeviceArrayTracklets(), + (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerCluster(), + (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerClusterSum(), + (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerROF(), + iteration, + this->mVrtParams[iteration].phiCut, + this->mVrtParams[iteration].maxTrackletsPerCluster, + conf.nBlocksVtxComputeTracklets[iteration], + conf.nThreadsVtxComputeTracklets[iteration], + mTimeFrameGPU->getStreams()); } -void VertexerTraitsGPU::computeTrackletMatching(const int iteration) +template +void VertexerTraitsGPU::computeTrackletMatching(const int iteration) { -} + if (!mTimeFrameGPU->getTotalTrackletsTF(0) || !mTimeFrameGPU->getTotalTrackletsTF(1)) { + return; + } -void VertexerTraitsGPU::computeVertices(const int iteration) -{ + const auto& conf = ITSGpuTrackingParamConfig::Instance(); + mTimeFrameGPU->createVtxLinesLUTDevice(iteration); + countTrackletsMatchingInROFsHandler(mTimeFrameGPU->getNrof(), + this->mVrtParams[iteration].deltaRof, + mTimeFrameGPU->getClusterSizes()[1], + mTimeFrameGPU->getDeviceROFrameClusters(), + mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getDeviceArrayUsedClusters(), + (const Tracklet**)mTimeFrameGPU->getDeviceArrayTracklets(), + mTimeFrameGPU->getDeviceUsedTracklets(), + (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerCluster(), + (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerClusterSum(), + mTimeFrameGPU->getDeviceNLinesPerCluster(), + mTimeFrameGPU->getDeviceNLinesPerClusterSum(), + iteration, + this->mVrtParams[iteration].phiCut, + this->mVrtParams[iteration].tanLambdaCut, + conf.nBlocksVtxComputeMatching[iteration], + conf.nThreadsVtxComputeMatching[iteration], + mTimeFrameGPU->getStreams()); + mTimeFrameGPU->createVtxLinesBuffer(iteration); + computeTrackletsMatchingInROFsHandler(mTimeFrameGPU->getNrof(), + this->mVrtParams[iteration].deltaRof, + mTimeFrameGPU->getClusterSizes()[1], + mTimeFrameGPU->getDeviceROFrameClusters(), + mTimeFrameGPU->getDeviceArrayClusters(), + nullptr, + (const Tracklet**)mTimeFrameGPU->getDeviceArrayTracklets(), + mTimeFrameGPU->getDeviceUsedTracklets(), + (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerCluster(), + (const int32_t**)mTimeFrameGPU->getDeviceArrayNTrackletsPerClusterSum(), + (const int32_t*)mTimeFrameGPU->getDeviceNLinesPerClusterSum(), + mTimeFrameGPU->getDeviceLines(), + iteration, + this->mVrtParams[iteration].phiCut, + this->mVrtParams[iteration].tanLambdaCut, + conf.nBlocksVtxComputeMatching[iteration], + conf.nThreadsVtxComputeMatching[iteration], + mTimeFrameGPU->getStreams()); } -void VertexerTraitsGPU::computeVerticesHist() +template +void VertexerTraitsGPU::computeVertices(const int iteration) { + LOGP(fatal, "This step is not implemented yet!"); + mTimeFrameGPU->loadUsedClustersDevice(); } -VertexerTraits* createVertexerTraitsGPU() -{ - return new VertexerTraitsGPU; -} +template class VertexerTraitsGPU<7>; + } // namespace o2::its diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu index 126e799efce5d..a2787bb13598d 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu @@ -14,364 +14,454 @@ #include #include "ITStrackingGPU/VertexingKernels.h" +#include "ITStracking/Tracklet.h" +#include "ITStracking/IndexTableUtils.h" +#include "ITStracking/ClusterLines.h" + +#include "GPUCommonMath.h" #include "GPUCommonHelpers.h" +#include "GPUCommonDef.h" -namespace o2 -{ -namespace its +namespace o2::its { -using math_utils::getNormalizedPhi; namespace gpu { -template -void trackletFinderHandler(const Cluster* clustersNextLayer, // 0 2 - const Cluster* clustersCurrentLayer, // 1 1 - const int* sizeNextLClusters, - const int* sizeCurrentLClusters, - const int* nextIndexTables, - Tracklet* Tracklets, - int* foundTracklets, - const IndexTableUtils* utils, - const unsigned int startRofId, - const unsigned int rofSize, - const float phiCut, - const unsigned int maxTrackletsPerCluster, - const int nBlocks, - const int nThreads) -{ - gpu::trackleterKernelMultipleRof<<>>( - clustersNextLayer, // const Cluster* clustersNextLayer, // 0 2 - clustersCurrentLayer, // const Cluster* clustersCurrentLayer, // 1 1 - sizeNextLClusters, // const int* sizeNextLClusters, - sizeCurrentLClusters, // const int* sizeCurrentLClusters, - nextIndexTables, // const int* nextIndexTables, - Tracklets, // Tracklet* Tracklets, - foundTracklets, // int* foundTracklets, - utils, // const IndexTableUtils* utils, - startRofId, // const unsigned int startRofId, - rofSize, // const unsigned int rofSize, - phiCut, // const float phiCut, - maxTrackletsPerCluster); // const unsigned int maxTrackletsPerCluster = 1e2 -} -/* - -GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, - const float z1, float maxdeltaz, float maxdeltaphi) -{ - const float zRangeMin = z1 - maxdeltaz; - const float phiRangeMin = currentCluster.phi - maxdeltaphi; - const float zRangeMax = z1 + maxdeltaz; - const float phiRangeMax = currentCluster.phi + maxdeltaphi; - - if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] || - zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) { - - return getEmptyBinsRect(); - } - return int4{o2::gpu::GPUCommonMath::Max(0, getZBinIndex(layerIndex + 1, zRangeMin)), - getPhiBinIndex(phiRangeMin), - o2::gpu::GPUCommonMath::Min(ZBins - 1, getZBinIndex(layerIndex + 1, zRangeMax)), - getPhiBinIndex(phiRangeMax)}; -} - -template -GPUd() void printOnThread(const unsigned int tId, const char* str, Args... args) +template +GPUg() void computeLayerTrackletMutliROFKernel(const Cluster** GPUrestrict() clusters, + const int32_t** GPUrestrict() rofClusters, + const uint8_t** GPUrestrict() usedClusters, + const int32_t** GPUrestrict() clusterIndexTables, + const float phiCut, + maybe_const** GPUrestrict() tracklets, + maybe_const** GPUrestrict() trackletOffsets, + const IndexTableUtils* GPUrestrict() utils, + const int32_t nRofs, + const int32_t deltaRof, + const int32_t* GPUrestrict() rofPV, + const int32_t iteration, + const int32_t verPerRofThreshold, + const int32_t maxTrackletsPerCluster) { - if (blockIdx.x * blockDim.x + threadIdx.x == tId) { - printf(str, args...); - } -} - -template -GPUd() void printOnBlock(const unsigned int bId, const char* str, Args... args) -{ - if (blockIdx.x == bId && threadIdx.x == 0) { - printf(str, args...); - } -} - -GPUg() void printBufferOnThread(const int* v, unsigned int size, const int len = 150, const unsigned int tId = 0) -{ - if (blockIdx.x * blockDim.x + threadIdx.x == tId) { - for (int i{0}; i < size; ++i) { - if (!(i % len)) { - printf("\n start: ===>%d/%d\t", i, (int)size); - } - printf("%d\t", v[i]); + constexpr int32_t iMode = (Mode == TrackletMode::Layer0Layer1) ? 0 : 1; + const int32_t phiBins(utils->getNphiBins()); + const int32_t zBins(utils->getNzBins()); + const int32_t tableSize{phiBins * zBins + 1}; + extern __shared__ uint16_t storedTrackletsShared[]; // each deltaROF needs its own counters + uint16_t* storedTrackletsLocal = storedTrackletsShared + threadIdx.x * (2 * deltaRof + 1); + for (uint32_t pivotRofId{blockIdx.x}; pivotRofId < (uint32_t)nRofs; pivotRofId += gridDim.x) { + if (iteration && rofPV[pivotRofId] > verPerRofThreshold) { + continue; } - printf("\n"); - } -} - -GPUg() void printBufferOnThreadF(const float* v, unsigned int size, const unsigned int tId = 0) -{ - if (blockIdx.x * blockDim.x + threadIdx.x == tId) { - printf("vector :"); - for (int i{0}; i < size; ++i) { - printf("%.9f\t", v[i]); + const uint16_t startROF = o2::gpu::CAMath::Max(0, (int)pivotRofId - deltaRof); + const uint16_t endROF = o2::gpu::CAMath::Min(nRofs, (int)pivotRofId + deltaRof + 1); + const auto clustersCurrentLayer = getClustersOnLayer((int32_t)pivotRofId, nRofs, 1, rofClusters, clusters); + if (clustersCurrentLayer.empty()) { + continue; } - printf("\n"); - } -} - -GPUg() void resetTrackletsKernel(Tracklet* tracklets, const int nTracklets) -{ - for (int iCurrentLayerClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentLayerClusterIndex < nTracklets; iCurrentLayerClusterIndex += blockDim.x * gridDim.x) { - new (tracklets + iCurrentLayerClusterIndex) Tracklet{}; - } -} - -GPUg() void dumpFoundTrackletsKernel(const Tracklet* tracklets, const int* nTracklet, const unsigned int nClustersMiddleLayer, const int maxTrackletsPerCluster) -{ - for (int iCurrentLayerClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentLayerClusterIndex < nClustersMiddleLayer; iCurrentLayerClusterIndex += blockDim.x * gridDim.x) { - const int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster}; - for (int iTracklet{0}; iTracklet < nTracklet[iCurrentLayerClusterIndex]; ++iTracklet) { - auto& t = tracklets[stride + iTracklet]; - t.dump(); - } - } -} - -GPUg() void dumpMaximaKernel(const cub::KeyValuePair* tmpVertexBins, const int threadId) -{ - if (blockIdx.x * blockDim.x + threadIdx.x == threadId) { - printf("XmaxBin: %d at index: %d | YmaxBin: %d at index: %d | ZmaxBin: %d at index: %d\n", - tmpVertexBins[0].value, tmpVertexBins[0].key, - tmpVertexBins[1].value, tmpVertexBins[1].key, - tmpVertexBins[2].value, tmpVertexBins[2].key); - } -} - -template -GPUg() void trackleterKernelSingleRof( - const Cluster* clustersNextLayer, // 0 2 - const Cluster* clustersCurrentLayer, // 1 1 - const int sizeNextLClusters, - const int sizeCurrentLClusters, - const int* indexTableNext, - const float phiCut, - Tracklet* Tracklets, - int* foundTracklets, - const IndexTableUtils* utils, - const short rofId, - const unsigned int maxTrackletsPerCluster = 1e2) -{ - const int phiBins{utils->getNphiBins()}; - const int zBins{utils->getNzBins()}; - // loop on layer1 clusters - for (int iCurrentLayerClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentLayerClusterIndex < sizeCurrentLClusters; iCurrentLayerClusterIndex += blockDim.x * gridDim.x) { - if (iCurrentLayerClusterIndex < sizeCurrentLClusters) { - unsigned int storedTracklets{0}; - const unsigned int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster}; - const Cluster& currentCluster = clustersCurrentLayer[iCurrentLayerClusterIndex]; - const int4 selectedBinsRect{VertexerTraits::getBinsRect(currentCluster, (int)Mode, 0.f, 50.f, phiCut / 2, *utils)}; + auto trackletsPerCluster = getNTrackletsPerCluster(pivotRofId, nRofs, iMode, rofClusters, trackletOffsets); + for (uint32_t iCurrentLayerClusterIndex{threadIdx.x}; iCurrentLayerClusterIndex < (uint32_t)clustersCurrentLayer.size(); iCurrentLayerClusterIndex += blockDim.x) { + for (int16_t i{0}; i < (int16_t)((2 * deltaRof) + 1); ++i) { + storedTrackletsLocal[i] = 0; + } + const Cluster& GPUrestrict() currentCluster { clustersCurrentLayer[iCurrentLayerClusterIndex] }; + const int4 selectedBinsRect{getBinsRect(currentCluster, (int)Mode, utils, 0.f, 0.f, 50.f, phiCut / 2)}; if (selectedBinsRect.x != 0 || selectedBinsRect.y != 0 || selectedBinsRect.z != 0 || selectedBinsRect.w != 0) { int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; if (phiBinsNum < 0) { phiBinsNum += phiBins; } - // loop on phi bins next layer - for (unsigned int iPhiBin{(unsigned int)selectedBinsRect.y}, iPhiCount{0}; iPhiCount < (unsigned int)phiBinsNum; iPhiBin = ++iPhiBin == phiBins ? 0 : iPhiBin, iPhiCount++) { - const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; - const int firstRowClusterIndex{indexTableNext[firstBinIndex]}; - const int maxRowClusterIndex{indexTableNext[firstBinIndex + zBins]}; - // loop on clusters next layer - for (int iNextLayerClusterIndex{firstRowClusterIndex}; iNextLayerClusterIndex < maxRowClusterIndex && iNextLayerClusterIndex < sizeNextLClusters; ++iNextLayerClusterIndex) { - const Cluster& nextCluster = clustersNextLayer[iNextLayerClusterIndex]; - if (o2::gpu::GPUCommonMath::Abs(currentCluster.phi - nextCluster.phi) < phiCut) { - if (storedTracklets < maxTrackletsPerCluster) { - if constexpr (Mode == TrackletMode::Layer0Layer1) { - new (Tracklets + stride + storedTracklets) Tracklet{iNextLayerClusterIndex, iCurrentLayerClusterIndex, nextCluster, currentCluster, rofId, rofId}; - } else { - new (Tracklets + stride + storedTracklets) Tracklet{iCurrentLayerClusterIndex, iNextLayerClusterIndex, currentCluster, nextCluster, rofId, rofId}; + for (int32_t iPhiBin{selectedBinsRect.y}, iPhiCount{0}; iPhiCount < phiBinsNum; iPhiBin = ++iPhiBin == phiBins ? 0 : iPhiBin, iPhiCount++) { + for (uint16_t targetRofId{startROF}; targetRofId < endROF; ++targetRofId) { + uint16_t& storedTracklets = storedTrackletsLocal[pivotRofId - targetRofId + deltaRof]; + const int32_t firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; + const int32_t maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; + const int32_t firstRowClusterIndex{clusterIndexTables[(int)Mode][(targetRofId)*tableSize + firstBinIndex]}; + const int32_t maxRowClusterIndex{clusterIndexTables[(int)Mode][(targetRofId)*tableSize + maxBinIndex]}; + auto clustersNextLayer = getClustersOnLayer((int32_t)targetRofId, nRofs, (int32_t)Mode, rofClusters, clusters); + if (clustersNextLayer.empty()) { + continue; + } + for (int32_t iNextLayerClusterIndex{firstRowClusterIndex}; iNextLayerClusterIndex < maxRowClusterIndex && iNextLayerClusterIndex < (int32_t)clustersNextLayer.size(); ++iNextLayerClusterIndex) { + if (iteration && usedClusters[(int32_t)Mode][iNextLayerClusterIndex]) { + continue; + } + const Cluster& GPUrestrict() nextCluster { clustersNextLayer[iNextLayerClusterIndex] }; + if (o2::gpu::GPUCommonMath::Abs(math_utils::smallestAngleDifference(currentCluster.phi, nextCluster.phi)) < phiCut) { + if (storedTracklets < maxTrackletsPerCluster) { + if constexpr (!dryRun) { + if constexpr (Mode == TrackletMode::Layer0Layer1) { + tracklets[0][trackletsPerCluster[iCurrentLayerClusterIndex] + storedTracklets] = Tracklet{iNextLayerClusterIndex, (int)iCurrentLayerClusterIndex, nextCluster, currentCluster, (short)targetRofId, (short)pivotRofId}; + } else { + tracklets[1][trackletsPerCluster[iCurrentLayerClusterIndex] + storedTracklets] = Tracklet{(int)iCurrentLayerClusterIndex, iNextLayerClusterIndex, currentCluster, nextCluster, (short)pivotRofId, (short)targetRofId}; + } + } + ++storedTracklets; } - ++storedTracklets; } } } } } - foundTracklets[iCurrentLayerClusterIndex] = storedTracklets; - if (storedTracklets >= maxTrackletsPerCluster) { - printf("gpu tracklet finder: some lines will be left behind for cluster %d. valid: %u max: %zu\n", iCurrentLayerClusterIndex, storedTracklets, maxTrackletsPerCluster); + if constexpr (dryRun) { + for (int32_t i{0}; i < (int32_t)((2 * deltaRof) + 1); ++i) { + trackletsPerCluster[iCurrentLayerClusterIndex] += storedTrackletsLocal[i]; + } } } } } -template -GPUg() void trackleterKernelMultipleRof( - const Cluster* clustersNextLayer, // 0 2 - const Cluster* clustersCurrentLayer, // 1 1 - const int* sizeNextLClusters, - const int* sizeCurrentLClusters, - const int* nextIndexTables, - Tracklet* Tracklets, - int* foundTracklets, - const IndexTableUtils* utils, - const short startRofId, - const short rofSize, - const float phiCut, - const unsigned int maxTrackletsPerCluster = 1e2) +template +GPUg() void computeTrackletSelectionMutliROFKernel(const Cluster** GPUrestrict() clusters, + maybe_const** GPUrestrict() usedClusters, + const int32_t** GPUrestrict() rofClusters, + const float phiCut, + const float tanLambdaCut, + const Tracklet** GPUrestrict() tracklets, + uint8_t* GPUrestrict() usedTracklets, + const int32_t** GPUrestrict() trackletOffsets, + const int32_t** GPUrestrict() trackletLUTs, + maybe_const* lineOffsets, + maybe_const* GPUrestrict() lines, + const int32_t nRofs, + const int32_t deltaRof, + const int32_t maxTracklets) { - const int phiBins{utils->getNphiBins()}; - const int zBins{utils->getNzBins()}; - for (auto iRof{blockIdx.x}; iRof < rofSize; iRof += gridDim.x) { - short rof = static_cast(iRof) + startRofId; - auto* clustersNextLayerRof = clustersNextLayer + (sizeNextLClusters[rof] - sizeNextLClusters[startRofId]); - auto* clustersCurrentLayerRof = clustersCurrentLayer + (sizeCurrentLClusters[rof] - sizeCurrentLClusters[startRofId]); - auto nClustersNextLayerRof = sizeNextLClusters[rof + 1] - sizeNextLClusters[rof]; - auto nClustersCurrentLayerRof = sizeCurrentLClusters[rof + 1] - sizeCurrentLClusters[rof]; - auto* indexTableNextRof = nextIndexTables + iRof * (phiBins * zBins + 1); - auto* TrackletsRof = Tracklets + (sizeCurrentLClusters[rof] - sizeCurrentLClusters[startRofId]) * maxTrackletsPerCluster; - auto* foundTrackletsRof = foundTracklets + (sizeCurrentLClusters[rof] - sizeCurrentLClusters[startRofId]); + for (uint32_t pivotRofId{blockIdx.x}; pivotRofId < nRofs; pivotRofId += gridDim.x) { + const int16_t startROF = o2::gpu::CAMath::Max(0, (int32_t)pivotRofId - deltaRof); + const int16_t endROF = o2::gpu::CAMath::Min(nRofs, (int32_t)pivotRofId + deltaRof + 1); - // single rof loop on layer1 clusters - for (int iCurrentLayerClusterIndex = threadIdx.x; iCurrentLayerClusterIndex < nClustersCurrentLayerRof; iCurrentLayerClusterIndex += blockDim.x) { - unsigned int storedTracklets{0}; - const unsigned int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster}; - const Cluster& currentCluster = clustersCurrentLayerRof[iCurrentLayerClusterIndex]; - const int4 selectedBinsRect{VertexerTraits::getBinsRect(currentCluster, (int)Mode, 0.f, 50.f, phiCut / 2, *utils)}; - if (selectedBinsRect.x != 0 || selectedBinsRect.y != 0 || selectedBinsRect.z != 0 || selectedBinsRect.w != 0) { - int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; - if (phiBinsNum < 0) { - phiBinsNum += phiBins; - } - // loop on phi bins next layer - for (unsigned int iPhiBin{(unsigned int)selectedBinsRect.y}, iPhiCount{0}; iPhiCount < (unsigned int)phiBinsNum; iPhiBin = ++iPhiBin == phiBins ? 0 : iPhiBin, iPhiCount++) { - const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; - const int firstRowClusterIndex{indexTableNextRof[firstBinIndex]}; - const int maxRowClusterIndex{indexTableNextRof[firstBinIndex + zBins]}; - // loop on clusters next layer - for (int iNextLayerClusterIndex{firstRowClusterIndex}; iNextLayerClusterIndex < maxRowClusterIndex && iNextLayerClusterIndex < nClustersNextLayerRof; ++iNextLayerClusterIndex) { - const Cluster& nextCluster = clustersNextLayerRof[iNextLayerClusterIndex]; - if (o2::gpu::GPUCommonMath::Abs(smallestAngleDifference(currentCluster.phi, nextCluster.phi)) < phiCut) { - if (storedTracklets < maxTrackletsPerCluster) { - if constexpr (Mode == TrackletMode::Layer0Layer1) { - new (TrackletsRof + stride + storedTracklets) Tracklet{iNextLayerClusterIndex, iCurrentLayerClusterIndex, nextCluster, currentCluster, rof, rof}; - } else { - new (TrackletsRof + stride + storedTracklets) Tracklet{iCurrentLayerClusterIndex, iNextLayerClusterIndex, currentCluster, nextCluster, rof, rof}; - } - ++storedTracklets; - } + const uint32_t clusterOffset = rofClusters[1][pivotRofId]; + const uint32_t nClustersCurrentLayer = rofClusters[1][pivotRofId + 1] - clusterOffset; + if (nClustersCurrentLayer <= 0) { + continue; + } + + auto linesPerCluster = getNLinesPerCluster(pivotRofId, nRofs, rofClusters, lineOffsets); + auto nTrackletsPerCluster01 = getNTrackletsPerCluster(pivotRofId, nRofs, 0, rofClusters, trackletOffsets); + auto nTrackletsPerCluster12 = getNTrackletsPerCluster(pivotRofId, nRofs, 1, rofClusters, trackletOffsets); + + for (uint32_t iCurrentLayerClusterIndex{threadIdx.x}; iCurrentLayerClusterIndex < nClustersCurrentLayer; iCurrentLayerClusterIndex += blockDim.x) { + int32_t validTracklets{0}; + const int32_t nTracklets01 = nTrackletsPerCluster01[iCurrentLayerClusterIndex]; + const int32_t nTracklets12 = nTrackletsPerCluster12[iCurrentLayerClusterIndex]; + for (int32_t iTracklet12{0}; iTracklet12 < nTracklets12; ++iTracklet12) { + for (int32_t iTracklet01{0}; iTracklet01 < nTracklets01; ++iTracklet01) { + + if (usedTracklets[trackletLUTs[0][clusterOffset + iCurrentLayerClusterIndex] + iTracklet01]) { + continue; + } + + const auto& GPUrestrict() tracklet01 { tracklets[0][trackletLUTs[0][clusterOffset + iCurrentLayerClusterIndex] + iTracklet01] }; + const auto& GPUrestrict() tracklet12 { tracklets[1][trackletLUTs[1][clusterOffset + iCurrentLayerClusterIndex] + iTracklet12] }; + const int16_t rof0 = tracklet01.rof[0]; + const int16_t rof2 = tracklet12.rof[1]; + if (deltaRof > 0 && ((rof0 < startROF) || (rof0 >= endROF) || (rof2 < startROF) || (rof2 >= endROF) || (o2::gpu::CAMath::Abs(rof0 - rof2) > deltaRof))) { + continue; + } + + const float deltaTanLambda{o2::gpu::GPUCommonMath::Abs(tracklet01.tanLambda - tracklet12.tanLambda)}; + const float deltaPhi{o2::gpu::GPUCommonMath::Abs(math_utils::smallestAngleDifference(tracklet01.phi, tracklet12.phi))}; + // + if (deltaTanLambda < tanLambdaCut && deltaPhi < phiCut && validTracklets < maxTracklets) { + // TODO use atomics to avoid race conditions for torn writes but is it needed here? + usedTracklets[trackletLUTs[0][clusterOffset + iCurrentLayerClusterIndex] + iTracklet01] = 1; + if constexpr (dryRun) { + usedClusters[0][rofClusters[0][rof0] + tracklet01.firstClusterIndex] = 1; + usedClusters[2][rofClusters[2][rof2] + tracklet12.secondClusterIndex] = 1; + } else { + const Cluster* clusters0 = clusters[0] + rofClusters[0][tracklet01.rof[0]]; + const Cluster* clusters1 = clusters[1] + rofClusters[1][tracklet01.rof[1]]; + lines[lineOffsets[iCurrentLayerClusterIndex] + validTracklets] = Line(tracklet01, clusters0, clusters1); } + ++validTracklets; } } } - foundTrackletsRof[iCurrentLayerClusterIndex] = storedTracklets; - // if (storedTracklets >= maxTrackletsPerCluster && storedTracklets - maxTrackletsPerCluster < 5) { - // printf("gpu tracklet finder: some lines will be left behind for cluster %d in rof: %d. valid: %u max: %lu (suppressing after 5 msgs)\n", iCurrentLayerClusterIndex, rof, storedTracklets, maxTrackletsPerCluster); - // } + + if constexpr (dryRun) { + linesPerCluster[iCurrentLayerClusterIndex] = validTracklets; + } } } } -template -GPUg() void trackletSelectionKernelSingleRof( - const Cluster* clusters0, - const Cluster* clusters1, - const unsigned int nClustersMiddleLayer, - Tracklet* tracklets01, - Tracklet* tracklets12, - const int* nFoundTracklet01, - const int* nFoundTracklet12, - unsigned char* usedTracklets, - Line* lines, - int* nFoundLines, - int* nExclusiveFoundLines, - const int maxTrackletsPerCluster = 1e2, - const float tanLambdaCut = 0.025f, - const float phiCut = 0.002f) +template +GPUg() void compileTrackletsPerROFKernel(const int32_t nRofs, + int** GPUrestrict() nTrackletsPerROF, + const int32_t** GPUrestrict() rofClusters, + const int32_t** GPUrestrict() nTrackletsPerCluster) { - for (int iCurrentLayerClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentLayerClusterIndex < nClustersMiddleLayer; iCurrentLayerClusterIndex += blockDim.x * gridDim.x) { - const int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster}; - int validTracklets{0}; - for (int iTracklet12{0}; iTracklet12 < nFoundTracklet12[iCurrentLayerClusterIndex]; ++iTracklet12) { - for (int iTracklet01{0}; iTracklet01 < nFoundTracklet01[iCurrentLayerClusterIndex] && validTracklets < maxTrackletsPerCluster; ++iTracklet01) { - const float deltaTanLambda{o2::gpu::GPUCommonMath::Abs(tracklets01[stride + iTracklet01].tanLambda - tracklets12[stride + iTracklet12].tanLambda)}; - const float deltaPhi{o2::gpu::GPUCommonMath::Abs(smallestAngleDifference(tracklets01[stride + iTracklet01].phi, tracklets12[stride + iTracklet12].phi))}; - if (!usedTracklets[stride + iTracklet01] && deltaTanLambda < tanLambdaCut && deltaPhi < phiCut && validTracklets != maxTrackletsPerCluster) { - usedTracklets[stride + iTracklet01] = true; - if constexpr (!initRun) { - new (lines + nExclusiveFoundLines[iCurrentLayerClusterIndex] + validTracklets) Line{tracklets01[stride + iTracklet01], clusters0, clusters1}; - } - ++validTracklets; - } - } + // TODO is this the best reduction kernel? + constexpr int32_t iMode = (Mode == TrackletMode::Layer0Layer1) ? 0 : 1; + extern __shared__ int32_t ssum[]; + for (uint32_t rof = blockIdx.x; rof < (uint32_t)nRofs; rof += gridDim.x) { + const auto& GPUrestrict() currentNTracklets = getNTrackletsPerCluster(rof, nRofs, iMode, rofClusters, nTrackletsPerCluster); + int32_t localSum = 0; + for (uint32_t ci = threadIdx.x; ci < (uint32_t)currentNTracklets.size(); ci += blockDim.x) { + localSum += currentNTracklets[ci]; } - if constexpr (initRun) { - nFoundLines[iCurrentLayerClusterIndex] = validTracklets; - if (validTracklets >= maxTrackletsPerCluster) { - printf("gpu tracklet selection: some lines will be left behind for cluster %d. valid: %d max: %d\n", iCurrentLayerClusterIndex, validTracklets, maxTrackletsPerCluster); + ssum[threadIdx.x] = localSum; + __syncthreads(); + for (uint32_t stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (threadIdx.x < stride) { + ssum[threadIdx.x] += ssum[threadIdx.x + stride]; } + __syncthreads(); + } + if (threadIdx.x == 0) { + nTrackletsPerROF[iMode][rof] = ssum[0]; } } } -template -GPUg() void trackletSelectionKernelMultipleRof( - const Cluster* clusters0, // Clusters on layer 0 - const Cluster* clusters1, // Clusters on layer 1 - const int* sizeClustersL0, // Number of clusters on layer 0 per ROF - const int* sizeClustersL1, // Number of clusters on layer 1 per ROF - Tracklet* tracklets01, // Tracklets on layer 0-1 - Tracklet* tracklets12, // Tracklets on layer 1-2 - const int* nFoundTracklets01, // Number of tracklets found on layers 0-1 - const int* nFoundTracklets12, // Number of tracklets found on layers 1-2 - unsigned char* usedTracklets, // Used tracklets - Line* lines, // Lines - int* nFoundLines, // Number of found lines - int* nExclusiveFoundLines, // Number of found lines exclusive scan - const unsigned int startRofId, // Starting ROF ID - const unsigned int rofSize, // Number of ROFs to consider - const int maxTrackletsPerCluster = 1e2, // Maximum number of tracklets per cluster - const float tanLambdaCut = 0.025f, // Cut on tan lambda - const float phiCut = 0.002f) // Cut on phi +template +GPUhi() void cubExclusiveScan(const T* GPUrestrict() in, T* GPUrestrict() out, int32_t num_items, cudaStream_t stream) { - for (unsigned int iRof{blockIdx.x}; iRof < rofSize; iRof += gridDim.x) { - auto rof = iRof + startRofId; - auto* clustersL0Rof = clusters0 + (sizeClustersL0[rof] - sizeClustersL0[startRofId]); - auto clustersL1offsetRof = sizeClustersL1[rof] - sizeClustersL1[startRofId]; - auto* clustersL1Rof = clusters1 + clustersL1offsetRof; - auto nClustersL1Rof = sizeClustersL1[rof + 1] - sizeClustersL1[rof]; - auto* tracklets01Rof = tracklets01 + clustersL1offsetRof * maxTrackletsPerCluster; - auto* tracklets12Rof = tracklets12 + clustersL1offsetRof * maxTrackletsPerCluster; - auto* foundTracklets01Rof = nFoundTracklets01 + clustersL1offsetRof; - auto* foundTracklets12Rof = nFoundTracklets12 + clustersL1offsetRof; - auto* usedTrackletsRof = usedTracklets + clustersL1offsetRof * maxTrackletsPerCluster; - auto* foundLinesRof = nFoundLines + clustersL1offsetRof; - int* nExclusiveFoundLinesRof = nullptr; - if constexpr (!initRun) { - nExclusiveFoundLinesRof = nExclusiveFoundLines + clustersL1offsetRof; - } - for (int iClusterIndexLayer1 = threadIdx.x; iClusterIndexLayer1 < nClustersL1Rof; iClusterIndexLayer1 += blockDim.x) { - const int stride{iClusterIndexLayer1 * maxTrackletsPerCluster}; - int validTracklets{0}; - for (int iTracklet12{0}; iTracklet12 < foundTracklets12Rof[iClusterIndexLayer1]; ++iTracklet12) { - for (int iTracklet01{0}; iTracklet01 < foundTracklets01Rof[iClusterIndexLayer1] && validTracklets < maxTrackletsPerCluster; ++iTracklet01) { - const float deltaTanLambda{o2::gpu::GPUCommonMath::Abs(tracklets01Rof[stride + iTracklet01].tanLambda - tracklets12Rof[stride + iTracklet12].tanLambda)}; - const float deltaPhi{o2::gpu::GPUCommonMath::Abs(tracklets01Rof[stride + iTracklet01].phi - tracklets12Rof[stride + iTracklet12].phi)}; - if (!usedTrackletsRof[stride + iTracklet01] && deltaTanLambda < tanLambdaCut && deltaPhi < phiCut && validTracklets != maxTrackletsPerCluster) { - usedTrackletsRof[stride + iTracklet01] = true; - if constexpr (!initRun) { - new (lines + nExclusiveFoundLinesRof[iClusterIndexLayer1] + validTracklets) Line{tracklets01Rof[stride + iTracklet01], clustersL0Rof, clustersL1Rof}; - } - ++validTracklets; - } - } - } - if constexpr (initRun) { - foundLinesRof[iClusterIndexLayer1] = validTracklets; - // if (validTracklets >= maxTrackletsPerCluster) { - // printf("gpu tracklet selection: some lines will be left behind for cluster %d. valid: %d max: %d\n", iClusterIndexLayer1, validTracklets, maxTrackletsPerCluster); - // } - } - } - } // rof loop + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in, out + 1, num_items, stream)); + GPUChkErrS(cudaMallocAsync(&d_temp_storage, temp_storage_bytes, stream)); + GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in, out + 1, num_items, stream)); + GPUChkErrS(cudaFreeAsync(d_temp_storage, stream)); +} + +} // namespace gpu + +template +void countTrackletsInROFsHandler(const IndexTableUtils* GPUrestrict() utils, + const uint8_t* GPUrestrict() multMask, + const int32_t nRofs, + const int32_t deltaROF, + const int32_t* GPUrestrict() rofPV, + const int32_t vertPerRofThreshold, + const Cluster** GPUrestrict() clusters, + const uint32_t nClusters, + const int32_t** GPUrestrict() ROFClusters, + const uint8_t** GPUrestrict() usedClusters, + const int32_t** GPUrestrict() clustersIndexTables, + int32_t** GPUrestrict() trackletsPerClusterLUTs, + int32_t** GPUrestrict() trackletsPerClusterSumLUTs, + int32_t** GPUrestrict() trackletsPerROF, + const std::array& trackletsPerClusterLUTsHost, + const std::array& trackletsPerClusterSumLUTsHost, + const int32_t iteration, + const float phiCut, + const int32_t maxTrackletsPerCluster, + const int32_t nBlocks, + const int32_t nThreads, + gpu::Streams& streams) +{ + const uint32_t sharedBytes = nThreads * (2 * deltaROF + 1) * sizeof(uint16_t); + gpu::computeLayerTrackletMutliROFKernel<<>>(clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + phiCut, + nullptr, + trackletsPerClusterLUTs, + utils, + nRofs, + deltaROF, + rofPV, + iteration, + vertPerRofThreshold, + maxTrackletsPerCluster); + gpu::compileTrackletsPerROFKernel<<>>(nRofs, trackletsPerROF, ROFClusters, (const int32_t**)trackletsPerClusterLUTs); + gpu::cubExclusiveScan(trackletsPerClusterLUTsHost[0], trackletsPerClusterSumLUTsHost[0], nClusters, streams[0].get()); + + gpu::computeLayerTrackletMutliROFKernel<<>>(clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + phiCut, + nullptr, + trackletsPerClusterLUTs, + utils, + nRofs, + deltaROF, + rofPV, + iteration, + vertPerRofThreshold, + maxTrackletsPerCluster); + gpu::compileTrackletsPerROFKernel<<>>(nRofs, trackletsPerROF, ROFClusters, (const int**)trackletsPerClusterLUTs); + gpu::cubExclusiveScan(trackletsPerClusterLUTsHost[1], trackletsPerClusterSumLUTsHost[1], nClusters, streams[1].get()); +} + +template +void computeTrackletsInROFsHandler(const IndexTableUtils* GPUrestrict() utils, + const uint8_t* GPUrestrict() multMask, + const int32_t nRofs, + const int32_t deltaROF, + const int32_t* GPUrestrict() rofPV, + const int vertPerRofThreshold, + const Cluster** GPUrestrict() clusters, + const uint32_t nClusters, + const int32_t** GPUrestrict() ROFClusters, + const uint8_t** GPUrestrict() usedClusters, + const int32_t** GPUrestrict() clustersIndexTables, + Tracklet** GPUrestrict() foundTracklets, + const int32_t** GPUrestrict() trackletsPerClusterLUTs, + const int32_t** GPUrestrict() trackletsPerClusterSumLUTs, + const int32_t** GPUrestrict() trackletsPerROF, + const int32_t iteration, + const float phiCut, + const int32_t maxTrackletsPerCluster, + const int32_t nBlocks, + const int32_t nThreads, + gpu::Streams& streams) +{ + const uint32_t sharedBytes = nThreads * (2 * deltaROF + 1) * sizeof(uint16_t); + gpu::computeLayerTrackletMutliROFKernel<<>>(clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + phiCut, + foundTracklets, + trackletsPerClusterSumLUTs, + utils, + nRofs, + deltaROF, + rofPV, + iteration, + vertPerRofThreshold, + maxTrackletsPerCluster); + gpu::computeLayerTrackletMutliROFKernel<<>>(clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + phiCut, + foundTracklets, + trackletsPerClusterSumLUTs, + utils, + nRofs, + deltaROF, + rofPV, + iteration, + vertPerRofThreshold, + maxTrackletsPerCluster); +} + +void countTrackletsMatchingInROFsHandler(const int32_t nRofs, + const int32_t deltaROF, + const uint32_t nClusters, + const int32_t** GPUrestrict() ROFClusters, + const Cluster** GPUrestrict() clusters, + uint8_t** GPUrestrict() usedClusters, + const Tracklet** GPUrestrict() foundTracklets, + uint8_t* GPUrestrict() usedTracklets, + const int32_t** GPUrestrict() trackletsPerClusterLUTs, + const int32_t** GPUrestrict() trackletsPerClusterSumLUTs, + int32_t* GPUrestrict() linesPerClusterLUT, + int32_t* GPUrestrict() linesPerClusterSumLUT, + const int32_t iteration, + const float phiCut, + const float tanLambdaCut, + const int32_t nBlocks, + const int32_t nThreads, + gpu::Streams& streams) +{ + streams[1].sync(); // need to make sure that all tracklets are done, since this placed in 0 tracklet01 will be done but tracklet12 needs to be guaranteed + gpu::computeTrackletSelectionMutliROFKernel<<>>(nullptr, + usedClusters, + ROFClusters, + phiCut, + tanLambdaCut, + foundTracklets, + usedTracklets, + trackletsPerClusterLUTs, + trackletsPerClusterSumLUTs, + linesPerClusterLUT, + nullptr, + nRofs, + deltaROF, + 100); + gpu::cubExclusiveScan(linesPerClusterLUT, linesPerClusterSumLUT, nClusters, streams[0].get()); } +void computeTrackletsMatchingInROFsHandler(const int32_t nRofs, + const int32_t deltaROF, + const uint32_t nClusters, + const int32_t** GPUrestrict() ROFClusters, + const Cluster** GPUrestrict() clusters, + const uint8_t** GPUrestrict() usedClusters, + const Tracklet** GPUrestrict() foundTracklets, + uint8_t* GPUrestrict() usedTracklets, + const int32_t** GPUrestrict() trackletsPerClusterLUTs, + const int32_t** GPUrestrict() trackletsPerClusterSumLUTs, + const int32_t* GPUrestrict() linesPerClusterSumLUT, + Line* GPUrestrict() lines, + const int32_t iteration, + const float phiCut, + const float tanLambdaCut, + const int32_t nBlocks, + const int32_t nThreads, + gpu::Streams& streams) +{ + gpu::computeTrackletSelectionMutliROFKernel<<>>(clusters, + nullptr, + ROFClusters, + phiCut, + tanLambdaCut, + foundTracklets, + usedTracklets, + trackletsPerClusterLUTs, + trackletsPerClusterSumLUTs, + linesPerClusterSumLUT, + lines, + nRofs, + deltaROF, + 100); +} + +/// Explicit instantiation of ITS2 handlers +template void countTrackletsInROFsHandler<7>(const IndexTableUtils<7>* GPUrestrict() utils, + const uint8_t* GPUrestrict() multMask, + const int32_t nRofs, + const int32_t deltaROF, + const int32_t* GPUrestrict() rofPV, + const int32_t vertPerRofThreshold, + const Cluster** GPUrestrict() clusters, + const uint32_t nClusters, + const int32_t** GPUrestrict() ROFClusters, + const uint8_t** GPUrestrict() usedClusters, + const int32_t** GPUrestrict() clustersIndexTables, + int32_t** trackletsPerClusterLUTs, + int32_t** trackletsPerClusterSumLUTs, + int32_t** trackletsPerROF, + const std::array& trackletsPerClusterLUTsHost, + const std::array& trackletsPerClusterSumLUTsHost, + const int32_t iteration, + const float phiCut, + const int32_t maxTrackletsPerCluster, + const int32_t nBlocks, + const int32_t nThreads, + gpu::Streams& streams); + +template void computeTrackletsInROFsHandler<7>(const IndexTableUtils<7>* GPUrestrict() utils, + const uint8_t* GPUrestrict() multMask, + const int32_t nRofs, + const int32_t deltaROF, + const int32_t* GPUrestrict() rofPV, + const int vertPerRofThreshold, + const Cluster** GPUrestrict() clusters, + const uint32_t nClusters, + const int32_t** GPUrestrict() ROFClusters, + const uint8_t** GPUrestrict() usedClusters, + const int32_t** GPUrestrict() clustersIndexTables, + Tracklet** GPUrestrict() foundTracklets, + const int32_t** GPUrestrict() trackletsPerClusterLUTs, + const int32_t** GPUrestrict() trackletsPerClusterSumLUTs, + const int32_t** GPUrestrict() trackletsPerROF, + const int32_t iteration, + const float phiCut, + const int32_t maxTrackletsPerCluster, + const int32_t nBlocks, + const int32_t nThreads, + gpu::Streams& streams); +/* GPUg() void lineClustererMultipleRof( const int* sizeClustersL1, // Number of clusters on layer 1 per ROF Line* lines, // Lines @@ -567,6 +657,4 @@ GPUg() void computeVertexKernel( } } */ -} // namespace gpu -} // namespace its -} // namespace o2 +} // namespace o2::its diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt b/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt index dd83669311a54..e8e475f2232c8 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt +++ b/Detectors/ITSMFT/ITS/tracking/GPU/hip/CMakeLists.txt @@ -19,8 +19,8 @@ if(HIP_ENABLED) ../cuda/TrackerTraitsGPU.cxx ../cuda/TracerGPU.cu ../cuda/TrackingKernels.cu - # ../cuda/VertexingKernels.cu - # ../cuda/VertexerTraitsGPU.cxx + ../cuda/VertexingKernels.cu + ../cuda/VertexerTraitsGPU.cxx PUBLIC_INCLUDE_DIRECTORIES ../ PUBLIC_LINK_LIBRARIES O2::ITStracking O2::GPUTracking diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h index 72857794c711e..c3be0de2dade7 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Definitions.h @@ -15,6 +15,8 @@ #ifndef TRACKINGITS_DEFINITIONS_H_ #define TRACKINGITS_DEFINITIONS_H_ +#include + #include "ReconstructionDataFormats/Vertex.h" #ifdef CA_DEBUG @@ -28,8 +30,16 @@ namespace o2::its { +enum class TrackletMode { + Layer0Layer1 = 0, + Layer1Layer2 = 2 +}; + using Vertex = o2::dataformats::Vertex>; -} +template +using maybe_const = typename std::conditional::type; + +} // namespace o2::its #endif diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index a148049e50129..c34701ce222e2 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -204,10 +204,8 @@ struct TimeFrame { void computeTracletsPerClusterScans(); int& getNTrackletsROF(int rofId, int combId) { return mNTrackletsPerROF[combId][rofId]; } auto& getLines(int rofId) { return mLines[rofId]; } - int getNLinesTotal() const - { - return std::accumulate(mLines.begin(), mLines.end(), 0, [](int sum, const auto& l) { return sum + l.size(); }); - } + int getNLinesTotal() const noexcept { return mTotalLines; } + void setNLinesTotal(uint32_t a) noexcept { mTotalLines = a; } auto& getTrackletClusters(int rofId) { return mTrackletClusters[rofId]; } gsl::span getFoundTracklets(int rofId, int combId) const; gsl::span getFoundTracklets(int rofId, int combId); @@ -237,10 +235,9 @@ struct TimeFrame { void setExternalAllocator(ExternalAllocator* allocator) { - if (mIsGPU) { + if (isGPU()) { LOGP(debug, "Setting timeFrame allocator to external"); mAllocator = allocator; - mExtAllocator = true; // to be removed } else { LOGP(fatal, "External allocator is currently only supported for GPU"); } @@ -276,8 +273,6 @@ struct TimeFrame { IndexTableUtilsN mIndexTableUtils; - bool mIsGPU = false; - std::array, nLayers> mClusters; std::array, nLayers> mTrackingFrameInfo; std::array, nLayers> mClusterExternalIndices; @@ -296,9 +291,8 @@ struct TimeFrame { bounded_vector mPrimaryVertices; // State if memory will be externally managed. - bool mExtAllocator = false; ExternalAllocator* mAllocator = nullptr; - bool getExtAllocator() const { return mExtAllocator; } + bool getExtAllocator() const noexcept { return mAllocator != nullptr; } std::array, nLayers> mUnsortedClusters; std::vector> mTracklets; @@ -313,6 +307,10 @@ struct TimeFrame { virtual void wipe(); + // interface + virtual bool isGPU() const noexcept { return false; } + virtual const char* getName() const noexcept { return "CPU"; } + private: void prepareClusters(const TrackingParameters& trkParam, const int maxLayers = nLayers); float mBz = 5.; @@ -348,6 +346,7 @@ struct TimeFrame { std::vector> mVerticesMCRecInfo; bounded_vector mVerticesContributorLabels; std::array mTotalTracklets = {0, 0}; + uint32_t mTotalLines = 0; unsigned int mNoVertexROF = 0; bounded_vector mTotVertPerIteration; // \Vertexer diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h index 4c445bdbbfa16..d368eb1d1f56a 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingConfigParam.h @@ -136,6 +136,12 @@ struct ITSGpuTrackingParamConfig : public o2::conf::ConfigurableParamHelper class VertexerTraits { diff --git a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx index e8212f4ad53a1..eb2e4ae690e5b 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx @@ -85,7 +85,7 @@ void Tracker::clustersToTracks(const LogFunc& logger, const LogFunc& er } double timeTracklets{0.}, timeCells{0.}, timeNeighbours{0.}, timeRoads{0.}; int nTracklets{0}, nCells{0}, nNeighbours{0}, nTracks{-static_cast(mTimeFrame->getNumberOfTracks())}; - int nROFsIterations = (mTrkParams[iteration].nROFsPerIterations > 0 && !mTimeFrame->mIsGPU) ? mTimeFrame->getNrof() / mTrkParams[iteration].nROFsPerIterations + bool(mTimeFrame->getNrof() % mTrkParams[iteration].nROFsPerIterations) : 1; + int nROFsIterations = (mTrkParams[iteration].nROFsPerIterations > 0 && !mTimeFrame->isGPU()) ? mTimeFrame->getNrof() / mTrkParams[iteration].nROFsPerIterations + bool(mTimeFrame->getNrof() % mTrkParams[iteration].nROFsPerIterations) : 1; iVertex = std::min(maxNvertices, 0); logger(std::format("==== ITS {} Tracking iteration {} summary ====", mTraits->getName(), iteration)); diff --git a/Detectors/ITSMFT/ITS/tracking/src/VertexerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/VertexerTraits.cxx index 153d7b6faa358..53be03af7eb14 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/VertexerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/VertexerTraits.cxx @@ -17,6 +17,7 @@ #include #include +#include #include "ITStracking/VertexerTraits.h" #include "ITStracking/BoundedAllocator.h" @@ -290,6 +291,7 @@ template void VertexerTraits::computeTrackletMatching(const int iteration) { mTaskArena->execute([&] { + tbb::combinable totalLines{0}; tbb::parallel_for( tbb::blocked_range(0, (short)mTimeFrame->getNrof()), [&](const tbb::blocked_range& Rofs) { @@ -333,8 +335,10 @@ void VertexerTraits::computeTrackletMatching(const int iteration) mVrtParams[iteration].phiCut); } } + totalLines.local() += mTimeFrame->getLines(pivotRofId).size(); } }); + mTimeFrame->setNLinesTotal(totalLines.combine(std::plus())); }); #ifdef VTX_DEBUG diff --git a/GPU/GPUTracking/Base/GPUReconstructionIncludesITS.h b/GPU/GPUTracking/Base/GPUReconstructionIncludesITS.h index c4202e9980d24..813e0aef2d1aa 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionIncludesITS.h +++ b/GPU/GPUTracking/Base/GPUReconstructionIncludesITS.h @@ -21,7 +21,7 @@ #include "ITStracking/TimeFrame.h" #if defined(__CUDACC__) || defined(__HIPCC__) #include "ITStrackingGPU/TrackerTraitsGPU.h" -// #include "ITStrackingGPU/VertexerTraitsGPU.h" +#include "ITStrackingGPU/VertexerTraitsGPU.h" #include "ITStrackingGPU/TimeFrameGPU.h" #endif #else @@ -39,10 +39,10 @@ template class TimeFrame { }; -// template -// class VertexerTraitsGPU : public VertexerTraits -// { -// }; +template +class VertexerTraitsGPU : public VertexerTraits +{ +}; template class TrackerTraitsGPU : public TrackerTraits { diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 71582b4fed55e..9e7cfa5495040 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -97,7 +97,9 @@ void GPUReconstructionCUDA::GetITSTraits(std::unique_ptrreset(new o2::its::TrackerTraitsGPU); } if (vertexerTraits) { - vertexerTraits->reset(new o2::its::VertexerTraits<7>); // TODO gpu-code to be implemented + vertexerTraits->reset(new o2::its::VertexerTraits<7>); + // TODO gpu-code to be implemented then remove line above and uncomment line below + // vertexerTraits->reset(new o2::its::VertexerTraitsGPU<7>); } if (timeFrame) { timeFrame->reset(new o2::its::gpu::TimeFrameGPU); diff --git a/GPU/GPUTracking/Global/GPUChainITS.cxx b/GPU/GPUTracking/Global/GPUChainITS.cxx index e53f5db3a2549..26dff3710cd4a 100644 --- a/GPU/GPUTracking/Global/GPUChainITS.cxx +++ b/GPU/GPUTracking/Global/GPUChainITS.cxx @@ -70,7 +70,7 @@ o2::its::TimeFrame<7>* GPUChainITS::GetITSTimeframe() mRec->GetITSTraits(nullptr, nullptr, &mITSTimeFrame); } #if !defined(GPUCA_STANDALONE) - if (mITSTimeFrame->mIsGPU) { + if (mITSTimeFrame->isGPU()) { auto doFWExtAlloc = [this](size_t size) -> void* { return rec()->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU); }; mFrameworkAllocator.reset(new o2::its::GPUFrameworkExternalAllocator);