diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index 29d2404e98681..88666cdfdb7fb 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -15,18 +15,13 @@ #include "ITStracking/TimeFrame.h" #include "ITStracking/Configuration.h" - -#include "ITStrackingGPU/ClusterLinesGPU.h" #include "ITStrackingGPU/Utils.h" #include -namespace o2 -{ -namespace its -{ -namespace gpu +namespace o2::its::gpu { + class Stream; class DefaultGPUAllocator : public ExternalAllocator @@ -228,7 +223,6 @@ inline int TimeFrameGPU::getNumberOfCells() const return std::accumulate(mNCells.begin(), mNCells.end(), 0); } -} // namespace gpu -} // namespace its -} // namespace o2 +} // namespace o2::its::gpu + #endif diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h index f9583d97ca030..c765307473749 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackerTraitsGPU.h @@ -13,8 +13,6 @@ #ifndef ITSTRACKINGGPU_TRACKERTRAITSGPU_H_ #define ITSTRACKINGGPU_TRACKERTRAITSGPU_H_ -#include "ITStracking/Configuration.h" -#include "ITStracking/Definitions.h" #include "ITStracking/TrackerTraits.h" #include "ITStrackingGPU/TimeFrameGPU.h" @@ -24,28 +22,27 @@ namespace its { template -class TrackerTraitsGPU : public TrackerTraits +class TrackerTraitsGPU final : public TrackerTraits { public: TrackerTraitsGPU() = default; ~TrackerTraitsGPU() override = default; - // void computeLayerCells() final; - void adoptTimeFrame(TimeFrame* tf) override; - void initialiseTimeFrame(const int iteration) override; + void adoptTimeFrame(TimeFrame* tf) final; + void initialiseTimeFrame(const int iteration) final; + void computeLayerTracklets(const int iteration, int, int) final; - void computeLayerCells(const int iteration) override; - void setBz(float) override; - void findCellsNeighbours(const int iteration) override; - void findRoads(const int iteration) override; + void computeLayerCells(const int iteration) final; + void findCellsNeighbours(const int iteration) final; + void findRoads(const int iteration) final; + + bool supportsExtendTracks() const noexcept final { return false; } + bool supportsFindShortPrimaries() const noexcept final { return false; } - // Methods to get CPU execution from traits - void initialiseTimeFrameHybrid(const int iteration) override { initialiseTimeFrame(iteration); }; - void computeTrackletsHybrid(const int iteration, int, int) override; - void computeCellsHybrid(const int iteration) override; - void findCellsNeighboursHybrid(const int iteration) override; + void setBz(float) final; - void extendTracks(const int iteration) override; + const char* getName() const noexcept final { return "GPU"; } + bool isGPU() const noexcept final { return true; } // TimeFrameGPU information forwarding int getTFNumberOfClusters() const override; diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 720867ddaba29..21b14fd9292d2 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -176,8 +176,7 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, const int nBlocks, const int nThreads); -int filterCellNeighboursHandler(std::vector&, - gpuPair*, +int filterCellNeighboursHandler(gpuPair*, int*, unsigned int); diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 3c65faddcff71..f3b62ec8a6108 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -11,10 +11,7 @@ /// #include -#include -#include #include -#include #include "DataFormatsITS/TrackITS.h" @@ -41,54 +38,7 @@ void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) } template -void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int, int) -{ -} - -template -void TrackerTraitsGPU::computeLayerCells(const int iteration) -{ -} - -template -void TrackerTraitsGPU::findCellsNeighbours(const int iteration) -{ -} - -template -void TrackerTraitsGPU::extendTracks(const int iteration) -{ -} - -template -void TrackerTraitsGPU::setBz(float bz) -{ - mBz = bz; - mTimeFrameGPU->setBz(bz); -} - -template -int TrackerTraitsGPU::getTFNumberOfClusters() const -{ - return mTimeFrameGPU->getNumberOfClusters(); -} - -template -int TrackerTraitsGPU::getTFNumberOfTracklets() const -{ - return std::accumulate(mTimeFrameGPU->getNTracklets().begin(), mTimeFrameGPU->getNTracklets().end(), 0); -} - -template -int TrackerTraitsGPU::getTFNumberOfCells() const -{ - return mTimeFrameGPU->getNumberOfCells(); -} - -//////////////////////////////////////////////////////////////////////////////// -// Hybrid tracking -template -void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex) +void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int iROFslice, int iVertex) { auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); mTimeFrameGPU->createTrackletsLUTDevice(iteration); @@ -161,7 +111,7 @@ void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int } template -void TrackerTraitsGPU::computeCellsHybrid(const int iteration) +void TrackerTraitsGPU::computeLayerCells(const int iteration) { mTimeFrameGPU->createCellsLUTDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); @@ -208,7 +158,7 @@ void TrackerTraitsGPU::computeCellsHybrid(const int iteration) } template -void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) +void TrackerTraitsGPU::findCellsNeighbours(const int iteration) { mTimeFrameGPU->createNeighboursIndexTablesDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); @@ -250,8 +200,7 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) conf.nBlocks, conf.nThreads); - filterCellNeighboursHandler(mTimeFrameGPU->getCellsNeighbours()[iLayer], - mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), + filterCellNeighboursHandler(mTimeFrameGPU->getDeviceNeighbourPairs(iLayer), mTimeFrameGPU->getDeviceNeighbours(iLayer), nNeigh); } @@ -270,9 +219,6 @@ void TrackerTraitsGPU::findRoads(const int iteration) if ((mTrkParams[iteration].StartLayerMask & (1 << (startLayer + 2))) == 0) { continue; } - std::vector lastCellId, updatedCellId; - std::vector lastCellSeed, updatedCellSeed; - processNeighboursHandler(startLayer, startLevel, mTimeFrameGPU->getDeviceArrayCells(), @@ -366,5 +312,30 @@ void TrackerTraitsGPU::findRoads(const int iteration) } }; +template +int TrackerTraitsGPU::getTFNumberOfClusters() const +{ + return mTimeFrameGPU->getNumberOfClusters(); +} + +template +int TrackerTraitsGPU::getTFNumberOfTracklets() const +{ + return std::accumulate(mTimeFrameGPU->getNTracklets().begin(), mTimeFrameGPU->getNTracklets().end(), 0); +} + +template +int TrackerTraitsGPU::getTFNumberOfCells() const +{ + return mTimeFrameGPU->getNumberOfCells(); +} + +template +void TrackerTraitsGPU::setBz(float bz) +{ + mBz = bz; + mTimeFrameGPU->setBz(bz); +} + template class TrackerTraitsGPU<7>; } // namespace o2::its diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 805e66675e1b9..bb39e9e70341b 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -12,11 +12,9 @@ #include #include -#include -#include #include -#include +#include #include #include #include @@ -28,7 +26,6 @@ #include #include "ITStracking/Constants.h" -#include "ITStracking/Configuration.h" #include "ITStracking/IndexTableUtils.h" #include "ITStracking/MathUtils.h" #include "DataFormatsITS/TrackITS.h" @@ -59,7 +56,7 @@ namespace o2::its using namespace constants::its2; using Vertex = o2::dataformats::Vertex>; -GPUd() float Sq(float v) +GPUdii() float Sq(float v) { return v * v; } @@ -76,15 +73,15 @@ GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerInde const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz; const float phiRangeMax = (maxdeltaphi > constants::math::Pi) ? constants::math::TwoPi : currentCluster.phi + maxdeltaphi; - if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] || - zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) { + if (zRangeMax < -utils.getLayerZ(layerIndex) || + zRangeMin > utils.getLayerZ(layerIndex) || zRangeMin > zRangeMax) { return getEmptyBinsRect(); } - return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)), + return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex, zRangeMin)), utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)), - o2::gpu::CAMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)), + o2::gpu::CAMath::Min(utils.getNzBins() - 1, utils.getZBinIndex(layerIndex, zRangeMax)), utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))}; } @@ -184,6 +181,11 @@ struct equal_tracklets { GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; } }; +template +struct sort_by_second { + GPUhd() bool operator()(const gpuPair& a, const gpuPair& b) const { return a.second < b.second; } +}; + template struct pair_to_first { GPUhd() int operator()(const gpuPair& a) const @@ -522,7 +524,7 @@ GPUg() void computeLayerTrackletsMultiROFKernel( const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * MSAngle))}; - const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * NSigmaCut, phiCut)}; + const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex + 1, *utils, zAtRmin, zAtRmax, sigmaZ * NSigmaCut, phiCut)}; if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { continue; } @@ -800,6 +802,44 @@ GPUg() void printCellSeeds(CellSeed* seed, int nCells, const unsigned int tId = } } } + +template +GPUhi() void cubExclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr) +{ + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, + in_out, num_items, stream)); + GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, in_out, + in_out, num_items, stream)); + GPUChkErrS(cudaFree(d_temp_storage)); +} + +template +GPUhi() void cubExclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr) +{ + cubExclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream); +} + +template +GPUhi() void cubInclusiveScanInPlace(T* in_out, int num_items, cudaStream_t stream = nullptr) +{ + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, + in_out, num_items, stream)); + GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, in_out, + in_out, num_items, stream)); + GPUChkErrS(cudaFree(d_temp_storage)); +} + +template +GPUhi() void cubInclusiveScanInPlace(Vector& in_out, int num_items, cudaStream_t stream = nullptr) +{ + cubInclusiveScanInPlace(thrust::raw_pointer_cast(in_out.data()), num_items, stream); +} } // namespace gpu template @@ -833,7 +873,8 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils, const int nThreads) { for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { - gpu::computeLayerTrackletsMultiROFKernel<<>>( + gpu::computeLayerTrackletsMultiROFKernel<<>>( utils, multMask, iLayer, @@ -860,22 +901,7 @@ void countTrackletsInROFsHandler(const IndexTableUtils* utils, resolutions[iLayer], radii[iLayer + 1] - radii[iLayer], mulScatAng[iLayer]); - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - trackletsLUTsHost[iLayer], // d_in - trackletsLUTsHost[iLayer], // d_out - nClusters[iLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - trackletsLUTsHost[iLayer], // d_in - trackletsLUTsHost[iLayer], // d_out - nClusters[iLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - GPUChkErrS(cudaFree(d_temp_storage)); + gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1); } } @@ -913,55 +939,42 @@ void computeTrackletsInROFsHandler(const IndexTableUtils* utils, const int nThreads) { for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { - gpu::computeLayerTrackletsMultiROFKernel<<>>(utils, - multMask, - iLayer, - startROF, - endROF, - maxROF, - deltaROF, - vertices, - rofPV, - nVertices, - vertexId, - clusters, - ROFClusters, - usedClusters, - clustersIndexTables, - tracklets, - trackletsLUTs, - iteration, - NSigmaCut, - phiCuts[iLayer], - resolutionPV, - minRs[iLayer + 1], - maxRs[iLayer + 1], - resolutions[iLayer], - radii[iLayer + 1] - radii[iLayer], - mulScatAng[iLayer]); + gpu::computeLayerTrackletsMultiROFKernel<<>>(utils, + multMask, + iLayer, + startROF, + endROF, + maxROF, + deltaROF, + vertices, + rofPV, + nVertices, + vertexId, + clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + tracklets, + trackletsLUTs, + iteration, + NSigmaCut, + phiCuts[iLayer], + resolutionPV, + minRs[iLayer + 1], + maxRs[iLayer + 1], + resolutions[iLayer], + radii[iLayer + 1] - radii[iLayer], + mulScatAng[iLayer]); thrust::device_ptr tracklets_ptr(spanTracklets[iLayer]); thrust::sort(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets()); auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets()); nTracklets[iLayer] = unique_end - tracklets_ptr; if (iLayer > 0) { GPUChkErrS(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int))); - gpu::compileTrackletsLookupTableKernel<<>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]); - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - trackletsLUTsHost[iLayer], // d_in - trackletsLUTsHost[iLayer], // d_out - nClusters[iLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - trackletsLUTsHost[iLayer], // d_in - trackletsLUTsHost[iLayer], // d_out - nClusters[iLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - GPUChkErrS(cudaFree(d_temp_storage)); + gpu::compileTrackletsLookupTableKernel<<>>( + spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]); + gpu::cubExclusiveScanInPlace(trackletsLUTsHost[iLayer], nClusters[iLayer] + 1); } } } @@ -984,7 +997,8 @@ void countCellsHandler( const int nBlocks, const int nThreads) { - gpu::computeLayerCellsKernel<<>>( + gpu::computeLayerCellsKernel<<>>( sortedClusters, // const Cluster** unsortedClusters, // const Cluster** tfInfo, // const TrackingFrameInfo** @@ -998,22 +1012,7 @@ void countCellsHandler( maxChi2ClusterAttachment, // const float cellDeltaTanLambdaSigma, // const float nSigmaCut); // const float - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - cellsLUTsHost, // d_in - cellsLUTsHost, // d_out - nTracklets + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - cellsLUTsHost, // d_in - cellsLUTsHost, // d_out - nTracklets + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - GPUChkErrS(cudaFree(d_temp_storage)); + gpu::cubExclusiveScanInPlace(cellsLUTsHost, nTracklets + 1); } void computeCellsHandler( @@ -1034,7 +1033,8 @@ void computeCellsHandler( const int nBlocks, const int nThreads) { - gpu::computeLayerCellsKernel<<>>( + gpu::computeLayerCellsKernel<<>>( sortedClusters, // const Cluster** unsortedClusters, // const Cluster** tfInfo, // const TrackingFrameInfo** @@ -1064,7 +1064,8 @@ unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice, const int nBlocks, const int nThreads) { - gpu::computeLayerCellNeighboursKernel<<>>( + gpu::computeLayerCellNeighboursKernel<<>>( cellsLayersDevice, neighboursLUT, neighboursIndexTable, @@ -1076,39 +1077,10 @@ unsigned int countCellNeighboursHandler(CellSeed** cellsLayersDevice, nCells, maxCellNeighbours); - void *d_temp_storage = nullptr, *d_temp_storage_2 = nullptr; - size_t temp_storage_bytes = 0, temp_storage_bytes_2 = 0; - GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - neighboursLUT, // d_in - neighboursLUT, // d_out - nCellsNext)); // num_items - - GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - GPUChkErrS(cub::DeviceScan::InclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - neighboursLUT, // d_in - neighboursLUT, // d_out - nCellsNext)); // num_items - - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage - temp_storage_bytes_2, // temp_storage_bytes - neighboursIndexTable, // d_in - neighboursIndexTable, // d_out - nCells + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - - GPUChkErrS(cudaMalloc(&d_temp_storage_2, temp_storage_bytes_2)); - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage_2, // d_temp_storage - temp_storage_bytes_2, // temp_storage_bytes - neighboursIndexTable, // d_in - neighboursIndexTable, // d_out - nCells + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer + gpu::cubInclusiveScanInPlace(neighboursLUT, nCellsNext); + gpu::cubExclusiveScanInPlace(neighboursIndexTable, nCells + 1); unsigned int nNeighbours; GPUChkErrS(cudaMemcpy(&nNeighbours, &neighboursLUT[nCellsNext - 1], sizeof(unsigned int), cudaMemcpyDeviceToHost)); - GPUChkErrS(cudaFree(d_temp_storage)); - GPUChkErrS(cudaFree(d_temp_storage_2)); return nNeighbours; } @@ -1143,32 +1115,18 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, GPUChkErrS(cudaDeviceSynchronize()); } -int filterCellNeighboursHandler(std::vector& neighHost, // TODO: eventually remove this! - gpuPair* cellNeighbourPairs, +int filterCellNeighboursHandler(gpuPair* cellNeighbourPairs, int* cellNeighbours, unsigned int nNeigh) { thrust::device_ptr> neighVectorPairs(cellNeighbourPairs); thrust::device_ptr validNeighs(cellNeighbours); - thrust::device_vector keys(nNeigh); // TODO: externally allocate. - thrust::device_vector vals(nNeigh); // TODO: externally allocate. - thrust::copy(thrust::make_transform_iterator(neighVectorPairs, gpu::pair_to_second()), - thrust::make_transform_iterator(neighVectorPairs + nNeigh, gpu::pair_to_second()), - keys.begin()); - thrust::sequence(vals.begin(), vals.end()); - thrust::sort_by_key(keys.begin(), keys.end(), vals.begin()); - thrust::device_vector> sortedNeigh(nNeigh); - thrust::copy(thrust::make_permutation_iterator(neighVectorPairs, vals.begin()), - thrust::make_permutation_iterator(neighVectorPairs, vals.end()), - sortedNeigh.begin()); - GPUChkErrS(cudaDeviceSynchronize()); - auto trimmedBegin = thrust::find_if(sortedNeigh.begin(), sortedNeigh.end(), gpu::is_valid_pair()); // trim leading -1s - auto trimmedSize = sortedNeigh.end() - trimmedBegin; - neighHost.resize(trimmedSize); - thrust::transform(trimmedBegin, sortedNeigh.end(), validNeighs, gpu::pair_to_first()); - GPUChkErrS(cudaMemcpy(neighHost.data(), cellNeighbours, trimmedSize * sizeof(int), cudaMemcpyDeviceToHost)); + auto updatedEnd = thrust::remove_if(neighVectorPairs, neighVectorPairs + nNeigh, gpu::is_invalid_pair()); + size_t newSize = updatedEnd - neighVectorPairs; + thrust::stable_sort(neighVectorPairs, neighVectorPairs + newSize, gpu::sort_by_second()); + thrust::transform(neighVectorPairs, neighVectorPairs + newSize, validNeighs, gpu::pair_to_first()); - return trimmedSize; + return newSize; } template @@ -1190,137 +1148,117 @@ void processNeighboursHandler(const int startLayer, const int nBlocks, const int nThreads) { - thrust::device_vector foundSeedsTable(nCells[startLayer] + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency. TODO: fix this. - // thrust::device_vector lastCellIds(lastCellIdHost); - // thrust::device_vector lastCellSeed(lastCellSeedHost); - thrust::device_vector lastCellId, updatedCellId; - thrust::device_vector lastCellSeed, updatedCellSeed; - gpu::processNeighboursKernel<<>>(startLayer, - startLevel, - allCellSeeds, - currentCellSeeds, - nullptr, - nCells[startLayer], - nullptr, - nullptr, - thrust::raw_pointer_cast(&foundSeedsTable[0]), - usedClusters, - neighbours[startLayer - 1], - neighboursDeviceLUTs[startLayer - 1], - foundTrackingFrameInfo, - bz, - maxChi2ClusterAttachment, - propagator, - matCorrType); - void* d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - GPUChkErrS(cub::DeviceScan::ExclusiveSum(nullptr, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out - nCells[startLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out - nCells[startLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - - updatedCellId.resize(foundSeedsTable.back()); - updatedCellSeed.resize(foundSeedsTable.back()); - - gpu::processNeighboursKernel<<>>(startLayer, - startLevel, - allCellSeeds, - currentCellSeeds, - nullptr, - nCells[startLayer], - thrust::raw_pointer_cast(&updatedCellSeed[0]), - thrust::raw_pointer_cast(&updatedCellId[0]), - thrust::raw_pointer_cast(&foundSeedsTable[0]), - usedClusters, - neighbours[startLayer - 1], - neighboursDeviceLUTs[startLayer - 1], - foundTrackingFrameInfo, - bz, - maxChi2ClusterAttachment, - propagator, - matCorrType); - auto t1 = updatedCellSeed.size(); - GPUChkErrS(cudaFree(d_temp_storage)); + thrust::device_vector foundSeedsTable(nCells[startLayer] + 1); // Shortcut: device_vector skips central memory management, we are relying on the contingency. + // TODO: fix this. + + gpu::processNeighboursKernel<<>>( + startLayer, + startLevel, + allCellSeeds, + currentCellSeeds, + nullptr, + nCells[startLayer], + nullptr, + nullptr, + thrust::raw_pointer_cast(&foundSeedsTable[0]), + usedClusters, + neighbours[startLayer - 1], + neighboursDeviceLUTs[startLayer - 1], + foundTrackingFrameInfo, + bz, + maxChi2ClusterAttachment, + propagator, + matCorrType); + gpu::cubExclusiveScanInPlace(foundSeedsTable, nCells[startLayer] + 1); + + thrust::device_vector updatedCellId(foundSeedsTable.back()); + thrust::device_vector updatedCellSeed(foundSeedsTable.back()); + gpu::processNeighboursKernel<<>>( + startLayer, + startLevel, + allCellSeeds, + currentCellSeeds, + nullptr, + nCells[startLayer], + thrust::raw_pointer_cast(&updatedCellSeed[0]), + thrust::raw_pointer_cast(&updatedCellId[0]), + thrust::raw_pointer_cast(&foundSeedsTable[0]), + usedClusters, + neighbours[startLayer - 1], + neighboursDeviceLUTs[startLayer - 1], + foundTrackingFrameInfo, + bz, + maxChi2ClusterAttachment, + propagator, + matCorrType); + int level = startLevel; + thrust::device_vector lastCellId; + thrust::device_vector lastCellSeed; for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { - temp_storage_bytes = 0; lastCellSeed.swap(updatedCellSeed); lastCellId.swap(updatedCellId); thrust::device_vector().swap(updatedCellSeed); thrust::device_vector().swap(updatedCellId); auto lastCellSeedSize{lastCellSeed.size()}; - foundSeedsTable.resize(nCells[iLayer] + 1); + foundSeedsTable.resize(lastCellSeedSize + 1); thrust::fill(foundSeedsTable.begin(), foundSeedsTable.end(), 0); - --level; - gpu::processNeighboursKernel<<>>(iLayer, - level, - allCellSeeds, - thrust::raw_pointer_cast(&lastCellSeed[0]), - thrust::raw_pointer_cast(&lastCellId[0]), - lastCellSeedSize, - nullptr, - nullptr, - thrust::raw_pointer_cast(&foundSeedsTable[0]), - usedClusters, - neighbours[iLayer - 1], - neighboursDeviceLUTs[iLayer - 1], - foundTrackingFrameInfo, - bz, - maxChi2ClusterAttachment, - propagator, - matCorrType); - GPUChkErrS(cub::DeviceScan::ExclusiveSum(nullptr, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out - nCells[iLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer - GPUChkErrS(cudaMalloc(&d_temp_storage, temp_storage_bytes)); - GPUChkErrS(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage - temp_storage_bytes, // temp_storage_bytes - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_in - thrust::raw_pointer_cast(&foundSeedsTable[0]), // d_out - nCells[iLayer] + 1, // num_items - 0)); // NOLINT: this is the offset of the sum, not a pointer + + gpu::processNeighboursKernel<<>>( + iLayer, + --level, + allCellSeeds, + thrust::raw_pointer_cast(&lastCellSeed[0]), + thrust::raw_pointer_cast(&lastCellId[0]), + lastCellSeedSize, + nullptr, + nullptr, + thrust::raw_pointer_cast(&foundSeedsTable[0]), + usedClusters, + neighbours[iLayer - 1], + neighboursDeviceLUTs[iLayer - 1], + foundTrackingFrameInfo, + bz, + maxChi2ClusterAttachment, + propagator, + matCorrType); + gpu::cubExclusiveScanInPlace(foundSeedsTable, foundSeedsTable.size()); + auto foundSeeds{foundSeedsTable.back()}; updatedCellId.resize(foundSeeds); thrust::fill(updatedCellId.begin(), updatedCellId.end(), 0); updatedCellSeed.resize(foundSeeds); thrust::fill(updatedCellSeed.begin(), updatedCellSeed.end(), CellSeed()); - gpu::processNeighboursKernel<<>>(iLayer, - level, - allCellSeeds, - thrust::raw_pointer_cast(&lastCellSeed[0]), - thrust::raw_pointer_cast(&lastCellId[0]), - lastCellSeedSize, - thrust::raw_pointer_cast(&updatedCellSeed[0]), - thrust::raw_pointer_cast(&updatedCellId[0]), - thrust::raw_pointer_cast(&foundSeedsTable[0]), - usedClusters, - neighbours[iLayer - 1], - neighboursDeviceLUTs[iLayer - 1], - foundTrackingFrameInfo, - bz, - maxChi2ClusterAttachment, - propagator, - matCorrType); - GPUChkErrS(cudaFree(d_temp_storage)); + gpu::processNeighboursKernel<<>>( + iLayer, + level, + allCellSeeds, + thrust::raw_pointer_cast(&lastCellSeed[0]), + thrust::raw_pointer_cast(&lastCellId[0]), + lastCellSeedSize, + thrust::raw_pointer_cast(&updatedCellSeed[0]), + thrust::raw_pointer_cast(&updatedCellId[0]), + thrust::raw_pointer_cast(&foundSeedsTable[0]), + usedClusters, + neighbours[iLayer - 1], + neighboursDeviceLUTs[iLayer - 1], + foundTrackingFrameInfo, + bz, + maxChi2ClusterAttachment, + propagator, + matCorrType); } + thrust::device_vector outSeeds(updatedCellSeed.size()); auto end = thrust::copy_if(updatedCellSeed.begin(), updatedCellSeed.end(), outSeeds.begin(), gpu::seed_selector(1.e3, maxChi2NDF * ((startLevel + 2) * 2 - 5))); auto s{end - outSeeds.begin()}; std::vector outSeedsHost(s); - thrust::copy(updatedCellSeed.begin(), updatedCellSeed.begin() + s, outSeedsHost.begin()); + thrust::copy(outSeeds.begin(), outSeeds.begin() + s, outSeedsHost.begin()); seedsHost.insert(seedsHost.end(), outSeedsHost.begin(), outSeedsHost.end()); } @@ -1339,7 +1277,8 @@ void trackSeedHandler(CellSeed* trackSeeds, const int nThreads) { thrust::device_vector minPts(minPtsHost); - gpu::fitTrackSeedsKernel<<>>( + gpu::fitTrackSeedsKernel<<>>( trackSeeds, // CellSeed* foundTrackingFrameInfo, // TrackingFrameInfo** tracks, // TrackITSExt* diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Tracker.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Tracker.h index 58483e4aa9f6f..8f0a471b40c59 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Tracker.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Tracker.h @@ -50,21 +50,14 @@ class TrackerTraits; class Tracker { + using LogFunc = std::function; public: Tracker(TrackerTraits* traits); - Tracker(const Tracker&) = delete; - Tracker& operator=(const Tracker&) = delete; - ~Tracker(); - void adoptTimeFrame(TimeFrame& tf); - void clustersToTracks( - std::function = [](std::string s) { std::cout << s << std::endl; }, std::function = [](std::string s) { std::cerr << s << std::endl; }); - void clustersToTracksHybrid( - std::function = [](std::string s) { std::cout << s << std::endl; }, std::function = [](std::string s) { std::cerr << s << std::endl; }); - std::vector& getTracks(); + void clustersToTracks(LogFunc = [](std::string s) { std::cout << s << std::endl; }, LogFunc = [](std::string s) { std::cerr << s << std::endl; }); void setParameters(const std::vector&); std::vector& getParameters() { return mTrkParams; } @@ -74,7 +67,7 @@ class Tracker bool isMatLUT() const; void setNThreads(int n); int getNThreads() const; - std::uint32_t mTimeFrameCounter = 0; + void printSummary() const; private: void initialiseTimeFrame(int& iteration); @@ -82,16 +75,7 @@ class Tracker void computeCells(int& iteration); void findCellsNeighbours(int& iteration); void findRoads(int& iteration); - - void initialiseTimeFrameHybrid(int& iteration); - void computeTrackletsHybrid(int& iteration, int& iROFslice, int& iVertex); - void computeCellsHybrid(int& iteration); - void findCellsNeighboursHybrid(int& iteration); - void findRoadsHybrid(int& iteration); - void findTracksHybrid(int& iteration); - void findShortPrimaries(); - void findTracks(); void extendTracks(int& iteration); // MC interaction @@ -100,7 +84,7 @@ class Tracker void rectifyClusterIndices(); template - float evaluateTask(void (Tracker::*)(T...), const char*, std::function logger, T&&... args); + float evaluateTask(void (Tracker::*)(T...), const char*, LogFunc logger, T&&... args); TrackerTraits* mTraits = nullptr; /// Observer pointer, not owned by this class TimeFrame* mTimeFrame = nullptr; /// Observer pointer, not owned by this class @@ -108,7 +92,9 @@ class Tracker std::vector mTrkParams; o2::gpu::GPUChainITS* mRecoChain = nullptr; - unsigned int mNumberOfRuns{0}; + unsigned int mNumberOfDroppedTFs{0}; + unsigned int mTimeFrameCounter{0}; + double mTotalTime{0}; }; inline void Tracker::setParameters(const std::vector& trkPars) @@ -117,8 +103,7 @@ inline void Tracker::setParameters(const std::vector& trkPar } template -float Tracker::evaluateTask(void (Tracker::*task)(T...), const char* taskName, std::function logger, - T&&... args) +float Tracker::evaluateTask(void (Tracker::*task)(T...), const char* taskName, LogFunc logger, T&&... args) { float diff{0.f}; diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h index 46499db92d4d5..6b514c6e8d000 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h @@ -16,23 +16,12 @@ #ifndef TRACKINGITSU_INCLUDE_TRACKERTRAITS_H_ #define TRACKINGITSU_INCLUDE_TRACKERTRAITS_H_ -#include -#include #include -#include -#include -#include -#include -#include -#include #include "DetectorsBase/Propagator.h" -#include "DetectorsBase/MatLayerCylSet.h" #include "ITStracking/Configuration.h" -#include "ITStracking/Definitions.h" #include "ITStracking/MathUtils.h" #include "ITStracking/TimeFrame.h" -#include "ITStracking/Road.h" // #define OPTIMISATION_OUTPUT @@ -52,30 +41,29 @@ class TrackerTraits virtual ~TrackerTraits() = default; virtual void adoptTimeFrame(TimeFrame* tf); virtual void initialiseTimeFrame(const int iteration); + virtual void computeLayerTracklets(const int iteration, int iROFslice, int iVertex); virtual void computeLayerCells(const int iteration); virtual void findCellsNeighbours(const int iteration); virtual void findRoads(const int iteration); - virtual void initialiseTimeFrameHybrid(const int iteration) { LOGP(error, "initialiseTimeFrameHybrid: this method should never be called with CPU traits"); } - virtual void computeTrackletsHybrid(const int iteration, int, int) { LOGP(error, "computeTrackletsHybrid: this method should never be called with CPU traits"); } - virtual void computeCellsHybrid(const int iteration) { LOGP(error, "computeCellsHybrid: this method should never be called with CPU traits"); } - virtual void findCellsNeighboursHybrid(const int iteration) { LOGP(error, "findCellsNeighboursHybrid: this method should never be called with CPU traits"); } - virtual void findRoadsHybrid(const int iteration) { LOGP(error, "findRoadsHybrid: this method should never be called with CPU traits"); } - virtual void findTracksHybrid(const int iteration) { LOGP(error, "findTracksHybrid: this method should never be called with CPU traits"); } - virtual void findTracks() { LOGP(error, "findTracks: this method is deprecated."); } + + virtual bool supportsExtendTracks() const noexcept { return true; } virtual void extendTracks(const int iteration); + virtual bool supportsFindShortPrimaries() const noexcept { return true; } virtual void findShortPrimaries(); - virtual void setBz(float bz); + virtual bool trackFollowing(TrackITSExt* track, int rof, bool outward, const int iteration); virtual void processNeighbours(int iLayer, int iLevel, const std::vector& currentCellSeed, const std::vector& currentCellId, std::vector& updatedCellSeed, std::vector& updatedCellId); void UpdateTrackingParameters(const std::vector& trkPars); TimeFrame* getTimeFrame() { return mTimeFrame; } - void setIsGPU(const unsigned char isgpu) { mIsGPU = isgpu; }; + virtual void setBz(float bz); float getBz() const; void setCorrType(const o2::base::PropagatorImpl::MatCorrType type) { mCorrType = type; } bool isMatLUT() const; + virtual const char* getName() const noexcept { return "CPU"; } + virtual bool isGPU() const noexcept { return false; } // Others GPUhd() static consteval int4 getEmptyBinsRect() { return int4{0, 0, 0, 0}; } @@ -109,13 +97,11 @@ class TrackerTraits o2::gpu::GPUChainITS* mChain = nullptr; TimeFrame* mTimeFrame; std::vector mTrkParams; - bool mIsGPU = false; }; inline void TrackerTraits::initialiseTimeFrame(const int iteration) { mTimeFrame->initialise(iteration, mTrkParams[iteration], mTrkParams[iteration].NLayers); - setIsGPU(false); } inline float TrackerTraits::getBz() const diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingInterface.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingInterface.h index b584bf6b8008b..6eacb94ebb1ea 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingInterface.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingInterface.h @@ -37,9 +37,7 @@ class ITSTrackingInterface const bool overrBeamEst) : mIsMC{isMC}, mUseTriggers{trgType}, - mOverrideBeamEstimation{overrBeamEst} - { - } + mOverrideBeamEstimation{overrBeamEst} {} void setClusterDictionary(const o2::itsmft::TopologyDictionary* d) { mDict = d; } void setMeanVertex(const o2::dataformats::MeanVertexObject* v) @@ -56,6 +54,7 @@ class ITSTrackingInterface void initialise(); template void run(framework::ProcessingContext& pc); + void printSummary() const; virtual void updateTimeDependentParams(framework::ProcessingContext& pc); virtual void finaliseCCDB(framework::ConcreteDataMatcher& matcher, void* obj); diff --git a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx index 50dc1f5dfd039..c23ba0576c625 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx @@ -25,8 +25,7 @@ #include "ReconstructionDataFormats/Track.h" #include -#include -#include +#include #include #include #include @@ -37,17 +36,16 @@ namespace its { using o2::its::constants::GB; -Tracker::Tracker(o2::its::TrackerTraits* traits) +Tracker::Tracker(o2::its::TrackerTraits* traits) : mTraits(traits) { /// Initialise standard configuration with 1 iteration mTrkParams.resize(1); - mTraits = traits; } -Tracker::~Tracker() = default; - -void Tracker::clustersToTracks(std::function logger, std::function error) +void Tracker::clustersToTracks(LogFunc logger, LogFunc error) { + LogFunc evalLog = [](const std::string&) {}; + double total{0}; mTraits->UpdateTrackingParameters(mTrkParams); int maxNvertices{-1}; @@ -62,22 +60,20 @@ void Tracker::clustersToTracks(std::function logger, std::f if (iteration == 3 && mTrkParams[0].DoUPCIteration) { mTimeFrame->swapMasks(); } - logger(fmt::format("ITS Tracking iteration {} summary:", iteration)); double timeTracklets{0.}, timeCells{0.}, timeNeighbours{0.}, timeRoads{0.}; int nTracklets{0}, nCells{0}, nNeighbours{0}, nTracks{-static_cast(mTimeFrame->getNumberOfTracks())}; - - total += evaluateTask(&Tracker::initialiseTimeFrame, "Timeframe initialisation", logger, iteration); int nROFsIterations = mTrkParams[iteration].nROFsPerIterations > 0 ? mTimeFrame->getNrof() / mTrkParams[iteration].nROFsPerIterations + bool(mTimeFrame->getNrof() % mTrkParams[iteration].nROFsPerIterations) : 1; int iVertex{std::min(maxNvertices, 0)}; + logger(std::format("==== ITS {} Tracking iteration {} summary ====", mTraits->getName(), iteration)); + total += evaluateTask(&Tracker::initialiseTimeFrame, "Timeframe initialisation", logger, iteration); do { for (int iROFs{0}; iROFs < nROFsIterations; ++iROFs) { - timeTracklets += evaluateTask( - &Tracker::computeTracklets, "Tracklet finding", [](std::string) {}, iteration, iROFs, iVertex); + timeTracklets += evaluateTask(&Tracker::computeTracklets, "Tracklet finding", evalLog, iteration, iROFs, iVertex); nTracklets += mTraits->getTFNumberOfTracklets(); if (!mTimeFrame->checkMemory(mTrkParams[iteration].MaxMemory)) { mTimeFrame->printSliceInfo(iROFs, mTrkParams[iteration].nROFsPerIterations); - error(fmt::format("Too much memory used during trackleting in iteration {} in ROF span {}-{}: {:.2f} GB. Current limit is {:.2f} GB, check the detector status and/or the selections.", + error(std::format("Too much memory used during trackleting in iteration {} in ROF span {}-{}: {:.2f} GB. Current limit is {:.2f} GB, check the detector status and/or the selections.", iteration, iROFs, iROFs + mTrkParams[iteration].nROFsPerIterations, mTimeFrame->getArtefactsMemory() / GB, mTrkParams[iteration].MaxMemory / GB)); if (mTrkParams[iteration].DropTFUponFailure) { dropTF = true; @@ -86,17 +82,16 @@ void Tracker::clustersToTracks(std::function logger, std::f } float trackletsPerCluster = mTraits->getTFNumberOfClusters() > 0 ? float(mTraits->getTFNumberOfTracklets()) / mTraits->getTFNumberOfClusters() : 0.f; if (trackletsPerCluster > mTrkParams[iteration].TrackletsPerClusterLimit) { - error(fmt::format("Too many tracklets per cluster ({}) in iteration {} in ROF span {}-{}:, check the detector status and/or the selections. Current limit is {}", + error(std::format("Too many tracklets per cluster ({}) in iteration {} in ROF span {}-{}:, check the detector status and/or the selections. Current limit is {}", trackletsPerCluster, iteration, iROFs, iROFs + mTrkParams[iteration].nROFsPerIterations, mTrkParams[iteration].TrackletsPerClusterLimit)); break; } - timeCells += evaluateTask( - &Tracker::computeCells, "Cell finding", [](std::string) {}, iteration); + timeCells += evaluateTask(&Tracker::computeCells, "Cell finding", evalLog, iteration); nCells += mTraits->getTFNumberOfCells(); if (!mTimeFrame->checkMemory(mTrkParams[iteration].MaxMemory)) { mTimeFrame->printSliceInfo(iROFs, mTrkParams[iteration].nROFsPerIterations); - error(fmt::format("Too much memory used during cell finding in iteration {} in ROF span {}-{}: {:.2f} GB. Current limit is {:.2f} GB, check the detector status and/or the selections.", + error(std::format("Too much memory used during cell finding in iteration {} in ROF span {}-{}: {:.2f} GB. Current limit is {:.2f} GB, check the detector status and/or the selections.", iteration, iROFs, iROFs + mTrkParams[iteration].nROFsPerIterations, mTimeFrame->getArtefactsMemory() / GB, mTrkParams[iteration].MaxMemory / GB)); if (mTrkParams[iteration].DropTFUponFailure) { dropTF = true; @@ -105,131 +100,53 @@ void Tracker::clustersToTracks(std::function logger, std::f } float cellsPerCluster = mTraits->getTFNumberOfClusters() > 0 ? float(mTraits->getTFNumberOfCells()) / mTraits->getTFNumberOfClusters() : 0.f; if (cellsPerCluster > mTrkParams[iteration].CellsPerClusterLimit) { - error(fmt::format("Too many cells per cluster ({}) in iteration {} in ROF span {}-{}, check the detector status and/or the selections. Current limit is {}", + error(std::format("Too many cells per cluster ({}) in iteration {} in ROF span {}-{}, check the detector status and/or the selections. Current limit is {}", cellsPerCluster, iteration, iROFs, iROFs + mTrkParams[iteration].nROFsPerIterations, mTrkParams[iteration].CellsPerClusterLimit)); break; } - timeNeighbours += evaluateTask( - &Tracker::findCellsNeighbours, "Neighbour finding", [](std::string) {}, iteration); + timeNeighbours += evaluateTask(&Tracker::findCellsNeighbours, "Neighbour finding", evalLog, iteration); nNeighbours += mTimeFrame->getNumberOfNeighbours(); - timeRoads += evaluateTask( - &Tracker::findRoads, "Road finding", [](std::string) {}, iteration); + timeRoads += evaluateTask(&Tracker::findRoads, "Road finding", evalLog, iteration); } iVertex++; } while (iVertex < maxNvertices && !dropTF); - logger(fmt::format(" - Tracklet finding: {} tracklets found in {:.2f} ms", nTracklets, timeTracklets)); - logger(fmt::format(" - Cell finding: {} cells found in {:.2f} ms", nCells, timeCells)); - logger(fmt::format(" - Neighbours finding: {} neighbours found in {:.2f} ms", nNeighbours, timeNeighbours)); - logger(fmt::format(" - Track finding: {} tracks found in {:.2f} ms", nTracks + mTimeFrame->getNumberOfTracks(), timeRoads)); + logger(std::format(" - Tracklet finding: {} tracklets found in {:.2f} ms", nTracklets, timeTracklets)); + logger(std::format(" - Cell finding: {} cells found in {:.2f} ms", nCells, timeCells)); + logger(std::format(" - Neighbours finding: {} neighbours found in {:.2f} ms", nNeighbours, timeNeighbours)); + logger(std::format(" - Track finding: {} tracks found in {:.2f} ms", nTracks + mTimeFrame->getNumberOfTracks(), timeRoads)); total += timeTracklets + timeCells + timeNeighbours + timeRoads; - if (mTrkParams[iteration].UseTrackFollower) { + if (mTraits->supportsExtendTracks() && mTrkParams[iteration].UseTrackFollower && !dropTF) { int nExtendedTracks{-mTimeFrame->mNExtendedTracks}, nExtendedClusters{-mTimeFrame->mNExtendedUsedClusters}; auto timeExtending = evaluateTask(&Tracker::extendTracks, "Extending tracks", [](const std::string&) {}, iteration); total += timeExtending; - logger(fmt::format(" - Extending Tracks: {} extended tracks using {} clusters found in {:.2f} ms", nExtendedTracks + mTimeFrame->mNExtendedTracks, nExtendedClusters + mTimeFrame->mNExtendedUsedClusters, timeExtending)); + logger(std::format(" - Extending Tracks: {} extended tracks using {} clusters found in {:.2f} ms", nExtendedTracks + mTimeFrame->mNExtendedTracks, nExtendedClusters + mTimeFrame->mNExtendedUsedClusters, timeExtending)); } if (dropTF) { - error(fmt::format("...Dropping Timeframe...")); + error("...Dropping Timeframe..."); mTimeFrame->dropTracks(); - break; // breaking out the iterations loop + ++mNumberOfDroppedTFs; + return; } } - total += evaluateTask(&Tracker::findShortPrimaries, "Short primaries finding", logger); - - std::stringstream sstream; - if constexpr (constants::DoTimeBenchmarks) { - sstream << std::setw(2) << " - " - << "Timeframe " << mTimeFrameCounter++ << " processing completed in: " << total << "ms using " << mTraits->getNThreads() << " threads."; + if (mTraits->supportsFindShortPrimaries() && mTrkParams[0].FindShortTracks) { + auto nTracksB = mTimeFrame->getNumberOfTracks(); + total += evaluateTask(&Tracker::findShortPrimaries, "Short primaries finding", logger); + auto nTracksA = mTimeFrame->getNumberOfTracks(); + logger(std::format(" `-> found {} additional tracks", nTracksA - nTracksB)); } - logger(sstream.str()); - if (mTimeFrame->hasMCinformation()) { - computeTracksMClabels(); - } - rectifyClusterIndices(); - mNumberOfRuns++; -} - -void Tracker::clustersToTracksHybrid(std::function logger, std::function error) -{ - double total{0.}; - mTraits->UpdateTrackingParameters(mTrkParams); - int maxNvertices{-1}; - if (mTrkParams[0].PerPrimaryVertexProcessing) { - for (int iROF{0}; iROF < mTimeFrame->getNrof(); ++iROF) { - maxNvertices = std::max(maxNvertices, (int)mTimeFrame->getPrimaryVertices(iROF).size()); - } - } - - for (int iteration = 0; iteration < (int)mTrkParams.size(); ++iteration) { - int nROFsIterations = mTrkParams[iteration].nROFsPerIterations > 0 ? mTimeFrame->getNrof() / mTrkParams[iteration].nROFsPerIterations + bool(mTimeFrame->getNrof() % mTrkParams[iteration].nROFsPerIterations) : 1; - logger(fmt::format("=========== ITS Hybrid Tracking iteration {} summary ===========", iteration, nROFsIterations, maxNvertices)); - double timeTracklets{0.}, timeCells{0.}, timeNeighbours{0.}, timeRoads{0.}; - int nTracklets{0}, nCells{0}, nNeighbours{0}, nTracks{-static_cast(mTimeFrame->getNumberOfTracks())}; - - total += evaluateTask(&Tracker::initialiseTimeFrameHybrid, "Hybrid Timeframe initialisation", logger, iteration); - int iVertex{std::min(maxNvertices, 0)}; - - do { - for (int iROFs{0}; iROFs < nROFsIterations; ++iROFs) { - timeTracklets += evaluateTask( - &Tracker::computeTrackletsHybrid, "Tracklet finding", [](std::string) {}, iteration, iROFs, iVertex); - nTracklets += mTraits->getTFNumberOfTracklets(); - if (!mTimeFrame->checkMemory(mTrkParams[iteration].MaxMemory)) { - error(fmt::format("Too much memory used during trackleting in iteration {}, check the detector status and/or the selections.", iteration)); - break; - } - float trackletsPerCluster = mTraits->getTFNumberOfClusters() > 0 ? float(mTraits->getTFNumberOfTracklets()) / mTraits->getTFNumberOfClusters() : 0.f; - if (trackletsPerCluster > mTrkParams[iteration].TrackletsPerClusterLimit) { - error(fmt::format("Too many tracklets per cluster ({}) in iteration {}, check the detector status and/or the selections. Current limit is {}", trackletsPerCluster, iteration, mTrkParams[iteration].TrackletsPerClusterLimit)); - break; - } - - timeCells += evaluateTask( - &Tracker::computeCellsHybrid, "Cell finding", [](std::string) {}, iteration); - nCells += mTraits->getTFNumberOfCells(); - if (!mTimeFrame->checkMemory(mTrkParams[iteration].MaxMemory)) { - error(fmt::format("Too much memory used during cell finding in iteration {}, check the detector status and/or the selections.", iteration)); - break; - } - float cellsPerCluster = mTraits->getTFNumberOfClusters() > 0 ? float(mTraits->getTFNumberOfCells()) / mTraits->getTFNumberOfClusters() : 0.f; - if (cellsPerCluster > mTrkParams[iteration].CellsPerClusterLimit) { - error(fmt::format("Too many cells per cluster ({}) in iteration {}, check the detector status and/or the selections. Current limit is {}", cellsPerCluster, iteration, mTrkParams[iteration].CellsPerClusterLimit)); - break; - } - - timeNeighbours += evaluateTask( - &Tracker::findCellsNeighboursHybrid, "Neighbour finding", [](std::string) {}, iteration); - nNeighbours += mTimeFrame->getNumberOfNeighbours(); - timeRoads += evaluateTask( - &Tracker::findRoads, "Road finding", [](std::string) {}, iteration); - } - iVertex++; - } while (iVertex < maxNvertices); - logger(fmt::format(" - Hybrid tracklet finding: {} tracklets found in {:.2f} ms", nTracklets, timeTracklets)); - logger(fmt::format(" - Hybrid cell finding: {} cells found in {:.2f} ms", nCells, timeCells)); - logger(fmt::format(" - Hybrid neighbours finding: {} neighbours found in {:.2f} ms", nNeighbours, timeNeighbours)); - logger(fmt::format(" - Hybrid track finding: {} tracks found in {:.2f} ms", nTracks + mTimeFrame->getNumberOfTracks(), timeRoads)); - total += timeTracklets + timeCells + timeNeighbours + timeRoads; - // total += evaluateTask(&Tracker::extendTracks, "Hybrid extending tracks", logger, iteration); - } - - // total += evaluateTask(&Tracker::findShortPrimaries, "Hybrid short primaries finding", logger); - - std::stringstream sstream; if constexpr (constants::DoTimeBenchmarks) { - sstream << std::setw(2) << " - " - << "Timeframe " << mTimeFrameCounter++ << " processing completed in: " << total << "ms using " << mTraits->getNThreads() << " threads."; + logger(std::format("=== TimeFrame {} processing completed in: {:.2f} ms using {} thread(s) ===", mTimeFrameCounter, total, mTraits->getNThreads())); } - logger(sstream.str()); if (mTimeFrame->hasMCinformation()) { computeTracksMClabels(); } rectifyClusterIndices(); - mNumberOfRuns++; + ++mTimeFrameCounter; + mTotalTime += total; } void Tracker::initialiseTimeFrame(int& iteration) @@ -257,41 +174,6 @@ void Tracker::findRoads(int& iteration) mTraits->findRoads(iteration); } -void Tracker::initialiseTimeFrameHybrid(int& iteration) -{ - mTraits->initialiseTimeFrameHybrid(iteration); -} - -void Tracker::computeTrackletsHybrid(int& iteration, int& iROFslice, int& iVertex) -{ - mTraits->computeTrackletsHybrid(iteration, iROFslice, iVertex); // placeholder for the proper ROF/vertex slicing -} - -void Tracker::computeCellsHybrid(int& iteration) -{ - mTraits->computeCellsHybrid(iteration); -} - -void Tracker::findCellsNeighboursHybrid(int& iteration) -{ - mTraits->findCellsNeighboursHybrid(iteration); -} - -void Tracker::findRoadsHybrid(int& iteration) -{ - mTraits->findRoadsHybrid(iteration); -} - -void Tracker::findTracksHybrid(int& iteration) -{ - mTraits->findTracksHybrid(iteration); -} - -void Tracker::findTracks() -{ - mTraits->findTracks(); -} - void Tracker::extendTracks(int& iteration) { mTraits->extendTracks(iteration); @@ -575,5 +457,11 @@ int Tracker::getNThreads() const { return mTraits->getNThreads(); } + +void Tracker::printSummary() const +{ + LOGP(info, "Tracker summary: Processed {} TFs (dropped {}) in TOT={:.2f} s, AVG/TF={:.2f} s", mTimeFrameCounter, mNumberOfDroppedTFs, mTotalTime * 1.e-3, mTotalTime * 1.e-3 / ((mTimeFrameCounter > 0) ? (double)mTimeFrameCounter : -1.0)); +} + } // namespace its } // namespace o2 diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index 8dcb7bfd315c1..987e8e3128fb4 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -19,7 +19,9 @@ #include #include -#include +#ifdef OPTIMISATION_OUTPUT +#include +#endif #include "CommonConstants/MathConstants.h" #include "DetectorsBase/Propagator.h" @@ -38,7 +40,7 @@ using o2::base::PropagatorF; namespace { -float Sq(float q) +inline float Sq(float q) { return q * q; } @@ -57,7 +59,7 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in #ifdef OPTIMISATION_OUTPUT static int iter{0}; - std::ofstream off(fmt::format("tracklets{}.txt", iter++)); + std::ofstream off(std::format("tracklets{}.txt", iter++)); #endif for (int iLayer = 0; iLayer < mTrkParams[iteration].TrackletsPerRoad(); ++iLayer) { @@ -173,7 +175,7 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in break; } } - off << fmt::format("{}\t{:d}\t{}\t{}\t{}\t{}", iLayer, label.isValid(), (tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate) / sigmaZ, tanLambda, resolution, sigmaZ) << std::endl; + off << std::format("{}\t{:d}\t{}\t{}\t{}\t{}", iLayer, label.isValid(), (tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate) / sigmaZ, tanLambda, resolution, sigmaZ) << std::endl; #endif if (deltaZ / sigmaZ < mTrkParams[iteration].NSigmaCut && @@ -270,7 +272,7 @@ void TrackerTraits::computeLayerCells(const int iteration) { #ifdef OPTIMISATION_OUTPUT static int iter{0}; - std::ofstream off(fmt::format("cells{}.txt", iter++)); + std::ofstream off(std::format("cells{}.txt", iter++)); #endif for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) { @@ -318,7 +320,7 @@ void TrackerTraits::computeLayerCells(const int iteration) #ifdef OPTIMISATION_OUTPUT bool good{tf->getTrackletsLabel(iLayer)[iTracklet] == tf->getTrackletsLabel(iLayer + 1)[iNextTracklet]}; float signedDelta{currentTracklet.tanLambda - nextTracklet.tanLambda}; - off << fmt::format("{}\t{:d}\t{}\t{}\t{}\t{}", iLayer, good, signedDelta, signedDelta / (mTrkParams[iteration].CellDeltaTanLambdaSigma), tanLambda, resolution) << std::endl; + off << std::format("{}\t{:d}\t{}\t{}\t{}\t{}", iLayer, good, signedDelta, signedDelta / (mTrkParams[iteration].CellDeltaTanLambdaSigma), tanLambda, resolution) << std::endl; #endif if (deltaTanLambda / mTrkParams[iteration].CellDeltaTanLambdaSigma < mTrkParams[iteration].NSigmaCut) { @@ -402,7 +404,7 @@ void TrackerTraits::computeLayerCells(const int iteration) void TrackerTraits::findCellsNeighbours(const int iteration) { #ifdef OPTIMISATION_OUTPUT - std::ofstream off(fmt::format("cellneighs{}.txt", iteration)); + std::ofstream off(std::format("cellneighs{}.txt", iteration)); #endif for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { const int nextLayerCellsNum{static_cast(mTimeFrame->getCells()[iLayer + 1].size())}; @@ -439,7 +441,7 @@ void TrackerTraits::findCellsNeighbours(const int iteration) #ifdef OPTIMISATION_OUTPUT bool good{mTimeFrame->getCellsLabel(iLayer)[iCell] == mTimeFrame->getCellsLabel(iLayer + 1)[iNextCell]}; - off << fmt::format("{}\t{:d}\t{}", iLayer, good, chi2) << std::endl; + off << std::format("{}\t{:d}\t{}", iLayer, good, chi2) << std::endl; #endif if (chi2 > mTrkParams[0].MaxChi2ClusterAttachment) { @@ -469,6 +471,7 @@ void TrackerTraits::findCellsNeighbours(const int iteration) void TrackerTraits::processNeighbours(int iLayer, int iLevel, const std::vector& currentCellSeed, const std::vector& currentCellId, std::vector& updatedCellSeeds, std::vector& updatedCellsIds) { + bool print = iLayer == 3 && iLevel == 2; if (iLevel < 2 || iLayer < 1) { std::cout << "Error: layer " << iLayer << " or level " << iLevel << " cannot be processed by processNeighbours" << std::endl; exit(1); @@ -723,10 +726,7 @@ void TrackerTraits::extendTracks(const int iteration) void TrackerTraits::findShortPrimaries() { - if (!mTrkParams[0].FindShortTracks) { - return; - } - auto propagator = o2::base::Propagator::Instance(); + const auto propagator = o2::base::Propagator::Instance(); mTimeFrame->fillPrimaryVerticesXandAlpha(); for (auto& cell : mTimeFrame->getCells()[0]) { diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx index 613402ce56e97..f0dad2722a301 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx @@ -310,18 +310,10 @@ void ITSTrackingInterface::run(framework::ProcessingContext& pc) mTimeFrame->setMultiplicityCutMask(processingMask); mTimeFrame->setROFMask(processUPCMask); // Run CA tracker - if constexpr (isGPU) { - if (mMode == o2::its::TrackingMode::Async && o2::its::TrackerParamConfig::Instance().fataliseUponFailure) { - mTracker->clustersToTracksHybrid(logger, fatalLogger); - } else { - mTracker->clustersToTracksHybrid(logger, errorLogger); - } + if (mMode == o2::its::TrackingMode::Async && o2::its::TrackerParamConfig::Instance().fataliseUponFailure) { + mTracker->clustersToTracks(logger, fatalLogger); } else { - if (mMode == o2::its::TrackingMode::Async && o2::its::TrackerParamConfig::Instance().fataliseUponFailure) { - mTracker->clustersToTracks(logger, fatalLogger); - } else { - mTracker->clustersToTracks(logger, errorLogger); - } + mTracker->clustersToTracks(logger, errorLogger); } size_t totTracks{mTimeFrame->getNumberOfTracks()}, totClusIDs{mTimeFrame->getNumberOfUsedClusters()}; allTracks.reserve(totTracks); @@ -438,6 +430,11 @@ void ITSTrackingInterface::finaliseCCDB(ConcreteDataMatcher& matcher, void* obj) } } +void ITSTrackingInterface::printSummary() const +{ + mTracker->printSummary(); +} + void ITSTrackingInterface::setTraitsFromProvider(VertexerTraits* vertexerTraits, TrackerTraits* trackerTraits, TimeFrame* frame) diff --git a/Detectors/ITSMFT/ITS/workflow/src/TrackerSpec.cxx b/Detectors/ITSMFT/ITS/workflow/src/TrackerSpec.cxx index 9e4c98ad6e9a1..abbb88aea42fa 100644 --- a/Detectors/ITSMFT/ITS/workflow/src/TrackerSpec.cxx +++ b/Detectors/ITSMFT/ITS/workflow/src/TrackerSpec.cxx @@ -48,6 +48,7 @@ void TrackerDPL::init(InitContext& ic) void TrackerDPL::stop() { + mITSTrackingInterface.printSummary(); LOGF(info, "CPU Reconstruction total timing: Cpu: %.3e Real: %.3e s in %d slots", mTimer.CpuTime(), mTimer.RealTime(), mTimer.Counter() - 1); } @@ -69,6 +70,7 @@ void TrackerDPL::finaliseCCDB(ConcreteDataMatcher& matcher, void* obj) void TrackerDPL::endOfStream(EndOfStreamContext& ec) { + mITSTrackingInterface.printSummary(); LOGF(info, "ITS CA-Tracker total timing: Cpu: %.3e Real: %.3e s in %d slots", mTimer.CpuTime(), mTimer.RealTime(), mTimer.Counter() - 1); } diff --git a/dependencies/FindO2GPU.cmake b/dependencies/FindO2GPU.cmake index 95db55041184f..fc4d838abcea3 100644 --- a/dependencies/FindO2GPU.cmake +++ b/dependencies/FindO2GPU.cmake @@ -68,7 +68,7 @@ endfunction() STRING(REGEX REPLACE "\-std=[^ ]*" "" O2_GPU_CMAKE_CXX_FLAGS_NOSTD "${CMAKE_CXX_FLAGS}") # ---------------------------------- Fast Math / Deterministic Mode ---------------------------------- -# set(GPUCA_DETERMINISTIC_MODE WHOLEO2) # Override +set(GPUCA_DETERMINISTIC_MODE WHOLEO2) # Override set(GPUCA_DETERMINISTIC_MODE_MAP_OFF 0) set(GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH 1) # No -ffast-math and similar compile flags for GPU folder set(GPUCA_DETERMINISTIC_MODE_MAP_OPTO2 2) # In addition, -O2 optimization on host for GPU folder