From b942bbae32989e04bb39a415897f087530aed022 Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Mon, 2 Jun 2025 18:42:46 +0200 Subject: [PATCH 1/6] ITS: allow sharing of arena in Tracker & Vertexer Signed-off-by: Felix Schlepper --- .../tracking/include/ITStracking/Tracker.h | 5 +-- .../include/ITStracking/TrackerTraits.h | 7 ++-- .../include/ITStracking/TrackingInterface.h | 3 ++ .../tracking/include/ITStracking/Vertexer.h | 5 +++ .../include/ITStracking/VertexerTraits.h | 9 +++--- Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx | 1 - .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 32 ++++++++++--------- .../ITS/tracking/src/TrackingInterface.cxx | 15 +++++++++ .../ITS/tracking/src/VertexerTraits.cxx | 26 ++++++++------- 9 files changed, 64 insertions(+), 39 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Tracker.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Tracker.h index b393d743809fd..5ba9b5039f808 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Tracker.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Tracker.h @@ -27,6 +27,8 @@ #include #include +#include + #include "ITStracking/Configuration.h" #include "CommonConstants/MathConstants.h" #include "ITStracking/Definitions.h" @@ -73,8 +75,7 @@ class Tracker void setBz(float bz) { mTraits->setBz(bz); } void setCorrType(const o2::base::PropagatorImpl::MatCorrType type) { mTraits->setCorrType(type); } bool isMatLUT() const { return mTraits->isMatLUT(); } - void setNThreads(int n) { mTraits->setNThreads(n); } - int getNThreads() const { return mTraits->getNThreads(); } + void setNThreads(int n, std::shared_ptr& arena) { mTraits->setNThreads(n, arena); } void printSummary() const; private: diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h index 36956a5206277..7ba67a01fce13 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackerTraits.h @@ -80,8 +80,8 @@ class TrackerTraits void SetRecoChain(o2::gpu::GPUChainITS* chain) { mChain = chain; } void setSmoothing(bool v) { mApplySmoothing = v; } bool getSmoothing() const { return mApplySmoothing; } - void setNThreads(int n); - int getNThreads() const { return mNThreads; } + void setNThreads(int n, std::shared_ptr& arena); + int getNThreads() { return mTaskArena->max_concurrency(); } o2::gpu::GPUChainITS* getChain() const { return mChain; } @@ -94,10 +94,9 @@ class TrackerTraits track::TrackParCov buildTrackSeed(const Cluster& cluster1, const Cluster& cluster2, const TrackingFrameInfo& tf3); bool fitTrack(TrackITSExt& track, int start, int end, int step, float chi2clcut = o2::constants::math::VeryBig, float chi2ndfcut = o2::constants::math::VeryBig, float maxQoverPt = o2::constants::math::VeryBig, int nCl = 0); - int mNThreads = 1; bool mApplySmoothing = false; std::shared_ptr mMemoryPool; - tbb::task_arena mTaskArena; + std::shared_ptr mTaskArena; protected: o2::base::PropagatorImpl::MatCorrType mCorrType = o2::base::PropagatorImpl::MatCorrType::USEMatCorrNONE; diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingInterface.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingInterface.h index cff6d215e5e3b..5e2316556ca14 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingInterface.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TrackingInterface.h @@ -28,6 +28,8 @@ #include "GPUO2Interface.h" #include "GPUChainITS.h" +#include + namespace o2::its { class ITSTrackingInterface @@ -97,6 +99,7 @@ class ITSTrackingInterface std::unique_ptr mVertexer = nullptr; const o2::dataformats::MeanVertexObject* mMeanVertex; std::shared_ptr mMemoryPool; + std::shared_ptr mTaskArena; }; } // namespace o2::its diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Vertexer.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Vertexer.h index 63dd41b4a0a8f..98bcb95ef65df 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Vertexer.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/Vertexer.h @@ -21,6 +21,9 @@ #include #include #include +#include + +#include #include "ITStracking/ROframe.h" #include "ITStracking/Constants.h" @@ -90,6 +93,8 @@ class Vertexer const unsigned selectedN, const unsigned int vertexN, const float initT, const float trackletT, const float selecT, const float vertexT); + void setNThreads(int n, std::shared_ptr& arena) { mTraits->setNThreads(n, arena); } + private: std::uint32_t mTimeFrameCounter = 0; diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h index e1e1d44e8ead9..6554e53fa2ee8 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/VertexerTraits.h @@ -17,6 +17,7 @@ #define O2_ITS_TRACKING_VERTEXER_TRAITS_H_ #include +#include #include #include @@ -93,8 +94,8 @@ class VertexerTraits auto getVertexingParameters() const { return mVrtParams; } void setVertexingParameters(std::vector& vertParams) { mVrtParams = vertParams; } void dumpVertexerTraits(); - void setNThreads(int n); - int getNThreads() const { return mNThreads; } + void setNThreads(int n, std::shared_ptr& arena); + int getNThreads() { return mTaskArena->max_concurrency(); } virtual bool isGPU() const noexcept { return false; } virtual const char* getName() const noexcept { return "CPU"; } virtual bool usesMemoryPool() const noexcept { return true; } @@ -116,8 +117,6 @@ class VertexerTraits } protected: - int mNThreads = 1; - std::vector mVrtParams; IndexTableUtils mIndexTableUtils; @@ -125,7 +124,7 @@ class VertexerTraits TimeFrame7* mTimeFrame = nullptr; // observer ptr private: std::shared_ptr mMemoryPool; - tbb::task_arena mTaskArena; + std::shared_ptr mTaskArena; }; inline void VertexerTraits::initialise(const TrackingParameters& trackingParams, const int iteration) diff --git a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx index c92d1e8505356..09d9cee06d9f9 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/Tracker.cxx @@ -342,7 +342,6 @@ void Tracker::getGlobalConfiguration() } else { mTraits->setCorrType(o2::base::PropagatorImpl::MatCorrType::USEMatCorrLUT); } - setNThreads(tc.nThreads); int nROFsPerIterations = tc.nROFsPerIterations > 0 ? tc.nROFsPerIterations : -1; if (tc.nOrbitsPerIterations > 0) { /// code to be used when the number of ROFs per orbit is known, this gets priority over the number of ROFs per iteration diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index 36636069137f3..186c5a99e7acb 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -71,7 +71,7 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROF int minRof = o2::gpu::CAMath::Max(startROF, rof0 - mTrkParams[iteration].DeltaROF); int maxRof = o2::gpu::CAMath::Min(endROF - 1, rof0 + mTrkParams[iteration].DeltaROF); - mTaskArena.execute([&] { + mTaskArena->execute([&] { tbb::parallel_for( tbb::blocked_range(0, mTrkParams[iteration].TrackletsPerRoad()), [&](const tbb::blocked_range& Layers) { @@ -200,7 +200,7 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROF return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; }; - mTaskArena.execute([&] { + mTaskArena->execute([&] { tbb::parallel_for( tbb::blocked_range(0, mTrkParams[iteration].CellsPerRoad()), [&](const tbb::blocked_range& Layers) { @@ -229,7 +229,7 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROF /// Layer 0 is done outside the loop // in-place deduplication auto& trklt0 = mTimeFrame->getTracklets()[0]; - mTaskArena.execute([&] { tbb::parallel_sort(trklt0.begin(), trklt0.end(), sortTracklets); }); + mTaskArena->execute([&] { tbb::parallel_sort(trklt0.begin(), trklt0.end(), sortTracklets); }); trklt0.erase(std::unique(trklt0.begin(), trklt0.end(), equalTracklets), trklt0.end()); trklt0.shrink_to_fit(); @@ -275,7 +275,7 @@ void TrackerTraits::computeLayerCells(const int iteration) } } - mTaskArena.execute([&] { + mTaskArena->execute([&] { tbb::parallel_for( tbb::blocked_range(0, mTrkParams[iteration].CellsPerRoad()), [&](const tbb::blocked_range& Layers) { @@ -496,7 +496,7 @@ void TrackerTraits::findCellsNeighbours(const int iteration) continue; } - mTaskArena.execute([&] { + mTaskArena->execute([&] { int layerCellsNum{static_cast(mTimeFrame->getCells()[iLayer].size())}; bounded_vector perCellCount(layerCellsNum + 1, 0, mMemoryPool.get()); @@ -621,7 +621,7 @@ void TrackerTraits::processNeighbours(int iLayer, int iLevel, const bou int failed[5]{0, 0, 0, 0, 0}, attempts{0}, failedByMismatch{0}; #endif - mTaskArena.execute([&] { + mTaskArena->execute([&] { bounded_vector perCellCount(currentCellSeed.size() + 1, 0, mMemoryPool.get()); tbb::parallel_for( tbb::blocked_range(0, (int)currentCellSeed.size()), @@ -812,7 +812,7 @@ void TrackerTraits::findRoads(const int iteration) } bounded_vector tracks(mMemoryPool.get()); - mTaskArena.execute([&] { + mTaskArena->execute([&] { bounded_vector perSeedCount(trackSeeds.size() + 1, 0, mMemoryPool.get()); tbb::parallel_for( tbb::blocked_range(0, (int)trackSeeds.size()), @@ -1258,17 +1258,19 @@ bool TrackerTraits::isMatLUT() const } template -void TrackerTraits::setNThreads(int n) +void TrackerTraits::setNThreads(int n, std::shared_ptr& arena) { - if (mNThreads == n && mTaskArena.is_active()) { - return; - } - mNThreads = n > 0 ? n : 1; #if defined(OPTIMISATION_OUTPUT) || defined(CA_DEBUG) - mNThreads = 1; // only works while serial + mTaskArena = std::make_shared(1); +#else + if (arena == nullptr) { + mTaskArena = std::make_shared(std::abs(n)); + LOGP(info, "Setting tracker with {} threads.", n); + } else { + mTaskArena = arena; + LOGP(info, "Attaching tracker to calling thread's arena"); + } #endif - mTaskArena.initialize(mNThreads); - LOGP(info, "Setting tracker with {} threads.", mNThreads); } template class TrackerTraits<7>; diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx index c70108b4f8a30..4d6548e6a3028 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx @@ -16,6 +16,7 @@ #include "ITSReconstruction/FastMultEst.h" #include "ITStracking/TrackingInterface.h" +#include #include #include "DataFormatsITSMFT/ROFRecord.h" @@ -148,6 +149,20 @@ void ITSTrackingInterface::initialise() } mTracker->setParameters(trackParams); mVertexer->setParameters(vertParams); + if (trackConf.nThreads == vertConf.nThreads) { + bool clamped{false}; + int nThreads = trackConf.nThreads; + if (nThreads > 0) { + const int hw = std::thread::hardware_concurrency(); + const int maxThreads = (hw == 0 ? 1 : hw); + nThreads = std::clamp(nThreads, 1, maxThreads); + clamped = trackConf.nThreads > maxThreads; + } + LOGP(info, "Tracker and Vertexer will share the task arena with {} thread(s){}", nThreads, (clamped) ? " (clamped)" : ""); + mTaskArena = std::make_shared(std::abs(nThreads)); + } + mVertexer->setNThreads(vertConf.nThreads, mTaskArena); + mTracker->setNThreads(trackConf.nThreads, mTaskArena); } void ITSTrackingInterface::run(framework::ProcessingContext& pc) diff --git a/Detectors/ITSMFT/ITS/tracking/src/VertexerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/VertexerTraits.cxx index 37b650c05bd61..51cd98aa1366d 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/VertexerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/VertexerTraits.cxx @@ -11,6 +11,7 @@ /// #include +#include #include #include @@ -168,13 +169,12 @@ void VertexerTraits::updateVertexingParameters(const std::vector(std::ceil(mIndexTableUtils.getNphiBins() * par.phiCut / o2::constants::math::TwoPI)); par.zSpan = static_cast(std::ceil(par.zCut * mIndexTableUtils.getInverseZCoordinate(0))); } - setNThreads(vrtPar[0].nThreads); } // Main functions void VertexerTraits::computeTracklets(const int iteration) { - mTaskArena.execute([&] { + mTaskArena->execute([&] { tbb::parallel_for( tbb::blocked_range(0, (short)mTimeFrame->getNrof()), [&](const tbb::blocked_range& Rofs) { @@ -220,7 +220,7 @@ void VertexerTraits::computeTracklets(const int iteration) mTimeFrame->getTracklets()[0].resize(mTimeFrame->getTotalTrackletsTF(0)); mTimeFrame->getTracklets()[1].resize(mTimeFrame->getTotalTrackletsTF(1)); - mTaskArena.execute([&] { + mTaskArena->execute([&] { tbb::parallel_for( tbb::blocked_range(0, (short)mTimeFrame->getNrof()), [&](const tbb::blocked_range& Rofs) { @@ -329,7 +329,7 @@ void VertexerTraits::computeTracklets(const int iteration) void VertexerTraits::computeTrackletMatching(const int iteration) { - mTaskArena.execute([&] { + mTaskArena->execute([&] { tbb::parallel_for( tbb::blocked_range(0, (short)mTimeFrame->getNrof()), [&](const tbb::blocked_range& Rofs) { @@ -687,15 +687,17 @@ void VertexerTraits::computeVerticesInRof(int rofId, verticesInRof.push_back(foundVertices); } -void VertexerTraits::setNThreads(int n) +void VertexerTraits::setNThreads(int n, std::shared_ptr& arena) { - if (mNThreads == n && mTaskArena.is_active()) { - return; - } - mNThreads = n > 0 ? n : 1; #if defined(VTX_DEBUG) - mNThreads = 1; + mTaskArena = std::make_shared(1); +#else + if (arena == nullptr) { + mTaskArena = std::make_shared(std::abs(n)); + LOGP(info, "Setting seeding vertexer with {} threads.", n); + } else { + mTaskArena = arena; + LOGP(info, "Attaching vertexer to calling thread's arena"); + } #endif - mTaskArena.initialize(mNThreads); - LOGP(info, "Setting seeding vertexer with {} threads.", mNThreads); } From fe3894b25ccef6ce100356939d7dfa01940a1372 Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Tue, 3 Jun 2025 11:11:38 +0200 Subject: [PATCH 2/6] ITS: recover single threaded performance in processNeighbours Signed-off-by: Felix Schlepper --- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 233 +++++++++--------- 1 file changed, 114 insertions(+), 119 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index 186c5a99e7acb..a530966d2d36e 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -14,9 +14,11 @@ /// #include +#include #include #include #include +#include #ifdef OPTIMISATION_OUTPUT #include @@ -43,6 +45,12 @@ namespace o2::its static constexpr int debugLevel{0}; +struct PassMode { + using OnePass = std::integral_constant; + using TwoPassCount = std::integral_constant; + using TwoPassInsert = std::integral_constant; +}; + template void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, int iVertex) { @@ -622,140 +630,127 @@ void TrackerTraits::processNeighbours(int iLayer, int iLevel, const bou #endif mTaskArena->execute([&] { - bounded_vector perCellCount(currentCellSeed.size() + 1, 0, mMemoryPool.get()); - tbb::parallel_for( - tbb::blocked_range(0, (int)currentCellSeed.size()), - [&](const tbb::blocked_range& Cells) { - for (int iCell = Cells.begin(); iCell < Cells.end(); ++iCell) { - const CellSeed& currentCell{currentCellSeed[iCell]}; - int foundSeeds{0}; - if (currentCell.getLevel() != iLevel) { - continue; - } - if (currentCellId.empty() && (mTimeFrame->isClusterUsed(iLayer, currentCell.getFirstClusterIndex()) || - mTimeFrame->isClusterUsed(iLayer + 1, currentCell.getSecondClusterIndex()) || - mTimeFrame->isClusterUsed(iLayer + 2, currentCell.getThirdClusterIndex()))) { - continue; /// this we do only on the first iteration, hence the check on currentCellId - } - const int cellId = currentCellId.empty() ? iCell : currentCellId[iCell]; - const int startNeighbourId{cellId ? mTimeFrame->getCellsNeighboursLUT()[iLayer - 1][cellId - 1] : 0}; - const int endNeighbourId{mTimeFrame->getCellsNeighboursLUT()[iLayer - 1][cellId]}; - - for (int iNeighbourCell{startNeighbourId}; iNeighbourCell < endNeighbourId; ++iNeighbourCell) { - CA_DEBUGGER(attempts++); - const int neighbourCellId = mTimeFrame->getCellsNeighbours()[iLayer - 1][iNeighbourCell]; - const CellSeed& neighbourCell = mTimeFrame->getCells()[iLayer - 1][neighbourCellId]; - if (neighbourCell.getSecondTrackletIndex() != currentCell.getFirstTrackletIndex()) { - CA_DEBUGGER(failedByMismatch++); - continue; - } - if (mTimeFrame->isClusterUsed(iLayer - 1, neighbourCell.getFirstClusterIndex())) { - continue; - } - if (currentCell.getLevel() - 1 != neighbourCell.getLevel()) { - CA_DEBUGGER(failed[0]++); - continue; - } - /// Let's start the fitting procedure - CellSeed seed{currentCell}; - auto& trHit = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer - 1)[neighbourCell.getFirstClusterIndex()]; + auto forCellNeighbours = [&](auto Tag, int iCell, int offset = 0) -> int { + const CellSeed& currentCell{currentCellSeed[iCell]}; - if (!seed.rotate(trHit.alphaTrackingFrame)) { - CA_DEBUGGER(failed[1]++); - continue; - } - - if (!propagator->propagateToX(seed, trHit.xTrackingFrame, getBz(), o2::base::PropagatorImpl::MAX_SIN_PHI, o2::base::PropagatorImpl::MAX_STEP, mCorrType)) { - CA_DEBUGGER(failed[2]++); - continue; - } + if constexpr (decltype(Tag)::value != PassMode::TwoPassInsert::value) { + if (currentCell.getLevel() != iLevel) { + return 0; + } + if (currentCellId.empty() && (mTimeFrame->isClusterUsed(iLayer, currentCell.getFirstClusterIndex()) || + mTimeFrame->isClusterUsed(iLayer + 1, currentCell.getSecondClusterIndex()) || + mTimeFrame->isClusterUsed(iLayer + 2, currentCell.getThirdClusterIndex()))) { + return 0; /// this we do only on the first iteration, hence the check on currentCellId + } + } - if (mCorrType == o2::base::PropagatorF::MatCorrType::USEMatCorrNONE) { - if (!seed.correctForMaterial(mTrkParams[0].LayerxX0[iLayer - 1], mTrkParams[0].LayerxX0[iLayer - 1] * constants::Radl * constants::Rho, true)) { - continue; - } - } + const int cellId = currentCellId.empty() ? iCell : currentCellId[iCell]; + const int startNeighbourId{cellId ? mTimeFrame->getCellsNeighboursLUT()[iLayer - 1][cellId - 1] : 0}; + const int endNeighbourId{mTimeFrame->getCellsNeighboursLUT()[iLayer - 1][cellId]}; + int foundSeeds{0}; + for (int iNeighbourCell{startNeighbourId}; iNeighbourCell < endNeighbourId; ++iNeighbourCell) { + CA_DEBUGGER(attempts++); + const int neighbourCellId = mTimeFrame->getCellsNeighbours()[iLayer - 1][iNeighbourCell]; + const CellSeed& neighbourCell = mTimeFrame->getCells()[iLayer - 1][neighbourCellId]; + if (neighbourCell.getSecondTrackletIndex() != currentCell.getFirstTrackletIndex()) { + CA_DEBUGGER(failedByMismatch++); + continue; + } + if (mTimeFrame->isClusterUsed(iLayer - 1, neighbourCell.getFirstClusterIndex())) { + continue; + } + if (currentCell.getLevel() - 1 != neighbourCell.getLevel()) { + CA_DEBUGGER(failed[0]++); + continue; + } + /// Let's start the fitting procedure + CellSeed seed{currentCell}; + const auto& trHit = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer - 1)[neighbourCell.getFirstClusterIndex()]; - auto predChi2{seed.getPredictedChi2Quiet(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)}; - if ((predChi2 > mTrkParams[0].MaxChi2ClusterAttachment) || predChi2 < 0.f) { - CA_DEBUGGER(failed[3]++); - continue; - } - seed.setChi2(seed.getChi2() + predChi2); - if (!seed.o2::track::TrackParCov::update(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)) { - CA_DEBUGGER(failed[4]++); - continue; - } - ++foundSeeds; - } - perCellCount[iCell] = foundSeeds; + if (!seed.rotate(trHit.alphaTrackingFrame)) { + CA_DEBUGGER(failed[1]++); + continue; } - }); - std::exclusive_scan(perCellCount.begin(), perCellCount.end(), perCellCount.begin(), 0); - auto totalNeighbours{perCellCount.back()}; - if (totalNeighbours == 0) { - return; - } - updatedCellSeeds.resize(totalNeighbours); - updatedCellsIds.resize(totalNeighbours); + if (!propagator->propagateToX(seed, trHit.xTrackingFrame, getBz(), o2::base::PropagatorImpl::MAX_SIN_PHI, o2::base::PropagatorImpl::MAX_STEP, mCorrType)) { + CA_DEBUGGER(failed[2]++); + continue; + } - tbb::parallel_for( - tbb::blocked_range(0, (int)currentCellSeed.size()), - [&](const tbb::blocked_range& Cells) { - for (int iCell = Cells.begin(); iCell < Cells.end(); ++iCell) { - if (perCellCount[iCell] == perCellCount[iCell + 1]) { + if (mCorrType == o2::base::PropagatorF::MatCorrType::USEMatCorrNONE) { + if (!seed.correctForMaterial(mTrkParams[0].LayerxX0[iLayer - 1], mTrkParams[0].LayerxX0[iLayer - 1] * constants::Radl * constants::Rho, true)) { continue; } - // no need for further checks on cell level - - const CellSeed& currentCell{currentCellSeed[iCell]}; - const int cellId = currentCellId.empty() ? iCell : currentCellId[iCell]; - const int startNeighbourId{cellId ? mTimeFrame->getCellsNeighboursLUT()[iLayer - 1][cellId - 1] : 0}; - const int endNeighbourId{mTimeFrame->getCellsNeighboursLUT()[iLayer - 1][cellId]}; - - int offset = perCellCount[iCell]; - for (int iNeighbourCell{startNeighbourId}; iNeighbourCell < endNeighbourId; ++iNeighbourCell) { - const int neighbourCellId = mTimeFrame->getCellsNeighbours()[iLayer - 1][iNeighbourCell]; - const CellSeed& neighbourCell = mTimeFrame->getCells()[iLayer - 1][neighbourCellId]; - if (neighbourCell.getSecondTrackletIndex() != currentCell.getFirstTrackletIndex() || - mTimeFrame->isClusterUsed(iLayer - 1, neighbourCell.getFirstClusterIndex()) || - currentCell.getLevel() - 1 != neighbourCell.getLevel()) { - continue; - } + } - auto seed = currentCell; + auto predChi2{seed.getPredictedChi2Quiet(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)}; + if ((predChi2 > mTrkParams[0].MaxChi2ClusterAttachment) || predChi2 < 0.f) { + CA_DEBUGGER(failed[3]++); + continue; + } + seed.setChi2(seed.getChi2() + predChi2); + if (!seed.o2::track::TrackParCov::update(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)) { + CA_DEBUGGER(failed[4]++); + continue; + } - const auto& trHit = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer - 1)[neighbourCell.getFirstClusterIndex()]; - if (!seed.rotate(trHit.alphaTrackingFrame) || !propagator->propagateToX(seed, trHit.xTrackingFrame, getBz(), o2::base::PropagatorImpl::MAX_SIN_PHI, o2::base::PropagatorImpl::MAX_STEP, mCorrType)) { - continue; - } + if constexpr (decltype(Tag)::value != PassMode::TwoPassCount::value) { + seed.getClusters()[iLayer - 1] = neighbourCell.getFirstClusterIndex(); + seed.setLevel(neighbourCell.getLevel()); + seed.setFirstTrackletIndex(neighbourCell.getFirstTrackletIndex()); + seed.setSecondTrackletIndex(neighbourCell.getSecondTrackletIndex()); + } - if (mCorrType == o2::base::PropagatorF::MatCorrType::USEMatCorrNONE) { - if (!seed.correctForMaterial(mTrkParams[0].LayerxX0[iLayer - 1], mTrkParams[0].LayerxX0[iLayer - 1] * constants::Radl * constants::Rho, true)) { - continue; - } - } + if constexpr (decltype(Tag)::value == PassMode::OnePass::value) { + updatedCellSeeds.push_back(seed); + updatedCellsIds.push_back(neighbourCellId); + } else if constexpr (decltype(Tag)::value == PassMode::TwoPassCount::value) { + ++foundSeeds; + } else if constexpr (decltype(Tag)::value == PassMode::TwoPassInsert::value) { + updatedCellSeeds[offset] = seed; + updatedCellsIds[offset++] = neighbourCellId; + } else { + static_assert(false, "Unknown mode!"); + } + } + return foundSeeds; + }; - auto predChi2{seed.getPredictedChi2Quiet(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)}; - if ((predChi2 > mTrkParams[0].MaxChi2ClusterAttachment) || predChi2 < 0.f) { - continue; - } - seed.setChi2(seed.getChi2() + predChi2); - if (!seed.o2::track::TrackParCov::update(trHit.positionTrackingFrame, trHit.covarianceTrackingFrame)) { - continue; - } + const int nCells = static_cast(currentCellSeed.size()); + if (mTaskArena->max_concurrency() <= 1) { + for (int iCell{0}; iCell < nCells; ++iCell) { + forCellNeighbours(PassMode::OnePass{}, iCell); + } + } else { + bounded_vector perCellCount(nCells + 1, 0, mMemoryPool.get()); + tbb::parallel_for( + tbb::blocked_range(0, nCells), + [&](const tbb::blocked_range& Cells) { + for (int iCell = Cells.begin(); iCell < Cells.end(); ++iCell) { + perCellCount[iCell] = forCellNeighbours(PassMode::TwoPassCount{}, iCell); + } + }); - seed.getClusters()[iLayer - 1] = neighbourCell.getFirstClusterIndex(); - seed.setLevel(neighbourCell.getLevel()); - seed.setFirstTrackletIndex(neighbourCell.getFirstTrackletIndex()); - seed.setSecondTrackletIndex(neighbourCell.getSecondTrackletIndex()); + std::exclusive_scan(perCellCount.begin(), perCellCount.end(), perCellCount.begin(), 0); + auto totalNeighbours{perCellCount.back()}; + if (totalNeighbours == 0) { + return; + } + updatedCellSeeds.resize(totalNeighbours); + updatedCellsIds.resize(totalNeighbours); - updatedCellSeeds[offset] = seed; - updatedCellsIds[offset++] = neighbourCellId; + tbb::parallel_for( + tbb::blocked_range(0, nCells), + [&](const tbb::blocked_range& Cells) { + for (int iCell = Cells.begin(); iCell < Cells.end(); ++iCell) { + int offset = perCellCount[iCell]; + if (offset == perCellCount[iCell + 1]) { + continue; + } + forCellNeighbours(PassMode::TwoPassInsert{}, iCell, offset); } - } - }); + }); + } }); #ifdef CA_DEBUG From 83271074fb3ab608e4247702b3e558c97f9c9482 Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Tue, 3 Jun 2025 12:03:33 +0200 Subject: [PATCH 3/6] ITS: recover single threaded performance in computeLayerCells Signed-off-by: Felix Schlepper --- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 265 +++++++----------- 1 file changed, 107 insertions(+), 158 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index a530966d2d36e..7e6f43a3989dc 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -14,7 +14,6 @@ /// #include -#include #include #include #include @@ -284,183 +283,133 @@ void TrackerTraits::computeLayerCells(const int iteration) } mTaskArena->execute([&] { - tbb::parallel_for( - tbb::blocked_range(0, mTrkParams[iteration].CellsPerRoad()), - [&](const tbb::blocked_range& Layers) { - for (int iLayer = Layers.begin(); iLayer < Layers.end(); ++iLayer) { - - if (mTimeFrame->getTracklets()[iLayer + 1].empty() || - mTimeFrame->getTracklets()[iLayer].empty()) { - continue; - } - -#ifdef OPTIMISATION_OUTPUT - float resolution{o2::gpu::CAMath::Sqrt(0.5f * (mTrkParams[iteration].SystErrorZ2[iLayer] + mTrkParams[iteration].SystErrorZ2[iLayer + 1] + mTrkParams[iteration].SystErrorZ2[iLayer + 2] + mTrkParams[iteration].SystErrorY2[iLayer] + mTrkParams[iteration].SystErrorY2[iLayer + 1] + mTrkParams[iteration].SystErrorY2[iLayer + 2])) / mTrkParams[iteration].LayerResolution[iLayer]}; - resolution = resolution > 1.e-12 ? resolution : 1.f; -#endif - - // count number of cells found - const int currentLayerTrackletsNum{static_cast(mTimeFrame->getTracklets()[iLayer].size())}; - bounded_vector perTrackletCount(currentLayerTrackletsNum + 1, 0, mMemoryPool.get()); - tbb::parallel_for( - tbb::blocked_range(0, currentLayerTrackletsNum), - [&](const tbb::blocked_range& Tracklets) { - for (int iTracklet = Tracklets.begin(); iTracklet < Tracklets.end(); ++iTracklet) { - const Tracklet& currentTracklet{mTimeFrame->getTracklets()[iLayer][iTracklet]}; - const int nextLayerClusterIndex{currentTracklet.secondClusterIndex}; - const int nextLayerFirstTrackletIndex{ - mTimeFrame->getTrackletsLookupTable()[iLayer][nextLayerClusterIndex]}; - const int nextLayerLastTrackletIndex{ - mTimeFrame->getTrackletsLookupTable()[iLayer][nextLayerClusterIndex + 1]}; - - if (nextLayerFirstTrackletIndex == nextLayerLastTrackletIndex) { - continue; - } - - int foundCells{0}; - for (int iNextTracklet{nextLayerFirstTrackletIndex}; iNextTracklet < nextLayerLastTrackletIndex; ++iNextTracklet) { - if (mTimeFrame->getTracklets()[iLayer + 1][iNextTracklet].firstClusterIndex != nextLayerClusterIndex) { - break; - } - const Tracklet& nextTracklet{mTimeFrame->getTracklets()[iLayer + 1][iNextTracklet]}; - const float deltaTanLambda{std::abs(currentTracklet.tanLambda - nextTracklet.tanLambda)}; + auto forTrackletCells = [&](auto Tag, int iLayer, bounded_vector& layerCells, int iTracklet, int offset = 0) -> int { + const Tracklet& currentTracklet{mTimeFrame->getTracklets()[iLayer][iTracklet]}; + const int nextLayerClusterIndex{currentTracklet.secondClusterIndex}; + const int nextLayerFirstTrackletIndex{ + mTimeFrame->getTrackletsLookupTable()[iLayer][nextLayerClusterIndex]}; + const int nextLayerLastTrackletIndex{ + mTimeFrame->getTrackletsLookupTable()[iLayer][nextLayerClusterIndex + 1]}; + + int foundCells{0}; + for (int iNextTracklet{nextLayerFirstTrackletIndex}; iNextTracklet < nextLayerLastTrackletIndex; ++iNextTracklet) { + if (mTimeFrame->getTracklets()[iLayer + 1][iNextTracklet].firstClusterIndex != nextLayerClusterIndex) { + break; + } + const Tracklet& nextTracklet{mTimeFrame->getTracklets()[iLayer + 1][iNextTracklet]}; + const float deltaTanLambda{std::abs(currentTracklet.tanLambda - nextTracklet.tanLambda)}; #ifdef OPTIMISATION_OUTPUT - bool good{mTimeFrame->getTrackletsLabel(iLayer)[iTracklet] == mTimeFrame->getTrackletsLabel(iLayer + 1)[iNextTracklet]}; - float signedDelta{currentTracklet.tanLambda - nextTracklet.tanLambda}; - off << std::format("{}\t{:d}\t{}\t{}\t{}\t{}", iLayer, good, signedDelta, signedDelta / (mTrkParams[iteration].CellDeltaTanLambdaSigma), tanLambda, resolution) << std::endl; + float resolution{o2::gpu::CAMath::Sqrt(0.5f * (mTrkParams[iteration].SystErrorZ2[iLayer] + mTrkParams[iteration].SystErrorZ2[iLayer + 1] + mTrkParams[iteration].SystErrorZ2[iLayer + 2] + mTrkParams[iteration].SystErrorY2[iLayer] + mTrkParams[iteration].SystErrorY2[iLayer + 1] + mTrkParams[iteration].SystErrorY2[iLayer + 2])) / mTrkParams[iteration].LayerResolution[iLayer]}; + resolution = resolution > 1.e-12 ? resolution : 1.f; + bool good{mTimeFrame->getTrackletsLabel(iLayer)[iTracklet] == mTimeFrame->getTrackletsLabel(iLayer + 1)[iNextTracklet]}; + float signedDelta{currentTracklet.tanLambda - nextTracklet.tanLambda}; + off << std::format("{}\t{:d}\t{}\t{}\t{}\t{}", iLayer, good, signedDelta, signedDelta / (mTrkParams[iteration].CellDeltaTanLambdaSigma), tanLambda, resolution) << std::endl; #endif - if (deltaTanLambda / mTrkParams[iteration].CellDeltaTanLambdaSigma < mTrkParams[iteration].NSigmaCut) { + if (deltaTanLambda / mTrkParams[iteration].CellDeltaTanLambdaSigma < mTrkParams[iteration].NSigmaCut) { - /// Track seed preparation. Clusters are numbered progressively from the innermost going outward. - const int clusId[3]{ - mTimeFrame->getClusters()[iLayer][currentTracklet.firstClusterIndex].clusterId, - mTimeFrame->getClusters()[iLayer + 1][nextTracklet.firstClusterIndex].clusterId, - mTimeFrame->getClusters()[iLayer + 2][nextTracklet.secondClusterIndex].clusterId}; - const auto& cluster1_glo = mTimeFrame->getUnsortedClusters()[iLayer][clusId[0]]; - const auto& cluster2_glo = mTimeFrame->getUnsortedClusters()[iLayer + 1][clusId[1]]; - const auto& cluster3_tf = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer + 2)[clusId[2]]; - auto track{buildTrackSeed(cluster1_glo, cluster2_glo, cluster3_tf)}; + /// Track seed preparation. Clusters are numbered progressively from the innermost going outward. + const int clusId[3]{ + mTimeFrame->getClusters()[iLayer][currentTracklet.firstClusterIndex].clusterId, + mTimeFrame->getClusters()[iLayer + 1][nextTracklet.firstClusterIndex].clusterId, + mTimeFrame->getClusters()[iLayer + 2][nextTracklet.secondClusterIndex].clusterId}; + const auto& cluster1_glo = mTimeFrame->getUnsortedClusters()[iLayer][clusId[0]]; + const auto& cluster2_glo = mTimeFrame->getUnsortedClusters()[iLayer + 1][clusId[1]]; + const auto& cluster3_tf = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer + 2)[clusId[2]]; + auto track{buildTrackSeed(cluster1_glo, cluster2_glo, cluster3_tf)}; - float chi2{0.f}; - bool good{false}; - for (int iC{2}; iC--;) { - const TrackingFrameInfo& trackingHit = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer + iC)[clusId[iC]]; + float chi2{0.f}; + bool good{false}; + for (int iC{2}; iC--;) { + const TrackingFrameInfo& trackingHit = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer + iC)[clusId[iC]]; - if (!track.rotate(trackingHit.alphaTrackingFrame)) { - break; - } + if (!track.rotate(trackingHit.alphaTrackingFrame)) { + break; + } - if (!track.propagateTo(trackingHit.xTrackingFrame, getBz())) { - break; - } + if (!track.propagateTo(trackingHit.xTrackingFrame, getBz())) { + break; + } - if (!track.correctForMaterial(mTrkParams[0].LayerxX0[iLayer + iC], mTrkParams[0].LayerxX0[iLayer] * constants::Radl * constants::Rho, true)) { - break; - } + if (!track.correctForMaterial(mTrkParams[0].LayerxX0[iLayer + iC], mTrkParams[0].LayerxX0[iLayer] * constants::Radl * constants::Rho, true)) { + break; + } - const auto predChi2{track.getPredictedChi2Quiet(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)}; - if (!iC && predChi2 > mTrkParams[iteration].MaxChi2ClusterAttachment) { - break; - } + const auto predChi2{track.getPredictedChi2Quiet(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)}; + if (!iC && predChi2 > mTrkParams[iteration].MaxChi2ClusterAttachment) { + break; + } - if (!track.o2::track::TrackParCov::update(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)) { - break; - } + if (!track.o2::track::TrackParCov::update(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)) { + break; + } - good = !iC; - chi2 += predChi2; - } - if (good) { - ++foundCells; - } - } - } - perTrackletCount[iTracklet] = foundCells; - } - }); + good = !iC; + chi2 += predChi2; + } + if (good) { + if constexpr (decltype(Tag)::value == PassMode::OnePass::value) { + layerCells.emplace_back(iLayer, clusId[0], clusId[1], clusId[2], iTracklet, iNextTracklet, track, chi2); + ++foundCells; + } else if constexpr (decltype(Tag)::value == PassMode::TwoPassCount::value) { + ++foundCells; + } else if constexpr (decltype(Tag)::value == PassMode::TwoPassInsert::value) { + layerCells[offset++] = CellSeed(iLayer, clusId[0], clusId[1], clusId[2], iTracklet, iNextTracklet, track, chi2); + } else { + static_assert(false, "Unknown mode!"); + } + } + } + } + return foundCells; + }; - // calculate offset table and check if any cells where found - std::exclusive_scan(perTrackletCount.begin(), perTrackletCount.end(), perTrackletCount.begin(), 0); - auto totalCells{perTrackletCount.back()}; - if (totalCells == 0) { + tbb::parallel_for( + tbb::blocked_range(0, mTrkParams[iteration].CellsPerRoad()), + [&](const tbb::blocked_range& Layers) { + for (int iLayer = Layers.begin(); iLayer < Layers.end(); ++iLayer) { + if (mTimeFrame->getTracklets()[iLayer + 1].empty() || + mTimeFrame->getTracklets()[iLayer].empty()) { continue; } - auto& layerCells = mTimeFrame->getCells()[iLayer]; - layerCells.resize(totalCells); - tbb::parallel_for( - tbb::blocked_range(0, currentLayerTrackletsNum), - [&](const tbb::blocked_range& Tracklets) { - for (int iTracklet = Tracklets.begin(); iTracklet < Tracklets.end(); ++iTracklet) { - if (perTrackletCount[iTracklet] == perTrackletCount[iTracklet + 1]) { - continue; + auto& layerCells = mTimeFrame->getCells()[iLayer]; + const int currentLayerTrackletsNum{static_cast(mTimeFrame->getTracklets()[iLayer].size())}; + bounded_vector perTrackletCount(currentLayerTrackletsNum + 1, 0, mMemoryPool.get()); + if (mTaskArena->max_concurrency() <= 1) { + for (int iTracklet{0}; iTracklet < currentLayerTrackletsNum; ++iTracklet) { + perTrackletCount[iTracklet] = forTrackletCells(PassMode::OnePass{}, iLayer, layerCells, iTracklet); + } + std::exclusive_scan(perTrackletCount.begin(), perTrackletCount.end(), perTrackletCount.begin(), 0); + } else { + tbb::parallel_for( + tbb::blocked_range(0, currentLayerTrackletsNum), + [&](const tbb::blocked_range& Tracklets) { + for (int iTracklet = Tracklets.begin(); iTracklet < Tracklets.end(); ++iTracklet) { + perTrackletCount[iTracklet] = forTrackletCells(PassMode::TwoPassCount{}, iLayer, layerCells, iTracklet); } + }); - const Tracklet& currentTracklet{mTimeFrame->getTracklets()[iLayer][iTracklet]}; - const int nextLayerClusterIndex{currentTracklet.secondClusterIndex}; - const int nextLayerFirstTrackletIndex{ - mTimeFrame->getTrackletsLookupTable()[iLayer][nextLayerClusterIndex]}; - const int nextLayerLastTrackletIndex{ - mTimeFrame->getTrackletsLookupTable()[iLayer][nextLayerClusterIndex + 1]}; - - int position = perTrackletCount[iTracklet]; - for (int iNextTracklet{nextLayerFirstTrackletIndex}; iNextTracklet < nextLayerLastTrackletIndex; ++iNextTracklet) { - if (mTimeFrame->getTracklets()[iLayer + 1][iNextTracklet].firstClusterIndex != nextLayerClusterIndex) { - break; - } - const Tracklet& nextTracklet{mTimeFrame->getTracklets()[iLayer + 1][iNextTracklet]}; - const float deltaTanLambda{std::abs(currentTracklet.tanLambda - nextTracklet.tanLambda)}; - - if (deltaTanLambda / mTrkParams[iteration].CellDeltaTanLambdaSigma < mTrkParams[iteration].NSigmaCut) { - - /// Track seed preparation. Clusters are numbered progressively from the innermost going outward. - const int clusId[3]{ - mTimeFrame->getClusters()[iLayer][currentTracklet.firstClusterIndex].clusterId, - mTimeFrame->getClusters()[iLayer + 1][nextTracklet.firstClusterIndex].clusterId, - mTimeFrame->getClusters()[iLayer + 2][nextTracklet.secondClusterIndex].clusterId}; - const auto& cluster1_glo = mTimeFrame->getUnsortedClusters()[iLayer][clusId[0]]; - const auto& cluster2_glo = mTimeFrame->getUnsortedClusters()[iLayer + 1][clusId[1]]; - const auto& cluster3_tf = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer + 2)[clusId[2]]; - auto track{buildTrackSeed(cluster1_glo, cluster2_glo, cluster3_tf)}; - - float chi2{0.f}; - bool good{false}; - for (int iC{2}; iC--;) { - const TrackingFrameInfo& trackingHit = mTimeFrame->getTrackingFrameInfoOnLayer(iLayer + iC)[clusId[iC]]; - - if (!track.rotate(trackingHit.alphaTrackingFrame)) { - break; - } - - if (!track.propagateTo(trackingHit.xTrackingFrame, getBz())) { - break; - } - - if (!track.correctForMaterial(mTrkParams[0].LayerxX0[iLayer + iC], mTrkParams[0].LayerxX0[iLayer] * constants::Radl * constants::Rho, true)) { - break; - } - - const auto predChi2{track.getPredictedChi2Quiet(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)}; - if (!iC && predChi2 > mTrkParams[iteration].MaxChi2ClusterAttachment) { - break; - } - - if (!track.o2::track::TrackParCov::update(trackingHit.positionTrackingFrame, trackingHit.covarianceTrackingFrame)) { - break; - } - - good = !iC; - chi2 += predChi2; - } - if (good) { - layerCells[position++] = CellSeed(iLayer, clusId[0], clusId[1], clusId[2], iTracklet, iNextTracklet, track, chi2); - } + std::exclusive_scan(perTrackletCount.begin(), perTrackletCount.end(), perTrackletCount.begin(), 0); + auto totalCells{perTrackletCount.back()}; + if (totalCells == 0) { + continue; + } + layerCells.resize(totalCells); + + tbb::parallel_for( + tbb::blocked_range(0, currentLayerTrackletsNum), + [&](const tbb::blocked_range& Tracklets) { + for (int iTracklet = Tracklets.begin(); iTracklet < Tracklets.end(); ++iTracklet) { + int offset = perTrackletCount[iTracklet]; + if (offset == perTrackletCount[iTracklet + 1]) { + continue; } + forTrackletCells(PassMode::TwoPassInsert{}, iLayer, layerCells, iTracklet, offset); } - } - }); + }); + } if (iLayer > 0) { auto& lut = mTimeFrame->getCellsLookupTable()[iLayer - 1]; From 250001fe88f8b7f8c4d13520d1a5c2548d5dc54e Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Tue, 3 Jun 2025 13:35:29 +0200 Subject: [PATCH 4/6] ITS: recover single threaded performance in findCellsNeighbours Signed-off-by: Felix Schlepper --- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 186 +++++++++--------- 1 file changed, 95 insertions(+), 91 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index 7e6f43a3989dc..a06c3b2f485ec 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -444,128 +444,132 @@ void TrackerTraits::findCellsNeighbours(const int iteration) #ifdef OPTIMISATION_OUTPUT std::ofstream off(std::format("cellneighs{}.txt", iteration)); #endif - for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { - const int nextLayerCellsNum{static_cast(mTimeFrame->getCells()[iLayer + 1].size())}; - deepVectorClear(mTimeFrame->getCellsNeighbours()[iLayer]); - deepVectorClear(mTimeFrame->getCellsNeighboursLUT()[iLayer]); - if (mTimeFrame->getCells()[iLayer + 1].empty() || - mTimeFrame->getCellsLookupTable()[iLayer].empty()) { - continue; - } - mTaskArena->execute([&] { - int layerCellsNum{static_cast(mTimeFrame->getCells()[iLayer].size())}; + struct Neighbor { + int cell{-1}, nextCell{-1}, level{-1}; + }; - bounded_vector perCellCount(layerCellsNum + 1, 0, mMemoryPool.get()); - tbb::parallel_for( - tbb::blocked_range(0, layerCellsNum), - [&](const tbb::blocked_range& Cells) { - for (int iCell = Cells.begin(); iCell < Cells.end(); ++iCell) { - const auto& currentCellSeed{mTimeFrame->getCells()[iLayer][iCell]}; - const int nextLayerTrackletIndex{currentCellSeed.getSecondTrackletIndex()}; - const int nextLayerFirstCellIndex{mTimeFrame->getCellsLookupTable()[iLayer][nextLayerTrackletIndex]}; - const int nextLayerLastCellIndex{mTimeFrame->getCellsLookupTable()[iLayer][nextLayerTrackletIndex + 1]}; - - int foundNextCells{0}; - for (int iNextCell{nextLayerFirstCellIndex}; iNextCell < nextLayerLastCellIndex; ++iNextCell) { - auto nextCellSeed{mTimeFrame->getCells()[iLayer + 1][iNextCell]}; /// copy - if (nextCellSeed.getFirstTrackletIndex() != nextLayerTrackletIndex) { - break; - } + mTaskArena->execute([&] { + for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { + deepVectorClear(mTimeFrame->getCellsNeighbours()[iLayer]); + deepVectorClear(mTimeFrame->getCellsNeighboursLUT()[iLayer]); + if (mTimeFrame->getCells()[iLayer + 1].empty() || + mTimeFrame->getCellsLookupTable()[iLayer].empty()) { + continue; + } - if (!nextCellSeed.rotate(currentCellSeed.getAlpha()) || - !nextCellSeed.propagateTo(currentCellSeed.getX(), getBz())) { - continue; - } - float chi2 = currentCellSeed.getPredictedChi2(nextCellSeed); /// TODO: switch to the chi2 wrt cluster to avoid correlation + int nCells{static_cast(mTimeFrame->getCells()[iLayer].size())}; + bounded_vector cellsNeighbours(mMemoryPool.get()); + + auto forCellNeighbour = [&](auto Tag, int iCell, int offset = 0) -> int { + const auto& currentCellSeed{mTimeFrame->getCells()[iLayer][iCell]}; + const int nextLayerTrackletIndex{currentCellSeed.getSecondTrackletIndex()}; + const int nextLayerFirstCellIndex{mTimeFrame->getCellsLookupTable()[iLayer][nextLayerTrackletIndex]}; + const int nextLayerLastCellIndex{mTimeFrame->getCellsLookupTable()[iLayer][nextLayerTrackletIndex + 1]}; + int foundNextCells{0}; + for (int iNextCell{nextLayerFirstCellIndex}; iNextCell < nextLayerLastCellIndex; ++iNextCell) { + auto nextCellSeed{mTimeFrame->getCells()[iLayer + 1][iNextCell]}; /// copy + if (nextCellSeed.getFirstTrackletIndex() != nextLayerTrackletIndex) { + break; + } + + if (!nextCellSeed.rotate(currentCellSeed.getAlpha()) || + !nextCellSeed.propagateTo(currentCellSeed.getX(), getBz())) { + continue; + } + float chi2 = currentCellSeed.getPredictedChi2(nextCellSeed); /// TODO: switch to the chi2 wrt cluster to avoid correlation #ifdef OPTIMISATION_OUTPUT - bool good{mTimeFrame->getCellsLabel(iLayer)[iCell] == mTimeFrame->getCellsLabel(iLayer + 1)[iNextCell]}; - off << std::format("{}\t{:d}\t{}", iLayer, good, chi2) << std::endl; + bool good{mTimeFrame->getCellsLabel(iLayer)[iCell] == mTimeFrame->getCellsLabel(iLayer + 1)[iNextCell]}; + off << std::format("{}\t{:d}\t{}", iLayer, good, chi2) << std::endl; #endif - if (chi2 > mTrkParams[0].MaxChi2ClusterAttachment) { - continue; - } - ++foundNextCells; - } - perCellCount[iCell] = foundNextCells; + if (chi2 > mTrkParams[0].MaxChi2ClusterAttachment) { + continue; } - }); - - std::exclusive_scan(perCellCount.begin(), perCellCount.end(), perCellCount.begin(), 0); - int totalCellNeighbours = perCellCount.back(); - if (totalCellNeighbours == 0) { - deepVectorClear(mTimeFrame->getCellsNeighbours()[iLayer]); - return; - } - struct Neighbor { - int cell{-1}, nextCell{-1}, level{-1}; + if constexpr (decltype(Tag)::value == PassMode::OnePass::value) { + cellsNeighbours.emplace_back(iCell, iNextCell, currentCellSeed.getLevel() + 1); + } else if constexpr (decltype(Tag)::value == PassMode::TwoPassCount::value) { + ++foundNextCells; + } else if constexpr (decltype(Tag)::value == PassMode::TwoPassInsert::value) { + cellsNeighbours[offset++] = {iCell, iNextCell, currentCellSeed.getLevel() + 1}; + } else { + static_assert(false, "Unknown mode!"); + } + } + return foundNextCells; }; - bounded_vector cellsNeighbours(mMemoryPool.get()); - cellsNeighbours.resize(totalCellNeighbours); - tbb::parallel_for( - tbb::blocked_range(0, layerCellsNum), - [&](const tbb::blocked_range& Cells) { - for (int iCell = Cells.begin(); iCell < Cells.end(); ++iCell) { - if (perCellCount[iCell] == perCellCount[iCell + 1]) { - continue; + if (mTaskArena->max_concurrency() <= 1) { + for (int iCell{0}; iCell < nCells; ++iCell) { + forCellNeighbour(PassMode::OnePass{}, iCell); + } + } else { + bounded_vector perCellCount(nCells + 1, 0, mMemoryPool.get()); + tbb::parallel_for( + tbb::blocked_range(0, nCells), + [&](const tbb::blocked_range& Cells) { + for (int iCell = Cells.begin(); iCell < Cells.end(); ++iCell) { + perCellCount[iCell] = forCellNeighbour(PassMode::TwoPassCount{}, iCell); } - const auto& currentCellSeed{mTimeFrame->getCells()[iLayer][iCell]}; - const int nextLayerTrackletIndex{currentCellSeed.getSecondTrackletIndex()}; - const int nextLayerFirstCellIndex{mTimeFrame->getCellsLookupTable()[iLayer][nextLayerTrackletIndex]}; - const int nextLayerLastCellIndex{mTimeFrame->getCellsLookupTable()[iLayer][nextLayerTrackletIndex + 1]}; - - int position = perCellCount[iCell]; - for (int iNextCell{nextLayerFirstCellIndex}; iNextCell < nextLayerLastCellIndex; ++iNextCell) { - auto nextCellSeed{mTimeFrame->getCells()[iLayer + 1][iNextCell]}; /// copy - if (nextCellSeed.getFirstTrackletIndex() != nextLayerTrackletIndex) { - break; - } + }); - if (!nextCellSeed.rotate(currentCellSeed.getAlpha()) || - !nextCellSeed.propagateTo(currentCellSeed.getX(), getBz())) { - continue; - } - - float chi2 = currentCellSeed.getPredictedChi2(nextCellSeed); /// TODO: switch to the chi2 wrt cluster to avoid correlation - if (chi2 > mTrkParams[0].MaxChi2ClusterAttachment) { + std::exclusive_scan(perCellCount.begin(), perCellCount.end(), perCellCount.begin(), 0); + int totalCellNeighbours = perCellCount.back(); + if (totalCellNeighbours == 0) { + deepVectorClear(mTimeFrame->getCellsNeighbours()[iLayer]); + continue; + } + cellsNeighbours.resize(totalCellNeighbours); + + tbb::parallel_for( + tbb::blocked_range(0, nCells), + [&](const tbb::blocked_range& Cells) { + for (int iCell = Cells.begin(); iCell < Cells.end(); ++iCell) { + int offset = perCellCount[iCell]; + if (offset == perCellCount[iCell + 1]) { continue; } - - cellsNeighbours[position++] = {iCell, iNextCell, currentCellSeed.getLevel() + 1}; + forCellNeighbour(PassMode::TwoPassInsert{}, iCell, offset); } - } - }); + }); + } + + if (cellsNeighbours.empty()) { + continue; + } tbb::parallel_sort(cellsNeighbours.begin(), cellsNeighbours.end(), [](const auto& a, const auto& b) { return a.nextCell < b.nextCell; }); auto& cellsNeighbourLUT = mTimeFrame->getCellsNeighboursLUT()[iLayer]; - cellsNeighbourLUT.assign(nextLayerCellsNum, 0); + cellsNeighbourLUT.assign(mTimeFrame->getCells()[iLayer + 1].size(), 0); for (const auto& neigh : cellsNeighbours) { ++cellsNeighbourLUT[neigh.nextCell]; } std::inclusive_scan(cellsNeighbourLUT.begin(), cellsNeighbourLUT.end(), cellsNeighbourLUT.begin()); - mTimeFrame->getCellsNeighbours()[iLayer].reserve(totalCellNeighbours); + mTimeFrame->getCellsNeighbours()[iLayer].reserve(cellsNeighbours.size()); std::ranges::transform(cellsNeighbours, std::back_inserter(mTimeFrame->getCellsNeighbours()[iLayer]), [](const auto& neigh) { return neigh.cell; }); auto it = cellsNeighbours.begin(); - while (it != cellsNeighbours.end()) { - const int current_nextCell = it->nextCell; - auto group_end = std::find_if_not(it, cellsNeighbours.end(), - [current_nextCell](const auto& nb) { return nb.nextCell == current_nextCell; }); - const auto max_level_it = std::max_element(it, group_end, - [](const auto& a, const auto& b) { return a.level < b.level; }); - mTimeFrame->getCells()[iLayer + 1][current_nextCell].setLevel(max_level_it->level); - it = group_end; + int current = it->nextCell; + int maxLvl = it->level; + ++it; + for (; it != cellsNeighbours.end(); ++it) { + if (it->nextCell == current) { + maxLvl = std::max(maxLvl, it->level); + } else { + mTimeFrame->getCells()[iLayer + 1][current].setLevel(maxLvl); + current = it->nextCell; + maxLvl = it->level; + } } - }); - } + mTimeFrame->getCells()[iLayer + 1][current].setLevel(maxLvl); + } + }); } template From ef91595ff55f9ee4b2204c7e011a472d9001f7fc Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Wed, 4 Jun 2025 15:36:08 +0200 Subject: [PATCH 5/6] ITS: recover single threaded performance in findRoads Signed-off-by: Felix Schlepper --- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 116 ++++++++++-------- 1 file changed, 62 insertions(+), 54 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index a06c3b2f485ec..2cc9a8bd1c312 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -761,65 +761,73 @@ void TrackerTraits::findRoads(const int iteration) bounded_vector tracks(mMemoryPool.get()); mTaskArena->execute([&] { - bounded_vector perSeedCount(trackSeeds.size() + 1, 0, mMemoryPool.get()); - tbb::parallel_for( - tbb::blocked_range(0, (int)trackSeeds.size()), - [&](const tbb::blocked_range& Seeds) { - for (int iSeed = Seeds.begin(); iSeed < Seeds.end(); ++iSeed) { - const CellSeed& seed{trackSeeds[iSeed]}; - TrackITSExt temporaryTrack{seed}; - temporaryTrack.resetCovariance(); - temporaryTrack.setChi2(0); - for (int iL{0}; iL < 7; ++iL) { - temporaryTrack.setExternalClusterIndex(iL, seed.getCluster(iL), seed.getCluster(iL) != constants::UnusedIndex); - } + auto forSeed = [&](auto Tag, int iSeed, int offset = 0) { + const CellSeed& seed{trackSeeds[iSeed]}; + TrackITSExt temporaryTrack{seed}; + temporaryTrack.resetCovariance(); + temporaryTrack.setChi2(0); + for (int iL{0}; iL < 7; ++iL) { + temporaryTrack.setExternalClusterIndex(iL, seed.getCluster(iL), seed.getCluster(iL) != constants::UnusedIndex); + } - bool fitSuccess = fitTrack(temporaryTrack, 0, mTrkParams[0].NLayers, 1, mTrkParams[0].MaxChi2ClusterAttachment, mTrkParams[0].MaxChi2NDF); - if (!fitSuccess) { - continue; - } - temporaryTrack.getParamOut() = temporaryTrack.getParamIn(); - temporaryTrack.resetCovariance(); - temporaryTrack.setChi2(0); - fitSuccess = fitTrack(temporaryTrack, mTrkParams[0].NLayers - 1, -1, -1, mTrkParams[0].MaxChi2ClusterAttachment, mTrkParams[0].MaxChi2NDF, 50.f); - if (!fitSuccess || temporaryTrack.getPt() < mTrkParams[iteration].MinPt[mTrkParams[iteration].NLayers - temporaryTrack.getNClusters()]) { - continue; - } - ++perSeedCount[iSeed]; - } - }); - std::exclusive_scan(perSeedCount.begin(), perSeedCount.end(), perSeedCount.begin(), 0); - auto totalTracks{perSeedCount.back()}; - if (totalTracks == 0) { - return; - } - tracks.resize(totalTracks); + bool fitSuccess = fitTrack(temporaryTrack, 0, mTrkParams[0].NLayers, 1, mTrkParams[0].MaxChi2ClusterAttachment, mTrkParams[0].MaxChi2NDF); + if (!fitSuccess) { + return 0; + } - tbb::parallel_for( - tbb::blocked_range(0, (int)trackSeeds.size()), - [&](const tbb::blocked_range& Seeds) { - for (int iSeed = Seeds.begin(); iSeed < Seeds.end(); ++iSeed) { - if (perSeedCount[iSeed] == perSeedCount[iSeed + 1]) { - continue; - } - const CellSeed& seed{trackSeeds[iSeed]}; - auto& trk = tracks[perSeedCount[iSeed]] = TrackITSExt(seed); - trk.resetCovariance(); - trk.setChi2(0); - for (int iL{0}; iL < 7; ++iL) { - trk.setExternalClusterIndex(iL, seed.getCluster(iL), seed.getCluster(iL) != constants::UnusedIndex); + temporaryTrack.getParamOut() = temporaryTrack.getParamIn(); + temporaryTrack.resetCovariance(); + temporaryTrack.setChi2(0); + fitSuccess = fitTrack(temporaryTrack, mTrkParams[0].NLayers - 1, -1, -1, mTrkParams[0].MaxChi2ClusterAttachment, mTrkParams[0].MaxChi2NDF, 50.f); + if (!fitSuccess || temporaryTrack.getPt() < mTrkParams[iteration].MinPt[mTrkParams[iteration].NLayers - temporaryTrack.getNClusters()]) { + return 0; + } + + if constexpr (decltype(Tag)::value == PassMode::OnePass::value) { + tracks.push_back(temporaryTrack); + } else if constexpr (decltype(Tag)::value == PassMode::TwoPassCount::value) { + // nothing to do + } else if constexpr (decltype(Tag)::value == PassMode::TwoPassInsert::value) { + tracks[offset] = temporaryTrack; + } else { + static_assert(false, "Unknown mode!"); + } + return 1; + }; + + const int nSeeds = static_cast(trackSeeds.size()); + if (mTaskArena->max_concurrency() <= 1) { + for (int iSeed{0}; iSeed < nSeeds; ++iSeed) { + forSeed(PassMode::OnePass{}, iSeed); + } + } else { + bounded_vector perSeedCount(nSeeds + 1, 0, mMemoryPool.get()); + tbb::parallel_for( + tbb::blocked_range(0, nSeeds), + [&](const tbb::blocked_range& Seeds) { + for (int iSeed = Seeds.begin(); iSeed < Seeds.end(); ++iSeed) { + perSeedCount[iSeed] = forSeed(PassMode::TwoPassCount{}, iSeed); } + }); - bool fitSuccess = fitTrack(trk, 0, mTrkParams[0].NLayers, 1, mTrkParams[0].MaxChi2ClusterAttachment, mTrkParams[0].MaxChi2NDF); - if (!fitSuccess) { - continue; + std::exclusive_scan(perSeedCount.begin(), perSeedCount.end(), perSeedCount.begin(), 0); + auto totalTracks{perSeedCount.back()}; + if (totalTracks == 0) { + return; + } + tracks.resize(totalTracks); + + tbb::parallel_for( + tbb::blocked_range(0, nSeeds), + [&](const tbb::blocked_range& Seeds) { + for (int iSeed = Seeds.begin(); iSeed < Seeds.end(); ++iSeed) { + if (perSeedCount[iSeed] == perSeedCount[iSeed + 1]) { + continue; + } + forSeed(PassMode::TwoPassInsert{}, iSeed, perSeedCount[iSeed]); } - trk.getParamOut() = trk.getParamIn(); - trk.resetCovariance(); - trk.setChi2(0); - fitTrack(trk, mTrkParams[0].NLayers - 1, -1, -1, mTrkParams[0].MaxChi2ClusterAttachment, mTrkParams[0].MaxChi2NDF, 50.f); - } - }); + }); + } deepVectorClear(trackSeeds); tbb::parallel_sort(tracks.begin(), tracks.end(), [](const auto& a, const auto& b) { From 92d5d19b2d5d93523fa99082007a4350972cf744 Mon Sep 17 00:00:00 2001 From: Felix Schlepper Date: Sun, 8 Jun 2025 23:45:32 +0200 Subject: [PATCH 6/6] ITS: just use one arena call in computeLayerTracklets Signed-off-by: Felix Schlepper --- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 67 ++++++++----------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index 2cc9a8bd1c312..8dd6b9870115c 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -71,14 +71,15 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROF gsl::span diamondSpan(&diamondVert, 1); int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0}; int endROF{o2::gpu::GPUCommonMath::Min(mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrame->getNrof(), mTimeFrame->getNrof())}; - for (int rof0{startROF}; rof0 < endROF; ++rof0) { - gsl::span primaryVertices = mTrkParams[iteration].UseDiamond ? diamondSpan : mTimeFrame->getPrimaryVertices(rof0); - const int startVtx{iVertex >= 0 ? iVertex : 0}; - const int endVtx{iVertex >= 0 ? o2::gpu::CAMath::Min(iVertex + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; - int minRof = o2::gpu::CAMath::Max(startROF, rof0 - mTrkParams[iteration].DeltaROF); - int maxRof = o2::gpu::CAMath::Min(endROF - 1, rof0 + mTrkParams[iteration].DeltaROF); - mTaskArena->execute([&] { + mTaskArena->execute([&] { + for (int rof0{startROF}; rof0 < endROF; ++rof0) { + gsl::span primaryVertices = mTrkParams[iteration].UseDiamond ? diamondSpan : mTimeFrame->getPrimaryVertices(rof0); + const int startVtx{iVertex >= 0 ? iVertex : 0}; + const int endVtx{iVertex >= 0 ? o2::gpu::CAMath::Min(iVertex + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; + int minRof = o2::gpu::CAMath::Max(startROF, rof0 - mTrkParams[iteration].DeltaROF); + int maxRof = o2::gpu::CAMath::Min(endROF - 1, rof0 + mTrkParams[iteration].DeltaROF); + tbb::parallel_for( tbb::blocked_range(0, mTrkParams[iteration].TrackletsPerRoad()), [&](const tbb::blocked_range& Layers) { @@ -197,49 +198,39 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROF } } }); - }); - } - - auto sortTracklets = [](const Tracklet& a, const Tracklet& b) -> bool { - return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); - }; - auto equalTracklets = [](const Tracklet& a, const Tracklet& b) -> bool { - return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; - }; + } - mTaskArena->execute([&] { tbb::parallel_for( - tbb::blocked_range(0, mTrkParams[iteration].CellsPerRoad()), + tbb::blocked_range(0, mTrkParams[iteration].TrackletsPerRoad()), [&](const tbb::blocked_range& Layers) { for (int iLayer = Layers.begin(); iLayer < Layers.end(); ++iLayer) { /// Sort tracklets - auto& trkl{mTimeFrame->getTracklets()[iLayer + 1]}; - tbb::parallel_sort(trkl.begin(), trkl.end(), sortTracklets); + auto& trkl{mTimeFrame->getTracklets()[iLayer]}; + tbb::parallel_sort(trkl.begin(), trkl.end(), [](const Tracklet& a, const Tracklet& b) -> bool { + return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); + }); /// Remove duplicates - trkl.erase(std::unique(trkl.begin(), trkl.end(), equalTracklets), trkl.end()); + trkl.erase(std::unique(trkl.begin(), trkl.end(), [](const Tracklet& a, const Tracklet& b) -> bool { + return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; + }), + trkl.end()); trkl.shrink_to_fit(); - /// recalculate lut - auto& lut{mTimeFrame->getTrackletsLookupTable()[iLayer]}; - std::fill(lut.begin(), lut.end(), 0); - if (trkl.empty()) { - return; - } - for (const auto& tkl : trkl) { - lut[tkl.firstClusterIndex]++; + if (iLayer > 0) { /// recalculate lut + auto& lut{mTimeFrame->getTrackletsLookupTable()[iLayer - 1]}; + std::fill(lut.begin(), lut.end(), 0); + if (trkl.empty()) { + return; + } + for (const auto& tkl : trkl) { + lut[tkl.firstClusterIndex]++; + } + std::exclusive_scan(lut.begin(), lut.end(), lut.begin(), 0); + lut.push_back(trkl.size()); } - std::exclusive_scan(lut.begin(), lut.end(), lut.begin(), 0); - lut.push_back(trkl.size()); } }); }); - /// Layer 0 is done outside the loop - // in-place deduplication - auto& trklt0 = mTimeFrame->getTracklets()[0]; - mTaskArena->execute([&] { tbb::parallel_sort(trklt0.begin(), trklt0.end(), sortTracklets); }); - trklt0.erase(std::unique(trklt0.begin(), trklt0.end(), equalTracklets), trklt0.end()); - trklt0.shrink_to_fit(); - /// Create tracklets labels if (mTimeFrame->hasMCinformation()) { for (int iLayer{0}; iLayer < mTrkParams[iteration].TrackletsPerRoad(); ++iLayer) {