From dfb9fba1ab7b935d30a94487bdd0fdca27011274 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 13 Mar 2025 11:41:02 +0100 Subject: [PATCH] GPU TPC: Simplify host code for driving the extrapolation tracking --- GPU/GPUTracking/Base/GPUReconstruction.h | 2 +- GPU/GPUTracking/Global/GPUChainTracking.cxx | 2 +- GPU/GPUTracking/Global/GPUChainTracking.h | 4 +- .../Global/GPUChainTrackingSectorTracker.cxx | 116 +++++------------- 4 files changed, 33 insertions(+), 91 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index a0248180a5e2c..93310284d7564 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -200,7 +200,7 @@ class GPUReconstruction void SetOutputControl(void* ptr, size_t size); void SetInputControl(void* ptr, size_t size); GPUOutputControl& OutputControl() { return mOutputControl; } - int32_t NStreams() const { return mNStreams; } + uint32_t NStreams() const { return mNStreams; } const void* DeviceMemoryBase() const { return mDeviceMemoryBase; } RecoStepField GetRecoSteps() const { return mRecoSteps.steps; } diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index c186f916891ba..6dcb6f1d7e514 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -260,7 +260,7 @@ bool GPUChainTracking::ValidateSettings() GPUError("configured max time bin exceeds 256 orbits"); return false; } - if ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) && std::max(GetProcessingSettings().nTPCClustererLanes + 1, GetProcessingSettings().nTPCClustererLanes * 2) + (GetProcessingSettings().doublePipeline ? 1 : 0) > mRec->NStreams()) { + if ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) && std::max(GetProcessingSettings().nTPCClustererLanes + 1, GetProcessingSettings().nTPCClustererLanes * 2) + (GetProcessingSettings().doublePipeline ? 1 : 0) > (int32_t)mRec->NStreams()) { GPUError("NStreams (%d) must be > nTPCClustererLanes (%d)", mRec->NStreams(), (int32_t)GetProcessingSettings().nTPCClustererLanes); return false; } diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h index 194573981838e..492ee65d1c9c1 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.h +++ b/GPU/GPUTracking/Global/GPUChainTracking.h @@ -220,7 +220,7 @@ class GPUChainTracking : public GPUChain GPUChainTracking(GPUReconstruction* rec, uint32_t maxTPCHits = GPUCA_MAX_CLUSTERS, uint32_t maxTRDTracklets = GPUCA_MAX_TRD_TRACKLETS); - int32_t ExtrapolationTracking(uint32_t iSector, int32_t threadId, bool synchronizeOutput = true); + int32_t ExtrapolationTracking(uint32_t iSector, bool blocking); int32_t PrepareProfile(); int32_t DoProfile(); @@ -278,7 +278,6 @@ class GPUChainTracking : public GPUChain // Synchronization and Locks eventStruct* mEvents = nullptr; - volatile int32_t mSectorSelectorReady = 0; std::array mExtrapolationTrackingDone; std::vector mOutputQueue; @@ -299,6 +298,7 @@ class GPUChainTracking : public GPUChain void RunTPCTrackingMerger_Resolve(int8_t useOrigTrackParam, int8_t mergeAll, GPUReconstruction::krnlDeviceType deviceType); void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function allocator, bool applyClusterCuts); bool NeedTPCClustersOnGPU(); + uint32_t StreamForSector(uint32_t sector) const; std::mutex mMutexUpdateCalib; std::unique_ptr mPipelineFinalizationCtx; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx index 522ccbad47e59..dd71a797f2744 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx @@ -24,12 +24,18 @@ using namespace o2::gpu; -int32_t GPUChainTracking::ExtrapolationTracking(uint32_t iSector, int32_t threadId, bool synchronizeOutput) +uint32_t GPUChainTracking::StreamForSector(uint32_t sector) const { - runKernel({GetGridBlk(256, iSector % mRec->NStreams()), {iSector}}); - TransferMemoryResourceLinkToHost(RecoStep::TPCSectorTracking, processors()->tpcTrackers[iSector].MemoryResCommon(), iSector % mRec->NStreams()); - if (synchronizeOutput) { - SynchronizeStream(iSector % mRec->NStreams()); + return sector % mRec->NStreams(); +} + +int32_t GPUChainTracking::ExtrapolationTracking(uint32_t iSector, bool blocking) +{ + const uint32_t stream = StreamForSector(iSector); + runKernel({GetGridBlk(256, stream), {iSector}}); + TransferMemoryResourceLinkToHost(RecoStep::TPCSectorTracking, processors()->tpcTrackers[iSector].MemoryResCommon(), stream); + if (blocking) { + SynchronizeStream(stream); } return (0); } @@ -153,7 +159,7 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal() mRec->runParallelOuterLoop(doGPU, NSECTORS, [&](uint32_t iSector) { GPUTPCTracker& trk = processors()->tpcTrackers[iSector]; GPUTPCTracker& trkShadow = doGPU ? processorsShadow()->tpcTrackers[iSector] : trk; - int32_t useStream = (iSector % mRec->NStreams()); + int32_t useStream = StreamForSector(iSector); if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Creating Sector Data (Sector %d)", iSector); @@ -234,102 +240,38 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal() } if (doGPU || GetProcessingSettings().debugLevel >= 1) { - if (doGPU) { - ReleaseEvent(mEvents->init); - } - - mSectorSelectorReady = 0; - - std::array transferRunning; - transferRunning.fill(true); - if (doGPU && !(GetRecoStepsGPU() & RecoStep::TPCMerging)) { // TODO: This seems pretty obsolete code path, can probably be removed. - if (param().rec.tpc.extrapolationTracking) { - mExtrapolationTrackingDone.fill(0); - } - - uint32_t tmpSector = 0; - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { - if (GetProcessingSettings().debugLevel >= 3) { - GPUInfo("Transfering Tracks from GPU to Host"); - } - - if (tmpSector == iSector) { - SynchronizeEvents(&mEvents->sector[iSector]); - } - while (tmpSector < NSECTORS && (tmpSector == iSector || IsEventDone(&mEvents->sector[tmpSector]))) { - ReleaseEvent(mEvents->sector[tmpSector]); - if (*processors()->tpcTrackers[tmpSector].NTracks() > 0) { - TransferMemoryResourceLinkToHost(RecoStep::TPCSectorTracking, processors()->tpcTrackers[tmpSector].MemoryResOutput(), streamMap[tmpSector], &mEvents->sector[tmpSector]); - } else { - transferRunning[tmpSector] = false; - } - tmpSector++; - } - - if (GetProcessingSettings().keepAllMemory) { - TransferMemoryResourcesToHost(RecoStep::TPCSectorTracking, &processors()->tpcTrackers[iSector], -1, true); - } - - if (transferRunning[iSector]) { - SynchronizeEvents(&mEvents->sector[iSector]); - } - if (GetProcessingSettings().debugLevel >= 3) { - GPUInfo("Tracks Transfered: %d / %d", *processors()->tpcTrackers[iSector].NTracks(), *processors()->tpcTrackers[iSector].NTrackHits()); - } - - if (GetProcessingSettings().debugLevel >= 3) { - GPUInfo("Data ready for sector %d", iSector); - } - mSectorSelectorReady = iSector; - - if (param().rec.tpc.extrapolationTracking) { - for (uint32_t tmpSector2a = 0; tmpSector2a <= iSector; tmpSector2a++) { - uint32_t tmpSector2 = GPUTPCExtrapolationTracking::ExtrapolationTrackingSectorOrder(tmpSector2a); - uint32_t sectorLeft, sectorRight; - GPUTPCExtrapolationTracking::ExtrapolationTrackingSectorLeftRight(tmpSector2, sectorLeft, sectorRight); - - if (tmpSector2 <= iSector && sectorLeft <= iSector && sectorRight <= iSector && mExtrapolationTrackingDone[tmpSector2] == 0) { - ExtrapolationTracking(tmpSector2, 0); - mExtrapolationTrackingDone[tmpSector2] = 1; - } - } - } - } - } if (param().rec.tpc.extrapolationTracking) { std::vector blocking(NSECTORS * mRec->NStreams()); - for (int32_t i = 0; i < NSECTORS; i++) { - for (int32_t j = 0; j < mRec->NStreams(); j++) { - blocking[i * mRec->NStreams() + j] = i % mRec->NStreams() == j; + for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { + for (uint32_t iStream = 0; iStream < mRec->NStreams(); iStream++) { + blocking[iSector * mRec->NStreams() + iStream] = StreamForSector(iSector) == iStream; } } for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { uint32_t tmpSector = GPUTPCExtrapolationTracking::ExtrapolationTrackingSectorOrder(iSector); - if (!(doGPU && !(GetRecoStepsGPU() & RecoStep::TPCMerging))) { - uint32_t sectorLeft, sectorRight; - GPUTPCExtrapolationTracking::ExtrapolationTrackingSectorLeftRight(tmpSector, sectorLeft, sectorRight); - if (doGPU && !blocking[tmpSector * mRec->NStreams() + sectorLeft % mRec->NStreams()]) { - StreamWaitForEvents(tmpSector % mRec->NStreams(), &mEvents->sector[sectorLeft]); - blocking[tmpSector * mRec->NStreams() + sectorLeft % mRec->NStreams()] = true; - } - if (doGPU && !blocking[tmpSector * mRec->NStreams() + sectorRight % mRec->NStreams()]) { - StreamWaitForEvents(tmpSector % mRec->NStreams(), &mEvents->sector[sectorRight]); - blocking[tmpSector * mRec->NStreams() + sectorRight % mRec->NStreams()] = true; - } + uint32_t sectorLeft, sectorRight; + GPUTPCExtrapolationTracking::ExtrapolationTrackingSectorLeftRight(tmpSector, sectorLeft, sectorRight); + if (doGPU && !blocking[tmpSector * mRec->NStreams() + StreamForSector(sectorLeft)]) { + StreamWaitForEvents(StreamForSector(tmpSector), &mEvents->sector[sectorLeft]); + blocking[tmpSector * mRec->NStreams() + StreamForSector(sectorLeft)] = true; + } + if (doGPU && !blocking[tmpSector * mRec->NStreams() + StreamForSector(sectorRight)]) { + StreamWaitForEvents(StreamForSector(tmpSector), &mEvents->sector[sectorRight]); + blocking[tmpSector * mRec->NStreams() + StreamForSector(sectorRight)] = true; } - ExtrapolationTracking(tmpSector, 0, false); + ExtrapolationTracking(tmpSector, false); } } - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { - if (doGPU && transferRunning[iSector]) { + if (doGPU) { + ReleaseEvent(mEvents->init); + for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { ReleaseEvent(mEvents->sector[iSector]); } } } else { - mSectorSelectorReady = NSECTORS; mRec->runParallelOuterLoop(doGPU, NSECTORS, [&](uint32_t iSector) { if (param().rec.tpc.extrapolationTracking) { - ExtrapolationTracking(iSector, 0); + ExtrapolationTracking(iSector, true); } }); mRec->SetNActiveThreadsOuterLoop(1);