From 2ca3ec9412446bade57b4b0225ef81e132f32889 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Tue, 13 May 2025 10:12:57 +0200 Subject: [PATCH 1/7] GPU TPC: Remove some obsolete code and track members (leftover from Run 2 by Sergey and totally forgotten), which were wasting performance --- GPU/GPUTracking/Merger/GPUTPCGMMergedTrack.h | 9 -------- GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx | 21 +------------------ 2 files changed, 1 insertion(+), 29 deletions(-) diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergedTrack.h b/GPU/GPUTracking/Merger/GPUTPCGMMergedTrack.h index 578fe1eeb4ca7..6ef2ed2ede668 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMergedTrack.h +++ b/GPU/GPUTracking/Merger/GPUTPCGMMergedTrack.h @@ -41,9 +41,6 @@ class GPUTPCGMMergedTrack { return mAlpha; } - GPUd() float LastX() const { return mLastX; } - GPUd() float LastY() const { return mLastY; } - GPUd() float LastZ() const { return mLastZ; } GPUd() bool OK() const { return mFlags & 0x01; } GPUd() bool Looper() const { return mFlags & 0x02; } GPUd() bool CSide() const { return mFlags & 0x04; } @@ -55,9 +52,6 @@ class GPUTPCGMMergedTrack GPUd() void SetFirstClusterRef(int32_t v) { mFirstClusterRef = v; } GPUd() void SetParam(const GPUTPCGMTrackParam& v) { mParam = v; } GPUd() void SetAlpha(float v) { mAlpha = v; } - GPUd() void SetLastX(float v) { mLastX = v; } - GPUd() void SetLastY(float v) { mLastY = v; } - GPUd() void SetLastZ(float v) { mLastZ = v; } GPUd() void SetOK(bool v) { if (v) { @@ -110,9 +104,6 @@ class GPUTPCGMMergedTrack gputpcgmmergertypes::GPUTPCOuterParam mOuterParam; //* outer param float mAlpha; //* alpha angle - float mLastX; //* outer X - float mLastY; //* outer Y - float mLastZ; //* outer Z uint32_t mFirstClusterRef; //* index of the first track cluster in corresponding cluster arrays // TODO: Change to 8 bit uint32_t mNClusters; //* number of track clusters diff --git a/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx b/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx index 0d8547263207b..366f75cb05e56 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx @@ -1143,26 +1143,7 @@ GPUd() void GPUTPCGMTrackParam::RefitTrack(GPUTPCGMMergedTrack& GPUrestrict() tr track.Param() = t; track.Alpha() = Alpha; - if (track.OK()) { - int32_t ind = track.FirstClusterRef(); - const GPUParam& GPUrestrict() param = merger->Param(); - float alphaa = param.Alpha(merger->Clusters()[ind].sector); - float xx, yy, zz; - if (merger->Param().par.earlyTpcTransform) { - xx = merger->ClustersXYZ()[ind].x; - yy = merger->ClustersXYZ()[ind].y; - zz = merger->ClustersXYZ()[ind].z - track.Param().GetTZOffset(); - } else { - const ClusterNative& GPUrestrict() cl = merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[merger->Clusters()[ind].num]; - merger->GetConstantMem()->calibObjects.fastTransformHelper->Transform(merger->Clusters()[ind].sector, merger->Clusters()[ind].row, cl.getPad(), cl.getTime(), xx, yy, zz, track.Param().GetTZOffset()); - } - float sinA, cosA; - CAMath::SinCos(alphaa - track.Alpha(), sinA, cosA); - track.SetLastX(xx * cosA - yy * sinA); - track.SetLastY(xx * sinA + yy * cosA); - track.SetLastZ(zz); - // merger->DebugRefitMergedTrack(track); - } + // if (track.OK()) merger->DebugRefitMergedTrack(track); } GPUd() void GPUTPCGMTrackParam::Rotate(float alpha) From 625d88f7962a6643f202cb9d975e89c19c0e255d Mon Sep 17 00:00:00 2001 From: David Rohr Date: Tue, 13 May 2025 10:17:56 +0200 Subject: [PATCH 2/7] GPU TPC: Rename some variables with misleading name --- ...GPUReconstructionCUDAKernelsSpecialize.inc | 4 +-- .../DataCompression/GPUTPCCompression.cxx | 2 +- .../GPUTPCCompressionKernels.cxx | 10 +++--- .../GPUChainTrackingDebugAndProfiling.cxx | 4 +-- .../Global/GPUChainTrackingMerger.cxx | 12 +++---- GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx | 32 +++++++++---------- GPU/GPUTracking/Merger/GPUTPCGMMerger.h | 4 +-- GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx | 14 ++++---- GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx | 2 +- GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx | 2 +- .../Merger/GPUTPCGMTracksToTPCSeeds.cxx | 6 ++-- .../Merger/GPUTPCGlobalDebugSortKernels.cxx | 4 +-- 12 files changed, 48 insertions(+), 48 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc index 8796f063abdc5..d3dd561dcea2f 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc @@ -109,13 +109,13 @@ inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed template <> inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed(const krnlSetupTime& _xyz) { - GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks())); + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NMergedTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks())); } template <> inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed(const krnlSetupTime& _xyz) { - GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NOutputTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks())); + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NMergedTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks())); } template <> diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx index 82834a694d0ba..ec1636dfe7f59 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx @@ -125,7 +125,7 @@ void GPUTPCCompression::SetMaxData(const GPUTrackingInOutPointers& io) mMaxClusterFactorBase1024 = mMaxClusters > 100000000 ? mRec->MemoryScalers()->NTPCUnattachedHitsBase1024(mRec->GetParam().rec.tpc.rejectionStrategy) : 1024; mMaxClustersInCache = mMaxClusters * mMaxClusterFactorBase1024 / 1024; mMaxTrackClusters = mRec->GetConstantMem().tpcMerger.NOutputTrackClusters(); // TODO: Why is this not using ioPtrs? Could remove GPUConstantMem.h include - mMaxTracks = mRec->GetConstantMem().tpcMerger.NOutputTracks(); + mMaxTracks = mRec->GetConstantMem().tpcMerger.NMergedTracks(); if (mMaxClusters % 16) { mMaxClusters += 16 - (mMaxClusters % 16); } diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx index bba97e9eace9b..73b195e8f4fe4 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx @@ -201,7 +201,7 @@ GPUdii() void GPUTPCCompressionKernels::ThreadclusterOffset[iSector][iRow]; - const uint32_t idOffsetOut = clusters->clusterOffset[iSector][iRow] * compressor.mMaxClusterFactorBase1024 / 1024; + const uint32_t idOffsetOut = clusters->clusterOffset[iSector][iRow] * compressor.mMaxClusterFactorBase1024 / 1024; // 32 bit enough for number of clusters per row * 1024 const uint32_t idOffsetOutMax = ((const uint32_t*)clusters->clusterOffset[iSector])[iRow + 1] * compressor.mMaxClusterFactorBase1024 / 1024; // Array out of bounds access is ok, since it goes to the correct nClustersTotal if (iThread == nThreads - 1) { smem.nCount = 0; @@ -214,7 +214,7 @@ GPUdii() void GPUTPCCompressionKernels::Thread(clusters->nClusters[iSector][iRow]); for (uint32_t i = iThread; i < nn + nThreads; i += nThreads) { const int32_t idx = idOffset + i; - int32_t cidx = 0; + int32_t storeCluster = 0; do { if (i >= clusters->nClusters[iSector][iRow]) { break; @@ -239,13 +239,13 @@ GPUdii() void GPUTPCCompressionKernels::ThreadtpcTrackers[i].NTrackHits(), processors()->tpcTrackers[i].NMaxTrackHits()); } addToMap("TPC Clusterer Clusters", usageMap, mRec->MemoryScalers()->nTPCHits, mRec->MemoryScalers()->NTPCClusters(mRec->MemoryScalers()->nTPCdigits)); - addToMap("TPC Tracks", usageMap, processors()->tpcMerger.NOutputTracks(), processors()->tpcMerger.NMaxTracks()); + addToMap("TPC Tracks", usageMap, processors()->tpcMerger.NMergedTracks(), processors()->tpcMerger.NMaxTracks()); addToMap("TPC TrackHits", usageMap, processors()->tpcMerger.NOutputTrackClusters(), processors()->tpcMerger.NMaxOutputTrackClusters()); if (mRec->GetProcessingSettings().createO2Output) { @@ -181,7 +181,7 @@ void GPUChainTracking::PrintMemoryRelations() GPUInfo("MEMREL SectorTracks NCl %d NTrk %d", processors()->tpcTrackers[i].NHitsTotal(), *processors()->tpcTrackers[i].NTracks()); GPUInfo("MEMREL SectorTrackHits NCl %d NTrkH %d", processors()->tpcTrackers[i].NHitsTotal(), *processors()->tpcTrackers[i].NTrackHits()); } - GPUInfo("MEMREL Tracks NCl %d NTrk %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NOutputTracks()); + GPUInfo("MEMREL Tracks NCl %d NTrk %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NMergedTracks()); GPUInfo("MEMREL TrackHitss NCl %d NTrkH %d", processors()->tpcMerger.NMaxClusters(), processors()->tpcMerger.NOutputTrackClusters()); } diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx index 6e86be03e7950..bd1fa7796dadf 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx @@ -220,7 +220,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) mOutputQueue.clear(); } - runKernel(doGPU ? GetGrid(Merger.NOutputTracks(), 0) : GetGridAuto(0), mergerSortTracks ? 1 : 0); + runKernel(doGPU ? GetGrid(Merger.NMergedTracks(), 0) : GetGridAuto(0), mergerSortTracks ? 1 : 0); if (param().rec.tpc.retryRefit == 1) { runKernel(GetGridAuto(0), -1); } @@ -233,7 +233,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) runKernel(GetGridAuto(0, deviceType)); runKernel(GetGridAuto(0, deviceType)); if (param().rec.tpc.mergeLoopersAfterburner) { - runKernel(doGPU ? GetGrid(Merger.NOutputTracks(), 0, deviceType) : GetGridAuto(0, deviceType)); + runKernel(doGPU ? GetGrid(Merger.NMergedTracks(), 0, deviceType) : GetGridAuto(0, deviceType)); if (doGPU) { TransferMemoryResourceLinkToHost(RecoStep::TPCMerging, Merger.MemoryResMemory(), 0); SynchronizeStream(0); // TODO: could probably synchronize on an event after runKernel @@ -255,10 +255,10 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) throw std::runtime_error("QA Scratch buffer exceeded"); } } - GPUMemCpy(RecoStep::TPCMerging, Merger.OutputTracks(), MergerShadowAll.OutputTracks(), Merger.NOutputTracks() * sizeof(*Merger.OutputTracks()), outputStream, 0, nullptr, waitEvent); + GPUMemCpy(RecoStep::TPCMerging, Merger.OutputTracks(), MergerShadowAll.OutputTracks(), Merger.NMergedTracks() * sizeof(*Merger.OutputTracks()), outputStream, 0, nullptr, waitEvent); waitEvent = nullptr; if (param().dodEdxEnabled) { - GPUMemCpy(RecoStep::TPCMerging, Merger.OutputTracksdEdx(), MergerShadowAll.OutputTracksdEdx(), Merger.NOutputTracks() * sizeof(*Merger.OutputTracksdEdx()), outputStream, 0); + GPUMemCpy(RecoStep::TPCMerging, Merger.OutputTracksdEdx(), MergerShadowAll.OutputTracksdEdx(), Merger.NMergedTracks() * sizeof(*Merger.OutputTracksdEdx()), outputStream, 0); } GPUMemCpy(RecoStep::TPCMerging, Merger.Clusters(), MergerShadowAll.Clusters(), Merger.NOutputTrackClusters() * sizeof(*Merger.Clusters()), outputStream, 0); if (param().par.earlyTpcTransform) { @@ -326,7 +326,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) } mIOPtrs.mergedTracks = Merger.OutputTracks(); - mIOPtrs.nMergedTracks = Merger.NOutputTracks(); + mIOPtrs.nMergedTracks = Merger.NMergedTracks(); mIOPtrs.mergedTrackHits = Merger.Clusters(); mIOPtrs.mergedTrackHitsXYZ = Merger.ClustersXYZ(); mIOPtrs.nMergedTrackHits = Merger.NOutputTrackClusters(); @@ -340,7 +340,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) if (doGPU) { processorsShadow()->ioPtrs.mergedTracks = MergerShadow.OutputTracks(); - processorsShadow()->ioPtrs.nMergedTracks = Merger.NOutputTracks(); + processorsShadow()->ioPtrs.nMergedTracks = Merger.NMergedTracks(); processorsShadow()->ioPtrs.mergedTrackHits = MergerShadow.Clusters(); processorsShadow()->ioPtrs.mergedTrackHitsXYZ = MergerShadow.ClustersXYZ(); processorsShadow()->ioPtrs.nMergedTrackHits = Merger.NOutputTrackClusters(); diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx index d2aba503be6a6..e96bbeee774bf 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx @@ -415,7 +415,7 @@ int32_t GPUTPCGMMerger::CheckSectors() GPUd() void GPUTPCGMMerger::ClearTrackLinks(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, bool output) { - const int32_t n = output ? mMemory->nOutputTracks : SectorTrackInfoLocalTotal(); + const int32_t n = output ? mMemory->nMergedTracks : SectorTrackInfoLocalTotal(); for (int32_t i = iBlock * nThreads + iThread; i < n; i += nThreads * nBlocks) { mTrackLinks[i] = -1; } @@ -1271,7 +1271,7 @@ GPUd() void GPUTPCGMMerger::MergeCEFill(const GPUTPCGMSectorTrack* track, const GPUd() void GPUTPCGMMerger::MergeCE(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { const ClusterNative* cls = Param().par.earlyTpcTransform ? nullptr : mConstantMem->ioPtrs.clustersNative->clustersLinear; - for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTracks; i += nThreads * nBlocks) { + for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nMergedTracks; i += nThreads * nBlocks) { if (mOutputTracks[i].CSide() == 0 && mTrackLinks[i] >= 0) { if (mTrackLinks[mTrackLinks[i]] != (int32_t)i) { continue; @@ -1392,7 +1392,7 @@ GPUd() void GPUTPCGMMerger::MergeCE(int32_t nBlocks, int32_t nThreads, int32_t i } } - // for (int32_t i = 0;i < mMemory->nOutputTracks;i++) {if (mOutputTracks[i].CCE() == false) {mOutputTracks[i].SetNClusters(0);mOutputTracks[i].SetOK(false);}} //Remove all non-CE tracks + // for (int32_t i = 0;i < mMemory->nMergedTracks;i++) {if (mOutputTracks[i].CCE() == false) {mOutputTracks[i].SetNClusters(0);mOutputTracks[i].SetOK(false);}} //Remove all non-CE tracks } namespace o2::gpu::internal @@ -1533,7 +1533,7 @@ GPUd() void GPUTPCGMMerger::CollectMergedTracks(int32_t nBlocks, int32_t nThread nHits = 0; for (int32_t ipart = 0; ipart < nParts; ipart++) { const GPUTPCGMSectorTrack* t = trackParts[ipart]; - CADEBUG(printf("Collect Track %d Part %d QPt %f DzDs %f\n", mMemory->nOutputTracks, ipart, t->QPt(), t->DzDs())); + CADEBUG(printf("Collect Track %d Part %d QPt %f DzDs %f\n", mMemory->nMergedTracks, ipart, t->QPt(), t->DzDs())); int32_t nTrackHits = t->NClusters(); trackCluster* c2 = trackClusters + nHits + nTrackHits - 1; for (int32_t i = 0; i < nTrackHits; i++, c2--) { @@ -1678,10 +1678,10 @@ GPUd() void GPUTPCGMMerger::CollectMergedTracks(int32_t nBlocks, int32_t nThread cl[i].leg = trackClusters[i].leg; } - uint32_t iOutputTrack = CAMath::AtomicAdd(&mMemory->nOutputTracks, 1u); + uint32_t iOutputTrack = CAMath::AtomicAdd(&mMemory->nMergedTracks, 1u); if (iOutputTrack >= mNMaxTracks) { raiseError(GPUErrors::ERROR_MERGER_TRACK_OVERFLOW, iOutputTrack, mNMaxTracks); - CAMath::AtomicExch(&mMemory->nOutputTracks, mNMaxTracks); + CAMath::AtomicExch(&mMemory->nMergedTracks, mNMaxTracks); continue; } @@ -1718,9 +1718,9 @@ GPUd() void GPUTPCGMMerger::CollectMergedTracks(int32_t nBlocks, int32_t nThread p1.QPt() = 100.f / Param().rec.bz0Pt10MeV; } - // if (nParts > 1) printf("Merged %d: QPt %f %d parts %d hits\n", mMemory->nOutputTracks, p1.QPt(), nParts, nHits); + // if (nParts > 1) printf("Merged %d: QPt %f %d parts %d hits\n", mMemory->nMergedTracks, p1.QPt(), nParts, nHits); - /*if (GPUQA::QAAvailable() && mRec->GetQA() && mRec->GetQA()->SuppressTrack(mMemory->nOutputTracks)) + /*if (GPUQA::QAAvailable() && mRec->GetQA() && mRec->GetQA()->SuppressTrack(mMemory->nMergedTracks)) { mergedTrack.SetOK(0); mergedTrack.SetNClusters(0); @@ -1742,14 +1742,14 @@ GPUd() void GPUTPCGMMerger::CollectMergedTracks(int32_t nBlocks, int32_t nThread GPUd() void GPUTPCGMMerger::SortTracksPrepare(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { - for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTracks; i += nThreads * nBlocks) { + for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nMergedTracks; i += nThreads * nBlocks) { mTrackOrderProcess[i] = i; } } GPUd() void GPUTPCGMMerger::PrepareClustersForFit0(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { - for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTracks; i += nBlocks * nThreads) { + for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nMergedTracks; i += nBlocks * nThreads) { mTrackSort[i] = i; } } @@ -1784,7 +1784,7 @@ GPUd() void GPUTPCGMMerger::SortTracks(int32_t nBlocks, int32_t nThreads, int32_ ) // clang-format on }; - GPUCommonAlgorithm::sortDeviceDynamic(mTrackOrderProcess, mTrackOrderProcess + mMemory->nOutputTracks, comp); + GPUCommonAlgorithm::sortDeviceDynamic(mTrackOrderProcess, mTrackOrderProcess + mMemory->nMergedTracks, comp); #endif } @@ -1810,13 +1810,13 @@ GPUd() void GPUTPCGMMerger::SortTracksQPt(int32_t nBlocks, int32_t nThreads, int ) // clang-format on }; - GPUCommonAlgorithm::sortDeviceDynamic(mTrackSort, mTrackSort + mMemory->nOutputTracks, comp); + GPUCommonAlgorithm::sortDeviceDynamic(mTrackSort, mTrackSort + mMemory->nMergedTracks, comp); #endif } GPUd() void GPUTPCGMMerger::PrepareClustersForFit1(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { - for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTracks; i += nBlocks * nThreads) { + for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nMergedTracks; i += nBlocks * nThreads) { mTrackOrderAttach[mTrackSort[i]] = i; const GPUTPCGMMergedTrack& trk = mOutputTracks[i]; if (trk.OK()) { @@ -1848,7 +1848,7 @@ GPUd() void GPUTPCGMMerger::PrepareClustersForFit2(int32_t nBlocks, int32_t nThr GPUd() void GPUTPCGMMerger::Finalize0(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { - for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTracks; i += nThreads * nBlocks) { + for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nMergedTracks; i += nThreads * nBlocks) { mTrackSort[mTrackOrderAttach[i]] = i; } for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTrackClusters; i += nThreads * nBlocks) { @@ -1858,7 +1858,7 @@ GPUd() void GPUTPCGMMerger::Finalize0(int32_t nBlocks, int32_t nThreads, int32_t GPUd() void GPUTPCGMMerger::Finalize1(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { - for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nOutputTracks; i += nThreads * nBlocks) { + for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nMergedTracks; i += nThreads * nBlocks) { const GPUTPCGMMergedTrack& trk = mOutputTracks[i]; if (!trk.OK() || trk.NClusters() == 0) { continue; @@ -1893,7 +1893,7 @@ GPUd() void GPUTPCGMMerger::Finalize2(int32_t nBlocks, int32_t nThreads, int32_t GPUd() void GPUTPCGMMerger::MergeLoopersInit(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { const float lowPtThresh = Param().rec.tpc.rejectQPtB5 * 1.1f; // Might need to merge tracks above the threshold with parts below the threshold - for (uint32_t i = get_global_id(0); i < mMemory->nOutputTracks; i += get_global_size(0)) { + for (uint32_t i = get_global_id(0); i < mMemory->nMergedTracks; i += get_global_size(0)) { const auto& trk = mOutputTracks[i]; const auto& p = trk.GetParam(); const float qptabs = CAMath::Abs(p.GetQPt()); diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.h b/GPU/GPUTracking/Merger/GPUTPCGMMerger.h index 6c6e0e02a2dc2..6c9c14b557798 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.h +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.h @@ -69,7 +69,7 @@ class GPUTPCGMMerger : public GPUProcessor GPUAtomic(uint32_t) nRetryRefit; GPUAtomic(uint32_t) nLoopData; GPUAtomic(uint32_t) nUnpackedTracks; - GPUAtomic(uint32_t) nOutputTracks; + GPUAtomic(uint32_t) nMergedTracks; GPUAtomic(uint32_t) nOutputTrackClusters; GPUAtomic(uint32_t) nO2Tracks; GPUAtomic(uint32_t) nO2ClusRefs; @@ -103,7 +103,7 @@ class GPUTPCGMMerger : public GPUProcessor void* SetPointersOutputState(void* mem); void* SetPointersMemory(void* mem); - GPUhdi() int32_t NOutputTracks() const { return mMemory->nOutputTracks; } + GPUhdi() int32_t NMergedTracks() const { return mMemory->nMergedTracks; } GPUhdi() const GPUTPCGMMergedTrack* OutputTracks() const { return mOutputTracks; } GPUhdi() GPUTPCGMMergedTrack* OutputTracks() { return mOutputTracks; } GPUhdi() const GPUdEdxInfo* OutputTracksdEdx() const { return mOutputTracksdEdx; } diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx index ac55f423b1c42..02d0ac98b05b0 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx @@ -94,7 +94,7 @@ void GPUTPCGMMerger::DumpMergeRanges(std::ostream& out, int32_t withinSector, in void GPUTPCGMMerger::DumpTrackLinks(std::ostream& out, bool output, const char* type) const { out << "\nTPC Merger Links " << type << "\n"; - const int32_t n = output ? mMemory->nOutputTracks : SectorTrackInfoLocalTotal(); + const int32_t n = output ? mMemory->nMergedTracks : SectorTrackInfoLocalTotal(); for (int32_t i = 0; i < n; i++) { if (mTrackLinks[i] != -1) { out << " " << i << ": " << mTrackLinks[i] << "\n"; @@ -138,7 +138,7 @@ void GPUTPCGMMerger::DumpCollected(std::ostream& out) const std::streamsize ss = out.precision(); out << std::setprecision(2); out << "\nTPC Merger Collected Tracks\n"; - for (uint32_t i = 0; i < mMemory->nOutputTracks; i++) { + for (uint32_t i = 0; i < mMemory->nMergedTracks; i++) { const auto& trk = mOutputTracks[i]; const auto& p = trk.GetParam(); out << " Track " << i << ": Loop " << trk.Looper() << " Alpha " << trk.GetAlpha() << " X " << p.GetX() << " offset " << p.GetTZOffset() << " Y " << p.GetY() << " Z " << p.GetZ() << " SPhi " << p.GetSinPhi() << " Tgl " << p.GetDzDs() << " QPt " << p.GetQPt() << " NCl " << trk.NClusters() << "\n"; @@ -150,7 +150,7 @@ void GPUTPCGMMerger::DumpMergeCE(std::ostream& out) const { DumpTrackLinks(out, true, " for CE merging"); out << "\nTPC Merger Merge CE\n"; - for (uint32_t i = 0; i < mMemory->nOutputTracks; i++) { + for (uint32_t i = 0; i < mMemory->nMergedTracks; i++) { const auto& trk = mOutputTracks[i]; if (trk.CCE()) { out << " Track " << i << ": CCE\n"; @@ -162,11 +162,11 @@ void GPUTPCGMMerger::DumpFitPrepare(std::ostream& out) const { out << "\nTPC Merger Refit Prepare\n"; out << " Sort\n"; - for (uint32_t i = 0; i < mMemory->nOutputTracks; i++) { + for (uint32_t i = 0; i < mMemory->nMergedTracks; i++) { out << " " << i << ": " << mTrackOrderAttach[i] << "\n"; } out << " Clusters\n"; - for (uint32_t j = 0; j < mMemory->nOutputTracks; j++) { + for (uint32_t j = 0; j < mMemory->nMergedTracks; j++) { const auto& trk = mOutputTracks[j]; out << " Track " << j << ": "; for (uint32_t i = trk.FirstClusterRef(); i < trk.FirstClusterRef() + trk.NClusters(); i++) { @@ -195,7 +195,7 @@ void GPUTPCGMMerger::DumpRefit(std::ostream& out) const std::streamsize ss = out.precision(); out << std::setprecision(2); out << "\nTPC Merger Refit\n"; - for (uint32_t i = 0; i < mMemory->nOutputTracks; i++) { + for (uint32_t i = 0; i < mMemory->nMergedTracks; i++) { const auto& trk = mOutputTracks[i]; if (trk.NClusters() == 0) { continue; @@ -212,7 +212,7 @@ void GPUTPCGMMerger::DumpRefit(std::ostream& out) const void GPUTPCGMMerger::DumpFinal(std::ostream& out) const { out << "\nTPC Merger Finalized\n"; - for (uint32_t j = 0; j < mMemory->nOutputTracks; j++) { + for (uint32_t j = 0; j < mMemory->nMergedTracks; j++) { const auto& trk = mOutputTracks[j]; if (trk.NClusters() == 0) { continue; diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx index d72d59a6250e7..68763b3549547 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx @@ -21,7 +21,7 @@ using namespace o2::gpu; template <> GPUdii() void GPUTPCGMMergerTrackFit::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger, int32_t mode) { - const int32_t iEnd = mode == -1 ? merger.Memory()->nRetryRefit : merger.NOutputTracks(); + const int32_t iEnd = mode == -1 ? merger.Memory()->nRetryRefit : merger.NMergedTracks(); GPUCA_TBB_KERNEL_LOOP(merger.GetRec(), int32_t, ii, iEnd, { const int32_t i = mode == -1 ? merger.RetryRefitIds()[ii] : mode ? merger.TrackOrderProcess()[ii] : ii; GPUTPCGMTrackParam::RefitTrack(merger.OutputTracks()[i], i, &merger, mode == -1); diff --git a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx index 9ead17ea5c7c0..72e9f63e5da83 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx @@ -38,7 +38,7 @@ template <> GPUdii() void GPUTPCGMO2Output::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger) { const GPUTPCGMMergedTrack* tracks = merger.OutputTracks(); - const uint32_t nTracks = merger.NOutputTracks(); + const uint32_t nTracks = merger.NMergedTracks(); const GPUTPCGMMergedTrackHit* trackClusters = merger.Clusters(); const GPUdEdxInfo* tracksdEdx = merger.OutputTracksdEdx(); diff --git a/GPU/GPUTracking/Merger/GPUTPCGMTracksToTPCSeeds.cxx b/GPU/GPUTracking/Merger/GPUTPCGMTracksToTPCSeeds.cxx index 78eea63edecdd..ebc9d22560524 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMTracksToTPCSeeds.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMTracksToTPCSeeds.cxx @@ -34,7 +34,7 @@ void GPUTPCGMTracksToTPCSeeds::CreateSeedsFromHLTTracks(TObjArray* seeds, AliTPC } seeds->Clear(); int32_t index = 0; - for (int32_t i = 0; i < merger->NOutputTracks(); i++) { + for (int32_t i = 0; i < merger->NMergedTracks(); i++) { const GPUTPCGMMergedTrack& track = merger->OutputTracks()[i]; if (!track.OK()) { continue; @@ -112,7 +112,7 @@ void GPUTPCGMTracksToTPCSeeds::UpdateParamsOuter(TObjArray* seeds) return; } int32_t index = 0; - for (int32_t i = 0; i < merger->NOutputTracks(); i++) { + for (int32_t i = 0; i < merger->NMergedTracks(); i++) { const GPUTPCGMMergedTrack& track = merger->OutputTracks()[i]; if (!track.OK()) { continue; @@ -134,7 +134,7 @@ void GPUTPCGMTracksToTPCSeeds::UpdateParamsInner(TObjArray* seeds) return; } int32_t index = 0; - for (int32_t i = 0; i < merger->NOutputTracks(); i++) { + for (int32_t i = 0; i < merger->NMergedTracks(); i++) { const GPUTPCGMMergedTrack& track = merger->OutputTracks()[i]; if (!track.OK()) { continue; diff --git a/GPU/GPUTracking/Merger/GPUTPCGlobalDebugSortKernels.cxx b/GPU/GPUTracking/Merger/GPUTPCGlobalDebugSortKernels.cxx index a21593b7ba9e9..e63bb82a9b09e 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGlobalDebugSortKernels.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGlobalDebugSortKernels.cxx @@ -100,7 +100,7 @@ GPUdii() void GPUTPCGlobalDebugSortKernels::Thread Date: Tue, 13 May 2025 10:25:11 +0200 Subject: [PATCH 3/7] GPU: Add additional optional debbug dumps for validation --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 2 +- .../Global/GPUChainTrackingDebug.h | 45 ++++++++++--------- .../Global/GPUChainTrackingMerger.cxx | 7 +-- .../Global/GPUChainTrackingSectorTracker.cxx | 4 +- GPU/GPUTracking/Merger/GPUTPCGMMergedTrack.h | 1 + GPU/GPUTracking/Merger/GPUTPCGMMerger.h | 1 + GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx | 15 ++++++- 7 files changed, 47 insertions(+), 28 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 9e0aa32155f0d..9400a429fca81 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -295,7 +295,7 @@ AddOption(trdNCandidates, int32_t, 3, "", 0, "Number of branching track candidat AddOption(trdTrackModelO2, bool, false, "", 0, "Use O2 track model instead of GPU track model for TRD tracking") AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, -1 = autoselect (-2 for O2, 0 for standalone))") AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)") -AddOption(debugMask, uint32_t, 262143, "", 0, "Mask for debug output dumps to file") +AddOption(debugMask, uint32_t, (1 << 18) - 1, "debugMask", 0, "Mask for debug output dumps to file") AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6") AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures") AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks") diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebug.h b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h index 810f40a1d8654..6c995f65f3dd3 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingDebug.h +++ b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h @@ -23,28 +23,29 @@ namespace o2::gpu { // NOTE: Values below 262144 are activated by default with --debug 6 in GPUSettingsList.h::debugMask enum GPUChainTrackingDebugFlags : uint32_t { - TPCSectorTrackingData = 1, - TPCPreLinks = 2, - TPCLinks = 4, - TPCStartHits = 8, - TPCTracklets = 16, - TPCSectorTracks = 32, - TPCHitWeights = 256, - TPCCompressedClusters = 512, - TPCDecompressedClusters = 1024, - TPCMergingRanges = 2048, - TPCMergingSectorTracks = 4096, - TPCMergingMergedTracks = 8192, - TPCMergingCollectedTracks = 16384, - TPCMergingCE = 32768, - TPCMergingRefit = 65536, - TPCClustererClusters = 131072, - TPCClusterer = 262144, - TPCClustererDigits = 262144 << 1, - TPCClustererPeaks = 262144 << 2, - TPCClustererSuppressedPeaks = 262144 << 3, - TPCClustererChargeMap = 262144 << 4, - TPCClustererZeroedCharges = 262144 << 5 + TPCSectorTrackingData = 1 << 0, + TPCPreLinks = 1 << 1, + TPCLinks = 1 << 2, + TPCStartHits = 1 << 3, + TPCTracklets = 1 << 4, + TPCSectorTracks = 1 << 5, + TPCHitWeights = 1 << 6, + TPCMergingRanges = 1 << 7, + TPCMergingSectorTracks = 1 << 8, + TPCMergingMatching = 1 << 9, + TPCMergingCollectedTracks = 1 << 10, + TPCMergingCE = 1 << 11, + TPCMergingPrepareFit = 1 << 12, + TPCMergingRefit = 1 << 13, + TPCMergingLoopers = 1 << 14, + TPCCompressedClusters = 1 << 15, + TPCDecompressedClusters = 1 << 16, + TPCClustererClusters = 1 << 17, + TPCClustererDigits = 1 << 18, + TPCClustererPeaks = 1 << 19, + TPCClustererSuppressedPeaks = 1 << 20, + TPCClustererChargeMap = 1 << 21, + TPCClustererZeroedCharges = 1 << 22 }; template diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx index bd1fa7796dadf..df80eabfb8761 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx @@ -143,7 +143,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) runKernel(GetGridAuto(0, deviceType)); RunTPCTrackingMerger_MergeBorderTracks(1, 0, deviceType); RunTPCTrackingMerger_Resolve(0, 1, deviceType); - DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingMergedTracks, doGPU, Merger, &GPUTPCGMMerger::DumpMergedWithinSectors, *mDebugFile); + DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingMatching, doGPU, Merger, &GPUTPCGMMerger::DumpMergedWithinSectors, *mDebugFile); runKernel(GetGridAuto(0, deviceType), false); runKernel({{1, -WarpSize(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter(), 2 * NSECTORS * sizeof(*MergerShadowAll.TmpCounter())); @@ -158,7 +158,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) runKernel(GetGridBlk(std::max(2u, numBlocks), 0, deviceType), 0, 1, 1); RunTPCTrackingMerger_MergeBorderTracks(0, -1, deviceType); RunTPCTrackingMerger_Resolve(0, 1, deviceType); - DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingMergedTracks, doGPU, Merger, &GPUTPCGMMerger::DumpMergedBetweenSectors, *mDebugFile); + DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingMatching, doGPU, Merger, &GPUTPCGMMerger::DumpMergedBetweenSectors, *mDebugFile); runKernel({{1, -WarpSize(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter(), 2 * NSECTORS * sizeof(*MergerShadowAll.TmpCounter())); @@ -202,7 +202,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) runKernel(GetGridAuto(0, deviceType)); runKernel(GetGridAuto(0, deviceType)); - DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingRefit, doGPU, Merger, &GPUTPCGMMerger::DumpFitPrepare, *mDebugFile); + DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingPrepareFit, doGPU, Merger, &GPUTPCGMMerger::DumpFitPrepare, *mDebugFile); if (doGPU) { CondWaitEvent(waitForTransfer, &mEvents->single); @@ -240,6 +240,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) } runKernel(GetGridAuto(0, deviceType)); runKernel(doGPU ? GetGrid(Merger.Memory()->nLooperMatchCandidates, 0, deviceType) : GetGridAuto(0, deviceType)); + DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingLoopers, Merger, &GPUTPCGMMerger::DumpLoopers, *mDebugFile); } DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingRefit, doGPU, Merger, &GPUTPCGMMerger::DumpFinal, *mDebugFile); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx index ef38d53173c2b..67ef402961a20 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx @@ -176,7 +176,9 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal() } if (GetProcessingSettings().debugLevel >= 6) { - *mDebugFile << "\n\nReconstruction: Sector " << iSector << "/" << NSECTORS << std::endl; + if ((GetProcessingSettings().debugMask & 63)) { + *mDebugFile << "\n\nReconstruction: Sector " << iSector << "/" << NSECTORS << std::endl; + } if (GetProcessingSettings().debugMask & GPUChainTrackingDebugFlags::TPCSectorTrackingData) { if (doGPU) { TransferMemoryResourcesToHost(RecoStep::TPCSectorTracking, &trk, -1, true); diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergedTrack.h b/GPU/GPUTracking/Merger/GPUTPCGMMergedTrack.h index 6ef2ed2ede668..73b14ba1b2fdf 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMergedTrack.h +++ b/GPU/GPUTracking/Merger/GPUTPCGMMergedTrack.h @@ -95,6 +95,7 @@ class GPUTPCGMMergedTrack GPUd() void SetFlags(uint8_t v) { mFlags = v; } GPUd() void SetLegs(uint8_t v) { mLegs = v; } GPUd() uint8_t Legs() const { return mLegs; } + GPUd() uint8_t Flags() const { return mFlags; } GPUd() const gputpcgmmergertypes::GPUTPCOuterParam& OuterParam() const { return mOuterParam; } GPUd() gputpcgmmergertypes::GPUTPCOuterParam& OuterParam() { return mOuterParam; } diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.h b/GPU/GPUTracking/Merger/GPUTPCGMMerger.h index 6c9c14b557798..ae85f20b17b48 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.h +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.h @@ -201,6 +201,7 @@ class GPUTPCGMMerger : public GPUProcessor void DumpFitPrepare(std::ostream& out) const; void DumpRefit(std::ostream& out) const; void DumpFinal(std::ostream& out) const; + void DumpLoopers(std::ostream& out) const; template void MergedTrackStreamerInternal(const GPUTPCGMBorderTrack& b1, const GPUTPCGMBorderTrack& b2, const char* name, int32_t sector1, int32_t sector2, int32_t mergeMode, float weight, float frac) const; diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx index 02d0ac98b05b0..3be32a2d87610 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx @@ -204,11 +204,24 @@ void GPUTPCGMMerger::DumpRefit(std::ostream& out) const const auto& po = trk.OuterParam(); out << " Track " << i << ": OK " << trk.OK() << " Alpha " << trk.GetAlpha() << " X " << p.GetX() << " offset " << p.GetTZOffset() << " Y " << p.GetY() << " Z " << p.GetZ() << " SPhi " << p.GetSinPhi() << " Tgl " << p.GetDzDs() << " QPt " << p.GetQPt() << " NCl " << trk.NClusters() << " / " << trk.NClustersFitted() << " Cov " << p.GetErr2Y() << "/" << p.GetErr2Z() << " dEdx " << (trk.OK() && Param().dodEdxEnabled ? mOutputTracksdEdx[i].dEdxTotTPC : -1.f) << "/" << (trk.OK() && Param().dodEdxEnabled ? mOutputTracksdEdx[i].dEdxMaxTPC : -1.f) - << " Outer " << po.P[0] << "/" << po.P[1] << "/" << po.P[2] << "/" << po.P[3] << "/" << po.P[4] << "\n"; + << " Outer " << po.P[0] << "/" << po.P[1] << "/" << po.P[2] << "/" << po.P[3] << "/" << po.P[4] + << " NFitted " << trk.NClustersFitted() << " legs " << (int)trk.Legs() << " flags " << (int)trk.Flags() << "\n"; } out << std::setprecision(ss); } +void GPUTPCGMMerger::DumpLoopers(std::ostream& out) const +{ + out << "\n TPC Merger Looper Afterburner\n"; + for (uint32_t i = 0; i < mMemory->nMergedTracks; i++) { + if (i && i % 100 == 0) { + out << "\n"; + } + out << (int)mOutputTracks[i].MergedLooper() << " "; + } + out << "\n"; +} + void GPUTPCGMMerger::DumpFinal(std::ostream& out) const { out << "\nTPC Merger Finalized\n"; From bbca3bb88304ad30253ecf1a75e5ecf4975ccc53 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Tue, 13 May 2025 10:25:47 +0200 Subject: [PATCH 4/7] GPU TPC: Fix deterministic mode for TPC cluster compression / decompression / looper merging afterburner --- .../Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc | 4 ++-- GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx | 2 +- GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc index d3dd561dcea2f..1d633eb5e748f 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc @@ -82,14 +82,14 @@ struct GPUTPCGMMergerSortTracksQPt_comp { struct GPUTPCGMMergerMergeLoopers_comp { GPUd() bool operator()(const MergeLooperParam& a, const MergeLooperParam& b) { - return CAMath::Abs(a.refz) < CAMath::Abs(b.refz); + return GPUCA_DETERMINISTIC_CODE(CAMath::Abs(a.refz) != CAMath::Abs(b.refz) ? CAMath::Abs(a.refz) < CAMath::Abs(b.refz) : a.id < b.id, CAMath::Abs(a.refz) < CAMath::Abs(b.refz)); } }; struct GPUTPCGMO2OutputSort_comp { GPUd() bool operator()(const GPUTPCGMMerger::tmpSort& a, const GPUTPCGMMerger::tmpSort& b) { - return (a.y > b.y); + return GPUCA_DETERMINISTIC_CODE(a.y != b.y ? a.y > b.y : a.x > b.x, a.y > b.y); } }; diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx index e96bbeee774bf..99ef548b2d78e 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx @@ -1948,7 +1948,7 @@ GPUd() void GPUTPCGMMerger::MergeLoopersSort(int32_t nBlocks, int32_t nThreads, if (iThread || iBlock) { return; } - auto comp = [](const MergeLooperParam& a, const MergeLooperParam& b) { return CAMath::Abs(a.refz) < CAMath::Abs(b.refz); }; + auto comp = [](const MergeLooperParam& a, const MergeLooperParam& b) { return GPUCA_DETERMINISTIC_CODE(CAMath::Abs(a.refz) != CAMath::Abs(b.refz) ? CAMath::Abs(a.refz) < CAMath::Abs(b.refz) : a.id < b.id, CAMath::Abs(a.refz) < CAMath::Abs(b.refz)); }; GPUCommonAlgorithm::sortDeviceDynamic(mLooperCandidates, mLooperCandidates + mMemory->nLooperMatchCandidates, comp); #endif } diff --git a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx index 72e9f63e5da83..624c9ab487c8d 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx @@ -88,7 +88,7 @@ GPUdii() void GPUTPCGMO2Output::Thread(int32_t nBlocks, return; } GPUTPCGMMerger::tmpSort* GPUrestrict() trackSort = merger.TrackSortO2(); - auto comp = [](const auto& a, const auto& b) { return (a.y > b.y); }; + auto comp = [](const auto& a, const auto& b) { return GPUCA_DETERMINISTIC_CODE(a.y != b.y ? a.y > b.y : a.x > b.x, a.y > b.y); }; GPUCommonAlgorithm::sortDeviceDynamic(trackSort, trackSort + merger.Memory()->nO2Tracks, comp); #endif } From 309851071924418a624bc62ee15b6dcbaff7fd91 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Tue, 13 May 2025 10:51:13 +0200 Subject: [PATCH 5/7] GPU: Deduplicate sort comparisons: Use structs, since both hipcub and rocthrust do not work with lambdas for some reason --- ...GPUReconstructionCUDAKernelsSpecialize.inc | 82 ---------- GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx | 143 ++++++++++-------- GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx | 20 ++- 3 files changed, 98 insertions(+), 147 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc index 1d633eb5e748f..44cde3d4ac48a 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc @@ -14,88 +14,6 @@ #if defined(GPUCA_SPECIALIZE_THRUST_SORTS) && !defined(GPUCA_GPUCODE_COMPILEKERNELS) -namespace o2::gpu::internal -{ -namespace // anonymous -{ -struct MergeBorderTracks_compMax { - GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) - { - return GPUCA_DETERMINISTIC_CODE((a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId), a.fMax < b.fMax); - } -}; -struct MergeBorderTracks_compMin { - GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) - { - return GPUCA_DETERMINISTIC_CODE((a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId), a.fMin < b.fMin); - } -}; - -struct GPUTPCGMMergerSortTracks_comp { - const GPUTPCGMMergedTrack* const mCmp; - GPUhd() GPUTPCGMMergerSortTracks_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {} - GPUd() bool operator()(const int32_t aa, const int32_t bb) - { - const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa]; - const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb]; - if (a.CCE() != b.CCE()) { - return a.CCE() > b.CCE(); - } - if (a.Legs() != b.Legs()) { - return a.Legs() > b.Legs(); - } - GPUCA_DETERMINISTIC_CODE( // clang-format off - if (a.NClusters() != b.NClusters()) { - return a.NClusters() > b.NClusters(); - } if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) { - return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); - } if (a.GetParam().GetY() != b.GetParam().GetY()) { - return a.GetParam().GetY() > b.GetParam().GetY(); - } - return aa > bb; - , // !GPUCA_DETERMINISTIC_CODE - return a.NClusters() > b.NClusters(); - ) // clang-format on - } -}; - -struct GPUTPCGMMergerSortTracksQPt_comp { - const GPUTPCGMMergedTrack* const mCmp; - GPUhd() GPUTPCGMMergerSortTracksQPt_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {} - GPUd() bool operator()(const int32_t aa, const int32_t bb) - { - const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa]; - const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb]; - GPUCA_DETERMINISTIC_CODE( // clang-format off - if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) { - return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); - } if (a.GetParam().GetY() != b.GetParam().GetY()) { - return a.GetParam().GetY() > b.GetParam().GetY(); - } - return a.GetParam().GetZ() > b.GetParam().GetZ(); - , // !GPUCA_DETERMINISTIC_CODE - return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); - ) // clang-format on - } -}; - -struct GPUTPCGMMergerMergeLoopers_comp { - GPUd() bool operator()(const MergeLooperParam& a, const MergeLooperParam& b) - { - return GPUCA_DETERMINISTIC_CODE(CAMath::Abs(a.refz) != CAMath::Abs(b.refz) ? CAMath::Abs(a.refz) < CAMath::Abs(b.refz) : a.id < b.id, CAMath::Abs(a.refz) < CAMath::Abs(b.refz)); - } -}; - -struct GPUTPCGMO2OutputSort_comp { - GPUd() bool operator()(const GPUTPCGMMerger::tmpSort& a, const GPUTPCGMMerger::tmpSort& b) - { - return GPUCA_DETERMINISTIC_CODE(a.y != b.y ? a.y > b.y : a.x > b.x, a.y > b.y); - } -}; - -} // anonymous namespace -} // namespace o2::gpu::internal - template <> inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed(const krnlSetupTime& _xyz, GPUTPCGMBorderRange* const& range, int32_t const& N, int32_t const& cmpMax) { diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx index 99ef548b2d78e..b12375a10023a 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx @@ -59,17 +59,13 @@ #include "SimulationDataFormat/MCCompLabel.h" #endif -namespace o2::gpu::internal -{ -} +static constexpr int32_t kMaxParts = 400; +static constexpr int32_t kMaxClusters = GPUCA_MERGER_MAX_TRACK_CLUSTERS; + using namespace o2::gpu; -using namespace o2::gpu::internal; using namespace o2::tpc; using namespace gputpcgmmergertypes; -static constexpr int32_t kMaxParts = 400; -static constexpr int32_t kMaxClusters = GPUCA_MERGER_MAX_TRACK_CLUSTERS; - namespace o2::gpu::internal { struct MergeLooperParam { @@ -78,8 +74,79 @@ struct MergeLooperParam { float y; uint32_t id; }; + +struct MergeBorderTracks_compMax { + GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) + { + return GPUCA_DETERMINISTIC_CODE((a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId), a.fMax < b.fMax); + } +}; +struct MergeBorderTracks_compMin { + GPUd() bool operator()(const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) + { + return GPUCA_DETERMINISTIC_CODE((a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId), a.fMin < b.fMin); + } +}; + +struct GPUTPCGMMergerSortTracks_comp { + const GPUTPCGMMergedTrack* const mCmp; + GPUhd() GPUTPCGMMergerSortTracks_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {} + GPUd() bool operator()(const int32_t aa, const int32_t bb) + { + const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa]; + const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb]; + if (a.CCE() != b.CCE()) { + return a.CCE() > b.CCE(); + } + if (a.Legs() != b.Legs()) { + return a.Legs() > b.Legs(); + } + GPUCA_DETERMINISTIC_CODE( // clang-format off + if (a.NClusters() != b.NClusters()) { + return a.NClusters() > b.NClusters(); + } if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) { + return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); + } if (a.GetParam().GetY() != b.GetParam().GetY()) { + return a.GetParam().GetY() > b.GetParam().GetY(); + } + return aa > bb; + , // !GPUCA_DETERMINISTIC_CODE + return a.NClusters() > b.NClusters(); + ) // clang-format on + } +}; + +struct GPUTPCGMMergerSortTracksQPt_comp { + const GPUTPCGMMergedTrack* const mCmp; + GPUhd() GPUTPCGMMergerSortTracksQPt_comp(GPUTPCGMMergedTrack* cmp) : mCmp(cmp) {} + GPUd() bool operator()(const int32_t aa, const int32_t bb) + { + const GPUTPCGMMergedTrack& GPUrestrict() a = mCmp[aa]; + const GPUTPCGMMergedTrack& GPUrestrict() b = mCmp[bb]; + GPUCA_DETERMINISTIC_CODE( // clang-format off + if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) { + return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); + } if (a.GetParam().GetY() != b.GetParam().GetY()) { + return a.GetParam().GetY() > b.GetParam().GetY(); + } + return a.GetParam().GetZ() > b.GetParam().GetZ(); + , // !GPUCA_DETERMINISTIC_CODE + return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); + ) // clang-format on + } +}; + +struct GPUTPCGMMergerMergeLoopers_comp { + GPUd() bool operator()(const MergeLooperParam& a, const MergeLooperParam& b) + { + return GPUCA_DETERMINISTIC_CODE(CAMath::Abs(a.refz) != CAMath::Abs(b.refz) ? CAMath::Abs(a.refz) < CAMath::Abs(b.refz) : a.id < b.id, CAMath::Abs(a.refz) < CAMath::Abs(b.refz)); + } +}; + } // namespace o2::gpu::internal +using namespace o2::gpu::internal; + #ifndef GPUCA_GPUCODE #include "GPUQA.h" @@ -742,11 +809,11 @@ template <> GPUd() void GPUTPCGMMerger::MergeBorderTracks<3>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUTPCGMBorderRange* range, int32_t N, int32_t cmpMax) { #ifndef GPUCA_SPECIALIZE_THRUST_SORTS - if (iThread == 0) { + if (iThread == 0 && iBlock == 0) { if (cmpMax) { - GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return GPUCA_DETERMINISTIC_CODE((a.fMax != b.fMax) ? (a.fMax < b.fMax) : (a.fId < b.fId), a.fMax < b.fMax); }); + GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, MergeBorderTracks_compMax()); } else { - GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, [](const GPUTPCGMBorderRange& a, const GPUTPCGMBorderRange& b) { return GPUCA_DETERMINISTIC_CODE((a.fMin != b.fMin) ? (a.fMin < b.fMin) : (a.fId < b.fId), a.fMin < b.fMin); }); + GPUCommonAlgorithm::sortDeviceDynamic(range, range + N, MergeBorderTracks_compMin()); } } #endif @@ -1757,60 +1824,18 @@ GPUd() void GPUTPCGMMerger::PrepareClustersForFit0(int32_t nBlocks, int32_t nThr GPUd() void GPUTPCGMMerger::SortTracks(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { #ifndef GPUCA_SPECIALIZE_THRUST_SORTS - if (iThread || iBlock) { - return; + if (iThread == 0 && iBlock == 0) { + GPUCommonAlgorithm::sortDeviceDynamic(mTrackOrderProcess, mTrackOrderProcess + mMemory->nMergedTracks, GPUTPCGMMergerSortTracks_comp(mOutputTracks)); } - // TODO: Fix this: Have to duplicate sort comparison: Thrust cannot use the Lambda but OpenCL cannot use the object - auto comp = [cmp = mOutputTracks](const int32_t aa, const int32_t bb) { - const GPUTPCGMMergedTrack& GPUrestrict() a = cmp[aa]; - const GPUTPCGMMergedTrack& GPUrestrict() b = cmp[bb]; - if (a.CCE() != b.CCE()) { - return a.CCE() > b.CCE(); - } - if (a.Legs() != b.Legs()) { - return a.Legs() > b.Legs(); - } - GPUCA_DETERMINISTIC_CODE( // clang-format off - if (a.NClusters() != b.NClusters()) { - return a.NClusters() > b.NClusters(); - } if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) { - return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); - } if (a.GetParam().GetY() != b.GetParam().GetY()) { - return a.GetParam().GetY() > b.GetParam().GetY(); - } - return aa > bb; - , // !GPUCA_DETERMINISTIC_CODE - return a.NClusters() > b.NClusters(); - ) // clang-format on - }; - - GPUCommonAlgorithm::sortDeviceDynamic(mTrackOrderProcess, mTrackOrderProcess + mMemory->nMergedTracks, comp); #endif } GPUd() void GPUTPCGMMerger::SortTracksQPt(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { #ifndef GPUCA_SPECIALIZE_THRUST_SORTS - if (iThread || iBlock) { - return; + if (iThread == 0 && iBlock == 0) { + GPUCommonAlgorithm::sortDeviceDynamic(mTrackSort, mTrackSort + mMemory->nMergedTracks, GPUTPCGMMergerSortTracksQPt_comp(mOutputTracks)); } - // TODO: Fix this: Have to duplicate sort comparison: Thrust cannot use the Lambda but OpenCL cannot use the object - auto comp = [cmp = mOutputTracks](const int32_t aa, const int32_t bb) { - const GPUTPCGMMergedTrack& GPUrestrict() a = cmp[aa]; - const GPUTPCGMMergedTrack& GPUrestrict() b = cmp[bb]; - GPUCA_DETERMINISTIC_CODE( // clang-format off - if (CAMath::Abs(a.GetParam().GetQPt()) != CAMath::Abs(b.GetParam().GetQPt())) { - return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); - } if (a.GetParam().GetY() != b.GetParam().GetY()) { - return a.GetParam().GetY() > b.GetParam().GetY(); - } - return a.GetParam().GetZ() > b.GetParam().GetZ(); - , // !GPUCA_DETERMINISTIC_CODE - return CAMath::Abs(a.GetParam().GetQPt()) > CAMath::Abs(b.GetParam().GetQPt()); - ) // clang-format on - }; - - GPUCommonAlgorithm::sortDeviceDynamic(mTrackSort, mTrackSort + mMemory->nMergedTracks, comp); #endif } @@ -1945,11 +1970,9 @@ GPUd() void GPUTPCGMMerger::MergeLoopersInit(int32_t nBlocks, int32_t nThreads, GPUd() void GPUTPCGMMerger::MergeLoopersSort(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { #ifndef GPUCA_SPECIALIZE_THRUST_SORTS - if (iThread || iBlock) { - return; + if (iThread == 0 && iBlock == 0) { + GPUCommonAlgorithm::sortDeviceDynamic(mLooperCandidates, mLooperCandidates + mMemory->nLooperMatchCandidates, GPUTPCGMMergerMergeLoopers_comp()); } - auto comp = [](const MergeLooperParam& a, const MergeLooperParam& b) { return GPUCA_DETERMINISTIC_CODE(CAMath::Abs(a.refz) != CAMath::Abs(b.refz) ? CAMath::Abs(a.refz) < CAMath::Abs(b.refz) : a.id < b.id, CAMath::Abs(a.refz) < CAMath::Abs(b.refz)); }; - GPUCommonAlgorithm::sortDeviceDynamic(mLooperCandidates, mLooperCandidates + mMemory->nLooperMatchCandidates, comp); #endif } diff --git a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx index 624c9ab487c8d..1e08058fb22dd 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx @@ -34,6 +34,18 @@ using namespace o2::tpc::constants; GPUdi() static constexpr uint8_t getFlagsReject() { return GPUTPCGMMergedTrackHit::flagReject | GPUTPCGMMergedTrackHit::flagNotFit; } GPUdi() static uint32_t getFlagsRequired(const GPUSettingsRec& rec) { return rec.tpc.dropSecondaryLegsInOutput ? gputpcgmmergertypes::attachGoodLeg : gputpcgmmergertypes::attachZero; } +namespace o2::gpu::internal +{ + +struct GPUTPCGMO2OutputSort_comp { + GPUd() bool operator()(const GPUTPCGMMerger::tmpSort& a, const GPUTPCGMMerger::tmpSort& b) + { + return GPUCA_DETERMINISTIC_CODE(a.y != b.y ? a.y > b.y : a.x > b.x, a.y > b.y); + } +}; + +} // namespace o2::gpu::internal + template <> GPUdii() void GPUTPCGMO2Output::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger) { @@ -84,12 +96,10 @@ template <> GPUdii() void GPUTPCGMO2Output::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger) { #ifndef GPUCA_SPECIALIZE_THRUST_SORTS - if (iThread || iBlock) { - return; + if (iThread == 0 && iBlock == 0) { + GPUTPCGMMerger::tmpSort* GPUrestrict() trackSort = merger.TrackSortO2(); + GPUCommonAlgorithm::sortDeviceDynamic(trackSort, trackSort + merger.Memory()->nO2Tracks, internal::GPUTPCGMO2OutputSort_comp()); } - GPUTPCGMMerger::tmpSort* GPUrestrict() trackSort = merger.TrackSortO2(); - auto comp = [](const auto& a, const auto& b) { return GPUCA_DETERMINISTIC_CODE(a.y != b.y ? a.y > b.y : a.x > b.x, a.y > b.y); }; - GPUCommonAlgorithm::sortDeviceDynamic(trackSort, trackSort + merger.Memory()->nO2Tracks, comp); #endif } From 8b28ac24d84f2a668a3fabfaec2b5763d7be6e34 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Tue, 13 May 2025 11:04:35 +0200 Subject: [PATCH 6/7] GPU: Remove obsolete files used for tests in Run 2 --- .../Merger/GPUTPCGMTracksToTPCSeeds.cxx | 149 ------------------ .../Merger/GPUTPCGMTracksToTPCSeeds.h | 29 ---- 2 files changed, 178 deletions(-) delete mode 100644 GPU/GPUTracking/Merger/GPUTPCGMTracksToTPCSeeds.cxx delete mode 100644 GPU/GPUTracking/Merger/GPUTPCGMTracksToTPCSeeds.h diff --git a/GPU/GPUTracking/Merger/GPUTPCGMTracksToTPCSeeds.cxx b/GPU/GPUTracking/Merger/GPUTPCGMTracksToTPCSeeds.cxx deleted file mode 100644 index ebc9d22560524..0000000000000 --- a/GPU/GPUTracking/Merger/GPUTPCGMTracksToTPCSeeds.cxx +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// \file GPUTPCGMTracksToTPCSeeds.cxx -/// \author David Rohr - -#include "GPUTPCGMTracksToTPCSeeds.h" -#include "GPUTPCGlobalMergerComponent.h" -#include "GPUTPCGMMergerTypes.h" -#include "GPUTPCGMMerger.h" -#include "GPULogging.h" -#include "AliTPCtracker.h" -#include "AliTPCtrack.h" -#include "AliTPCseed.h" -#include "AliTPCtrackerSector.h" -#include "TObjArray.h" -#include "AliTPCclusterMI.h" - -using namespace o2::gpu; - -void GPUTPCGMTracksToTPCSeeds::CreateSeedsFromHLTTracks(TObjArray* seeds, AliTPCtracker* tpctracker) -{ - const GPUTPCGMMerger* merger = GPUTPCGlobalMergerComponent::GetCurrentMerger(); - if (merger == nullptr) { - return; - } - seeds->Clear(); - int32_t index = 0; - for (int32_t i = 0; i < merger->NMergedTracks(); i++) { - const GPUTPCGMMergedTrack& track = merger->OutputTracks()[i]; - if (!track.OK()) { - continue; - } - - AliTPCtrack tr; - tr.Set(track.GetParam().GetX(), track.GetAlpha(), track.GetParam().GetPar(), track.GetParam().GetCov()); - AliTPCseed* seed = new (tpctracker->NextFreeSeed()) AliTPCseed(tr); - for (int32_t j = 0; j < GPUCA_ROW_COUNT; j++) { - seed->SetClusterPointer(j, nullptr); - seed->SetClusterIndex(j, -1); - } - int32_t ncls = 0; - int32_t lastrow = -1; - int32_t lastleg = -1; - for (int32_t j = track.NClusters() - 1; j >= 0; j--) { - const GPUTPCGMMergedTrackHit& cls = merger->Clusters()[track.FirstClusterRef() + j]; - if (cls.state & GPUTPCGMMergedTrackHit::flagReject) { - continue; - } - if (lastrow != -1 && (cls.row < lastrow || cls.leg != lastleg)) { - break; - } - if (cls.row == lastrow) { - continue; - } - - AliTPCtrackerRow& row = tpctracker->GetRow(cls.sector % 18, cls.row); - uint32_t clIndexOffline = 0; - AliTPCclusterMI* clOffline = row.FindNearest2(cls.y, cls.z, 0.01f, 0.01f, clIndexOffline); - if (!clOffline) { - continue; - } - clIndexOffline = row.GetIndex(clIndexOffline); - - clOffline->Use(10); - seed->SetClusterPointer(cls.row, clOffline); - seed->SetClusterIndex2(cls.row, clIndexOffline); - - lastrow = cls.row; - lastleg = cls.leg; - ncls++; - } - - seed->SetRelativeSector(track.GetAlpha() / (M_PI / 9.f)); - seed->SetNumberOfClusters(ncls); - seed->SetNFoundable(ncls); - seed->SetChi2(track.GetParam().GetChi2()); - - float alpha = seed->GetAlpha(); - if (alpha >= 2.f * M_PI) { - alpha -= 2.f * M_PI; - } - if (alpha < 0) { - alpha += 2.f * M_PI; - } - seed->SetRelativeSector(track.GetAlpha() / (M_PI / 9.f)); - - seed->SetPoolID(tpctracker->GetLastSeedId()); - seed->SetIsSeeding(kTRUE); - seed->SetSeed1(GPUCA_ROW_COUNT - 1); - seed->SetSeed2(GPUCA_ROW_COUNT - 2); - seed->SetSeedType(0); - seed->SetFirstPoint(-1); - seed->SetLastPoint(-1); - seeds->AddLast(seed); // note, track is seed, don't free the seed - index++; - } -} - -void GPUTPCGMTracksToTPCSeeds::UpdateParamsOuter(TObjArray* seeds) -{ - const GPUTPCGMMerger* merger = GPUTPCGlobalMergerComponent::GetCurrentMerger(); - if (merger == nullptr) { - return; - } - int32_t index = 0; - for (int32_t i = 0; i < merger->NMergedTracks(); i++) { - const GPUTPCGMMergedTrack& track = merger->OutputTracks()[i]; - if (!track.OK()) { - continue; - } - if (index > seeds->GetEntriesFast()) { - GPUError("Invalid number of offline seeds"); - return; - } - AliTPCseed* seed = (AliTPCseed*)seeds->UncheckedAt(index++); - const gputpcgmmergertypes::GPUTPCOuterParam& param = track.OuterParam(); - seed->Set(param.X, param.alpha, param.P, param.C); - } -} - -void GPUTPCGMTracksToTPCSeeds::UpdateParamsInner(TObjArray* seeds) -{ - const GPUTPCGMMerger* merger = GPUTPCGlobalMergerComponent::GetCurrentMerger(); - if (merger == nullptr) { - return; - } - int32_t index = 0; - for (int32_t i = 0; i < merger->NMergedTracks(); i++) { - const GPUTPCGMMergedTrack& track = merger->OutputTracks()[i]; - if (!track.OK()) { - continue; - } - if (index > seeds->GetEntriesFast()) { - GPUError("Invalid number of offline seeds"); - return; - } - AliTPCseed* seed = (AliTPCseed*)seeds->UncheckedAt(index++); - seed->Set(track.GetParam().GetX(), track.GetAlpha(), track.GetParam().GetPar(), track.GetParam().GetCov()); - } -} diff --git a/GPU/GPUTracking/Merger/GPUTPCGMTracksToTPCSeeds.h b/GPU/GPUTracking/Merger/GPUTPCGMTracksToTPCSeeds.h deleted file mode 100644 index 029cb108d4119..0000000000000 --- a/GPU/GPUTracking/Merger/GPUTPCGMTracksToTPCSeeds.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// \file GPUTPCGMTracksToTPCSeeds.h -/// \author David Rohr - -#ifndef GPUTPCGMTRACKSTOTPCSEEDS_H -#define GPUTPCGMTRACKSTOTPCSEEDS_H - -class TObjArray; -class AliTPCtracker; - -class GPUTPCGMTracksToTPCSeeds -{ - public: - static void CreateSeedsFromHLTTracks(TObjArray* seeds, AliTPCtracker* tpctracker); - static void UpdateParamsOuter(TObjArray* seeds); - static void UpdateParamsInner(TObjArray* seeds); -}; - -#endif From 0c7120e754d1b38602413525e16ee19ab3cec874 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Tue, 13 May 2025 11:04:56 +0200 Subject: [PATCH 7/7] GPU TPC: Some more member variable renaming --- ...GPUReconstructionCUDAKernelsSpecialize.inc | 4 +- .../Global/GPUChainTrackingMerger.cxx | 8 ++-- .../Global/GPUChainTrackingRefit.cxx | 2 +- GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx | 38 +++++++++---------- GPU/GPUTracking/Merger/GPUTPCGMMerger.h | 18 ++++----- GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx | 14 +++---- GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx | 2 +- GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx | 10 ++--- GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx | 12 +++--- .../Merger/GPUTPCGlobalDebugSortKernels.cxx | 10 ++--- 10 files changed, 59 insertions(+), 59 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc index 44cde3d4ac48a..85567d70d70d6 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAKernelsSpecialize.inc @@ -27,13 +27,13 @@ inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed template <> inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed(const krnlSetupTime& _xyz) { - GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NMergedTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.OutputTracks())); + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackOrderProcess(), processors()->tpcMerger.NMergedTracks(), GPUTPCGMMergerSortTracks_comp(mProcessorsShadow->tpcMerger.MergedTracks())); } template <> inline void GPUCA_M_CAT(GPUReconstruction, GPUCA_GPUTYPE)::runKernelBackendTimed(const krnlSetupTime& _xyz) { - GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NMergedTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.OutputTracks())); + GPUCommonAlgorithm::sortOnDevice(this, _xyz.x.stream, mProcessorsShadow->tpcMerger.TrackSort(), processors()->tpcMerger.NMergedTracks(), GPUTPCGMMergerSortTracksQPt_comp(mProcessorsShadow->tpcMerger.MergedTracks())); } template <> diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx index df80eabfb8761..2b3d719a27dea 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx @@ -256,10 +256,10 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) throw std::runtime_error("QA Scratch buffer exceeded"); } } - GPUMemCpy(RecoStep::TPCMerging, Merger.OutputTracks(), MergerShadowAll.OutputTracks(), Merger.NMergedTracks() * sizeof(*Merger.OutputTracks()), outputStream, 0, nullptr, waitEvent); + GPUMemCpy(RecoStep::TPCMerging, Merger.MergedTracks(), MergerShadowAll.MergedTracks(), Merger.NMergedTracks() * sizeof(*Merger.MergedTracks()), outputStream, 0, nullptr, waitEvent); waitEvent = nullptr; if (param().dodEdxEnabled) { - GPUMemCpy(RecoStep::TPCMerging, Merger.OutputTracksdEdx(), MergerShadowAll.OutputTracksdEdx(), Merger.NMergedTracks() * sizeof(*Merger.OutputTracksdEdx()), outputStream, 0); + GPUMemCpy(RecoStep::TPCMerging, Merger.MergedTracksdEdx(), MergerShadowAll.MergedTracksdEdx(), Merger.NMergedTracks() * sizeof(*Merger.MergedTracksdEdx()), outputStream, 0); } GPUMemCpy(RecoStep::TPCMerging, Merger.Clusters(), MergerShadowAll.Clusters(), Merger.NOutputTrackClusters() * sizeof(*Merger.Clusters()), outputStream, 0); if (param().par.earlyTpcTransform) { @@ -326,7 +326,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) mRec->ReturnVolatileDeviceMemory(); } - mIOPtrs.mergedTracks = Merger.OutputTracks(); + mIOPtrs.mergedTracks = Merger.MergedTracks(); mIOPtrs.nMergedTracks = Merger.NMergedTracks(); mIOPtrs.mergedTrackHits = Merger.Clusters(); mIOPtrs.mergedTrackHitsXYZ = Merger.ClustersXYZ(); @@ -340,7 +340,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput) mIOPtrs.outputTracksTPCO2MC = Merger.OutputTracksTPCO2MC(); if (doGPU) { - processorsShadow()->ioPtrs.mergedTracks = MergerShadow.OutputTracks(); + processorsShadow()->ioPtrs.mergedTracks = MergerShadow.MergedTracks(); processorsShadow()->ioPtrs.nMergedTracks = Merger.NMergedTracks(); processorsShadow()->ioPtrs.mergedTrackHits = MergerShadow.Clusters(); processorsShadow()->ioPtrs.mergedTrackHitsXYZ = MergerShadow.ClustersXYZ(); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingRefit.cxx b/GPU/GPUTracking/Global/GPUChainTrackingRefit.cxx index 4662b5464f710..5ca20a39d0462 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingRefit.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingRefit.cxx @@ -33,7 +33,7 @@ int32_t GPUChainTracking::RunRefit() SetupGPUProcessor(&Refit, false); RefitShadow.SetPtrsFromGPUConstantMem(processorsShadow(), doGPU ? &processorsDevice()->param : nullptr); RefitShadow.SetPropagator(doGPU ? processorsShadow()->calibObjects.o2Propagator : GetO2Propagator()); - RefitShadow.mPTracks = (doGPU ? processorsShadow() : processors())->tpcMerger.OutputTracks(); + RefitShadow.mPTracks = (doGPU ? processorsShadow() : processors())->tpcMerger.MergedTracks(); WriteToConstantMemory(RecoStep::Refit, (char*)&processors()->trackingRefit - (char*)processors(), &RefitShadow, sizeof(RefitShadow), 0); // TransferMemoryResourcesToGPU(RecoStep::Refit, &Refit, 0); if (param().rec.trackingRefitGPUModel) { diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx index b12375a10023a..f1a0816529c3a 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.cxx @@ -365,11 +365,11 @@ void* GPUTPCGMMerger::SetPointersRefitScratch(void* mem) void* GPUTPCGMMerger::SetPointersOutput(void* mem) { - computePointerWithAlignment(mem, mOutputTracks, mNMaxTracks); + computePointerWithAlignment(mem, mMergedTracks, mNMaxTracks); if (mRec->GetParam().dodEdxEnabled) { - computePointerWithAlignment(mem, mOutputTracksdEdx, mNMaxTracks); + computePointerWithAlignment(mem, mMergedTracksdEdx, mNMaxTracks); if (mRec->GetParam().rec.tpc.dEdxClusterRejectionFlagMask != mRec->GetParam().rec.tpc.dEdxClusterRejectionFlagMaskAlt) { - computePointerWithAlignment(mem, mOutputTracksdEdxAlt, mNMaxTracks); + computePointerWithAlignment(mem, mMergedTracksdEdxAlt, mNMaxTracks); } } computePointerWithAlignment(mem, mClusters, mNMaxOutputTrackClusters); @@ -1318,7 +1318,7 @@ GPUd() void GPUTPCGMMerger::MergeCEFill(const GPUTPCGMSectorTrack* track, const const float x0 = GPUTPCGeometry::Row2X(attempt == 0 ? 63 : cls.row); if (track->TransportToX(this, x0, Param().bzCLight, b, GPUCA_MAX_SIN_PHI_LOW)) { b.SetTrackID(itr); - b.SetNClusters(mOutputTracks[itr].NClusters()); + b.SetNClusters(mMergedTracks[itr].NClusters()); if (CAMath::Abs(b.Cov()[4]) >= 0.5f) { b.SetCov(4, 0.5f); // TODO: Is this needed and better than the cut in BorderTrack? } @@ -1339,11 +1339,11 @@ GPUd() void GPUTPCGMMerger::MergeCE(int32_t nBlocks, int32_t nThreads, int32_t i { const ClusterNative* cls = Param().par.earlyTpcTransform ? nullptr : mConstantMem->ioPtrs.clustersNative->clustersLinear; for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nMergedTracks; i += nThreads * nBlocks) { - if (mOutputTracks[i].CSide() == 0 && mTrackLinks[i] >= 0) { + if (mMergedTracks[i].CSide() == 0 && mTrackLinks[i] >= 0) { if (mTrackLinks[mTrackLinks[i]] != (int32_t)i) { continue; } - GPUTPCGMMergedTrack* trk[2] = {&mOutputTracks[i], &mOutputTracks[mTrackLinks[i]]}; + GPUTPCGMMergedTrack* trk[2] = {&mMergedTracks[i], &mMergedTracks[mTrackLinks[i]]}; if (!trk[1]->OK() || trk[1]->CCE()) { continue; @@ -1459,7 +1459,7 @@ GPUd() void GPUTPCGMMerger::MergeCE(int32_t nBlocks, int32_t nThreads, int32_t i } } - // for (int32_t i = 0;i < mMemory->nMergedTracks;i++) {if (mOutputTracks[i].CCE() == false) {mOutputTracks[i].SetNClusters(0);mOutputTracks[i].SetOK(false);}} //Remove all non-CE tracks + // for (int32_t i = 0;i < mMemory->nMergedTracks;i++) {if (mMergedTracks[i].CCE() == false) {mMergedTracks[i].SetNClusters(0);mMergedTracks[i].SetOK(false);}} //Remove all non-CE tracks } namespace o2::gpu::internal @@ -1752,7 +1752,7 @@ GPUd() void GPUTPCGMMerger::CollectMergedTracks(int32_t nBlocks, int32_t nThread continue; } - GPUTPCGMMergedTrack& mergedTrack = mOutputTracks[iOutputTrack]; + GPUTPCGMMergedTrack& mergedTrack = mMergedTracks[iOutputTrack]; mergedTrack.SetFlags(0); mergedTrack.SetOK(1); @@ -1825,7 +1825,7 @@ GPUd() void GPUTPCGMMerger::SortTracks(int32_t nBlocks, int32_t nThreads, int32_ { #ifndef GPUCA_SPECIALIZE_THRUST_SORTS if (iThread == 0 && iBlock == 0) { - GPUCommonAlgorithm::sortDeviceDynamic(mTrackOrderProcess, mTrackOrderProcess + mMemory->nMergedTracks, GPUTPCGMMergerSortTracks_comp(mOutputTracks)); + GPUCommonAlgorithm::sortDeviceDynamic(mTrackOrderProcess, mTrackOrderProcess + mMemory->nMergedTracks, GPUTPCGMMergerSortTracks_comp(mMergedTracks)); } #endif } @@ -1834,7 +1834,7 @@ GPUd() void GPUTPCGMMerger::SortTracksQPt(int32_t nBlocks, int32_t nThreads, int { #ifndef GPUCA_SPECIALIZE_THRUST_SORTS if (iThread == 0 && iBlock == 0) { - GPUCommonAlgorithm::sortDeviceDynamic(mTrackSort, mTrackSort + mMemory->nMergedTracks, GPUTPCGMMergerSortTracksQPt_comp(mOutputTracks)); + GPUCommonAlgorithm::sortDeviceDynamic(mTrackSort, mTrackSort + mMemory->nMergedTracks, GPUTPCGMMergerSortTracksQPt_comp(mMergedTracks)); } #endif } @@ -1843,7 +1843,7 @@ GPUd() void GPUTPCGMMerger::PrepareClustersForFit1(int32_t nBlocks, int32_t nThr { for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nMergedTracks; i += nBlocks * nThreads) { mTrackOrderAttach[mTrackSort[i]] = i; - const GPUTPCGMMergedTrack& trk = mOutputTracks[i]; + const GPUTPCGMMergedTrack& trk = mMergedTracks[i]; if (trk.OK()) { for (uint32_t j = 0; j < trk.NClusters(); j++) { mClusterAttachment[mClusters[trk.FirstClusterRef() + j].num] = attachAttached | attachGood; @@ -1884,7 +1884,7 @@ GPUd() void GPUTPCGMMerger::Finalize0(int32_t nBlocks, int32_t nThreads, int32_t GPUd() void GPUTPCGMMerger::Finalize1(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread) { for (uint32_t i = iBlock * nThreads + iThread; i < mMemory->nMergedTracks; i += nThreads * nBlocks) { - const GPUTPCGMMergedTrack& trk = mOutputTracks[i]; + const GPUTPCGMMergedTrack& trk = mMergedTracks[i]; if (!trk.OK() || trk.NClusters() == 0) { continue; } @@ -1919,7 +1919,7 @@ GPUd() void GPUTPCGMMerger::MergeLoopersInit(int32_t nBlocks, int32_t nThreads, { const float lowPtThresh = Param().rec.tpc.rejectQPtB5 * 1.1f; // Might need to merge tracks above the threshold with parts below the threshold for (uint32_t i = get_global_id(0); i < mMemory->nMergedTracks; i += get_global_size(0)) { - const auto& trk = mOutputTracks[i]; + const auto& trk = mMergedTracks[i]; const auto& p = trk.GetParam(); const float qptabs = CAMath::Abs(p.GetQPt()); if (trk.NClusters() && qptabs * Param().qptB5Scaler > 5.f && qptabs * Param().qptB5Scaler <= lowPtThresh) { @@ -1983,7 +1983,7 @@ GPUd() void GPUTPCGMMerger::MergeLoopersMain(int32_t nBlocks, int32_t nThreads, #if GPUCA_MERGE_LOOPER_MC && !defined(GPUCA_GPUCODE) std::vector paramLabels(mMemory->nLooperMatchCandidates); for (uint32_t i = 0; i < mMemory->nLooperMatchCandidates; i++) { - paramLabels[i] = GetTrackLabel(mOutputTracks[params[i].id]); + paramLabels[i] = GetTrackLabel(mMergedTracks[params[i].id]); } /*std::vector dropped(mMemory->nLooperMatchCandidates); std::vector droppedMC(mMemory->nLooperMatchCandidates); @@ -2005,8 +2005,8 @@ GPUd() void GPUTPCGMMerger::MergeLoopersMain(int32_t nBlocks, int32_t nThreads, // bs |= 1; continue; } - const auto& trk1 = mOutputTracks[params[i].id]; - const auto& trk2 = mOutputTracks[params[j].id]; + const auto& trk1 = mMergedTracks[params[i].id]; + const auto& trk2 = mMergedTracks[params[j].id]; const auto& param1 = trk1.GetParam(); const auto& param2 = trk2.GetParam(); if (CAMath::Abs(param1.GetDzDs()) > 0.03f && CAMath::Abs(param2.GetDzDs()) > 0.03f && param1.GetDzDs() * param2.GetDzDs() * param1.GetQPt() * param2.GetQPt() < 0) { @@ -2045,7 +2045,7 @@ GPUd() void GPUTPCGMMerger::MergeLoopersMain(int32_t nBlocks, int32_t nThreads, const int64_t label2 = paramLabels[j]; bool labelEQ = label1 != -1 && label1 == label2; if (1 || EQ || labelEQ) { - // printf("Matching track %d/%d %u-%u (%ld/%ld): dist %f side %d %d, tgl %f %f, qpt %f %f, x %f %f, y %f %f\n", (int32_t)EQ, (int32_t)labelEQ, i, j, label1, label2, d, (int32_t)mOutputTracks[params[i].id].CSide(), (int32_t)mOutputTracks[params[j].id].CSide(), params[i].tgl, params[j].tgl, params[i].qpt, params[j].qpt, params[i].x, params[j].x, params[i].y, params[j].y); + // printf("Matching track %d/%d %u-%u (%ld/%ld): dist %f side %d %d, tgl %f %f, qpt %f %f, x %f %f, y %f %f\n", (int32_t)EQ, (int32_t)labelEQ, i, j, label1, label2, d, (int32_t)mMergedTracks[params[i].id].CSide(), (int32_t)mMergedTracks[params[j].id].CSide(), params[i].tgl, params[j].tgl, params[i].qpt, params[j].qpt, params[i].x, params[j].x, params[i].y, params[j].y); static auto& tup = GPUROOTDump::get("mergeloopers", "labeleq:sides:d2xy:tgl1:tgl2:qpt1:qpt2:dz:dzcorr:dtgl:dqpt:dznorm:bs"); tup.Fill((float)labelEQ, (trk1.CSide() ? 1 : 0) | (trk2.CSide() ? 2 : 0), d2xy, param1.GetDzDs(), param2.GetDzDs(), param1.GetQPt(), param2.GetQPt(), CAMath::Abs(params[j].refz) - CAMath::Abs(params[i].refz), dzcorr, dtgl, dqpt, dznorm, bs); static auto tup2 = GPUROOTDump::getNew("mergeloopers2", "labeleq:refz1:refz2:tgl1:tgl2:qpt1:qpt2:snp1:snp2:a1:a2:dzn:phasecor:phasedir:dzcorr"); @@ -2063,9 +2063,9 @@ GPUd() void GPUTPCGMMerger::MergeLoopersMain(int32_t nBlocks, int32_t nThreads, }*/ #endif if (EQ) { - mOutputTracks[params[j].id].SetMergedLooper(true); + mMergedTracks[params[j].id].SetMergedLooper(true); if (CAMath::Abs(param2.GetQPt() * Param().qptB5Scaler) >= Param().rec.tpc.rejectQPtB5) { - mOutputTracks[params[i].id].SetMergedLooper(true); + mMergedTracks[params[i].id].SetMergedLooper(true); } } } diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMerger.h b/GPU/GPUTracking/Merger/GPUTPCGMMerger.h index ae85f20b17b48..4487b6d937dc2 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMerger.h +++ b/GPU/GPUTracking/Merger/GPUTPCGMMerger.h @@ -104,12 +104,12 @@ class GPUTPCGMMerger : public GPUProcessor void* SetPointersMemory(void* mem); GPUhdi() int32_t NMergedTracks() const { return mMemory->nMergedTracks; } - GPUhdi() const GPUTPCGMMergedTrack* OutputTracks() const { return mOutputTracks; } - GPUhdi() GPUTPCGMMergedTrack* OutputTracks() { return mOutputTracks; } - GPUhdi() const GPUdEdxInfo* OutputTracksdEdx() const { return mOutputTracksdEdx; } - GPUhdi() GPUdEdxInfo* OutputTracksdEdx() { return mOutputTracksdEdx; } - GPUhdi() const GPUdEdxInfo* OutputTracksdEdxAlt() const { return mOutputTracksdEdxAlt; } - GPUhdi() GPUdEdxInfo* OutputTracksdEdxAlt() { return mOutputTracksdEdxAlt; } + GPUhdi() const GPUTPCGMMergedTrack* MergedTracks() const { return mMergedTracks; } + GPUhdi() GPUTPCGMMergedTrack* MergedTracks() { return mMergedTracks; } + GPUhdi() const GPUdEdxInfo* MergedTracksdEdx() const { return mMergedTracksdEdx; } + GPUhdi() GPUdEdxInfo* MergedTracksdEdx() { return mMergedTracksdEdx; } + GPUhdi() const GPUdEdxInfo* MergedTracksdEdxAlt() const { return mMergedTracksdEdxAlt; } + GPUhdi() GPUdEdxInfo* MergedTracksdEdxAlt() { return mMergedTracksdEdxAlt; } GPUhdi() uint32_t NClusters() const { return mNClusters; } GPUhdi() uint32_t NMaxClusters() const { return mNMaxClusters; } GPUhdi() uint32_t NMaxTracks() const { return mNMaxTracks; } @@ -262,9 +262,9 @@ class GPUTPCGMMerger : public GPUProcessor uint16_t mMemoryResOutputO2Scratch = (uint16_t)-1; int32_t mNClusters = 0; // Total number of incoming clusters (from sector tracks) - GPUTPCGMMergedTrack* mOutputTracks = nullptr; //* array of output merged tracks - GPUdEdxInfo* mOutputTracksdEdx = nullptr; //* dEdx information - GPUdEdxInfo* mOutputTracksdEdxAlt = nullptr; //* dEdx alternative information + GPUTPCGMMergedTrack* mMergedTracks = nullptr; //* array of output merged tracks + GPUdEdxInfo* mMergedTracksdEdx = nullptr; //* dEdx information + GPUdEdxInfo* mMergedTracksdEdxAlt = nullptr; //* dEdx alternative information GPUTPCGMSectorTrack* mSectorTrackInfos = nullptr; //* additional information for sector tracks int32_t* mSectorTrackInfoIndex = nullptr; GPUTPCGMMergedTrackHit* mClusters = nullptr; diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx index 3be32a2d87610..9c924e74ec519 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMergerDump.cxx @@ -139,7 +139,7 @@ void GPUTPCGMMerger::DumpCollected(std::ostream& out) const out << std::setprecision(2); out << "\nTPC Merger Collected Tracks\n"; for (uint32_t i = 0; i < mMemory->nMergedTracks; i++) { - const auto& trk = mOutputTracks[i]; + const auto& trk = mMergedTracks[i]; const auto& p = trk.GetParam(); out << " Track " << i << ": Loop " << trk.Looper() << " Alpha " << trk.GetAlpha() << " X " << p.GetX() << " offset " << p.GetTZOffset() << " Y " << p.GetY() << " Z " << p.GetZ() << " SPhi " << p.GetSinPhi() << " Tgl " << p.GetDzDs() << " QPt " << p.GetQPt() << " NCl " << trk.NClusters() << "\n"; } @@ -151,7 +151,7 @@ void GPUTPCGMMerger::DumpMergeCE(std::ostream& out) const DumpTrackLinks(out, true, " for CE merging"); out << "\nTPC Merger Merge CE\n"; for (uint32_t i = 0; i < mMemory->nMergedTracks; i++) { - const auto& trk = mOutputTracks[i]; + const auto& trk = mMergedTracks[i]; if (trk.CCE()) { out << " Track " << i << ": CCE\n"; } @@ -167,7 +167,7 @@ void GPUTPCGMMerger::DumpFitPrepare(std::ostream& out) const } out << " Clusters\n"; for (uint32_t j = 0; j < mMemory->nMergedTracks; j++) { - const auto& trk = mOutputTracks[j]; + const auto& trk = mMergedTracks[j]; out << " Track " << j << ": "; for (uint32_t i = trk.FirstClusterRef(); i < trk.FirstClusterRef() + trk.NClusters(); i++) { out << j << "/" << (i - trk.FirstClusterRef()) << ": " << mClusters[i].num << "/" << (int32_t)mClusters[i].state << ", "; @@ -196,14 +196,14 @@ void GPUTPCGMMerger::DumpRefit(std::ostream& out) const out << std::setprecision(2); out << "\nTPC Merger Refit\n"; for (uint32_t i = 0; i < mMemory->nMergedTracks; i++) { - const auto& trk = mOutputTracks[i]; + const auto& trk = mMergedTracks[i]; if (trk.NClusters() == 0) { continue; } const auto& p = trk.GetParam(); const auto& po = trk.OuterParam(); out << " Track " << i << ": OK " << trk.OK() << " Alpha " << trk.GetAlpha() << " X " << p.GetX() << " offset " << p.GetTZOffset() << " Y " << p.GetY() << " Z " << p.GetZ() << " SPhi " << p.GetSinPhi() << " Tgl " << p.GetDzDs() << " QPt " << p.GetQPt() << " NCl " << trk.NClusters() << " / " << trk.NClustersFitted() << " Cov " << p.GetErr2Y() << "/" << p.GetErr2Z() - << " dEdx " << (trk.OK() && Param().dodEdxEnabled ? mOutputTracksdEdx[i].dEdxTotTPC : -1.f) << "/" << (trk.OK() && Param().dodEdxEnabled ? mOutputTracksdEdx[i].dEdxMaxTPC : -1.f) + << " dEdx " << (trk.OK() && Param().dodEdxEnabled ? mMergedTracksdEdx[i].dEdxTotTPC : -1.f) << "/" << (trk.OK() && Param().dodEdxEnabled ? mMergedTracksdEdx[i].dEdxMaxTPC : -1.f) << " Outer " << po.P[0] << "/" << po.P[1] << "/" << po.P[2] << "/" << po.P[3] << "/" << po.P[4] << " NFitted " << trk.NClustersFitted() << " legs " << (int)trk.Legs() << " flags " << (int)trk.Flags() << "\n"; } @@ -217,7 +217,7 @@ void GPUTPCGMMerger::DumpLoopers(std::ostream& out) const if (i && i % 100 == 0) { out << "\n"; } - out << (int)mOutputTracks[i].MergedLooper() << " "; + out << (int)mMergedTracks[i].MergedLooper() << " "; } out << "\n"; } @@ -226,7 +226,7 @@ void GPUTPCGMMerger::DumpFinal(std::ostream& out) const { out << "\nTPC Merger Finalized\n"; for (uint32_t j = 0; j < mMemory->nMergedTracks; j++) { - const auto& trk = mOutputTracks[j]; + const auto& trk = mMergedTracks[j]; if (trk.NClusters() == 0) { continue; } diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx index 68763b3549547..1631777d80482 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx @@ -24,7 +24,7 @@ GPUdii() void GPUTPCGMMergerTrackFit::Thread<0>(int32_t nBlocks, int32_t nThread const int32_t iEnd = mode == -1 ? merger.Memory()->nRetryRefit : merger.NMergedTracks(); GPUCA_TBB_KERNEL_LOOP(merger.GetRec(), int32_t, ii, iEnd, { const int32_t i = mode == -1 ? merger.RetryRefitIds()[ii] : mode ? merger.TrackOrderProcess()[ii] : ii; - GPUTPCGMTrackParam::RefitTrack(merger.OutputTracks()[i], i, &merger, mode == -1); + GPUTPCGMTrackParam::RefitTrack(merger.MergedTracks()[i], i, &merger, mode == -1); }); } diff --git a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx index 1e08058fb22dd..eb22ca49e9242 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMO2Output.cxx @@ -49,10 +49,10 @@ struct GPUTPCGMO2OutputSort_comp { template <> GPUdii() void GPUTPCGMO2Output::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger) { - const GPUTPCGMMergedTrack* tracks = merger.OutputTracks(); + const GPUTPCGMMergedTrack* tracks = merger.MergedTracks(); const uint32_t nTracks = merger.NMergedTracks(); const GPUTPCGMMergedTrackHit* trackClusters = merger.Clusters(); - const GPUdEdxInfo* tracksdEdx = merger.OutputTracksdEdx(); + const GPUdEdxInfo* tracksdEdx = merger.MergedTracksdEdx(); constexpr uint8_t flagsReject = getFlagsReject(); const uint32_t flagsRequired = getFlagsRequired(merger.Param().rec); @@ -107,9 +107,9 @@ template <> GPUdii() void GPUTPCGMO2Output::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger) { constexpr float MinDelta = 0.1f; - const GPUTPCGMMergedTrack* tracks = merger.OutputTracks(); - GPUdEdxInfo* tracksdEdx = merger.OutputTracksdEdx(); - GPUdEdxInfo* tracksdEdxAlt = merger.OutputTracksdEdxAlt(); + const GPUTPCGMMergedTrack* tracks = merger.MergedTracks(); + GPUdEdxInfo* tracksdEdx = merger.MergedTracksdEdx(); + GPUdEdxInfo* tracksdEdxAlt = merger.MergedTracksdEdxAlt(); const int32_t nTracks = merger.NOutputTracksTPCO2(); const GPUTPCGMMergedTrackHit* trackClusters = merger.Clusters(); constexpr uint8_t flagsReject = getFlagsReject(); diff --git a/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx b/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx index 366f75cb05e56..4b616fce83f5f 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx @@ -94,10 +94,10 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int32_ storeOuter = 0; if (iWay == nWays - 1) { StoreOuter(outerParam, prop, 0); - if (merger->OutputTracks()[iTrk].Looper()) { + if (merger->MergedTracks()[iTrk].Looper()) { storeOuter = 1; } - } else if (iWay == nWays - 2 && merger->OutputTracks()[iTrk].Looper()) { + } else if (iWay == nWays - 2 && merger->MergedTracks()[iTrk].Looper()) { storeOuter = 2; } } @@ -435,9 +435,9 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int32_ // TODO: we have looping tracks here with 0 accepted clusters in the primary leg. In that case we should refit the track using only the primary leg. if (param.par.dodEdx && param.dodEdxEnabled) { - dEdx.computedEdx(merger->OutputTracksdEdx()[iTrk], param); + dEdx.computedEdx(merger->MergedTracksdEdx()[iTrk], param); if GPUCA_RTC_CONSTEXPR (param.rec.tpc.dEdxClusterRejectionFlagMask != param.rec.tpc.dEdxClusterRejectionFlagMaskAlt) { - dEdxAlt.computedEdx(merger->OutputTracksdEdxAlt()[iTrk], param); + dEdxAlt.computedEdx(merger->MergedTracksdEdxAlt()[iTrk], param); } } Alpha = prop.GetAlpha(); @@ -596,7 +596,7 @@ GPUd() float GPUTPCGMTrackParam::AttachClusters(const GPUTPCGMMerger* GPUrestric return -1e6f; } - const float zOffset = Merger->Param().par.earlyTpcTransform ? ((Merger->OutputTracks()[iTrack].CSide() ^ (sector >= 18)) ? -mTZOffset : mTZOffset) : Merger->GetConstantMem()->calibObjects.fastTransformHelper->getCorrMap()->convVertexTimeToZOffset(sector, mTZOffset, Merger->Param().continuousMaxTimeBin); + const float zOffset = Merger->Param().par.earlyTpcTransform ? ((Merger->MergedTracks()[iTrack].CSide() ^ (sector >= 18)) ? -mTZOffset : mTZOffset) : Merger->GetConstantMem()->calibObjects.fastTransformHelper->getCorrMap()->convVertexTimeToZOffset(sector, mTZOffset, Merger->Param().continuousMaxTimeBin); const float y0 = row.Grid().YMin(); const float stepY = row.HstepY(); const float z0 = row.Grid().ZMin() - zOffset; // We can use our own ZOffset, since this is only used temporarily anyway @@ -1136,7 +1136,7 @@ GPUd() void GPUTPCGMTrackParam::RefitTrack(GPUTPCGMMergedTrack& GPUrestrict() tr t.QPt() = 1.e-4f; } - CADEBUG(if (t.GetX() > 250) { printf("ERROR, Track %d at impossible X %f, Pt %f, Looper %d\n", iTrk, t.GetX(), CAMath::Abs(1.f / t.QPt()), (int32_t)merger->OutputTracks()[iTrk].Looper()); }); + CADEBUG(if (t.GetX() > 250) { printf("ERROR, Track %d at impossible X %f, Pt %f, Looper %d\n", iTrk, t.GetX(), CAMath::Abs(1.f / t.QPt()), (int32_t)merger->MergedTracks()[iTrk].Looper()); }); track.SetOK(ok); track.SetNClustersFitted(nTrackHits); diff --git a/GPU/GPUTracking/Merger/GPUTPCGlobalDebugSortKernels.cxx b/GPU/GPUTracking/Merger/GPUTPCGlobalDebugSortKernels.cxx index e63bb82a9b09e..5af3ebb51b9d6 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGlobalDebugSortKernels.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGlobalDebugSortKernels.cxx @@ -105,8 +105,8 @@ GPUdii() void GPUTPCGlobalDebugSortKernels::Thread= 0) { int32_t firstIdx = j; - auto firstItem = merger.OutputTracks()[firstIdx]; + auto firstItem = merger.MergedTracks()[firstIdx]; int32_t currIdx = firstIdx; int32_t sourceIdx = tmp[currIdx]; tmp2[sourceIdx] = currIdx; do { tmp[currIdx] = -1; - merger.OutputTracks()[currIdx] = merger.OutputTracks()[sourceIdx]; + merger.MergedTracks()[currIdx] = merger.MergedTracks()[sourceIdx]; currIdx = sourceIdx; sourceIdx = tmp[currIdx]; tmp2[sourceIdx] = currIdx; } while (sourceIdx != firstIdx); tmp[currIdx] = -1; - merger.OutputTracks()[currIdx] = firstItem; + merger.MergedTracks()[currIdx] = firstItem; } } }