From 4555ac431347a05d7f92906a005288a3a6d76595 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Wed, 3 Dec 2025 21:06:04 +0100 Subject: [PATCH 1/6] GPU QA: Add pad row vs occuapncy histogram --- GPU/GPUTracking/qa/GPUQA.cxx | 68 +++++++++++++++++++++++------------- GPU/GPUTracking/qa/GPUQA.h | 6 ++-- 2 files changed, 47 insertions(+), 27 deletions(-) diff --git a/GPU/GPUTracking/qa/GPUQA.cxx b/GPU/GPUTracking/qa/GPUQA.cxx index 689dc20cb1606..28b603f77e2ff 100644 --- a/GPU/GPUTracking/qa/GPUQA.cxx +++ b/GPU/GPUTracking/qa/GPUQA.cxx @@ -152,6 +152,7 @@ static constexpr float PT_MIN_CLUST = 0.01; static constexpr float PT_MAX = 20; static constexpr float ETA_MAX = 1.5; static constexpr float ETA_MAX2 = 0.9; +static constexpr int32_t PADROW_CHECK_MINCLS = 50; static constexpr bool CLUST_HIST_INT_SUM = false; @@ -525,9 +526,10 @@ int32_t GPUQA::InitQACreateHistograms() createHist(mClusters[i], name, name, AXIS_BINS[4], binsPt.get()); } - createHist(mPadRow[0], "padrow0", "padrow0", GPUCA_ROW_COUNT, 0, GPUCA_ROW_COUNT - 1, GPUCA_ROW_COUNT, 0, GPUCA_ROW_COUNT - 1); - createHist(mPadRow[1], "padrow1", "padrow1", 100.f, -0.2f, 0.2f, GPUCA_ROW_COUNT, 0, GPUCA_ROW_COUNT - 1); - createHist(mPadRow[2], "padrow2", "padrow2", 100.f, -0.2f, 0.2f, GPUCA_ROW_COUNT, 0, GPUCA_ROW_COUNT - 1); + createHist(mPadRow[0], "padrow0", "padrow0", GPUCA_ROW_COUNT - PADROW_CHECK_MINCLS, 0, GPUCA_ROW_COUNT - 1 - PADROW_CHECK_MINCLS, GPUCA_ROW_COUNT - PADROW_CHECK_MINCLS, 0, GPUCA_ROW_COUNT - 1 - PADROW_CHECK_MINCLS); + createHist(mPadRow[1], "padrow1", "padrow1", 100.f, -0.2f, 0.2f, GPUCA_ROW_COUNT - PADROW_CHECK_MINCLS, 0, GPUCA_ROW_COUNT - 1 - PADROW_CHECK_MINCLS); + createHist(mPadRow[2], "padrow2", "padrow2", 100.f, -0.2f, 0.2f, GPUCA_ROW_COUNT - PADROW_CHECK_MINCLS, 0, GPUCA_ROW_COUNT - 1 - PADROW_CHECK_MINCLS); + createHist(mPadRow[3], "padrow3", "padrow3", 100.f, 0, 300000, GPUCA_ROW_COUNT - PADROW_CHECK_MINCLS, 0, GPUCA_ROW_COUNT - 1 - PADROW_CHECK_MINCLS); } if (mQATasks & taskTrackStatistics) { @@ -968,7 +970,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx nClusters++; uint32_t hitId = mTracking->mIOPtrs.mergedTrackHits[track.FirstClusterRef() + k].num; if (hitId >= GetNMCLabels()) { - GPUError("Invalid hit id %u > %d (nClusters %d)", hitId, GetNMCLabels(), mTracking->mIOPtrs.clustersNative ? mTracking->mIOPtrs.clustersNative->nClustersTotal : 0); + GPUError("Invalid hit id %u > %d (nClusters %d)", hitId, GetNMCLabels(), clNative ? clNative->nClustersTotal : 0); throw std::runtime_error("qa error"); } acc.addLabel(hitId); @@ -1069,7 +1071,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx } } } - if ((mQATasks & taskClusterAttach)) { + if ((mQATasks & taskClusterAttach) && !tracksExternal) { std::vector lowestPadRow(mTracking->mIOPtrs.nMergedTracks); // fill cluster adjacent status if (mTracking->mIOPtrs.mergedTrackHitAttachment) { @@ -1096,12 +1098,12 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx } } } - if (mTracking->mIOPtrs.nMergedTracks && mTracking->mIOPtrs.clustersNative) { + if (mTracking->mIOPtrs.nMergedTracks && clNative) { std::fill(lowestPadRow.begin(), lowestPadRow.end(), 255); for (uint32_t iSector = 0; iSector < GPUCA_NSECTORS; iSector++) { for (uint32_t iRow = 0; iRow < GPUCA_ROW_COUNT; iRow++) { - for (uint32_t iCl = 0; iCl < mTracking->mIOPtrs.clustersNative->nClusters[iSector][iRow]; iCl++) { - int32_t i = mTracking->mIOPtrs.clustersNative->clusterOffset[iSector][iRow] + iCl; + for (uint32_t iCl = 0; iCl < clNative->nClusters[iSector][iRow]; iCl++) { + int32_t i = clNative->clusterOffset[iSector][iRow] + iCl; for (int32_t j = 0; j < GetMCLabelNID(i); j++) { uint32_t trackId = GetMCTrackObj(mTrackMCLabelsReverse, GetMCLabel(i, j)); if (trackId < lowestPadRow.size() && lowestPadRow[trackId] > iRow) { @@ -1113,12 +1115,21 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx } for (uint32_t i = 0; i < mTracking->mIOPtrs.nMergedTracks; i++) { const auto& trk = mTracking->mIOPtrs.mergedTracks[i]; - if (trk.OK() && lowestPadRow[i] != 255 && trk.NClustersFitted() > 70 && CAMath::Abs(trk.GetParam().GetQPt()) < 0.5) { - int32_t lowestRow = CAMath::Min(mTracking->mIOPtrs.mergedTrackHits[trk.FirstClusterRef()].row, mTracking->mIOPtrs.mergedTrackHits[trk.FirstClusterRef() + trk.NClusters() - 1].row); + if (trk.OK() && lowestPadRow[i] != 255 && trk.NClustersFitted() >= PADROW_CHECK_MINCLS && CAMath::Abs(trk.GetParam().GetQPt()) < 1.0) { + const auto& lowestCl = mTracking->mIOPtrs.mergedTrackHits[trk.FirstClusterRef()].row < mTracking->mIOPtrs.mergedTrackHits[trk.FirstClusterRef() + trk.NClusters() - 1].row ? mTracking->mIOPtrs.mergedTrackHits[trk.FirstClusterRef()] : mTracking->mIOPtrs.mergedTrackHits[trk.FirstClusterRef() + trk.NClusters() - 1]; + const int32_t lowestRow = lowestCl.row; mPadRow[0]->Fill(lowestPadRow[i], lowestRow, 1.f); mPadRow[1]->Fill(CAMath::ATan2(trk.GetParam().GetY(), trk.GetParam().GetX()), lowestRow, 1.f); - if (lowestPadRow[i] == 0 && lowestRow != 0) { - mPadRow[2]->Fill(CAMath::ATan2(trk.GetParam().GetY(), trk.GetParam().GetX()), lowestRow, 1.f); + if (lowestPadRow[i] < 10 && lowestRow > lowestPadRow[i] + 3) { + const auto& cl = clNative->clustersLinear[lowestCl.num]; + float x, y, z; + mTracking->GetTPCTransformHelper()->Transform(lowestCl.sector, lowestCl.row, cl.getPad(), cl.getTime(), x, y, z, trk.GetParam().GetTOffset()); + float phi = CAMath::ATan2(y, x); + mPadRow[2]->Fill(phi, lowestRow, 1.f); + if (CAMath::Abs(phi) < 0.15) { + const float time = cl.getTime(); + mPadRow[3]->Fill(mTracking->GetParam().GetUnscaledMult(time), lowestRow, 1.f); + } } } } @@ -1485,7 +1496,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx } } - if (mQATasks & taskClusterAttach) { + if ((mQATasks & taskClusterAttach) && !tracksExternal) { // Fill cluster histograms for (uint32_t iTrk = 0; iTrk < nReconstructedTracks; iTrk++) { const GPUTPCGMMergedTrack& track = mTracking->mIOPtrs.mergedTracks[iTrk]; @@ -1715,7 +1726,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx GPUWarning("No MC information available, only running partial TPC QA!"); } // mcAvail - if (mQATasks & taskTrackStatistics) { + if ((mQATasks & taskTrackStatistics) && !tracksExternal) { // Fill track statistic histograms std::vector> clusterAttachCounts; if (mcAvail) { @@ -1815,8 +1826,8 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx if (mQATasks & taskClusterCounts) { for (uint32_t iSector = 0; iSector < GPUCA_NSECTORS; iSector++) { for (uint32_t iRow = 0; iRow < GPUCA_ROW_COUNT; iRow++) { - for (uint32_t iCl = 0; iCl < mTracking->mIOPtrs.clustersNative->nClusters[iSector][iRow]; iCl++) { - uint32_t i = mTracking->mIOPtrs.clustersNative->clusterOffset[iSector][iRow] + iCl; + for (uint32_t iCl = 0; iCl < clNative->nClusters[iSector][iRow]; iCl++) { + uint32_t i = clNative->clusterOffset[iSector][iRow] + iCl; int32_t attach = mTracking->mIOPtrs.mergedTrackHitAttachment[i]; const auto& r = checkClusterState(attach, &mClusterCounts); @@ -1873,8 +1884,8 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx if (r.unattached) { mClusterCounts.nUnattached++; } - if (mTracking && mTracking->mIOPtrs.clustersNative) { - const auto& cl = mTracking->mIOPtrs.clustersNative->clustersLinear[i]; + if (mTracking && clNative) { + const auto& cl = clNative->clustersLinear[i]; mClRej[0]->Fill(cl.getPad() - GPUTPCGeometry::NPads(iRow) / 2 + 0.5, iRow, 1.f); if (!r.unattached && !r.protect) { mClRej[1]->Fill(cl.getPad() - GPUTPCGeometry::NPads(iRow) / 2 + 0.5, iRow, 1.f); @@ -1895,7 +1906,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx GPUInfo("QA Time: Cluster Counts:\t%6.0f us", timer.GetCurrentElapsedTime(true) * 1e6); } - if (mConfig.dumpToROOT) { + if (mConfig.dumpToROOT && !tracksExternal) { if (!clNative || !mTracking || !mTracking->mIOPtrs.mergedTrackHitAttachment || !mTracking->mIOPtrs.mergedTracks) { throw std::runtime_error("Cannot dump non o2::tpc::clusterNative clusters, need also hit attachmend and GPU tracks"); } @@ -2273,7 +2284,7 @@ int32_t GPUQA::DrawQAHistograms(TObjArray* qcout) mPClRejP = createGarbageCollected("p0", "", 0.0, 0.0, 1.0, 1.0); mPClRejP->Draw(); - for (int32_t i = 0; i < 3; i++) { + for (int32_t i = 0; i < 4; i++) { snprintf(name, 2048, "cpadrow%d", i); mCPadRow[i] = createGarbageCollected(name, name, 0, 0, 700, 700. * 2. / 3.); mCPadRow[i]->cd(); @@ -2842,19 +2853,28 @@ int32_t GPUQA::DrawQAHistograms(TObjArray* qcout) } } - for (int32_t i = 0; i < 3; i++) { + for (int32_t i = 0; i < 4; i++) { auto* e = mPadRow[i]; if (tout && !mConfig.inputHistogramsOnly) { e->Write(); } mPPadRow[i]->cd(); e->SetOption("colz"); - e->SetTitle(i == 2 ? "First Track Pad Row (row_{MC} = 0, row_{trk} > 0)" : "First Track Pad Row"); - e->GetXaxis()->SetTitle(i ? "#Phi (sector)" : "First MC Pad Row"); + std::string title = "First Track Pad Row (p_{T} > 1GeV, N_{Cl} #geq " + std::to_string(PADROW_CHECK_MINCLS); + if (i >= 2) { + title += ", row_{trk} > row_{MC} + 3, row_{MC} < 10"; + } + if (i >= 3) { + title += ", #Phi_{Cl} < 0.15"; + } + title += ")"; + + e->SetTitle(title.c_str()); + e->GetXaxis()->SetTitle(i == 3 ? "Local Occupancy" : (i ? "#Phi_{Cl} (sector)" : "First MC Pad Row")); e->GetYaxis()->SetTitle("First Pad Row"); e->Draw(); mCPadRow[i]->cd(); - static const constexpr char* PADROW_NAMES[3] = {"MC", "Phi", "Phi1"}; + static const constexpr char* PADROW_NAMES[4] = {"MC", "Phi", "Phi1", "Occ"}; mCPadRow[i]->Print(Form("%s/padRow%s.pdf", mConfig.plotsDir.c_str(), PADROW_NAMES[i])); if (mConfig.writeFileExt != "") { mCPadRow[i]->Print(Form("%s/padRow%s.%s", mConfig.plotsDir.c_str(), PADROW_NAMES[i], mConfig.writeFileExt.c_str())); diff --git a/GPU/GPUTracking/qa/GPUQA.h b/GPU/GPUTracking/qa/GPUQA.h index 54d1ceed9d365..7303ed62a9562 100644 --- a/GPU/GPUTracking/qa/GPUQA.h +++ b/GPU/GPUTracking/qa/GPUQA.h @@ -323,9 +323,9 @@ class GPUQA TPad* mPClRej[3]; TPad* mPClRejP; - TH2F* mPadRow[3]; - TCanvas* mCPadRow[3]; - TPad* mPPadRow[3]; + TH2F* mPadRow[4]; + TCanvas* mCPadRow[4]; + TPad* mPPadRow[4]; std::vector mHistClusterCount; From 6804ed289026bb0534ddf7d48068d6ea7f685fc4 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 4 Dec 2025 12:55:58 +0100 Subject: [PATCH 2/6] GPU QA Standalone: By default write histograms to output root file in plots folder --- GPU/GPUTracking/Standalone/Benchmark/standalone.cxx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index ca26f26d32612..857803d913372 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -197,6 +197,9 @@ int32_t ReadConfiguration(int argc, char** argv) printf("Can only produce QA pdf output when input files are specified!\n"); return 1; } + if (configStandalone.QA.enableLocalOutput && !configStandalone.QA.inputHistogramsOnly && configStandalone.QA.output == "" && configStandalone.QA.plotsDir != "") { + configStandalone.QA.output = configStandalone.QA.plotsDir + "/output.root"; + } if (configStandalone.QA.inputHistogramsOnly) { configStandalone.rundEdx = false; configStandalone.noEvents = true; From 329f043432b1eb83a3fcbaeca03acdc1599e0354 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Wed, 3 Dec 2025 21:13:34 +0100 Subject: [PATCH 3/6] GPU: Remove non-working MI100 serialization workaround and obsolete StuckProtection --- GPU/GPUTracking/Base/GPUReconstructionCPU.h | 2 -- .../Base/opencl/GPUReconstructionOCL.cxx | 21 ------------------- .../Base/opencl/GPUReconstructionOCL.h | 1 - GPU/GPUTracking/Definitions/GPUSettingsList.h | 2 -- GPU/GPUTracking/Global/GPUChain.h | 2 -- .../Global/GPUChainTrackingSectorTracker.cxx | 3 --- prodtests/full-system-test/dpl-workflow.sh | 2 -- 7 files changed, 33 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index a78a482db4e7a..d621d45fcd92b 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -88,8 +88,6 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface int32_t ExitDevice() override; int32_t GetThread(); - virtual int32_t DoStuckProtection(int32_t stream, deviceEvent event) { return 0; } - // Pointers to tracker classes GPUProcessorProcessors mProcShadow; // Host copy of tracker objects that will be used on the GPU GPUConstantMem*& mProcessorsShadow = mProcShadow.mProcessorsProc; diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx index 271fe494860cd..6954cfb3d6211 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx @@ -470,27 +470,6 @@ void GPUReconstructionOCL::ReleaseEvent(deviceEvent ev) { GPUChkErr(clReleaseEve void GPUReconstructionOCL::RecordMarker(deviceEvent* ev, int32_t stream) { GPUChkErr(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev->getEventList())); } -int32_t GPUReconstructionOCL::DoStuckProtection(int32_t stream, deviceEvent event) -{ - if (GetProcessingSettings().stuckProtection) { - cl_int tmp = 0; - for (int32_t i = 0; i <= GetProcessingSettings().stuckProtection / 50; i++) { - usleep(50); - clGetEventInfo(event.get(), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(tmp), &tmp, nullptr); - if (tmp == CL_COMPLETE) { - break; - } - } - if (tmp != CL_COMPLETE) { - mGPUStuck = 1; - GPUErrorReturn("GPU Stuck, future processing in this component is disabled, skipping event (GPU Event State %d)", (int32_t)tmp); - } - } else { - clFinish(mInternals->command_queue[stream]); - } - return 0; -} - void GPUReconstructionOCL::SynchronizeGPU() { for (int32_t i = 0; i < mNStreams; i++) { diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h index 958d5186bf41a..a52db1f2a737a 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h @@ -43,7 +43,6 @@ class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const override; void SynchronizeGPU() override; - int32_t DoStuckProtection(int32_t stream, deviceEvent event) override; int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false) override; void SynchronizeStream(int32_t stream) override; void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) override; diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 5a075bf7f9a02..d70fac115eab7 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -301,7 +301,6 @@ BeginSubConfig(GPUSettingsProcessing, proc, configStandalone, "PROC", 0, "Proces AddOption(deviceNum, int32_t, -1, "gpuDevice", 0, "Set GPU device to use (-1: automatic, -2: for round-robin usage in timeslice-pipeline)") AddOption(gpuDeviceOnly, bool, false, "", 0, "Use only GPU as device (i.e. no CPU for OpenCL)") AddOption(globalInitMutex, bool, false, "", 0, "Use global mutex to synchronize initialization of multiple GPU instances") -AddOption(stuckProtection, int32_t, 0, "", 0, "Timeout in us, When AMD GPU is stuck, just continue processing and skip tracking, do not crash or stall the chain") AddOption(trdNCandidates, int32_t, 3, "", 0, "Number of branching track candidates for single input track during propagation") AddOption(trdTrackModelO2, bool, false, "", 0, "Use O2 track model instead of GPU track model for TRD tracking") AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, -1 = autoselect (-2 for O2, 0 for standalone))") @@ -383,7 +382,6 @@ AddOption(debugOnFailureMaxN, uint32_t, 1, "", 0, "Max number of times to run th AddOption(debugOnFailureMaxFiles, uint32_t, 0, "", 0, "Max number of files to have in the target folder") AddOption(debugOnFailureMaxSize, uint32_t, 0, "", 0, "Max size of existing dumps in the target folder in GB") AddOption(debugOnFailureDirectory, std::string, ".", "", 0, "Target folder for debug / dump") -AddOption(amdMI100SerializationWorkaround, bool, false, "", 0, "Enable workaround that mitigates MI100 serialization bug") AddOption(memoryStat, bool, false, "", 0, "Print memory statistics") AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr) AddSubConfig(GPUSettingsProcessingRTC, rtc) diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h index 9ce3da1092e83..6831fbd15080a 100644 --- a/GPU/GPUTracking/Global/GPUChain.h +++ b/GPU/GPUTracking/Global/GPUChain.h @@ -224,8 +224,6 @@ class GPUChain inline GPUChain* GetNextChainInQueue() { return mRec->GetNextChainInQueue(); } - virtual int32_t DoStuckProtection(int32_t stream, deviceEvent event) { return 0; } - template bool DoDebugAndDump(RecoStep step, uint32_t mask, T& processor, S T::*func, Args&&... args) { diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx index 122eb709b4356..e2d68f10819fb 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx @@ -149,9 +149,6 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal() GPUTPCTracker& trk = processors()->tpcTrackers[iSector]; GPUTPCTracker& trkShadow = doGPU ? processorsShadow()->tpcTrackers[iSector] : trk; int32_t useStream = StreamForSector(iSector); - if (GetProcessingSettings().amdMI100SerializationWorkaround) { - SynchronizeStream(useStream); // TODO: Remove this workaround once fixed on MI100 - } if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Creating Sector Data (Sector %d)", iSector); diff --git a/prodtests/full-system-test/dpl-workflow.sh b/prodtests/full-system-test/dpl-workflow.sh index ce5607d361cbe..754349c87eecc 100755 --- a/prodtests/full-system-test/dpl-workflow.sh +++ b/prodtests/full-system-test/dpl-workflow.sh @@ -284,8 +284,6 @@ if [[ $GPUTYPE == "HIP" ]]; then if [[ ${EPN_NODE_MI100:-} == "1" && ${DISABLE_MI100_SERIALIZATION:-0} != 1 ]]; then if [[ -n ${OPTIMIZED_PARALLEL_ASYNC:-} ]] || [[ $EPNSYNCMODE == 1 && ${FULL_MI100_SERIALIZATION:-0} == 1 ]]; then GPU_CONFIG_KEY+="GPU_proc.serializeGPU=3;" - elif [[ $EPNSYNCMODE == 1 ]]; then - GPU_CONFIG_KEY+="GPU_proc.amdMI100SerializationWorkaround=1;" fi fi #export HSA_TOOLS_LIB=/opt/rocm/lib/librocm-debug-agent.so.2 From 64845251e1c06bb76c308f935812572bf24a049d Mon Sep 17 00:00:00 2001 From: David Rohr Date: Mon, 8 Dec 2025 09:32:03 +0100 Subject: [PATCH 4/6] GPU QA: Dump also text output to output folder --- GPU/GPUTracking/qa/GPUQA.cxx | 11 ++++++++++- GPU/GPUTracking/qa/GPUQA.h | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/GPU/GPUTracking/qa/GPUQA.cxx b/GPU/GPUTracking/qa/GPUQA.cxx index 28b603f77e2ff..3c176031dec08 100644 --- a/GPU/GPUTracking/qa/GPUQA.cxx +++ b/GPU/GPUTracking/qa/GPUQA.cxx @@ -3141,7 +3141,9 @@ void GPUQA::PrintClusterCount(int32_t mode, int32_t& num, const char* name, uint createHist(mHistClusterCount[num], name2, name, 1000, 0, mConfig.histMaxNClusters, 1000, 0, 100); } else if (mode == 0) { if (normalization && mConfig.enableLocalOutput) { - printf("\t%40s: %'12" PRIu64 " (%6.2f%%)\n", name, n, 100.f * n / normalization); + for (uint32_t i = 0; i < 1 + (mTextDump != nullptr); i++) { + fprintf(i ? mTextDump : stdout, "\t%40s: %'12" PRIu64 " (%6.2f%%)\n", name, n, 100.f * n / normalization); + } } if (mConfig.clusterRejectionHistograms) { float ratio = 100.f * n / std::max(normalization, 1); @@ -3153,6 +3155,9 @@ void GPUQA::PrintClusterCount(int32_t mode, int32_t& num, const char* name, uint int32_t GPUQA::DoClusterCounts(uint64_t* attachClusterCounts, int32_t mode) { + if (mConfig.enableLocalOutput && !mConfig.inputHistogramsOnly && mConfig.plotsDir != "") { + mTextDump = fopen((mConfig.plotsDir + "/clusterCounts.txt").c_str(), "w+"); + } int32_t num = 0; if (mcPresent() && (mQATasks & taskClusterAttach) && attachClusterCounts) { for (int32_t i = 0; i < N_CLS_HIST; i++) { // TODO: Check that these counts are still printed correctly! @@ -3191,6 +3196,10 @@ int32_t GPUQA::DoClusterCounts(uint64_t* attachClusterCounts, int32_t mode) PrintClusterCount(mode, num, "Correctly Attached all-trk normalized", mClusterCounts.nCorrectlyAttachedNormalized, mClusterCounts.nTotal); PrintClusterCount(mode, num, "Correctly Attached non-fake normalized", mClusterCounts.nCorrectlyAttachedNormalizedNonFake, mClusterCounts.nTotal); } + if (mTextDump) { + fclose(mTextDump); + mTextDump = nullptr; + } return num; } diff --git a/GPU/GPUTracking/qa/GPUQA.h b/GPU/GPUTracking/qa/GPUQA.h index 7303ed62a9562..b42fa804c6212 100644 --- a/GPU/GPUTracking/qa/GPUQA.h +++ b/GPU/GPUTracking/qa/GPUQA.h @@ -62,6 +62,7 @@ class GPUQA #else #include "GPUTPCDef.h" +#include #include #include #include @@ -365,6 +366,7 @@ class GPUQA int32_t mMCTrackMin = -1, mMCTrackMax = -1; const o2::tpc::ClusterNativeAccess* mClNative = nullptr; + FILE* mTextDump = nullptr; }; inline bool GPUQA::SuppressTrack(int32_t iTrack) const { return (mConfig.matchMCLabels.size() && !mGoodTracks[mNEvents][iTrack]); } From 448f6da3e29f2472e3d0360d9c9d45ad01a0aafb Mon Sep 17 00:00:00 2001 From: David Rohr Date: Wed, 10 Dec 2025 19:16:24 +0100 Subject: [PATCH 5/6] GPU QA: Fix some task number inconsistencies --- GPU/GPUTracking/Global/GPUChainTracking.cxx | 4 +- GPU/GPUTracking/qa/GPUQA.cxx | 223 ++++++++++---------- GPU/GPUTracking/qa/GPUQA.h | 16 +- GPU/Workflow/src/GPUWorkflowSpec.cxx | 2 +- 4 files changed, 131 insertions(+), 114 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index 14d0e04eb4dd3..0e7d4bc4f436e 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -475,7 +475,7 @@ int32_t GPUChainTracking::ForceInitQA() qa.reset(new GPUQA(this)); } if (!GetQA()->IsInitialized()) { - return GetQA()->InitQA(); + return GetQA()->InitQA(GetProcessingSettings().runQA <= 0 ? -GetProcessingSettings().runQA : GPUQA::tasksAutomatic); } return 0; } @@ -690,7 +690,7 @@ int32_t GPUChainTracking::RunChain() } const bool needQA = GPUQA::QAAvailable() && (GetProcessingSettings().runQA || (GetProcessingSettings().eventDisplay && (mIOPtrs.nMCInfosTPC || GetProcessingSettings().runMC))); if (needQA && GetQA()->IsInitialized() == false) { - if (GetQA()->InitQA(GetProcessingSettings().runQA ? -GetProcessingSettings().runQA : -1)) { + if (GetQA()->InitQA(GetProcessingSettings().runQA <= 0 ? -GetProcessingSettings().runQA : GPUQA::tasksAutomatic)) { return 1; } } diff --git a/GPU/GPUTracking/qa/GPUQA.cxx b/GPU/GPUTracking/qa/GPUQA.cxx index 3c176031dec08..852ac5c1feefb 100644 --- a/GPU/GPUTracking/qa/GPUQA.cxx +++ b/GPU/GPUTracking/qa/GPUQA.cxx @@ -544,7 +544,8 @@ int32_t GPUQA::InitQACreateHistograms() createHist(mT0[0], "tracks_t0", "tracks_t0", (maxTime + 1) / 10, 0, maxTime); createHist(mT0[1], "tracks_t0_res", "tracks_t0_res", 1000, -100, 100); createHist(mClXY, "clXY", "clXY", 1000, -250, 250, 1000, -250, 250); // TODO: Pass name only once - + } + if (mQATasks & taskClusterRejection) { const int padCount = GPUTPCGeometry::NPads(GPUCA_ROW_COUNT - 1); for (int32_t i = 0; i < 3; i++) { snprintf(name, 2048, "clrej_%d", i); @@ -577,8 +578,8 @@ int32_t GPUQA::InitQACreateHistograms() int32_t GPUQA::loadHistograms(std::vector& i1, std::vector& i2, std::vector& i3, std::vector& i4, int32_t tasks) { - if (tasks == -1) { - tasks = taskDefaultPostprocess; + if (tasks == tasksAutomatic) { + tasks = tasksDefaultPostprocess; } if (mQAInitialized && (!mHaveExternalHists || tasks != mQATasks)) { throw std::runtime_error("QA not initialized or initialized with different task array"); @@ -593,7 +594,7 @@ int32_t GPUQA::loadHistograms(std::vector& i1, std::vector& i2, std: mHistGraph_pos.clear(); mHaveExternalHists = true; if (mConfig.noMC) { - tasks &= tasksNoQC; + tasks &= tasksAllNoQC; } mQATasks = tasks; if (InitQACreateHistograms()) { @@ -806,8 +807,8 @@ int32_t GPUQA::InitQA(int32_t tasks) if (mQAInitialized) { throw std::runtime_error("QA already initialized"); } - if (tasks == -1) { - tasks = taskDefault; + if (tasks == tasksAutomatic) { + tasks = tasksDefault; } mHist1D = new std::vector; @@ -815,7 +816,7 @@ int32_t GPUQA::InitQA(int32_t tasks) mHist1Dd = new std::vector; mHistGraph = new std::vector; if (mConfig.noMC) { - tasks &= tasksNoQC; + tasks &= tasksAllNoQC; } mQATasks = tasks; @@ -1823,7 +1824,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx uint32_t nCl = clNative ? clNative->nClustersTotal : mTracking->GetProcessors()->tpcMerger.NMaxClusters(); mClusterCounts.nTotal += nCl; - if (mQATasks & taskClusterCounts) { + if (mQATasks & (taskClusterCounts | taskClusterRejection)) { for (uint32_t iSector = 0; iSector < GPUCA_NSECTORS; iSector++) { for (uint32_t iRow = 0; iRow < GPUCA_ROW_COUNT; iRow++) { for (uint32_t iCl = 0; iCl < clNative->nClusters[iSector][iRow]; iCl++) { @@ -1831,64 +1832,68 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx int32_t attach = mTracking->mIOPtrs.mergedTrackHitAttachment[i]; const auto& r = checkClusterState(attach, &mClusterCounts); - if (mcAvail) { - float totalWeight = 0, weight400 = 0, weight40 = 0; - for (int32_t j = 0; j < GetMCLabelNID(i); j++) { - const auto& label = GetMCLabel(i, j); - if (GetMCLabelID(label) >= 0) { - totalWeight += GetMCLabelWeight(label); - if (GetMCTrackObj(mMCParam, label).pt >= 0.4) { - weight400 += GetMCLabelWeight(label); - } - if (GetMCTrackObj(mMCParam, label).pt <= 0.04) { - weight40 += GetMCLabelWeight(label); + if (mQATasks & taskClusterRejection) { + if (mcAvail) { + float totalWeight = 0, weight400 = 0, weight40 = 0; + for (int32_t j = 0; j < GetMCLabelNID(i); j++) { + const auto& label = GetMCLabel(i, j); + if (GetMCLabelID(label) >= 0) { + totalWeight += GetMCLabelWeight(label); + if (GetMCTrackObj(mMCParam, label).pt >= 0.4) { + weight400 += GetMCLabelWeight(label); + } + if (GetMCTrackObj(mMCParam, label).pt <= 0.04) { + weight40 += GetMCLabelWeight(label); + } } } - } - if (totalWeight > 0 && 10.f * weight400 >= totalWeight) { - if (!r.unattached && !r.protect && !r.physics) { - mClusterCounts.nFakeRemove400++; - int32_t totalFake = weight400 < 0.9f * totalWeight; - if (totalFake) { - mClusterCounts.nFullFakeRemove400++; - } - /*printf("Fake removal (%d): Hit %7d, attached %d lowPt %d looper %d tube200 %d highIncl %d tube %d bad %d recPt %7.2f recLabel %6d", totalFake, i, (int32_t) (mClusterParam[i].attached || mClusterParam[i].fakeAttached), - (int32_t) lowPt, (int32_t) ((attach & gputpcgmmergertypes::attachGoodLeg) == 0), (int32_t) ((attach & gputpcgmmergertypes::attachTube) && mev200), - (int32_t) ((attach & gputpcgmmergertypes::attachHighIncl) != 0), (int32_t) ((attach & gputpcgmmergertypes::attachTube) != 0), (int32_t) ((attach & gputpcgmmergertypes::attachGood) == 0), - fabsf(qpt) > 0 ? 1.f / qpt : 0.f, id); - for (int32_t j = 0;j < GetMCLabelNID(i);j++) - { - //if (GetMCLabelID(i, j) < 0) break; - printf(" - label%d %6d weight %5d", j, GetMCLabelID(i, j), (int32_t) GetMCLabelWeight(i, j)); - if (GetMCLabelID(i, j) >= 0) printf(" - pt %7.2f", mMCParam[GetMCLabelID(i, j)].pt); - else printf(" "); + if (totalWeight > 0 && 10.f * weight400 >= totalWeight) { + if (!r.unattached && !r.protect && !r.physics) { + mClusterCounts.nFakeRemove400++; + int32_t totalFake = weight400 < 0.9f * totalWeight; + if (totalFake) { + mClusterCounts.nFullFakeRemove400++; + } + /*printf("Fake removal (%d): Hit %7d, attached %d lowPt %d looper %d tube200 %d highIncl %d tube %d bad %d recPt %7.2f recLabel %6d", totalFake, i, (int32_t) (mClusterParam[i].attached || mClusterParam[i].fakeAttached), + (int32_t) lowPt, (int32_t) ((attach & gputpcgmmergertypes::attachGoodLeg) == 0), (int32_t) ((attach & gputpcgmmergertypes::attachTube) && mev200), + (int32_t) ((attach & gputpcgmmergertypes::attachHighIncl) != 0), (int32_t) ((attach & gputpcgmmergertypes::attachTube) != 0), (int32_t) ((attach & gputpcgmmergertypes::attachGood) == 0), + fabsf(qpt) > 0 ? 1.f / qpt : 0.f, id); + for (int32_t j = 0;j < GetMCLabelNID(i);j++) + { + //if (GetMCLabelID(i, j) < 0) break; + printf(" - label%d %6d weight %5d", j, GetMCLabelID(i, j), (int32_t) GetMCLabelWeight(i, j)); + if (GetMCLabelID(i, j) >= 0) printf(" - pt %7.2f", mMCParam[GetMCLabelID(i, j)].pt); + else printf(" "); + } + printf("\n");*/ } - printf("\n");*/ + mClusterCounts.nAbove400++; } - mClusterCounts.nAbove400++; - } - if (totalWeight > 0 && weight40 >= 0.9 * totalWeight) { - mClusterCounts.nBelow40++; - if (r.protect || r.physics) { - mClusterCounts.nFakeProtect40++; + if (totalWeight > 0 && weight40 >= 0.9 * totalWeight) { + mClusterCounts.nBelow40++; + if (r.protect || r.physics) { + mClusterCounts.nFakeProtect40++; + } } } - } - if (r.physics) { - mClusterCounts.nPhysics++; - } - if (r.protect) { - mClusterCounts.nProt++; - } - if (r.unattached) { - mClusterCounts.nUnattached++; + if (r.physics) { + mClusterCounts.nPhysics++; + } + if (r.protect) { + mClusterCounts.nProt++; + } + if (r.unattached) { + mClusterCounts.nUnattached++; + } } - if (mTracking && clNative) { - const auto& cl = clNative->clustersLinear[i]; - mClRej[0]->Fill(cl.getPad() - GPUTPCGeometry::NPads(iRow) / 2 + 0.5, iRow, 1.f); - if (!r.unattached && !r.protect) { - mClRej[1]->Fill(cl.getPad() - GPUTPCGeometry::NPads(iRow) / 2 + 0.5, iRow, 1.f); + if (mQATasks & taskClusterRejection) { + if (mTracking && clNative) { + const auto& cl = clNative->clustersLinear[i]; + mClRej[0]->Fill(cl.getPad() - GPUTPCGeometry::NPads(iRow) / 2 + 0.5, iRow, 1.f); + if (!r.unattached && !r.protect) { + mClRej[1]->Fill(cl.getPad() - GPUTPCGeometry::NPads(iRow) / 2 + 0.5, iRow, 1.f); + } } } } @@ -2271,7 +2276,9 @@ int32_t GPUQA::DrawQAHistograms(TObjArray* qcout) mCClXY->cd(); mPClXY = createGarbageCollected("p0", "", 0.0, 0.0, 1.0, 1.0); mPClXY->Draw(); + } + if (mQATasks & taskClusterRejection) { for (int32_t i = 0; i < 3; i++) { snprintf(name, 2048, "cnclrej%d", i); mCClRej[i] = createGarbageCollected(name, name, 0, 0, 700, 700. * 2. / 3.); @@ -2283,7 +2290,9 @@ int32_t GPUQA::DrawQAHistograms(TObjArray* qcout) mCClRejP->cd(); mPClRejP = createGarbageCollected("p0", "", 0.0, 0.0, 1.0, 1.0); mPClRejP->Draw(); + } + if (mQATasks & taskClusterAttach) { for (int32_t i = 0; i < 4; i++) { snprintf(name, 2048, "cpadrow%d", i); mCPadRow[i] = createGarbageCollected(name, name, 0, 0, 700, 700. * 2. / 3.); @@ -3034,7 +3043,7 @@ int32_t GPUQA::DrawQAHistograms(TObjArray* qcout) } } - mPClXY->cd(); + mPClXY->cd(); // TODO: This should become a separate task category mClXY->SetOption("colz"); mClXY->Draw(); mCClXY->cd(); @@ -3042,61 +3051,61 @@ int32_t GPUQA::DrawQAHistograms(TObjArray* qcout) if (mConfig.writeFileExt != "") { mCClXY->Print(Form("%s/clustersXY.%s", mConfig.plotsDir.c_str(), mConfig.writeFileExt.c_str())); } + } - if (mQATasks & taskClusterCounts) { - mClRej[2]->Divide(mClRej[1], mClRej[0]); + if (mQATasks & taskClusterRejection) { + mClRej[2]->Divide(mClRej[1], mClRej[0]); - for (int32_t i = 0; i < 3; i++) { - if (tout && !mConfig.inputHistogramsOnly) { - mClRej[i]->Write(); - } - mPClRej[i]->cd(); - mClRej[i]->SetTitle(REJECTED_NAMES[i]); - mClRej[i]->SetOption("colz"); - mClRej[i]->Draw(); - mCClRej[i]->cd(); - mCClRej[i]->Print(Form("%s/clustersRej%d%s.pdf", mConfig.plotsDir.c_str(), i, REJECTED_NAMES[i])); - if (mConfig.writeFileExt != "") { - mCClRej[i]->Print(Form("%s/clustersRej%d%s.%s", mConfig.plotsDir.c_str(), i, REJECTED_NAMES[i], mConfig.writeFileExt.c_str())); - } + for (int32_t i = 0; i < 3; i++) { + if (tout && !mConfig.inputHistogramsOnly) { + mClRej[i]->Write(); + } + mPClRej[i]->cd(); + mClRej[i]->SetTitle(REJECTED_NAMES[i]); + mClRej[i]->SetOption("colz"); + mClRej[i]->Draw(); + mCClRej[i]->cd(); + mCClRej[i]->Print(Form("%s/clustersRej%d%s.pdf", mConfig.plotsDir.c_str(), i, REJECTED_NAMES[i])); + if (mConfig.writeFileExt != "") { + mCClRej[i]->Print(Form("%s/clustersRej%d%s.%s", mConfig.plotsDir.c_str(), i, REJECTED_NAMES[i], mConfig.writeFileExt.c_str())); } + } - mPClRejP->cd(); - for (int32_t k = 0; k < ConfigNumInputs; k++) { - auto* tmp = mClRej[0]; - if (GetHist(tmp, tin, k, nNewInput) == nullptr) { - continue; - } - TH1D* proj1 = tmp->ProjectionY(Form("clrejptmp1%d", k)); // TODO: Clean up names - proj1->SetDirectory(nullptr); - tmp = mClRej[1]; - if (GetHist(tmp, tin, k, nNewInput) == nullptr) { - continue; - } - TH1D* proj2 = tmp->ProjectionY(Form("clrejptmp2%d", k)); - proj2->SetDirectory(nullptr); + mPClRejP->cd(); + for (int32_t k = 0; k < ConfigNumInputs; k++) { + auto* tmp = mClRej[0]; + if (GetHist(tmp, tin, k, nNewInput) == nullptr) { + continue; + } + TH1D* proj1 = tmp->ProjectionY(Form("clrejptmp1%d", k)); // TODO: Clean up names + proj1->SetDirectory(nullptr); + tmp = mClRej[1]; + if (GetHist(tmp, tin, k, nNewInput) == nullptr) { + continue; + } + TH1D* proj2 = tmp->ProjectionY(Form("clrejptmp2%d", k)); + proj2->SetDirectory(nullptr); - auto* e = mClRejP; - if (GetHist(e, tin, k, nNewInput) == nullptr) { - continue; - } - e->Divide(proj2, proj1); - if (tout && !mConfig.inputHistogramsOnly && k == 0) { - e->Write(); - } - delete proj1; - delete proj2; - e->SetMinimum(-0.02); - e->SetMaximum(0.22); - e->SetTitle("Rejected Clusters"); - e->GetXaxis()->SetTitle("Pad Row"); - e->GetYaxis()->SetTitle("Rejected Clusters (fraction)"); - e->Draw(k == 0 ? "" : "same"); + auto* e = mClRejP; + if (GetHist(e, tin, k, nNewInput) == nullptr) { + continue; } - mPClRejP->Print(Form("%s/clustersRejProjected.pdf", mConfig.plotsDir.c_str())); - if (mConfig.writeFileExt != "") { - mPClRejP->Print(Form("%s/clustersRejProjected.%s", mConfig.plotsDir.c_str(), mConfig.writeFileExt.c_str())); + e->Divide(proj2, proj1); + if (tout && !mConfig.inputHistogramsOnly && k == 0) { + e->Write(); } + delete proj1; + delete proj2; + e->SetMinimum(-0.02); + e->SetMaximum(0.22); + e->SetTitle("Rejected Clusters"); + e->GetXaxis()->SetTitle("Pad Row"); + e->GetYaxis()->SetTitle("Rejected Clusters (fraction)"); + e->Draw(k == 0 ? "" : "same"); + } + mPClRejP->Print(Form("%s/clustersRejProjected.pdf", mConfig.plotsDir.c_str())); + if (mConfig.writeFileExt != "") { + mPClRejP->Print(Form("%s/clustersRejProjected.%s", mConfig.plotsDir.c_str(), mConfig.writeFileExt.c_str())); } } diff --git a/GPU/GPUTracking/qa/GPUQA.h b/GPU/GPUTracking/qa/GPUQA.h index b42fa804c6212..3dd49e2ec1373 100644 --- a/GPU/GPUTracking/qa/GPUQA.h +++ b/GPU/GPUTracking/qa/GPUQA.h @@ -56,6 +56,10 @@ class GPUQA static bool QAAvailable() { return false; } static bool IsInitialized() { return false; } void UpdateChain(GPUChainTracking* chain) {} + + enum QA_TASKS { + tasksAutomatic = 0 + }; }; } // namespace o2::gpu @@ -146,16 +150,20 @@ class GPUQA static constexpr int32_t MC_LABEL_INVALID = -1e9; - enum QA_TASKS { + enum QA_TASKS { // TODO: make this in32_t typed taskTrackingEff = 1, taskTrackingRes = 2, taskTrackingResPull = 4, + tasksAllMC = 8 - 1, taskClusterAttach = 8, taskTrackStatistics = 16, taskClusterCounts = 32, - taskDefault = 63, - taskDefaultPostprocess = 31, - tasksNoQC = 56 + taskClusterRejection = 64, + tasksAll = 128 - 1, + tasksDefault = tasksAll, + tasksDefaultPostprocess = tasksDefault & ~taskClusterCounts, + tasksAllNoQC = tasksAll & ~tasksAllMC, + tasksAutomatic = -1 }; private: diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index d7ea772c31653..fb1d489a8479d 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -180,7 +180,7 @@ void GPURecoWorkflowSpec::init(InitContext& ic) mConfig->configQA.shipToQC = true; if (!mConfig->configProcessing.runQA) { mConfig->configQA.enableLocalOutput = false; - mQATaskMask = (mSpecConfig.processMC ? 15 : 0) | (mConfig->configQA.clusterRejectionHistograms ? 32 : 0); + mQATaskMask = (mSpecConfig.processMC ? 15 : 0) | (mConfig->configQA.clusterRejectionHistograms ? 32 : 0); // TODO: Clean up using numeric flags! mConfig->configProcessing.runQA = -mQATaskMask; } } From 547159d37f7a3f15e3b4545c383b421f9f6144f7 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Wed, 29 Oct 2025 14:45:23 +0100 Subject: [PATCH 6/6] GPU TPC: Fix deterministic mode in combination of propagation of MC labels --- GPU/GPUTracking/Global/GPUChainTracking.h | 1 + .../Global/GPUChainTrackingClusterizer.cxx | 92 ++++++++++++++----- 2 files changed, 72 insertions(+), 21 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h index 8de49cc954e35..4b07aadfad357 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.h +++ b/GPU/GPUTracking/Global/GPUChainTracking.h @@ -306,6 +306,7 @@ class GPUChainTracking : public GPUChain void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function allocator, bool applyClusterCuts); bool NeedTPCClustersOnGPU(); void WriteReducedClusters(); + void SortClusters(bool buildNativeGPU, bool propagateMCLabels, o2::tpc::ClusterNativeAccess* clusterAccess, o2::tpc::ClusterNative* clusters); template int32_t RunTRDTrackingInternal(); uint32_t StreamForSector(uint32_t sector) const; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index fdce8ef5a127d..c4566ffb968a7 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -57,6 +57,8 @@ #include "utils/VcShim.h" #include "utils/strtag.h" #include +#include +#include using namespace o2::gpu; using namespace o2::tpc; @@ -762,14 +764,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) ClusterNative* tmpNativeClusters = nullptr; std::unique_ptr tmpNativeClusterBuffer; - // setup MC Labels - bool propagateMCLabels = GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC; + const bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU(); + const bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output + const bool propagateMCLabels = buildNativeHost && GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC; + const bool sortClusters = buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4); auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr; - bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU(); - bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output - mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor; if (buildNativeGPU) { AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); @@ -1281,21 +1282,20 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } ClusterNativeAccess::ConstMCLabelContainerView* mcLabelsConstView = nullptr; - if (propagateMCLabels) { - // TODO: write to buffer directly + if (propagateMCLabels) { // TODO: write to buffer directly o2::dataformats::MCTruthContainer mcLabels; std::pair buffer; - if (!GetProcessingSettings().tpcWriteClustersAfterRejection && mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)] && mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->useExternal()) { - if (!mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->allocator) { + auto& labelOutputControl = mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]; + if (!GetProcessingSettings().tpcWriteClustersAfterRejection && !sortClusters && labelOutputControl && labelOutputControl->useExternal()) { + if (!labelOutputControl->allocator) { throw std::runtime_error("Cluster MC Label buffer missing"); } - ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer* container = reinterpret_cast(mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->allocator(0)); + ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer* container = reinterpret_cast(labelOutputControl->allocator(0)); buffer = {&container->first, &container->second}; } else { mIOMem.clusterNativeMCView = std::make_unique(); mIOMem.clusterNativeMCBuffer = std::make_unique(); - buffer.first = mIOMem.clusterNativeMCBuffer.get(); - buffer.second = mIOMem.clusterNativeMCView.get(); + buffer = {mIOMem.clusterNativeMCBuffer.get(), mIOMem.clusterNativeMCView.get()}; } assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true); @@ -1350,15 +1350,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (doGPU && synchronizeCalibUpdate) { SynchronizeStream(0); } - if (buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4)) { - for (uint32_t i = 0; i < NSECTORS; i++) { - for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) { - std::sort(&tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j]], &tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j] + tmpNativeAccess->nClusters[i][j]]); - } - } - if (buildNativeGPU) { - GPUMemCpy(RecoStep::TPCClusterFinding, (void*)mInputsShadow->mPclusterNativeBuffer, (const void*)tmpNativeClusters, nClsTotal * sizeof(tmpNativeClusters[0]), -1, true); - } + if (sortClusters) { + SortClusters(buildNativeGPU, propagateMCLabels, tmpNativeAccess, tmpNativeClusters); } mRec->MemoryScalers()->nTPCHits = nClsTotal; mRec->PopNonPersistentMemory(RecoStep::TPCClusterFinding, qStr2Tag("TPCCLUST")); @@ -1374,3 +1367,60 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) #endif return 0; } + +void GPUChainTracking::SortClusters(bool buildNativeGPU, bool propagateMCLabels, ClusterNativeAccess* clusterAccess, ClusterNative* clusters) +{ + if (propagateMCLabels) { + std::vector clsOrder(clusterAccess->nClustersTotal); + std::iota(clsOrder.begin(), clsOrder.end(), 0); + std::vector tmpClusters; + for (uint32_t i = 0; i < NSECTORS; i++) { + for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) { + const uint32_t offset = clusterAccess->clusterOffset[i][j]; + std::sort(&clsOrder[offset], &clsOrder[offset + clusterAccess->nClusters[i][j]], [&clusters](const uint32_t a, const uint32_t b) { + return clusters[a] < clusters[b]; + }); + tmpClusters.resize(clusterAccess->nClusters[i][j]); + memcpy(tmpClusters.data(), &clusters[offset], clusterAccess->nClusters[i][j] * sizeof(tmpClusters[0])); + for (uint32_t k = 0; k < tmpClusters.size(); k++) { + clusters[offset + k] = tmpClusters[clsOrder[offset + k] - offset]; + } + } + } + tmpClusters.clear(); + + std::pair labelBuffer; + GPUOutputControl* labelOutput = mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]; + std::unique_ptr tmpUniqueContainerView; + std::unique_ptr tmpUniqueContainerBuffer; + if (labelOutput && labelOutput->allocator) { + ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer* labelContainer = reinterpret_cast(labelOutput->allocator(0)); + labelBuffer = {&labelContainer->first, &labelContainer->second}; + } else { + tmpUniqueContainerView = std::move(mIOMem.clusterNativeMCView); + tmpUniqueContainerBuffer = std::move(mIOMem.clusterNativeMCBuffer); + mIOMem.clusterNativeMCView = std::make_unique(); + mIOMem.clusterNativeMCBuffer = std::make_unique(); + labelBuffer = {mIOMem.clusterNativeMCBuffer.get(), mIOMem.clusterNativeMCView.get()}; + } + + o2::dataformats::MCLabelContainer tmpContainer; + for (uint32_t i = 0; i < clusterAccess->nClustersTotal; i++) { + for (const auto& element : clusterAccess->clustersMCTruth->getLabels(clsOrder[i])) { + tmpContainer.addElement(i, element); + } + } + tmpContainer.flatten_to(*labelBuffer.first); + *labelBuffer.second = *labelBuffer.first; + clusterAccess->clustersMCTruth = labelBuffer.second; + } else { + for (uint32_t i = 0; i < NSECTORS; i++) { + for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) { + std::sort(&clusters[clusterAccess->clusterOffset[i][j]], &clusters[clusterAccess->clusterOffset[i][j] + clusterAccess->nClusters[i][j]]); + } + } + } + if (buildNativeGPU) { + GPUMemCpy(RecoStep::TPCClusterFinding, (void*)mInputsShadow->mPclusterNativeBuffer, (const void*)clusters, clusterAccess->nClustersTotal * sizeof(clusters[0]), -1, true); + } +}