From 4555ac431347a05d7f92906a005288a3a6d76595 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Wed, 3 Dec 2025 21:06:04 +0100
Subject: [PATCH 1/6] GPU QA: Add pad row vs occuapncy histogram

---
 GPU/GPUTracking/qa/GPUQA.cxx | 68 +++++++++++++++++++++++-------------
 GPU/GPUTracking/qa/GPUQA.h   |  6 ++--
 2 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/GPU/GPUTracking/qa/GPUQA.cxx b/GPU/GPUTracking/qa/GPUQA.cxx
index 689dc20cb1606..28b603f77e2ff 100644
--- a/GPU/GPUTracking/qa/GPUQA.cxx
+++ b/GPU/GPUTracking/qa/GPUQA.cxx
@@ -152,6 +152,7 @@ static constexpr float PT_MIN_CLUST = 0.01;
 static constexpr float PT_MAX = 20;
 static constexpr float ETA_MAX = 1.5;
 static constexpr float ETA_MAX2 = 0.9;
+static constexpr int32_t PADROW_CHECK_MINCLS = 50;
 
 static constexpr bool CLUST_HIST_INT_SUM = false;
 
@@ -525,9 +526,10 @@ int32_t GPUQA::InitQACreateHistograms()
       createHist(mClusters[i], name, name, AXIS_BINS[4], binsPt.get());
     }
 
-    createHist(mPadRow[0], "padrow0", "padrow0", GPUCA_ROW_COUNT, 0, GPUCA_ROW_COUNT - 1, GPUCA_ROW_COUNT, 0, GPUCA_ROW_COUNT - 1);
-    createHist(mPadRow[1], "padrow1", "padrow1", 100.f, -0.2f, 0.2f, GPUCA_ROW_COUNT, 0, GPUCA_ROW_COUNT - 1);
-    createHist(mPadRow[2], "padrow2", "padrow2", 100.f, -0.2f, 0.2f, GPUCA_ROW_COUNT, 0, GPUCA_ROW_COUNT - 1);
+    createHist(mPadRow[0], "padrow0", "padrow0", GPUCA_ROW_COUNT - PADROW_CHECK_MINCLS, 0, GPUCA_ROW_COUNT - 1 - PADROW_CHECK_MINCLS, GPUCA_ROW_COUNT - PADROW_CHECK_MINCLS, 0, GPUCA_ROW_COUNT - 1 - PADROW_CHECK_MINCLS);
+    createHist(mPadRow[1], "padrow1", "padrow1", 100.f, -0.2f, 0.2f, GPUCA_ROW_COUNT - PADROW_CHECK_MINCLS, 0, GPUCA_ROW_COUNT - 1 - PADROW_CHECK_MINCLS);
+    createHist(mPadRow[2], "padrow2", "padrow2", 100.f, -0.2f, 0.2f, GPUCA_ROW_COUNT - PADROW_CHECK_MINCLS, 0, GPUCA_ROW_COUNT - 1 - PADROW_CHECK_MINCLS);
+    createHist(mPadRow[3], "padrow3", "padrow3", 100.f, 0, 300000, GPUCA_ROW_COUNT - PADROW_CHECK_MINCLS, 0, GPUCA_ROW_COUNT - 1 - PADROW_CHECK_MINCLS);
   }
 
   if (mQATasks & taskTrackStatistics) {
@@ -968,7 +970,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
             nClusters++;
             uint32_t hitId = mTracking->mIOPtrs.mergedTrackHits[track.FirstClusterRef() + k].num;
             if (hitId >= GetNMCLabels()) {
-              GPUError("Invalid hit id %u > %d (nClusters %d)", hitId, GetNMCLabels(), mTracking->mIOPtrs.clustersNative ? mTracking->mIOPtrs.clustersNative->nClustersTotal : 0);
+              GPUError("Invalid hit id %u > %d (nClusters %d)", hitId, GetNMCLabels(), clNative ? clNative->nClustersTotal : 0);
               throw std::runtime_error("qa error");
             }
             acc.addLabel(hitId);
@@ -1069,7 +1071,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
         }
       }
     }
-    if ((mQATasks & taskClusterAttach)) {
+    if ((mQATasks & taskClusterAttach) && !tracksExternal) {
       std::vector<uint8_t> lowestPadRow(mTracking->mIOPtrs.nMergedTracks);
       // fill cluster adjacent status
       if (mTracking->mIOPtrs.mergedTrackHitAttachment) {
@@ -1096,12 +1098,12 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
           }
         }
       }
-      if (mTracking->mIOPtrs.nMergedTracks && mTracking->mIOPtrs.clustersNative) {
+      if (mTracking->mIOPtrs.nMergedTracks && clNative) {
         std::fill(lowestPadRow.begin(), lowestPadRow.end(), 255);
         for (uint32_t iSector = 0; iSector < GPUCA_NSECTORS; iSector++) {
           for (uint32_t iRow = 0; iRow < GPUCA_ROW_COUNT; iRow++) {
-            for (uint32_t iCl = 0; iCl < mTracking->mIOPtrs.clustersNative->nClusters[iSector][iRow]; iCl++) {
-              int32_t i = mTracking->mIOPtrs.clustersNative->clusterOffset[iSector][iRow] + iCl;
+            for (uint32_t iCl = 0; iCl < clNative->nClusters[iSector][iRow]; iCl++) {
+              int32_t i = clNative->clusterOffset[iSector][iRow] + iCl;
               for (int32_t j = 0; j < GetMCLabelNID(i); j++) {
                 uint32_t trackId = GetMCTrackObj(mTrackMCLabelsReverse, GetMCLabel(i, j));
                 if (trackId < lowestPadRow.size() && lowestPadRow[trackId] > iRow) {
@@ -1113,12 +1115,21 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
         }
         for (uint32_t i = 0; i < mTracking->mIOPtrs.nMergedTracks; i++) {
           const auto& trk = mTracking->mIOPtrs.mergedTracks[i];
-          if (trk.OK() && lowestPadRow[i] != 255 && trk.NClustersFitted() > 70 && CAMath::Abs(trk.GetParam().GetQPt()) < 0.5) {
-            int32_t lowestRow = CAMath::Min(mTracking->mIOPtrs.mergedTrackHits[trk.FirstClusterRef()].row, mTracking->mIOPtrs.mergedTrackHits[trk.FirstClusterRef() + trk.NClusters() - 1].row);
+          if (trk.OK() && lowestPadRow[i] != 255 && trk.NClustersFitted() >= PADROW_CHECK_MINCLS && CAMath::Abs(trk.GetParam().GetQPt()) < 1.0) {
+            const auto& lowestCl = mTracking->mIOPtrs.mergedTrackHits[trk.FirstClusterRef()].row < mTracking->mIOPtrs.mergedTrackHits[trk.FirstClusterRef() + trk.NClusters() - 1].row ? mTracking->mIOPtrs.mergedTrackHits[trk.FirstClusterRef()] : mTracking->mIOPtrs.mergedTrackHits[trk.FirstClusterRef() + trk.NClusters() - 1];
+            const int32_t lowestRow = lowestCl.row;
             mPadRow[0]->Fill(lowestPadRow[i], lowestRow, 1.f);
             mPadRow[1]->Fill(CAMath::ATan2(trk.GetParam().GetY(), trk.GetParam().GetX()), lowestRow, 1.f);
-            if (lowestPadRow[i] == 0 && lowestRow != 0) {
-              mPadRow[2]->Fill(CAMath::ATan2(trk.GetParam().GetY(), trk.GetParam().GetX()), lowestRow, 1.f);
+            if (lowestPadRow[i] < 10 && lowestRow > lowestPadRow[i] + 3) {
+              const auto& cl = clNative->clustersLinear[lowestCl.num];
+              float x, y, z;
+              mTracking->GetTPCTransformHelper()->Transform(lowestCl.sector, lowestCl.row, cl.getPad(), cl.getTime(), x, y, z, trk.GetParam().GetTOffset());
+              float phi = CAMath::ATan2(y, x);
+              mPadRow[2]->Fill(phi, lowestRow, 1.f);
+              if (CAMath::Abs(phi) < 0.15) {
+                const float time = cl.getTime();
+                mPadRow[3]->Fill(mTracking->GetParam().GetUnscaledMult(time), lowestRow, 1.f);
+              }
             }
           }
         }
@@ -1485,7 +1496,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
       }
     }
 
-    if (mQATasks & taskClusterAttach) {
+    if ((mQATasks & taskClusterAttach) && !tracksExternal) {
       // Fill cluster histograms
       for (uint32_t iTrk = 0; iTrk < nReconstructedTracks; iTrk++) {
         const GPUTPCGMMergedTrack& track = mTracking->mIOPtrs.mergedTracks[iTrk];
@@ -1715,7 +1726,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
     GPUWarning("No MC information available, only running partial TPC QA!");
   } // mcAvail
 
-  if (mQATasks & taskTrackStatistics) {
+  if ((mQATasks & taskTrackStatistics) && !tracksExternal) {
     // Fill track statistic histograms
     std::vector<std::array<float, 3>> clusterAttachCounts;
     if (mcAvail) {
@@ -1815,8 +1826,8 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
   if (mQATasks & taskClusterCounts) {
     for (uint32_t iSector = 0; iSector < GPUCA_NSECTORS; iSector++) {
       for (uint32_t iRow = 0; iRow < GPUCA_ROW_COUNT; iRow++) {
-        for (uint32_t iCl = 0; iCl < mTracking->mIOPtrs.clustersNative->nClusters[iSector][iRow]; iCl++) {
-          uint32_t i = mTracking->mIOPtrs.clustersNative->clusterOffset[iSector][iRow] + iCl;
+        for (uint32_t iCl = 0; iCl < clNative->nClusters[iSector][iRow]; iCl++) {
+          uint32_t i = clNative->clusterOffset[iSector][iRow] + iCl;
           int32_t attach = mTracking->mIOPtrs.mergedTrackHitAttachment[i];
           const auto& r = checkClusterState<true>(attach, &mClusterCounts);
 
@@ -1873,8 +1884,8 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
           if (r.unattached) {
             mClusterCounts.nUnattached++;
           }
-          if (mTracking && mTracking->mIOPtrs.clustersNative) {
-            const auto& cl = mTracking->mIOPtrs.clustersNative->clustersLinear[i];
+          if (mTracking && clNative) {
+            const auto& cl = clNative->clustersLinear[i];
             mClRej[0]->Fill(cl.getPad() - GPUTPCGeometry::NPads(iRow) / 2 + 0.5, iRow, 1.f);
             if (!r.unattached && !r.protect) {
               mClRej[1]->Fill(cl.getPad() - GPUTPCGeometry::NPads(iRow) / 2 + 0.5, iRow, 1.f);
@@ -1895,7 +1906,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
     GPUInfo("QA Time: Cluster Counts:\t%6.0f us", timer.GetCurrentElapsedTime(true) * 1e6);
   }
 
-  if (mConfig.dumpToROOT) {
+  if (mConfig.dumpToROOT && !tracksExternal) {
     if (!clNative || !mTracking || !mTracking->mIOPtrs.mergedTrackHitAttachment || !mTracking->mIOPtrs.mergedTracks) {
       throw std::runtime_error("Cannot dump non o2::tpc::clusterNative clusters, need also hit attachmend and GPU tracks");
     }
@@ -2273,7 +2284,7 @@ int32_t GPUQA::DrawQAHistograms(TObjArray* qcout)
       mPClRejP = createGarbageCollected<TPad>("p0", "", 0.0, 0.0, 1.0, 1.0);
       mPClRejP->Draw();
 
-      for (int32_t i = 0; i < 3; i++) {
+      for (int32_t i = 0; i < 4; i++) {
         snprintf(name, 2048, "cpadrow%d", i);
         mCPadRow[i] = createGarbageCollected<TCanvas>(name, name, 0, 0, 700, 700. * 2. / 3.);
         mCPadRow[i]->cd();
@@ -2842,19 +2853,28 @@ int32_t GPUQA::DrawQAHistograms(TObjArray* qcout)
       }
     }
 
-    for (int32_t i = 0; i < 3; i++) {
+    for (int32_t i = 0; i < 4; i++) {
       auto* e = mPadRow[i];
       if (tout && !mConfig.inputHistogramsOnly) {
         e->Write();
       }
       mPPadRow[i]->cd();
       e->SetOption("colz");
-      e->SetTitle(i == 2 ? "First Track Pad Row (row_{MC} = 0, row_{trk} > 0)" : "First Track Pad Row");
-      e->GetXaxis()->SetTitle(i ? "#Phi (sector)" : "First MC Pad Row");
+      std::string title = "First Track Pad Row (p_{T} > 1GeV, N_{Cl} #geq " + std::to_string(PADROW_CHECK_MINCLS);
+      if (i >= 2) {
+        title += ", row_{trk} > row_{MC} + 3, row_{MC} < 10";
+      }
+      if (i >= 3) {
+        title += ", #Phi_{Cl} < 0.15";
+      }
+      title += ")";
+
+      e->SetTitle(title.c_str());
+      e->GetXaxis()->SetTitle(i == 3 ? "Local Occupancy" : (i ? "#Phi_{Cl} (sector)" : "First MC Pad Row"));
       e->GetYaxis()->SetTitle("First Pad Row");
       e->Draw();
       mCPadRow[i]->cd();
-      static const constexpr char* PADROW_NAMES[3] = {"MC", "Phi", "Phi1"};
+      static const constexpr char* PADROW_NAMES[4] = {"MC", "Phi", "Phi1", "Occ"};
       mCPadRow[i]->Print(Form("%s/padRow%s.pdf", mConfig.plotsDir.c_str(), PADROW_NAMES[i]));
       if (mConfig.writeFileExt != "") {
         mCPadRow[i]->Print(Form("%s/padRow%s.%s", mConfig.plotsDir.c_str(), PADROW_NAMES[i], mConfig.writeFileExt.c_str()));
diff --git a/GPU/GPUTracking/qa/GPUQA.h b/GPU/GPUTracking/qa/GPUQA.h
index 54d1ceed9d365..7303ed62a9562 100644
--- a/GPU/GPUTracking/qa/GPUQA.h
+++ b/GPU/GPUTracking/qa/GPUQA.h
@@ -323,9 +323,9 @@ class GPUQA
   TPad* mPClRej[3];
   TPad* mPClRejP;
 
-  TH2F* mPadRow[3];
-  TCanvas* mCPadRow[3];
-  TPad* mPPadRow[3];
+  TH2F* mPadRow[4];
+  TCanvas* mCPadRow[4];
+  TPad* mPPadRow[4];
 
   std::vector<TH2F*> mHistClusterCount;
 

From 6804ed289026bb0534ddf7d48068d6ea7f685fc4 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Thu, 4 Dec 2025 12:55:58 +0100
Subject: [PATCH 2/6] GPU QA Standalone: By default write histograms to output
 root file in plots folder

---
 GPU/GPUTracking/Standalone/Benchmark/standalone.cxx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
index ca26f26d32612..857803d913372 100644
--- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
+++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
@@ -197,6 +197,9 @@ int32_t ReadConfiguration(int argc, char** argv)
     printf("Can only produce QA pdf output when input files are specified!\n");
     return 1;
   }
+  if (configStandalone.QA.enableLocalOutput && !configStandalone.QA.inputHistogramsOnly && configStandalone.QA.output == "" && configStandalone.QA.plotsDir != "") {
+    configStandalone.QA.output = configStandalone.QA.plotsDir + "/output.root";
+  }
   if (configStandalone.QA.inputHistogramsOnly) {
     configStandalone.rundEdx = false;
     configStandalone.noEvents = true;

From 329f043432b1eb83a3fcbaeca03acdc1599e0354 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Wed, 3 Dec 2025 21:13:34 +0100
Subject: [PATCH 3/6] GPU: Remove non-working MI100 serialization workaround
 and obsolete StuckProtection

---
 GPU/GPUTracking/Base/GPUReconstructionCPU.h   |  2 --
 .../Base/opencl/GPUReconstructionOCL.cxx      | 21 -------------------
 .../Base/opencl/GPUReconstructionOCL.h        |  1 -
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  2 --
 GPU/GPUTracking/Global/GPUChain.h             |  2 --
 .../Global/GPUChainTrackingSectorTracker.cxx  |  3 ---
 prodtests/full-system-test/dpl-workflow.sh    |  2 --
 7 files changed, 33 deletions(-)

diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index a78a482db4e7a..d621d45fcd92b 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -88,8 +88,6 @@ class GPUReconstructionCPU : public GPUReconstructionProcessing::KernelInterface
   int32_t ExitDevice() override;
   int32_t GetThread();
 
-  virtual int32_t DoStuckProtection(int32_t stream, deviceEvent event) { return 0; }
-
   // Pointers to tracker classes
   GPUProcessorProcessors mProcShadow; // Host copy of tracker objects that will be used on the GPU
   GPUConstantMem*& mProcessorsShadow = mProcShadow.mProcessorsProc;
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
index 271fe494860cd..6954cfb3d6211 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
@@ -470,27 +470,6 @@ void GPUReconstructionOCL::ReleaseEvent(deviceEvent ev) { GPUChkErr(clReleaseEve
 
 void GPUReconstructionOCL::RecordMarker(deviceEvent* ev, int32_t stream) { GPUChkErr(clEnqueueMarkerWithWaitList(mInternals->command_queue[stream], 0, nullptr, ev->getEventList<cl_event>())); }
 
-int32_t GPUReconstructionOCL::DoStuckProtection(int32_t stream, deviceEvent event)
-{
-  if (GetProcessingSettings().stuckProtection) {
-    cl_int tmp = 0;
-    for (int32_t i = 0; i <= GetProcessingSettings().stuckProtection / 50; i++) {
-      usleep(50);
-      clGetEventInfo(event.get<cl_event>(), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(tmp), &tmp, nullptr);
-      if (tmp == CL_COMPLETE) {
-        break;
-      }
-    }
-    if (tmp != CL_COMPLETE) {
-      mGPUStuck = 1;
-      GPUErrorReturn("GPU Stuck, future processing in this component is disabled, skipping event (GPU Event State %d)", (int32_t)tmp);
-    }
-  } else {
-    clFinish(mInternals->command_queue[stream]);
-  }
-  return 0;
-}
-
 void GPUReconstructionOCL::SynchronizeGPU()
 {
   for (int32_t i = 0; i < mNStreams; i++) {
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
index 958d5186bf41a..a52db1f2a737a 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.h
@@ -43,7 +43,6 @@ class GPUReconstructionOCL : public GPUReconstructionProcessing::KernelInterface
   virtual int32_t GPUChkErrInternal(const int64_t error, const char* file, int32_t line) const override;
 
   void SynchronizeGPU() override;
-  int32_t DoStuckProtection(int32_t stream, deviceEvent event) override;
   int32_t GPUDebug(const char* state = "UNKNOWN", int32_t stream = -1, bool force = false) override;
   void SynchronizeStream(int32_t stream) override;
   void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) override;
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 5a075bf7f9a02..d70fac115eab7 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -301,7 +301,6 @@ BeginSubConfig(GPUSettingsProcessing, proc, configStandalone, "PROC", 0, "Proces
 AddOption(deviceNum, int32_t, -1, "gpuDevice", 0, "Set GPU device to use (-1: automatic, -2: for round-robin usage in timeslice-pipeline)")
 AddOption(gpuDeviceOnly, bool, false, "", 0, "Use only GPU as device (i.e. no CPU for OpenCL)")
 AddOption(globalInitMutex, bool, false, "", 0, "Use global mutex to synchronize initialization of multiple GPU instances")
-AddOption(stuckProtection, int32_t, 0, "", 0, "Timeout in us, When AMD GPU is stuck, just continue processing and skip tracking, do not crash or stall the chain")
 AddOption(trdNCandidates, int32_t, 3, "", 0, "Number of branching track candidates for single input track during propagation")
 AddOption(trdTrackModelO2, bool, false, "", 0, "Use O2 track model instead of GPU track model for TRD tracking")
 AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, -1 = autoselect (-2 for O2, 0 for standalone))")
@@ -383,7 +382,6 @@ AddOption(debugOnFailureMaxN, uint32_t, 1, "", 0, "Max number of times to run th
 AddOption(debugOnFailureMaxFiles, uint32_t, 0, "", 0, "Max number of files to have in the target folder")
 AddOption(debugOnFailureMaxSize, uint32_t, 0, "", 0, "Max size of existing dumps in the target folder in GB")
 AddOption(debugOnFailureDirectory, std::string, ".", "", 0, "Target folder for debug / dump")
-AddOption(amdMI100SerializationWorkaround, bool, false, "", 0, "Enable workaround that mitigates MI100 serialization bug")
 AddOption(memoryStat, bool, false, "", 0, "Print memory statistics")
 AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr)
 AddSubConfig(GPUSettingsProcessingRTC, rtc)
diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
index 9ce3da1092e83..6831fbd15080a 100644
--- a/GPU/GPUTracking/Global/GPUChain.h
+++ b/GPU/GPUTracking/Global/GPUChain.h
@@ -224,8 +224,6 @@ class GPUChain
 
   inline GPUChain* GetNextChainInQueue() { return mRec->GetNextChainInQueue(); }
 
-  virtual int32_t DoStuckProtection(int32_t stream, deviceEvent event) { return 0; }
-
   template <class T, class S, typename... Args>
   bool DoDebugAndDump(RecoStep step, uint32_t mask, T& processor, S T::*func, Args&&... args)
   {
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx
index 122eb709b4356..e2d68f10819fb 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx
@@ -149,9 +149,6 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal()
     GPUTPCTracker& trk = processors()->tpcTrackers[iSector];
     GPUTPCTracker& trkShadow = doGPU ? processorsShadow()->tpcTrackers[iSector] : trk;
     int32_t useStream = StreamForSector(iSector);
-    if (GetProcessingSettings().amdMI100SerializationWorkaround) {
-      SynchronizeStream(useStream); // TODO: Remove this workaround once fixed on MI100
-    }
 
     if (GetProcessingSettings().debugLevel >= 3) {
       GPUInfo("Creating Sector Data (Sector %d)", iSector);
diff --git a/prodtests/full-system-test/dpl-workflow.sh b/prodtests/full-system-test/dpl-workflow.sh
index ce5607d361cbe..754349c87eecc 100755
--- a/prodtests/full-system-test/dpl-workflow.sh
+++ b/prodtests/full-system-test/dpl-workflow.sh
@@ -284,8 +284,6 @@ if [[ $GPUTYPE == "HIP" ]]; then
   if [[ ${EPN_NODE_MI100:-} == "1" && ${DISABLE_MI100_SERIALIZATION:-0} != 1 ]]; then
     if [[ -n ${OPTIMIZED_PARALLEL_ASYNC:-} ]] || [[ $EPNSYNCMODE == 1 && ${FULL_MI100_SERIALIZATION:-0} == 1 ]]; then
       GPU_CONFIG_KEY+="GPU_proc.serializeGPU=3;"
-    elif [[ $EPNSYNCMODE == 1 ]]; then
-      GPU_CONFIG_KEY+="GPU_proc.amdMI100SerializationWorkaround=1;"
     fi
   fi
   #export HSA_TOOLS_LIB=/opt/rocm/lib/librocm-debug-agent.so.2

From 64845251e1c06bb76c308f935812572bf24a049d Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Mon, 8 Dec 2025 09:32:03 +0100
Subject: [PATCH 4/6] GPU QA: Dump also text output to output folder

---
 GPU/GPUTracking/qa/GPUQA.cxx | 11 ++++++++++-
 GPU/GPUTracking/qa/GPUQA.h   |  2 ++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/qa/GPUQA.cxx b/GPU/GPUTracking/qa/GPUQA.cxx
index 28b603f77e2ff..3c176031dec08 100644
--- a/GPU/GPUTracking/qa/GPUQA.cxx
+++ b/GPU/GPUTracking/qa/GPUQA.cxx
@@ -3141,7 +3141,9 @@ void GPUQA::PrintClusterCount(int32_t mode, int32_t& num, const char* name, uint
     createHist(mHistClusterCount[num], name2, name, 1000, 0, mConfig.histMaxNClusters, 1000, 0, 100);
   } else if (mode == 0) {
     if (normalization && mConfig.enableLocalOutput) {
-      printf("\t%40s: %'12" PRIu64 " (%6.2f%%)\n", name, n, 100.f * n / normalization);
+      for (uint32_t i = 0; i < 1 + (mTextDump != nullptr); i++) {
+        fprintf(i ? mTextDump : stdout, "\t%40s: %'12" PRIu64 " (%6.2f%%)\n", name, n, 100.f * n / normalization);
+      }
     }
     if (mConfig.clusterRejectionHistograms) {
       float ratio = 100.f * n / std::max<uint64_t>(normalization, 1);
@@ -3153,6 +3155,9 @@ void GPUQA::PrintClusterCount(int32_t mode, int32_t& num, const char* name, uint
 
 int32_t GPUQA::DoClusterCounts(uint64_t* attachClusterCounts, int32_t mode)
 {
+  if (mConfig.enableLocalOutput && !mConfig.inputHistogramsOnly && mConfig.plotsDir != "") {
+    mTextDump = fopen((mConfig.plotsDir + "/clusterCounts.txt").c_str(), "w+");
+  }
   int32_t num = 0;
   if (mcPresent() && (mQATasks & taskClusterAttach) && attachClusterCounts) {
     for (int32_t i = 0; i < N_CLS_HIST; i++) { // TODO: Check that these counts are still printed correctly!
@@ -3191,6 +3196,10 @@ int32_t GPUQA::DoClusterCounts(uint64_t* attachClusterCounts, int32_t mode)
     PrintClusterCount(mode, num, "Correctly Attached all-trk normalized", mClusterCounts.nCorrectlyAttachedNormalized, mClusterCounts.nTotal);
     PrintClusterCount(mode, num, "Correctly Attached non-fake normalized", mClusterCounts.nCorrectlyAttachedNormalizedNonFake, mClusterCounts.nTotal);
   }
+  if (mTextDump) {
+    fclose(mTextDump);
+    mTextDump = nullptr;
+  }
   return num;
 }
 
diff --git a/GPU/GPUTracking/qa/GPUQA.h b/GPU/GPUTracking/qa/GPUQA.h
index 7303ed62a9562..b42fa804c6212 100644
--- a/GPU/GPUTracking/qa/GPUQA.h
+++ b/GPU/GPUTracking/qa/GPUQA.h
@@ -62,6 +62,7 @@ class GPUQA
 #else
 
 #include "GPUTPCDef.h"
+#include <cstdio>
 #include <cmath>
 #include <vector>
 #include <memory>
@@ -365,6 +366,7 @@ class GPUQA
   int32_t mMCTrackMin = -1, mMCTrackMax = -1;
 
   const o2::tpc::ClusterNativeAccess* mClNative = nullptr;
+  FILE* mTextDump = nullptr;
 };
 
 inline bool GPUQA::SuppressTrack(int32_t iTrack) const { return (mConfig.matchMCLabels.size() && !mGoodTracks[mNEvents][iTrack]); }

From 448f6da3e29f2472e3d0360d9c9d45ad01a0aafb Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Wed, 10 Dec 2025 19:16:24 +0100
Subject: [PATCH 5/6] GPU QA: Fix some task number inconsistencies

---
 GPU/GPUTracking/Global/GPUChainTracking.cxx |   4 +-
 GPU/GPUTracking/qa/GPUQA.cxx                | 223 ++++++++++----------
 GPU/GPUTracking/qa/GPUQA.h                  |  16 +-
 GPU/Workflow/src/GPUWorkflowSpec.cxx        |   2 +-
 4 files changed, 131 insertions(+), 114 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index 14d0e04eb4dd3..0e7d4bc4f436e 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -475,7 +475,7 @@ int32_t GPUChainTracking::ForceInitQA()
     qa.reset(new GPUQA(this));
   }
   if (!GetQA()->IsInitialized()) {
-    return GetQA()->InitQA();
+    return GetQA()->InitQA(GetProcessingSettings().runQA <= 0 ? -GetProcessingSettings().runQA : GPUQA::tasksAutomatic);
   }
   return 0;
 }
@@ -690,7 +690,7 @@ int32_t GPUChainTracking::RunChain()
   }
   const bool needQA = GPUQA::QAAvailable() && (GetProcessingSettings().runQA || (GetProcessingSettings().eventDisplay && (mIOPtrs.nMCInfosTPC || GetProcessingSettings().runMC)));
   if (needQA && GetQA()->IsInitialized() == false) {
-    if (GetQA()->InitQA(GetProcessingSettings().runQA ? -GetProcessingSettings().runQA : -1)) {
+    if (GetQA()->InitQA(GetProcessingSettings().runQA <= 0 ? -GetProcessingSettings().runQA : GPUQA::tasksAutomatic)) {
       return 1;
     }
   }
diff --git a/GPU/GPUTracking/qa/GPUQA.cxx b/GPU/GPUTracking/qa/GPUQA.cxx
index 3c176031dec08..852ac5c1feefb 100644
--- a/GPU/GPUTracking/qa/GPUQA.cxx
+++ b/GPU/GPUTracking/qa/GPUQA.cxx
@@ -544,7 +544,8 @@ int32_t GPUQA::InitQACreateHistograms()
     createHist(mT0[0], "tracks_t0", "tracks_t0", (maxTime + 1) / 10, 0, maxTime);
     createHist(mT0[1], "tracks_t0_res", "tracks_t0_res", 1000, -100, 100);
     createHist(mClXY, "clXY", "clXY", 1000, -250, 250, 1000, -250, 250); // TODO: Pass name only once
-
+  }
+  if (mQATasks & taskClusterRejection) {
     const int padCount = GPUTPCGeometry::NPads(GPUCA_ROW_COUNT - 1);
     for (int32_t i = 0; i < 3; i++) {
       snprintf(name, 2048, "clrej_%d", i);
@@ -577,8 +578,8 @@ int32_t GPUQA::InitQACreateHistograms()
 
 int32_t GPUQA::loadHistograms(std::vector<TH1F>& i1, std::vector<TH2F>& i2, std::vector<TH1D>& i3, std::vector<TGraphAsymmErrors>& i4, int32_t tasks)
 {
-  if (tasks == -1) {
-    tasks = taskDefaultPostprocess;
+  if (tasks == tasksAutomatic) {
+    tasks = tasksDefaultPostprocess;
   }
   if (mQAInitialized && (!mHaveExternalHists || tasks != mQATasks)) {
     throw std::runtime_error("QA not initialized or initialized with different task array");
@@ -593,7 +594,7 @@ int32_t GPUQA::loadHistograms(std::vector<TH1F>& i1, std::vector<TH2F>& i2, std:
   mHistGraph_pos.clear();
   mHaveExternalHists = true;
   if (mConfig.noMC) {
-    tasks &= tasksNoQC;
+    tasks &= tasksAllNoQC;
   }
   mQATasks = tasks;
   if (InitQACreateHistograms()) {
@@ -806,8 +807,8 @@ int32_t GPUQA::InitQA(int32_t tasks)
   if (mQAInitialized) {
     throw std::runtime_error("QA already initialized");
   }
-  if (tasks == -1) {
-    tasks = taskDefault;
+  if (tasks == tasksAutomatic) {
+    tasks = tasksDefault;
   }
 
   mHist1D = new std::vector<TH1F>;
@@ -815,7 +816,7 @@ int32_t GPUQA::InitQA(int32_t tasks)
   mHist1Dd = new std::vector<TH1D>;
   mHistGraph = new std::vector<TGraphAsymmErrors>;
   if (mConfig.noMC) {
-    tasks &= tasksNoQC;
+    tasks &= tasksAllNoQC;
   }
   mQATasks = tasks;
 
@@ -1823,7 +1824,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
 
   uint32_t nCl = clNative ? clNative->nClustersTotal : mTracking->GetProcessors()->tpcMerger.NMaxClusters();
   mClusterCounts.nTotal += nCl;
-  if (mQATasks & taskClusterCounts) {
+  if (mQATasks & (taskClusterCounts | taskClusterRejection)) {
     for (uint32_t iSector = 0; iSector < GPUCA_NSECTORS; iSector++) {
       for (uint32_t iRow = 0; iRow < GPUCA_ROW_COUNT; iRow++) {
         for (uint32_t iCl = 0; iCl < clNative->nClusters[iSector][iRow]; iCl++) {
@@ -1831,64 +1832,68 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
           int32_t attach = mTracking->mIOPtrs.mergedTrackHitAttachment[i];
           const auto& r = checkClusterState<true>(attach, &mClusterCounts);
 
-          if (mcAvail) {
-            float totalWeight = 0, weight400 = 0, weight40 = 0;
-            for (int32_t j = 0; j < GetMCLabelNID(i); j++) {
-              const auto& label = GetMCLabel(i, j);
-              if (GetMCLabelID(label) >= 0) {
-                totalWeight += GetMCLabelWeight(label);
-                if (GetMCTrackObj(mMCParam, label).pt >= 0.4) {
-                  weight400 += GetMCLabelWeight(label);
-                }
-                if (GetMCTrackObj(mMCParam, label).pt <= 0.04) {
-                  weight40 += GetMCLabelWeight(label);
+          if (mQATasks & taskClusterRejection) {
+            if (mcAvail) {
+              float totalWeight = 0, weight400 = 0, weight40 = 0;
+              for (int32_t j = 0; j < GetMCLabelNID(i); j++) {
+                const auto& label = GetMCLabel(i, j);
+                if (GetMCLabelID(label) >= 0) {
+                  totalWeight += GetMCLabelWeight(label);
+                  if (GetMCTrackObj(mMCParam, label).pt >= 0.4) {
+                    weight400 += GetMCLabelWeight(label);
+                  }
+                  if (GetMCTrackObj(mMCParam, label).pt <= 0.04) {
+                    weight40 += GetMCLabelWeight(label);
+                  }
                 }
               }
-            }
-            if (totalWeight > 0 && 10.f * weight400 >= totalWeight) {
-              if (!r.unattached && !r.protect && !r.physics) {
-                mClusterCounts.nFakeRemove400++;
-                int32_t totalFake = weight400 < 0.9f * totalWeight;
-                if (totalFake) {
-                  mClusterCounts.nFullFakeRemove400++;
-                }
-                /*printf("Fake removal (%d): Hit %7d, attached %d lowPt %d looper %d tube200 %d highIncl %d tube %d bad %d recPt %7.2f recLabel %6d", totalFake, i, (int32_t) (mClusterParam[i].attached || mClusterParam[i].fakeAttached),
-                    (int32_t) lowPt, (int32_t) ((attach & gputpcgmmergertypes::attachGoodLeg) == 0), (int32_t) ((attach & gputpcgmmergertypes::attachTube) && mev200),
-                    (int32_t) ((attach & gputpcgmmergertypes::attachHighIncl) != 0), (int32_t) ((attach & gputpcgmmergertypes::attachTube) != 0), (int32_t) ((attach & gputpcgmmergertypes::attachGood) == 0),
-                    fabsf(qpt) > 0 ? 1.f / qpt : 0.f, id);
-                for (int32_t j = 0;j < GetMCLabelNID(i);j++)
-                {
-                    //if (GetMCLabelID(i, j) < 0) break;
-                    printf(" - label%d %6d weight %5d", j, GetMCLabelID(i, j), (int32_t) GetMCLabelWeight(i, j));
-                    if (GetMCLabelID(i, j) >= 0) printf(" - pt %7.2f", mMCParam[GetMCLabelID(i, j)].pt);
-                    else printf("             ");
+              if (totalWeight > 0 && 10.f * weight400 >= totalWeight) {
+                if (!r.unattached && !r.protect && !r.physics) {
+                  mClusterCounts.nFakeRemove400++;
+                  int32_t totalFake = weight400 < 0.9f * totalWeight;
+                  if (totalFake) {
+                    mClusterCounts.nFullFakeRemove400++;
+                  }
+                  /*printf("Fake removal (%d): Hit %7d, attached %d lowPt %d looper %d tube200 %d highIncl %d tube %d bad %d recPt %7.2f recLabel %6d", totalFake, i, (int32_t) (mClusterParam[i].attached || mClusterParam[i].fakeAttached),
+                      (int32_t) lowPt, (int32_t) ((attach & gputpcgmmergertypes::attachGoodLeg) == 0), (int32_t) ((attach & gputpcgmmergertypes::attachTube) && mev200),
+                      (int32_t) ((attach & gputpcgmmergertypes::attachHighIncl) != 0), (int32_t) ((attach & gputpcgmmergertypes::attachTube) != 0), (int32_t) ((attach & gputpcgmmergertypes::attachGood) == 0),
+                      fabsf(qpt) > 0 ? 1.f / qpt : 0.f, id);
+                  for (int32_t j = 0;j < GetMCLabelNID(i);j++)
+                  {
+                      //if (GetMCLabelID(i, j) < 0) break;
+                      printf(" - label%d %6d weight %5d", j, GetMCLabelID(i, j), (int32_t) GetMCLabelWeight(i, j));
+                      if (GetMCLabelID(i, j) >= 0) printf(" - pt %7.2f", mMCParam[GetMCLabelID(i, j)].pt);
+                      else printf("             ");
+                  }
+                  printf("\n");*/
                 }
-                printf("\n");*/
+                mClusterCounts.nAbove400++;
               }
-              mClusterCounts.nAbove400++;
-            }
-            if (totalWeight > 0 && weight40 >= 0.9 * totalWeight) {
-              mClusterCounts.nBelow40++;
-              if (r.protect || r.physics) {
-                mClusterCounts.nFakeProtect40++;
+              if (totalWeight > 0 && weight40 >= 0.9 * totalWeight) {
+                mClusterCounts.nBelow40++;
+                if (r.protect || r.physics) {
+                  mClusterCounts.nFakeProtect40++;
+                }
               }
             }
-          }
 
-          if (r.physics) {
-            mClusterCounts.nPhysics++;
-          }
-          if (r.protect) {
-            mClusterCounts.nProt++;
-          }
-          if (r.unattached) {
-            mClusterCounts.nUnattached++;
+            if (r.physics) {
+              mClusterCounts.nPhysics++;
+            }
+            if (r.protect) {
+              mClusterCounts.nProt++;
+            }
+            if (r.unattached) {
+              mClusterCounts.nUnattached++;
+            }
           }
-          if (mTracking && clNative) {
-            const auto& cl = clNative->clustersLinear[i];
-            mClRej[0]->Fill(cl.getPad() - GPUTPCGeometry::NPads(iRow) / 2 + 0.5, iRow, 1.f);
-            if (!r.unattached && !r.protect) {
-              mClRej[1]->Fill(cl.getPad() - GPUTPCGeometry::NPads(iRow) / 2 + 0.5, iRow, 1.f);
+          if (mQATasks & taskClusterRejection) {
+            if (mTracking && clNative) {
+              const auto& cl = clNative->clustersLinear[i];
+              mClRej[0]->Fill(cl.getPad() - GPUTPCGeometry::NPads(iRow) / 2 + 0.5, iRow, 1.f);
+              if (!r.unattached && !r.protect) {
+                mClRej[1]->Fill(cl.getPad() - GPUTPCGeometry::NPads(iRow) / 2 + 0.5, iRow, 1.f);
+              }
             }
           }
         }
@@ -2271,7 +2276,9 @@ int32_t GPUQA::DrawQAHistograms(TObjArray* qcout)
       mCClXY->cd();
       mPClXY = createGarbageCollected<TPad>("p0", "", 0.0, 0.0, 1.0, 1.0);
       mPClXY->Draw();
+    }
 
+    if (mQATasks & taskClusterRejection) {
       for (int32_t i = 0; i < 3; i++) {
         snprintf(name, 2048, "cnclrej%d", i);
         mCClRej[i] = createGarbageCollected<TCanvas>(name, name, 0, 0, 700, 700. * 2. / 3.);
@@ -2283,7 +2290,9 @@ int32_t GPUQA::DrawQAHistograms(TObjArray* qcout)
       mCClRejP->cd();
       mPClRejP = createGarbageCollected<TPad>("p0", "", 0.0, 0.0, 1.0, 1.0);
       mPClRejP->Draw();
+    }
 
+    if (mQATasks & taskClusterAttach) {
       for (int32_t i = 0; i < 4; i++) {
         snprintf(name, 2048, "cpadrow%d", i);
         mCPadRow[i] = createGarbageCollected<TCanvas>(name, name, 0, 0, 700, 700. * 2. / 3.);
@@ -3034,7 +3043,7 @@ int32_t GPUQA::DrawQAHistograms(TObjArray* qcout)
       }
     }
 
-    mPClXY->cd();
+    mPClXY->cd(); // TODO: This should become a separate task category
     mClXY->SetOption("colz");
     mClXY->Draw();
     mCClXY->cd();
@@ -3042,61 +3051,61 @@ int32_t GPUQA::DrawQAHistograms(TObjArray* qcout)
     if (mConfig.writeFileExt != "") {
       mCClXY->Print(Form("%s/clustersXY.%s", mConfig.plotsDir.c_str(), mConfig.writeFileExt.c_str()));
     }
+  }
 
-    if (mQATasks & taskClusterCounts) {
-      mClRej[2]->Divide(mClRej[1], mClRej[0]);
+  if (mQATasks & taskClusterRejection) {
+    mClRej[2]->Divide(mClRej[1], mClRej[0]);
 
-      for (int32_t i = 0; i < 3; i++) {
-        if (tout && !mConfig.inputHistogramsOnly) {
-          mClRej[i]->Write();
-        }
-        mPClRej[i]->cd();
-        mClRej[i]->SetTitle(REJECTED_NAMES[i]);
-        mClRej[i]->SetOption("colz");
-        mClRej[i]->Draw();
-        mCClRej[i]->cd();
-        mCClRej[i]->Print(Form("%s/clustersRej%d%s.pdf", mConfig.plotsDir.c_str(), i, REJECTED_NAMES[i]));
-        if (mConfig.writeFileExt != "") {
-          mCClRej[i]->Print(Form("%s/clustersRej%d%s.%s", mConfig.plotsDir.c_str(), i, REJECTED_NAMES[i], mConfig.writeFileExt.c_str()));
-        }
+    for (int32_t i = 0; i < 3; i++) {
+      if (tout && !mConfig.inputHistogramsOnly) {
+        mClRej[i]->Write();
+      }
+      mPClRej[i]->cd();
+      mClRej[i]->SetTitle(REJECTED_NAMES[i]);
+      mClRej[i]->SetOption("colz");
+      mClRej[i]->Draw();
+      mCClRej[i]->cd();
+      mCClRej[i]->Print(Form("%s/clustersRej%d%s.pdf", mConfig.plotsDir.c_str(), i, REJECTED_NAMES[i]));
+      if (mConfig.writeFileExt != "") {
+        mCClRej[i]->Print(Form("%s/clustersRej%d%s.%s", mConfig.plotsDir.c_str(), i, REJECTED_NAMES[i], mConfig.writeFileExt.c_str()));
       }
+    }
 
-      mPClRejP->cd();
-      for (int32_t k = 0; k < ConfigNumInputs; k++) {
-        auto* tmp = mClRej[0];
-        if (GetHist(tmp, tin, k, nNewInput) == nullptr) {
-          continue;
-        }
-        TH1D* proj1 = tmp->ProjectionY(Form("clrejptmp1%d", k)); // TODO: Clean up names
-        proj1->SetDirectory(nullptr);
-        tmp = mClRej[1];
-        if (GetHist(tmp, tin, k, nNewInput) == nullptr) {
-          continue;
-        }
-        TH1D* proj2 = tmp->ProjectionY(Form("clrejptmp2%d", k));
-        proj2->SetDirectory(nullptr);
+    mPClRejP->cd();
+    for (int32_t k = 0; k < ConfigNumInputs; k++) {
+      auto* tmp = mClRej[0];
+      if (GetHist(tmp, tin, k, nNewInput) == nullptr) {
+        continue;
+      }
+      TH1D* proj1 = tmp->ProjectionY(Form("clrejptmp1%d", k)); // TODO: Clean up names
+      proj1->SetDirectory(nullptr);
+      tmp = mClRej[1];
+      if (GetHist(tmp, tin, k, nNewInput) == nullptr) {
+        continue;
+      }
+      TH1D* proj2 = tmp->ProjectionY(Form("clrejptmp2%d", k));
+      proj2->SetDirectory(nullptr);
 
-        auto* e = mClRejP;
-        if (GetHist(e, tin, k, nNewInput) == nullptr) {
-          continue;
-        }
-        e->Divide(proj2, proj1);
-        if (tout && !mConfig.inputHistogramsOnly && k == 0) {
-          e->Write();
-        }
-        delete proj1;
-        delete proj2;
-        e->SetMinimum(-0.02);
-        e->SetMaximum(0.22);
-        e->SetTitle("Rejected Clusters");
-        e->GetXaxis()->SetTitle("Pad Row");
-        e->GetYaxis()->SetTitle("Rejected Clusters (fraction)");
-        e->Draw(k == 0 ? "" : "same");
+      auto* e = mClRejP;
+      if (GetHist(e, tin, k, nNewInput) == nullptr) {
+        continue;
       }
-      mPClRejP->Print(Form("%s/clustersRejProjected.pdf", mConfig.plotsDir.c_str()));
-      if (mConfig.writeFileExt != "") {
-        mPClRejP->Print(Form("%s/clustersRejProjected.%s", mConfig.plotsDir.c_str(), mConfig.writeFileExt.c_str()));
+      e->Divide(proj2, proj1);
+      if (tout && !mConfig.inputHistogramsOnly && k == 0) {
+        e->Write();
       }
+      delete proj1;
+      delete proj2;
+      e->SetMinimum(-0.02);
+      e->SetMaximum(0.22);
+      e->SetTitle("Rejected Clusters");
+      e->GetXaxis()->SetTitle("Pad Row");
+      e->GetYaxis()->SetTitle("Rejected Clusters (fraction)");
+      e->Draw(k == 0 ? "" : "same");
+    }
+    mPClRejP->Print(Form("%s/clustersRejProjected.pdf", mConfig.plotsDir.c_str()));
+    if (mConfig.writeFileExt != "") {
+      mPClRejP->Print(Form("%s/clustersRejProjected.%s", mConfig.plotsDir.c_str(), mConfig.writeFileExt.c_str()));
     }
   }
 
diff --git a/GPU/GPUTracking/qa/GPUQA.h b/GPU/GPUTracking/qa/GPUQA.h
index b42fa804c6212..3dd49e2ec1373 100644
--- a/GPU/GPUTracking/qa/GPUQA.h
+++ b/GPU/GPUTracking/qa/GPUQA.h
@@ -56,6 +56,10 @@ class GPUQA
   static bool QAAvailable() { return false; }
   static bool IsInitialized() { return false; }
   void UpdateChain(GPUChainTracking* chain) {}
+
+  enum QA_TASKS {
+    tasksAutomatic = 0
+  };
 };
 } // namespace o2::gpu
 
@@ -146,16 +150,20 @@ class GPUQA
 
   static constexpr int32_t MC_LABEL_INVALID = -1e9;
 
-  enum QA_TASKS {
+  enum QA_TASKS { // TODO: make this in32_t typed
     taskTrackingEff = 1,
     taskTrackingRes = 2,
     taskTrackingResPull = 4,
+    tasksAllMC = 8 - 1,
     taskClusterAttach = 8,
     taskTrackStatistics = 16,
     taskClusterCounts = 32,
-    taskDefault = 63,
-    taskDefaultPostprocess = 31,
-    tasksNoQC = 56
+    taskClusterRejection = 64,
+    tasksAll = 128 - 1,
+    tasksDefault = tasksAll,
+    tasksDefaultPostprocess = tasksDefault & ~taskClusterCounts,
+    tasksAllNoQC = tasksAll & ~tasksAllMC,
+    tasksAutomatic = -1
   };
 
  private:
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index d7ea772c31653..fb1d489a8479d 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -180,7 +180,7 @@ void GPURecoWorkflowSpec::init(InitContext& ic)
     mConfig->configQA.shipToQC = true;
     if (!mConfig->configProcessing.runQA) {
       mConfig->configQA.enableLocalOutput = false;
-      mQATaskMask = (mSpecConfig.processMC ? 15 : 0) | (mConfig->configQA.clusterRejectionHistograms ? 32 : 0);
+      mQATaskMask = (mSpecConfig.processMC ? 15 : 0) | (mConfig->configQA.clusterRejectionHistograms ? 32 : 0); // TODO: Clean up using numeric flags!
       mConfig->configProcessing.runQA = -mQATaskMask;
     }
   }

From 547159d37f7a3f15e3b4545c383b421f9f6144f7 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Wed, 29 Oct 2025 14:45:23 +0100
Subject: [PATCH 6/6] GPU TPC: Fix deterministic mode in combination of
 propagation of MC labels

---
 GPU/GPUTracking/Global/GPUChainTracking.h     |  1 +
 .../Global/GPUChainTrackingClusterizer.cxx    | 92 ++++++++++++++-----
 2 files changed, 72 insertions(+), 21 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
index 8de49cc954e35..4b07aadfad357 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.h
+++ b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -306,6 +306,7 @@ class GPUChainTracking : public GPUChain
   void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function<o2::tpc::ClusterNative*(size_t)> allocator, bool applyClusterCuts);
   bool NeedTPCClustersOnGPU();
   void WriteReducedClusters();
+  void SortClusters(bool buildNativeGPU, bool propagateMCLabels, o2::tpc::ClusterNativeAccess* clusterAccess, o2::tpc::ClusterNative* clusters);
   template <int32_t I>
   int32_t RunTRDTrackingInternal();
   uint32_t StreamForSector(uint32_t sector) const;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index fdce8ef5a127d..c4566ffb968a7 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -57,6 +57,8 @@
 #include "utils/VcShim.h"
 #include "utils/strtag.h"
 #include <fstream>
+#include <numeric>
+#include <vector>
 
 using namespace o2::gpu;
 using namespace o2::tpc;
@@ -762,14 +764,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   ClusterNative* tmpNativeClusters = nullptr;
   std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;
 
-  // setup MC Labels
-  bool propagateMCLabels = GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC;
+  const bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
+  const bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output
+  const bool propagateMCLabels = buildNativeHost && GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC;
+  const bool sortClusters = buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4);
 
   auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;
 
-  bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
-  bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output
-
   mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor;
   if (buildNativeGPU) {
     AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
@@ -1281,21 +1282,20 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
 
   ClusterNativeAccess::ConstMCLabelContainerView* mcLabelsConstView = nullptr;
-  if (propagateMCLabels) {
-    // TODO: write to buffer directly
+  if (propagateMCLabels) { // TODO: write to buffer directly
     o2::dataformats::MCTruthContainer<o2::MCCompLabel> mcLabels;
     std::pair<ConstMCLabelContainer*, ConstMCLabelContainerView*> buffer;
-    if (!GetProcessingSettings().tpcWriteClustersAfterRejection && mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)] && mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->useExternal()) {
-      if (!mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->allocator) {
+    auto& labelOutputControl = mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)];
+    if (!GetProcessingSettings().tpcWriteClustersAfterRejection && !sortClusters && labelOutputControl && labelOutputControl->useExternal()) {
+      if (!labelOutputControl->allocator) {
         throw std::runtime_error("Cluster MC Label buffer missing");
       }
-      ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer* container = reinterpret_cast<ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer*>(mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)]->allocator(0));
+      ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer* container = reinterpret_cast<ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer*>(labelOutputControl->allocator(0));
       buffer = {&container->first, &container->second};
     } else {
       mIOMem.clusterNativeMCView = std::make_unique<ConstMCLabelContainerView>();
       mIOMem.clusterNativeMCBuffer = std::make_unique<ConstMCLabelContainer>();
-      buffer.first = mIOMem.clusterNativeMCBuffer.get();
-      buffer.second = mIOMem.clusterNativeMCView.get();
+      buffer = {mIOMem.clusterNativeMCBuffer.get(), mIOMem.clusterNativeMCView.get()};
     }
 
     assert(propagateMCLabels ? mcLinearLabels.header.size() == nClsTotal : true);
@@ -1350,15 +1350,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   if (doGPU && synchronizeCalibUpdate) {
     SynchronizeStream(0);
   }
-  if (buildNativeHost && (GetProcessingSettings().deterministicGPUReconstruction || GetProcessingSettings().debugLevel >= 4)) {
-    for (uint32_t i = 0; i < NSECTORS; i++) {
-      for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
-        std::sort(&tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j]], &tmpNativeClusters[tmpNativeAccess->clusterOffset[i][j] + tmpNativeAccess->nClusters[i][j]]);
-      }
-    }
-    if (buildNativeGPU) {
-      GPUMemCpy(RecoStep::TPCClusterFinding, (void*)mInputsShadow->mPclusterNativeBuffer, (const void*)tmpNativeClusters, nClsTotal * sizeof(tmpNativeClusters[0]), -1, true);
-    }
+  if (sortClusters) {
+    SortClusters(buildNativeGPU, propagateMCLabels, tmpNativeAccess, tmpNativeClusters);
   }
   mRec->MemoryScalers()->nTPCHits = nClsTotal;
   mRec->PopNonPersistentMemory(RecoStep::TPCClusterFinding, qStr2Tag("TPCCLUST"));
@@ -1374,3 +1367,60 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 #endif
   return 0;
 }
+
+void GPUChainTracking::SortClusters(bool buildNativeGPU, bool propagateMCLabels, ClusterNativeAccess* clusterAccess, ClusterNative* clusters)
+{
+  if (propagateMCLabels) {
+    std::vector<uint32_t> clsOrder(clusterAccess->nClustersTotal);
+    std::iota(clsOrder.begin(), clsOrder.end(), 0);
+    std::vector<ClusterNative> tmpClusters;
+    for (uint32_t i = 0; i < NSECTORS; i++) {
+      for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
+        const uint32_t offset = clusterAccess->clusterOffset[i][j];
+        std::sort(&clsOrder[offset], &clsOrder[offset + clusterAccess->nClusters[i][j]], [&clusters](const uint32_t a, const uint32_t b) {
+          return clusters[a] < clusters[b];
+        });
+        tmpClusters.resize(clusterAccess->nClusters[i][j]);
+        memcpy(tmpClusters.data(), &clusters[offset], clusterAccess->nClusters[i][j] * sizeof(tmpClusters[0]));
+        for (uint32_t k = 0; k < tmpClusters.size(); k++) {
+          clusters[offset + k] = tmpClusters[clsOrder[offset + k] - offset];
+        }
+      }
+    }
+    tmpClusters.clear();
+
+    std::pair<o2::dataformats::ConstMCLabelContainer*, o2::dataformats::ConstMCLabelContainerView*> labelBuffer;
+    GPUOutputControl* labelOutput = mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clusterLabels)];
+    std::unique_ptr<ConstMCLabelContainerView> tmpUniqueContainerView;
+    std::unique_ptr<ConstMCLabelContainer> tmpUniqueContainerBuffer;
+    if (labelOutput && labelOutput->allocator) {
+      ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer* labelContainer = reinterpret_cast<ClusterNativeAccess::ConstMCLabelContainerViewWithBuffer*>(labelOutput->allocator(0));
+      labelBuffer = {&labelContainer->first, &labelContainer->second};
+    } else {
+      tmpUniqueContainerView = std::move(mIOMem.clusterNativeMCView);
+      tmpUniqueContainerBuffer = std::move(mIOMem.clusterNativeMCBuffer);
+      mIOMem.clusterNativeMCView = std::make_unique<ConstMCLabelContainerView>();
+      mIOMem.clusterNativeMCBuffer = std::make_unique<ConstMCLabelContainer>();
+      labelBuffer = {mIOMem.clusterNativeMCBuffer.get(), mIOMem.clusterNativeMCView.get()};
+    }
+
+    o2::dataformats::MCLabelContainer tmpContainer;
+    for (uint32_t i = 0; i < clusterAccess->nClustersTotal; i++) {
+      for (const auto& element : clusterAccess->clustersMCTruth->getLabels(clsOrder[i])) {
+        tmpContainer.addElement(i, element);
+      }
+    }
+    tmpContainer.flatten_to(*labelBuffer.first);
+    *labelBuffer.second = *labelBuffer.first;
+    clusterAccess->clustersMCTruth = labelBuffer.second;
+  } else {
+    for (uint32_t i = 0; i < NSECTORS; i++) {
+      for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
+        std::sort(&clusters[clusterAccess->clusterOffset[i][j]], &clusters[clusterAccess->clusterOffset[i][j] + clusterAccess->nClusters[i][j]]);
+      }
+    }
+  }
+  if (buildNativeGPU) {
+    GPUMemCpy(RecoStep::TPCClusterFinding, (void*)mInputsShadow->mPclusterNativeBuffer, (const void*)clusters, clusterAccess->nClustersTotal * sizeof(clusters[0]), -1, true);
+  }
+}