From 339b4a0225bf693c8687a9def3653d10a3e8bf3c Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Thu, 8 May 2025 00:18:02 +0200
Subject: [PATCH 1/2] GPU: Add debug dumps for compressed / uncompressed TPC
 clusters

---
 GPU/GPUTracking/CMakeLists.txt                |  1 +
 .../DataCompression/GPUTPCCompression.cxx     | 56 +++++++++++++
 .../DataCompression/GPUTPCCompression.h       |  4 +
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  2 +-
 GPU/GPUTracking/Global/GPUChain.h             | 31 +++-----
 GPU/GPUTracking/Global/GPUChainTracking.h     |  1 +
 .../Global/GPUChainTrackingClusterizer.cxx    | 19 ++---
 .../Global/GPUChainTrackingCompression.cxx    |  3 +
 .../Global/GPUChainTrackingDebug.h            | 79 +++++++++++++++++++
 .../GPUChainTrackingDebugAndProfiling.cxx     | 15 ++++
 .../Global/GPUChainTrackingMerger.cxx         | 19 ++---
 .../Global/GPUChainTrackingSectorTracker.cxx  | 15 ++--
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |  1 +
 .../GPUTPCClusterFinderDump.cxx               |  2 +-
 14 files changed, 201 insertions(+), 47 deletions(-)
 create mode 100644 GPU/GPUTracking/Global/GPUChainTrackingDebug.h

diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index b2852389398d0..2e26622d05291 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -138,6 +138,7 @@ set(HDRS_INSTALL
     Definitions/GPULogging.h
     Definitions/GPUSettingsList.h
     Global/GPUChainTrackingDefs.h
+    Global/GPUChainTrackingDebug.h
     Global/GPUChainTrackingGetters.inc
     Global/GPUErrorCodes.h
     Merger/GPUTPCGMBorderTrack.h
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx
index 2a0c5b58d8a83..a107f749ddd77 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx
@@ -130,3 +130,59 @@ void GPUTPCCompression::SetMaxData(const GPUTrackingInOutPointers& io)
     mMaxClusters += 16 - (mMaxClusters % 16);
   }
 }
+
+void GPUTPCCompression::DumpCompressedClusters(std::ostream& out)
+{
+  const o2::tpc::CompressedClusters O = *mOutputFlat;
+  out << "\n\nCompressed Clusters:\n";
+  out << O.nTracks << " Tracks\n";
+  out << "Slice Row Clusters:\n";
+  for (uint32_t i = 0; i < NSECTORS; i++) {
+    out << "Sector " << i << ": ";
+    for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
+      out << O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] << ", ";
+    }
+    out << "\n";
+  }
+  out << "\nTrack Clusters:\n";
+  for (uint32_t i = 0; i < O.nTracks; i++) {
+    if (i && i % 100 == 0) {
+      out << "\n";
+    }
+    out << O.nTrackClusters[i] << ", ";
+  }
+  out << "\n\nUnattached Clusters\n";
+  uint32_t offset = 0;
+  for (uint32_t i = 0; i < NSECTORS; i++) {
+    for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
+      out << "Sector " << i << " Row " << j << ": ";
+      for (uint32_t k = 0; k < O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j]; k++) {
+        if (k && k % 10 == 0) {
+          out << "\n    ";
+        }
+        const uint32_t l = k + offset;
+        out << "[" << (uint32_t)O.qTotU[l] << ", " << (uint32_t)O.qMaxU[l] << ", " << (uint32_t)O.flagsU[l] << ", " << (int32_t)O.padDiffU[l] << ", " << (int32_t)O.timeDiffU[l] << ", " << (uint32_t)O.sigmaPadU[l] << ", " << (uint32_t)O.sigmaTimeU[l] << "] ";
+      }
+      offset += O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j];
+      out << "\n";
+    }
+  }
+  out << "\n\nAttached Clusters\n";
+  offset = 0;
+  for (uint32_t i = 0; i < O.nTracks; i++) {
+    out << "Track " << i << ": {" << (uint32_t)O.qPtA[i] << ", " << (uint32_t)O.rowA[i] << ", " << (uint32_t)O.sliceA[i] << ", " << (uint32_t)O.timeA[i] << ", " << (uint32_t)O.padA[i] << "} - ";
+    for (uint32_t k = 0; k < O.nTrackClusters[i]; k++) {
+      if (k && k % 10 == 0) {
+        out << "\n    ";
+      }
+      const uint32_t l1 = k + offset, l2 = k + offset - i;
+      out << "[";
+      if (k) {
+        out << (int32_t)O.rowDiffA[l2] << ", " << (int32_t)O.sliceLegDiffA[l2] << ", " << (uint32_t)O.padResA[l2] << ", " << (uint32_t)O.timeResA[l2] << ", ";
+      }
+      out << (uint32_t)O.qTotA[l1] << ", " << (uint32_t)O.qMaxA[l1] << ", " << (uint32_t)O.flagsA[l1] << ", " << (uint32_t)O.sigmaPadA[l1] << ", " << (uint32_t)O.sigmaTimeA[l1] << "] ";
+    }
+    offset += O.nTrackClusters[i];
+    out << "\n";
+  }
+}
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompression.h b/GPU/GPUTracking/DataCompression/GPUTPCCompression.h
index c1d9fe283fbea..52585b4c08b24 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCCompression.h
+++ b/GPU/GPUTracking/DataCompression/GPUTPCCompression.h
@@ -57,6 +57,10 @@ class GPUTPCCompression : public GPUProcessor
   GPUd() static void truncateSignificantBitsChargeMax(uint16_t& charge, const GPUParam& param) { truncateSignificantBits(charge, param.rec.tpc.sigBitsCharge, P_MAX_QMAX); }
   GPUd() static void truncateSignificantBitsWidth(uint8_t& width, const GPUParam& param) { truncateSignificantBits(width, param.rec.tpc.sigBitsWidth, P_MAX_SIGMA); }
 
+#ifndef GPUCA_GPUCODE
+  void DumpCompressedClusters(std::ostream& out);
+#endif
+
  protected:
   struct memory {
     uint32_t nStoredTracks = 0;
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 4c32c3e46e3a7..638a3ed43d2aa 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -295,7 +295,7 @@ AddOption(trdNCandidates, int32_t, 3, "", 0, "Number of branching track candidat
 AddOption(trdTrackModelO2, bool, false, "", 0, "Use O2 track model instead of GPU track model for TRD tracking")
 AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, -1 = autoselect (-2 for O2, 0 for standalone))")
 AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)")
-AddOption(debugMask, int32_t, 262143, "", 0, "Mask for debug output dumps to file")
+AddOption(debugMask, uint32_t, 262143, "", 0, "Mask for debug output dumps to file")
 AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures")
 AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks")
 AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6", def(1))
diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
index 5df324fcba648..1f88d3dc3aba1 100644
--- a/GPU/GPUTracking/Global/GPUChain.h
+++ b/GPU/GPUTracking/Global/GPUChain.h
@@ -20,6 +20,7 @@
 #include "GPUKernelClassesFwd.h"
 
 #include <ctime>
+#include <functional>
 
 namespace o2::gpu
 {
@@ -226,12 +227,19 @@ class GPUChain
   virtual int32_t DoStuckProtection(int32_t stream, deviceEvent event) { return 0; }
 
   template <class T, class S, typename... Args>
-  bool DoDebugAndDump(RecoStep step, int32_t mask, T& processor, S T::*func, Args&&... args)
+  bool DoDebugAndDump(RecoStep step, uint32_t mask, T& processor, S T::* func, Args&&... args)
   {
     return DoDebugAndDump(step, mask, true, processor, func, args...);
   }
   template <class T, class S, typename... Args>
-  bool DoDebugAndDump(RecoStep step, int32_t mask, bool transfer, T& processor, S T::*func, Args&&... args);
+  bool DoDebugAndDump(RecoStep step, uint32_t mask, bool transfer, T& processor, S T::* func, Args&&... args);
+  template <typename... Args>
+  bool DoDebugDump(uint32_t mask, std::function<void(Args&...)> func, Args&... args);
+  template <class S, typename... Args>
+  bool DoDebugDump(uint32_t mask, S* func, Args&&... args)
+  {
+    return DoDebugDump(mask, std::function<void(Args && ...)>([&func](Args&&... args_tmp) { (*func)(args_tmp...); }), args...);
+  }
 
   template <class T, class S, typename... Args>
   int32_t runRecoStep(RecoStep step, S T::*func, Args... args);
@@ -278,24 +286,7 @@ inline void GPUChain::timeCpy(RecoStep step, int32_t toGPU, S T::*func, Args...
 }
 
 template <class T, class S, typename... Args>
-bool GPUChain::DoDebugAndDump(GPUChain::RecoStep step, int32_t mask, bool transfer, T& processor, S T::*func, Args&&... args)
-{
-  if (GetProcessingSettings().keepAllMemory) {
-    if (transfer) {
-      TransferMemoryResourcesToHost(step, &processor, -1, true);
-    }
-    if (GetProcessingSettings().debugLevel >= 6 && (mask == 0 || (GetProcessingSettings().debugMask & mask))) {
-      if (func) {
-        (processor.*func)(args...);
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-template <class T, class S, typename... Args>
-int32_t GPUChain::runRecoStep(RecoStep step, S T::*func, Args... args)
+inline int32_t GPUChain::runRecoStep(RecoStep step, S T::* func, Args... args)
 {
   if (GetRecoSteps().isSet(step)) {
     auto* timer = GetProcessingSettings().recoTaskTiming ? &mRec->getRecoStepTimer(step) : nullptr;
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
index 8664652b549e3..13773a97d4e3d 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.h
+++ b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -234,6 +234,7 @@ class GPUChainTracking : public GPUChain
   void PrepareDebugOutput();
   void PrintDebugOutput();
   void PrintOutputStat();
+  static void DumpClusters(std::ostream& out, const o2::tpc::ClusterNativeAccess* clusters);
 
   bool ValidateSteps();
   bool ValidateSettings();
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index f188388e76a02..2cdd1bb76bf00 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -14,6 +14,7 @@
 
 #include "GPUChainTracking.h"
 #include "GPUChainTrackingDefs.h"
+#include "GPUChainTrackingDebug.h"
 #include "GPULogging.h"
 #include "GPUO2DataTypes.h"
 #include "GPUMemorySizeScalers.h"
@@ -813,7 +814,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (fragment.index == 0) {
           runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy));
         }
-        DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges");
+        DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererZeroedCharges, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges");
 
         if (doGPU) {
           if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
@@ -900,7 +901,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (!mIOPtrs.tpcZS) {
           runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillFromDigits>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
         }
-        if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 1, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile)) {
+        if (DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererDigits, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile)) {
           clusterer.DumpChargeMap(*mDebugFile, "Charges");
         }
 
@@ -919,13 +920,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         }
 
         runKernel<GPUTPCCFPeakFinder>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
-        if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaks, *mDebugFile)) {
+        if (DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererPeaks, clusterer, &GPUTPCClusterFinder::DumpPeaks, *mDebugFile)) {
           clusterer.DumpPeakMap(*mDebugFile, "Peaks");
         }
 
         RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane);
         TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-        DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaksCompacted, *mDebugFile); // clang-format off
+        DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererPeaks, clusterer, &GPUTPCClusterFinder::DumpPeaksCompacted, *mDebugFile); // clang-format off
       });
       mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
         uint32_t iSector = iSectorBase + lane;
@@ -939,13 +940,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         }
         runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSector}});
         runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSector}});
-        if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile)) {
+        if (DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererSuppressedPeaks, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile)) {
           clusterer.DumpPeakMap(*mDebugFile, "Suppressed Peaks");
         }
 
         RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
         TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-        DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaksCompacted, *mDebugFile); // clang-format off
+        DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererSuppressedPeaks, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaksCompacted, *mDebugFile); // clang-format off
       });
       mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
         uint32_t iSector = iSectorBase + lane;
@@ -979,7 +980,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
           if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
-            DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+            DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
 
           // float time_clusterizer = 0, time_fill = 0, time_networks = 0;
@@ -1092,7 +1093,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 #endif
         } else {
           runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
-          DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+          DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0);
         }
 
@@ -1111,7 +1112,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clusterer, lane);
         laneHasData[lane] = true;
         // Include clusters in default debug mask, exclude other debug output by default
-        DoDebugAndDump(RecoStep::TPCClusterFinding, 131072, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile); // clang-format off
+        DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererClusters, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile); // clang-format off
       });
       mRec->SetNActiveThreadsOuterLoop(1);
     }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 24c74a661f18e..534c02a4c0a84 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -13,6 +13,7 @@
 /// \author David Rohr
 
 #include "GPUChainTracking.h"
+#include "GPUChainTrackingDebug.h"
 #include "GPULogging.h"
 #include "GPUO2DataTypes.h"
 #include "GPUTrackingInputProvider.h"
@@ -202,6 +203,7 @@ int32_t GPUChainTracking::RunTPCCompression()
     ((GPUChainTracking*)GetNextChainInQueue())->mRec->BlockStackedMemory(mRec);
   }
   mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCCOMPR"));
+  DoDebugAndDump(RecoStep::TPCCompression, GPUChainTrackingDebugFlags::TPCCompressedClusters, Compressor, &GPUTPCCompression::DumpCompressedClusters, *mDebugFile);
   return 0;
 }
 
@@ -425,5 +427,6 @@ int32_t GPUChainTracking::RunTPCDecompression()
     }
     mRec->PopNonPersistentMemory(RecoStep::TPCDecompression, qStr2Tag("TPCDCMPR"));
   }
+  DoDebugDump(GPUChainTrackingDebugFlags::TPCDecompressedClusters, &GPUChainTracking::DumpClusters, *mDebugFile, mIOPtrs.clustersNative);
   return 0;
 }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebug.h b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h
new file mode 100644
index 0000000000000..952a6c088ae8b
--- /dev/null
+++ b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h
@@ -0,0 +1,79 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUChainTrackingDEBUG.h
+/// \author David Rohr
+
+#ifndef GPUCHAINTRACKINGDEBUG_H
+#define GPUCHAINTRACKINGDEBUG_H
+
+#include <cstdint>
+#include <functional>
+#include <fstream>
+
+namespace o2::gpu
+{
+// NOTE: Values below 262144 are activated by default with --debug 6 in GPUSettingsList.h::debugMask
+enum GPUChainTrackingDebugFlags : uint32_t {
+  TPCSectorTrackingData = 1,
+  TPCPreLinks = 2,
+  TPCLinks = 4,
+  TPCStartHits = 8,
+  TPCTracklets = 16,
+  TPCSectorTracks = 32,
+  TPCHitWeights = 256,
+  TPCCompressedClusters = 512,
+  TPCDecompressedClusters = 1024,
+  TPCMergingRanges = 2048,
+  TPCMergingSectorTracks = 4096,
+  TPCMergingMergedTracks = 8192,
+  TPCMergingCollectedTracks = 16384,
+  TPCMergingCE = 32768,
+  TPCMergingRefit = 65536,
+  TPCClustererClusters = 131072,
+  TPCClusterer = 262144,
+  TPCClustererDigits = 262144 << 1,
+  TPCClustererPeaks = 262144 << 2,
+  TPCClustererSuppressedPeaks = 262144 << 3,
+  TPCClustererChargeMap = 262144 << 4,
+  TPCClustererZeroedCharges = 262144 << 5
+};
+
+template <class T, class S, typename... Args>
+inline bool GPUChain::DoDebugAndDump(GPUChain::RecoStep step, uint32_t mask, bool transfer, T& processor, S T::* func, Args&&... args)
+{
+  if (GetProcessingSettings().keepAllMemory) {
+    if (transfer) {
+      TransferMemoryResourcesToHost(step, &processor, -1, true);
+    }
+    std::function<void(Args && ...)> lambda = [&processor, &func](Args&... args_tmp) {
+      if (func) {
+        (processor.*func)(args_tmp...);
+      }
+    };
+    return DoDebugDump(mask, lambda, args...);
+  }
+  return false;
+}
+
+template <typename... Args>
+inline bool GPUChain::DoDebugDump(uint32_t mask, std::function<void(Args&...)> func, Args&... args)
+{
+  if (GetProcessingSettings().debugLevel >= 6 && (mask == 0 || (GetProcessingSettings().debugMask & mask))) {
+    func(args...);
+    return true;
+  }
+  return false;
+}
+
+} // namespace o2::gpu
+
+#endif
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
index 53bdfbadd4b25..903505068ad2c 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
@@ -333,3 +333,18 @@ void GPUChainTracking::RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* cluster
     }
   }
 }
+
+void GPUChainTracking::DumpClusters(std::ostream& out, const o2::tpc::ClusterNativeAccess* clusters)
+{
+  out << "\nTPC Clusters:\n";
+  for (uint32_t iSec = 0; iSec < GPUCA_NSECTORS; iSec++) {
+    out << "TPCClusters - Sector " << iSec << "\n";
+    for (uint32_t i = 0; i < GPUCA_ROW_COUNT; i++) {
+      out << "  Row: " << i << ": " << clusters->nClusters[iSec][i] << " clusters:\n";
+      for (uint32_t j = 0; j < clusters->nClusters[iSec][i]; j++) {
+        const auto& cl = clusters->clusters[iSec][i][j];
+        out << "    " << std::hex << cl.timeFlagsPacked << std::dec << " " << cl.padPacked << " " << int32_t{cl.sigmaTimePacked} << " " << int32_t{cl.sigmaPadPacked} << " " << cl.qMax << " " << cl.qTot << "\n";
+      }
+    }
+  }
+}
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
index a38148ccb375a..6e86be03e7950 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingMerger.cxx
@@ -13,6 +13,7 @@
 /// \author David Rohr
 
 #include "GPUChainTracking.h"
+#include "GPUChainTrackingDebug.h"
 #include "GPULogging.h"
 #include "GPUDefParametersRuntime.h"
 #include "GPUO2DataTypes.h"
@@ -72,7 +73,7 @@ void GPUChainTracking::RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSecto
       runKernel<GPUTPCGMMergerMergeBorders, 2>(GetGridAuto(0, deviceType), i, withinSector, mergeMode);
     }
   }
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpMergeRanges, *mDebugFile, withinSector, mergeMode);
+  DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingRanges, doGPU, Merger, &GPUTPCGMMerger::DumpMergeRanges, *mDebugFile, withinSector, mergeMode);
   mRec->ReturnVolatileDeviceMemory();
 }
 
@@ -135,14 +136,14 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
   if (GetProcessingSettings().deterministicGPUReconstruction) {
     runKernel<GPUTPCGlobalDebugSortKernels, GPUTPCGlobalDebugSortKernels::sectorTracks>({{GPUCA_NSECTORS, -WarpSize(), 0, deviceType}}, 1);
   }
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpSectorTracks, *mDebugFile);
+  DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingSectorTracks, doGPU, Merger, &GPUTPCGMMerger::DumpSectorTracks, *mDebugFile);
 
   runKernel<GPUTPCGMMergerClearLinks>(GetGridAuto(0, deviceType), false);
   runKernel<GPUMemClean16>({{1, -WarpSize(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter(), NSECTORS * sizeof(*MergerShadowAll.TmpCounter()));
   runKernel<GPUTPCGMMergerMergeWithinPrepare>(GetGridAuto(0, deviceType));
   RunTPCTrackingMerger_MergeBorderTracks(1, 0, deviceType);
   RunTPCTrackingMerger_Resolve(0, 1, deviceType);
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpMergedWithinSectors, *mDebugFile);
+  DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingMergedTracks, doGPU, Merger, &GPUTPCGMMerger::DumpMergedWithinSectors, *mDebugFile);
 
   runKernel<GPUTPCGMMergerClearLinks>(GetGridAuto(0, deviceType), false);
   runKernel<GPUMemClean16>({{1, -WarpSize(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter(), 2 * NSECTORS * sizeof(*MergerShadowAll.TmpCounter()));
@@ -157,7 +158,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
   runKernel<GPUTPCGMMergerMergeSectorsPrepare>(GetGridBlk(std::max(2u, numBlocks), 0, deviceType), 0, 1, 1);
   RunTPCTrackingMerger_MergeBorderTracks(0, -1, deviceType);
   RunTPCTrackingMerger_Resolve(0, 1, deviceType);
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpMergedBetweenSectors, *mDebugFile);
+  DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingMergedTracks, doGPU, Merger, &GPUTPCGMMerger::DumpMergedBetweenSectors, *mDebugFile);
 
   runKernel<GPUMemClean16>({{1, -WarpSize(), 0, deviceType, RecoStep::TPCMerging}}, MergerShadowAll.TmpCounter(), 2 * NSECTORS * sizeof(*MergerShadowAll.TmpCounter()));
 
@@ -167,14 +168,14 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
     runKernel<GPUTPCGlobalDebugSortKernels, GPUTPCGlobalDebugSortKernels::extrapolatedTracks1>({{1, -WarpSize(), 0, deviceType}}, 1);
     runKernel<GPUTPCGlobalDebugSortKernels, GPUTPCGlobalDebugSortKernels::extrapolatedTracks2>({{1, -WarpSize(), 0, deviceType}}, 1);
   }
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpCollected, *mDebugFile);
+  DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingCollectedTracks, doGPU, Merger, &GPUTPCGMMerger::DumpCollected, *mDebugFile);
 
   if (param().rec.tpc.mergeCE) {
     runKernel<GPUTPCGMMergerClearLinks>(GetGridAuto(0, deviceType), true);
     RunTPCTrackingMerger_MergeBorderTracks(-1, 1, deviceType);
     RunTPCTrackingMerger_MergeBorderTracks(-1, 2, deviceType);
     runKernel<GPUTPCGMMergerMergeCE>(GetGridAuto(0, deviceType));
-    DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpMergeCE, *mDebugFile);
+    DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingCE, doGPU, Merger, &GPUTPCGMMerger::DumpMergeCE, *mDebugFile);
   }
   int32_t waitForTransfer = 0;
   if (doGPU) {
@@ -201,7 +202,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
   runKernel<GPUTPCGMMergerPrepareClusters, 1>(GetGridAuto(0, deviceType));
   runKernel<GPUTPCGMMergerPrepareClusters, 2>(GetGridAuto(0, deviceType));
 
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpFitPrepare, *mDebugFile);
+  DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingRefit, doGPU, Merger, &GPUTPCGMMerger::DumpFitPrepare, *mDebugFile);
 
   if (doGPU) {
     CondWaitEvent(waitForTransfer, &mEvents->single);
@@ -227,7 +228,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
     runKernel<GPUTPCGMMergerFollowLoopers>(GetGridAuto(0));
   }
 
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, Merger, &GPUTPCGMMerger::DumpRefit, *mDebugFile);
+  DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingRefit, Merger, &GPUTPCGMMerger::DumpRefit, *mDebugFile);
   runKernel<GPUTPCGMMergerFinalize, 0>(GetGridAuto(0, deviceType));
   runKernel<GPUTPCGMMergerFinalize, 1>(GetGridAuto(0, deviceType));
   runKernel<GPUTPCGMMergerFinalize, 2>(GetGridAuto(0, deviceType));
@@ -240,7 +241,7 @@ int32_t GPUChainTracking::RunTPCTrackingMerger(bool synchronizeOutput)
     runKernel<GPUTPCGMMergerMergeLoopers, 1>(GetGridAuto(0, deviceType));
     runKernel<GPUTPCGMMergerMergeLoopers, 2>(doGPU ? GetGrid(Merger.Memory()->nLooperMatchCandidates, 0, deviceType) : GetGridAuto(0, deviceType));
   }
-  DoDebugAndDump(RecoStep::TPCMerging, 2048, doGPU, Merger, &GPUTPCGMMerger::DumpFinal, *mDebugFile);
+  DoDebugAndDump(RecoStep::TPCMerging, GPUChainTrackingDebugFlags::TPCMergingRefit, doGPU, Merger, &GPUTPCGMMerger::DumpFinal, *mDebugFile);
 
   if (doGPU) {
     RecordMarker(&mEvents->single, 0);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx
index 635641c00ae14..ef38d53173c2b 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingSectorTracker.cxx
@@ -13,6 +13,7 @@
 /// \author David Rohr
 
 #include "GPUChainTracking.h"
+#include "GPUChainTrackingDebug.h"
 #include "GPULogging.h"
 #include "GPUO2DataTypes.h"
 #include "GPUMemorySizeScalers.h"
@@ -176,7 +177,7 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal()
 
     if (GetProcessingSettings().debugLevel >= 6) {
       *mDebugFile << "\n\nReconstruction: Sector " << iSector << "/" << NSECTORS << std::endl;
-      if (GetProcessingSettings().debugMask & 1) {
+      if (GetProcessingSettings().debugMask & GPUChainTrackingDebugFlags::TPCSectorTrackingData) {
         if (doGPU) {
           TransferMemoryResourcesToHost(RecoStep::TPCSectorTracking, &trk, -1, true);
         }
@@ -191,13 +192,13 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal()
     if (GetProcessingSettings().keepDisplayMemory) {
       TransferMemoryResourcesToHost(RecoStep::TPCSectorTracking, &trk, -1, true);
       memcpy(trk.LinkTmpMemory(), mRec->Res(trk.MemoryResLinks()).Ptr(), mRec->Res(trk.MemoryResLinks()).Size());
-      if (GetProcessingSettings().debugMask & 2) {
+      if (GetProcessingSettings().debugMask & GPUChainTrackingDebugFlags::TPCPreLinks) {
         trk.DumpLinks(*mDebugFile, 0);
       }
     }
 
     runKernel<GPUTPCNeighboursCleaner>({GetGridBlk(GPUCA_ROW_COUNT - 2, useStream), {iSector}});
-    DoDebugAndDump(RecoStep::TPCSectorTracking, 4, trk, &GPUTPCTracker::DumpLinks, *mDebugFile, 1);
+    DoDebugAndDump(RecoStep::TPCSectorTracking, GPUChainTrackingDebugFlags::TPCLinks, trk, &GPUTPCTracker::DumpLinks, *mDebugFile, 1);
 
     runKernel<GPUTPCStartHitsFinder>({GetGridBlk(GPUCA_ROW_COUNT - 6, useStream), {iSector}});
     if (mRec->getGPUParameters(doGPU).par_SORT_STARTHITS) {
@@ -206,7 +207,7 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal()
     if (GetProcessingSettings().deterministicGPUReconstruction) {
       runKernel<GPUTPCSectorDebugSortKernels, GPUTPCSectorDebugSortKernels::startHits>({GetGrid(1, 1, useStream), {iSector}});
     }
-    DoDebugAndDump(RecoStep::TPCSectorTracking, 32, trk, &GPUTPCTracker::DumpStartHits, *mDebugFile);
+    DoDebugAndDump(RecoStep::TPCSectorTracking, GPUChainTrackingDebugFlags::TPCStartHits, trk, &GPUTPCTracker::DumpStartHits, *mDebugFile);
 
     if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
       trk.UpdateMaxData();
@@ -215,8 +216,8 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal()
     }
 
     runKernel<GPUTPCTrackletConstructor>({GetGridAuto(useStream), {iSector}});
-    DoDebugAndDump(RecoStep::TPCSectorTracking, 128, trk, &GPUTPCTracker::DumpTrackletHits, *mDebugFile);
-    if (GetProcessingSettings().debugMask & 256 && GetProcessingSettings().deterministicGPUReconstruction < 2) {
+    DoDebugAndDump(RecoStep::TPCSectorTracking, GPUChainTrackingDebugFlags::TPCTracklets, trk, &GPUTPCTracker::DumpTrackletHits, *mDebugFile);
+    if (GetProcessingSettings().debugMask & GPUChainTrackingDebugFlags::TPCHitWeights && GetProcessingSettings().deterministicGPUReconstruction < 2) {
       trk.DumpHitWeights(*mDebugFile);
     }
 
@@ -230,7 +231,7 @@ int32_t GPUChainTracking::RunTPCTrackingSectors_internal()
     if (GetProcessingSettings().debugLevel >= 3) {
       GPUInfo("Sector %u, Number of tracks: %d", iSector, *trk.NTracks());
     }
-    DoDebugAndDump(RecoStep::TPCSectorTracking, 512, trk, &GPUTPCTracker::DumpTrackHits, *mDebugFile);
+    DoDebugAndDump(RecoStep::TPCSectorTracking, GPUChainTrackingDebugFlags::TPCSectorTracks, trk, &GPUTPCTracker::DumpTrackHits, *mDebugFile);
   });
   mRec->SetNActiveThreadsOuterLoop(1);
   if (error) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 37399f5e4863f..35e2a7297338f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -36,6 +36,7 @@ class ConstMCTruthContainerView;
 namespace tpc
 {
 struct ClusterNative;
+struct ClusterNativeAccess;
 class Digit;
 } // namespace tpc
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx
index da30375149b7c..d676cf9cd3887 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinderDump.cxx
@@ -157,7 +157,7 @@ void GPUTPCClusterFinder::DumpClusters(std::ostream& out)
 {
   out << "\nClusterer - Clusters - Sector " << mISector << " - Fragment " << mPmemory->fragment.index << "\n";
 
-  for (int32_t i = 0; i < GPUCA_ROW_COUNT; i++) {
+  for (uint32_t i = 0; i < GPUCA_ROW_COUNT; i++) {
     size_t N = mPclusterInRow[i];
     const tpc::ClusterNative* row = &mPclusterByRow[i * mNMaxClusterPerRow];
 

From 6127d41467765d2c26c40aff71d680fe57483d70 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Thu, 8 May 2025 13:13:24 +0000
Subject: [PATCH 2/2] Please consider the following formatting changes

---
 GPU/GPUTracking/Global/GPUChain.h              | 6 +++---
 GPU/GPUTracking/Global/GPUChainTrackingDebug.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
index 1f88d3dc3aba1..aca1bb2420fb6 100644
--- a/GPU/GPUTracking/Global/GPUChain.h
+++ b/GPU/GPUTracking/Global/GPUChain.h
@@ -227,12 +227,12 @@ class GPUChain
   virtual int32_t DoStuckProtection(int32_t stream, deviceEvent event) { return 0; }
 
   template <class T, class S, typename... Args>
-  bool DoDebugAndDump(RecoStep step, uint32_t mask, T& processor, S T::* func, Args&&... args)
+  bool DoDebugAndDump(RecoStep step, uint32_t mask, T& processor, S T::*func, Args&&... args)
   {
     return DoDebugAndDump(step, mask, true, processor, func, args...);
   }
   template <class T, class S, typename... Args>
-  bool DoDebugAndDump(RecoStep step, uint32_t mask, bool transfer, T& processor, S T::* func, Args&&... args);
+  bool DoDebugAndDump(RecoStep step, uint32_t mask, bool transfer, T& processor, S T::*func, Args&&... args);
   template <typename... Args>
   bool DoDebugDump(uint32_t mask, std::function<void(Args&...)> func, Args&... args);
   template <class S, typename... Args>
@@ -286,7 +286,7 @@ inline void GPUChain::timeCpy(RecoStep step, int32_t toGPU, S T::*func, Args...
 }
 
 template <class T, class S, typename... Args>
-inline int32_t GPUChain::runRecoStep(RecoStep step, S T::* func, Args... args)
+inline int32_t GPUChain::runRecoStep(RecoStep step, S T::*func, Args... args)
 {
   if (GetRecoSteps().isSet(step)) {
     auto* timer = GetProcessingSettings().recoTaskTiming ? &mRec->getRecoStepTimer(step) : nullptr;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebug.h b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h
index 952a6c088ae8b..810f40a1d8654 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingDebug.h
+++ b/GPU/GPUTracking/Global/GPUChainTrackingDebug.h
@@ -48,7 +48,7 @@ enum GPUChainTrackingDebugFlags : uint32_t {
 };
 
 template <class T, class S, typename... Args>
-inline bool GPUChain::DoDebugAndDump(GPUChain::RecoStep step, uint32_t mask, bool transfer, T& processor, S T::* func, Args&&... args)
+inline bool GPUChain::DoDebugAndDump(GPUChain::RecoStep step, uint32_t mask, bool transfer, T& processor, S T::*func, Args&&... args)
 {
   if (GetProcessingSettings().keepAllMemory) {
     if (transfer) {