From 9308c4c450bea828af5f618bd39a72a34b7cf20a Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Thu, 8 May 2025 21:16:48 +0200
Subject: [PATCH 1/4] GPU: Add debugSuffix option for debug files

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 +
 GPU/GPUTracking/Global/GPUChainTracking.cxx   | 6 +++---
 GPU/GPUTracking/Global/GPUChainTrackingIO.cxx | 2 --
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 638a3ed43d2aa..9e0aa32155f0d 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -296,6 +296,7 @@ AddOption(trdTrackModelO2, bool, false, "", 0, "Use O2 track model instead of GP
 AddOption(debugLevel, int32_t, -1, "debug", 'd', "Set debug level (-2 = silent, -1 = autoselect (-2 for O2, 0 for standalone))")
 AddOption(allocDebugLevel, int32_t, 0, "allocDebug", 0, "Some debug output for memory allocations (without messing with normal debug level)")
 AddOption(debugMask, uint32_t, 262143, "", 0, "Mask for debug output dumps to file")
+AddOption(debugLogSuffix, std::string, "", "debugSuffix", 0, "Suffix for debug log files with --debug 6")
 AddOption(serializeGPU, int8_t, 0, "", 0, "Synchronize after each kernel call (bit 1) and DMA transfer (bit 2) and identify failures")
 AddOption(recoTaskTiming, bool, 0, "", 0, "Perform summary timing after whole reconstruction tasks")
 AddOption(deterministicGPUReconstruction, int32_t, -1, "", 0, "Make CPU and GPU debug output comparable (sort / skip concurrent parts), -1 = automatic if debugLevel >= 6", def(1))
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index f8d4165477220..c1c3e368ce90c 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -378,7 +378,7 @@ int32_t GPUChainTracking::Init()
   }
 
   if (GetProcessingSettings().debugLevel >= 6) {
-    std::string filename = std::string(mRec->IsGPU() ? "GPU" : "CPU") + (mRec->slaveId() != -1 ? (std::string("_slave") + std::to_string(mRec->slaveId())) : std::string(mRec->slavesExist() ? "_master" : "")) + ".out";
+    std::string filename = std::string(mRec->IsGPU() ? "GPU" : "CPU") + (mRec->slaveId() != -1 ? (std::string("_slave") + std::to_string(mRec->slaveId())) : std::string(mRec->slavesExist() ? "_master" : "")) + GetProcessingSettings().debugLogSuffix + ".out";
     mDebugFile->open(filename.c_str());
   }
 
@@ -838,7 +838,7 @@ int32_t GPUChainTracking::RunChainFinalize()
 
     int32_t iKey;
     do {
-      Sleep(10);
+      usleep(10000);
       if (GetProcessingSettings().eventDisplay->EnableSendKey()) {
         iKey = kbhit() ? getch() : 0;
         if (iKey == 27) {
@@ -847,7 +847,7 @@ int32_t GPUChainTracking::RunChainFinalize()
           break;
         } else if (iKey) {
           while (GetProcessingSettings().eventDisplay->getSendKey() != 0) {
-            Sleep(1);
+            usleep(1000);
           }
           GetProcessingSettings().eventDisplay->setSendKey(iKey);
         }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx
index 5e7672022b3ff..035e257ca7952 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingIO.cxx
@@ -47,8 +47,6 @@
 #include "TPCFastTransform.h"
 #include "CorrectionMapsHelper.h"
 
-#include "utils/linux_helpers.h"
-
 using namespace o2::gpu;
 
 #include "GPUO2DataTypes.h"

From 6e3244af8d8813b8c79b04f3aa138c30dd5cd9bf Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Thu, 8 May 2025 20:53:50 +0200
Subject: [PATCH 2/4] GPU: Add sorting of tracks of attached compressed
 clusters in deterministic mode

---
 .../DataCompression/GPUTPCCompression.cxx     | 26 ++++++------
 GPU/GPUTracking/Global/GPUChainTracking.h     |  1 +
 .../Global/GPUChainTrackingCompression.cxx    |  4 ++
 .../GPUChainTrackingDebugAndProfiling.cxx     | 42 +++++++++++++++++++
 4 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx b/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx
index a107f749ddd77..82834a694d0ba 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCCompression.cxx
@@ -140,7 +140,7 @@ void GPUTPCCompression::DumpCompressedClusters(std::ostream& out)
   for (uint32_t i = 0; i < NSECTORS; i++) {
     out << "Sector " << i << ": ";
     for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
-      out << O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] << ", ";
+      out << (O.nSliceRowClusters ? O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j] : 0) << ", ";
     }
     out << "\n";
   }
@@ -153,18 +153,20 @@ void GPUTPCCompression::DumpCompressedClusters(std::ostream& out)
   }
   out << "\n\nUnattached Clusters\n";
   uint32_t offset = 0;
-  for (uint32_t i = 0; i < NSECTORS; i++) {
-    for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
-      out << "Sector " << i << " Row " << j << ": ";
-      for (uint32_t k = 0; k < O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j]; k++) {
-        if (k && k % 10 == 0) {
-          out << "\n    ";
+  if (O.nSliceRowClusters) {
+    for (uint32_t i = 0; i < NSECTORS; i++) {
+      for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
+        out << "Sector " << i << " Row " << j << ": ";
+        for (uint32_t k = 0; k < O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j]; k++) {
+          if (k && k % 10 == 0) {
+            out << "\n    ";
+          }
+          const uint32_t l = k + offset;
+          out << "[" << (uint32_t)O.qTotU[l] << ", " << (uint32_t)O.qMaxU[l] << ", " << (uint32_t)O.flagsU[l] << ", " << (int32_t)O.padDiffU[l] << ", " << (int32_t)O.timeDiffU[l] << ", " << (uint32_t)O.sigmaPadU[l] << ", " << (uint32_t)O.sigmaTimeU[l] << "] ";
         }
-        const uint32_t l = k + offset;
-        out << "[" << (uint32_t)O.qTotU[l] << ", " << (uint32_t)O.qMaxU[l] << ", " << (uint32_t)O.flagsU[l] << ", " << (int32_t)O.padDiffU[l] << ", " << (int32_t)O.timeDiffU[l] << ", " << (uint32_t)O.sigmaPadU[l] << ", " << (uint32_t)O.sigmaTimeU[l] << "] ";
+        offset += O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j];
+        out << "\n";
       }
-      offset += O.nSliceRowClusters[i * GPUCA_ROW_COUNT + j];
-      out << "\n";
     }
   }
   out << "\n\nAttached Clusters\n";
@@ -175,7 +177,7 @@ void GPUTPCCompression::DumpCompressedClusters(std::ostream& out)
       if (k && k % 10 == 0) {
         out << "\n    ";
       }
-      const uint32_t l1 = k + offset, l2 = k + offset - i;
+      const uint32_t l1 = offset + k, l2 = offset - i + k - 1;
       out << "[";
       if (k) {
         out << (int32_t)O.rowDiffA[l2] << ", " << (int32_t)O.sliceLegDiffA[l2] << ", " << (uint32_t)O.padResA[l2] << ", " << (uint32_t)O.timeResA[l2] << ", ";
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
index 13773a97d4e3d..2a2996895dbcf 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.h
+++ b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -235,6 +235,7 @@ class GPUChainTracking : public GPUChain
   void PrintDebugOutput();
   void PrintOutputStat();
   static void DumpClusters(std::ostream& out, const o2::tpc::ClusterNativeAccess* clusters);
+  static void DebugSortCompressedClusters(o2::tpc::CompressedClustersFlat* cls);
 
   bool ValidateSteps();
   bool ValidateSettings();
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index 534c02a4c0a84..3bcd2390eae52 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -203,6 +203,10 @@ int32_t GPUChainTracking::RunTPCCompression()
     ((GPUChainTracking*)GetNextChainInQueue())->mRec->BlockStackedMemory(mRec);
   }
   mRec->PopNonPersistentMemory(RecoStep::TPCCompression, qStr2Tag("TPCCOMPR"));
+  if (GetProcessingSettings().deterministicGPUReconstruction) {
+    SynchronizeGPU();
+    DebugSortCompressedClusters(Compressor.mOutputFlat);
+  }
   DoDebugAndDump(RecoStep::TPCCompression, GPUChainTrackingDebugFlags::TPCCompressedClusters, Compressor, &GPUTPCCompression::DumpCompressedClusters, *mDebugFile);
   return 0;
 }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
index 903505068ad2c..00cf127162b94 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingDebugAndProfiling.cxx
@@ -20,6 +20,7 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <numeric>
 
 #ifdef GPUCA_TRACKLET_CONSTRUCTOR_DO_PROFILE
 #include "bitmapfile.h"
@@ -348,3 +349,44 @@ void GPUChainTracking::DumpClusters(std::ostream& out, const o2::tpc::ClusterNat
     }
   }
 }
+
+void GPUChainTracking::DebugSortCompressedClusters(o2::tpc::CompressedClustersFlat* cls)
+{
+  o2::tpc::CompressedClusters c = *cls;
+  std::vector<uint32_t> sorted(c.nTracks), offsets(c.nTracks);
+  std::iota(sorted.begin(), sorted.end(), 0);
+  auto sorter = [&c](const auto a, const auto b) {
+    return std::tie(c.sliceA[a], c.rowA[a], c.timeA[a], c.padA[a], c.qPtA[a]) <
+           std::tie(c.sliceA[b], c.rowA[b], c.timeA[b], c.padA[b], c.qPtA[b]);
+  };
+  std::sort(sorted.begin(), sorted.end(), sorter);
+  uint32_t offset = 0;
+  for (uint32_t i = 0; i < c.nTracks; i++) {
+    offsets[i] = offset;
+    offset += c.nTrackClusters[i];
+  }
+
+  auto sortArray = [&c, &sorted, &offsets](auto* src, size_t totalSize, auto getOffset, auto getSize) {
+    auto buf = std::make_unique<std::remove_reference_t<decltype(src[0])>[]>(totalSize);
+    memcpy(buf.get(), src, totalSize * sizeof(*src));
+    uint32_t targetOffset = 0;
+    for (uint32_t i = 0; i < c.nTracks; i++) {
+      const uint32_t j = sorted[i];
+      memcpy(src + targetOffset, buf.get() + getOffset(offsets[j], j), getSize(j) * sizeof(*src));
+      targetOffset += getSize(j);
+    }
+  };
+  auto sortMultiple = [&sortArray](size_t totalSize, auto getOffset, auto getSize, auto&&... arrays) {
+    (..., sortArray(std::forward<decltype(arrays)>(arrays), totalSize, getOffset, getSize));
+  };
+  auto getFullOffset = [](uint32_t off, uint32_t ind) { return off; };
+  auto getReducedOffset = [](uint32_t off, uint32_t ind) { return off - ind; };
+  auto getIndex = [](uint32_t off, uint32_t ind) { return ind; };
+  auto getN = [&c](uint32_t j) { return c.nTrackClusters[j]; };
+  auto getN1 = [&c](uint32_t j) { return c.nTrackClusters[j] - 1; };
+  auto get1 = [](uint32_t j) { return 1; };
+
+  sortMultiple(c.nAttachedClusters, getFullOffset, getN, c.qTotA, c.qMaxA, c.flagsA, c.sigmaPadA, c.sigmaTimeA);
+  sortMultiple(c.nAttachedClustersReduced, getReducedOffset, getN1, c.rowDiffA, c.sliceLegDiffA, c.padResA, c.timeResA);
+  sortMultiple(c.nTracks, getIndex, get1, c.qPtA, c.rowA, c.sliceA, c.timeA, c.padA, c.nTrackClusters); // NOTE: This must be last, since nTrackClusters is used for handling the arrays above!
+}

From 8480305977bff25887ea3d8cd43e5fc4d178898d Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Thu, 8 May 2025 23:26:22 +0200
Subject: [PATCH 3/4] GPU: Make GPUCommonAlgorithm::sortInBlock deterministic
 with GPUCA_DETERMINISTIC_MODE

---
 GPU/Common/GPUCommonAlgorithm.h | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/GPU/Common/GPUCommonAlgorithm.h b/GPU/Common/GPUCommonAlgorithm.h
index 417c9e0d1f8c1..d0643391246a8 100644
--- a/GPU/Common/GPUCommonAlgorithm.h
+++ b/GPU/Common/GPUCommonAlgorithm.h
@@ -283,21 +283,29 @@ GPUdi() void GPUCommonAlgorithm::sortInBlock(T* begin, T* end, const S& comp)
 #ifndef GPUCA_GPUCODE
   GPUCommonAlgorithm::sort(begin, end, comp);
 #else
-  int32_t n = end - begin;
-  for (int32_t i = 0; i < n; i++) {
-    for (int32_t tIdx = get_local_id(0); tIdx < n; tIdx += get_local_size(0)) {
-      int32_t offset = i % 2;
-      int32_t curPos = 2 * tIdx + offset;
-      int32_t nextPos = curPos + 1;
-
-      if (nextPos < n) {
-        if (!comp(begin[curPos], begin[nextPos])) {
-          IterSwap(&begin[curPos], &begin[nextPos]);
+  GPUCA_DETERMINISTIC_CODE( // clang-format off
+    GPUbarrier();
+    if (get_local_id(0) == 0) {
+      GPUCommonAlgorithm::sort(begin, end, comp);
+    }
+    GPUbarrier();
+  , // !GPUCA_DETERMINISTIC_CODE
+    int32_t n = end - begin;
+    for (int32_t i = 0; i < n; i++) {
+      for (int32_t tIdx = get_local_id(0); tIdx < n; tIdx += get_local_size(0)) {
+        int32_t offset = i % 2;
+        int32_t curPos = 2 * tIdx + offset;
+        int32_t nextPos = curPos + 1;
+
+        if (nextPos < n) {
+          if (!comp(begin[curPos], begin[nextPos])) {
+            IterSwap(&begin[curPos], &begin[nextPos]);
+          }
         }
       }
+      GPUbarrier();
     }
-    GPUbarrier();
-  }
+  ) // clang-format on
 #endif
 }
 

From a5475261b766e3a1e512a7b0176136b014c6464a Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Thu, 8 May 2025 23:51:00 +0200
Subject: [PATCH 4/4] GPU: Use total sorting in deterministic mode for
 unattached clusters

---
 .../GPUTPCCompressionKernels.cxx              | 40 +++++++++++++------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx
index 5dbbf63ca8264..3b88c8764d0fd 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCCompressionKernels.cxx
@@ -148,19 +148,19 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step0at
 }
 
 template <>
-GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<0>::operator()(uint32_t a, uint32_t b) const
+GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>::operator()(uint32_t a, uint32_t b) const
 {
   return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
 }
 
 template <>
-GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<1>::operator()(uint32_t a, uint32_t b) const
+GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>::operator()(uint32_t a, uint32_t b) const
 {
   return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
 }
 
 template <>
-GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<2>::operator()(uint32_t a, uint32_t b) const
+GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>::operator()(uint32_t a, uint32_t b) const
 {
   if (mClsPtr[a].getTimePacked() >> 3 == mClsPtr[b].getTimePacked() >> 3) {
     return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
@@ -169,7 +169,7 @@ GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<2>::opera
 }
 
 template <>
-GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<3>::operator()(uint32_t a, uint32_t b) const
+GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>::operator()(uint32_t a, uint32_t b) const
 {
   if (mClsPtr[a].padPacked >> 3 == mClsPtr[b].padPacked >> 3) {
     return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
@@ -177,6 +177,18 @@ GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<3>::opera
   return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
 }
 
+template <> // Deterministic comparison
+GPUd() bool GPUTPCCompressionKernels::GPUTPCCompressionKernels_Compare<4>::operator()(uint32_t a, uint32_t b) const
+{
+  if (mClsPtr[a].getTimePacked() != mClsPtr[b].getTimePacked()) {
+    return mClsPtr[a].getTimePacked() < mClsPtr[b].getTimePacked();
+  }
+  if (mClsPtr[a].padPacked != mClsPtr[b].padPacked) {
+    return mClsPtr[a].padPacked < mClsPtr[b].padPacked;
+  }
+  return mClsPtr[a].qTot < mClsPtr[b].qTot;
+}
+
 template <>
 GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1unattached>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() processors)
 {
@@ -261,15 +273,19 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
 #ifdef GPUCA_GPUCODE
         static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCompressionKernels_step1unattached) * 2 <= GPUCA_TPC_COMP_CHUNK_SIZE);
 #endif
-        if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZPadTime) {
+        GPUCA_DETERMINISTIC_CODE( // clang-format off
           CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
-        } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) {
-          CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>(clusters->clusters[iSector][iRow]));
-        } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortPad) {
-          CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>(clusters->clusters[iSector][iRow]));
-        } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortTime) {
-          CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>(clusters->clusters[iSector][iRow]));
-        }
+        , // !GPUCA_DETERMINISTIC_CODE
+          if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZPadTime) {
+            CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZPadTime>(clusters->clusters[iSector][iRow]));
+          } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortZTimePad) {
+            CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortZTimePad>(clusters->clusters[iSector][iRow]));
+          } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortPad) {
+            CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortPad>(clusters->clusters[iSector][iRow]));
+          } else if (param.rec.tpc.compressionSortOrder == GPUSettings::SortTime) {
+            CAAlgo::sortInBlock(sortBuffer, sortBuffer + count, GPUTPCCompressionKernels_Compare<GPUSettings::SortTime>(clusters->clusters[iSector][iRow]));
+          }
+        ) // clang-format on
         GPUbarrier();
       }