From 9ab32454718d771993286e7fe252362521e697a9 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 12 Sep 2025 00:39:37 +0200
Subject: [PATCH 1/4] Bug-fix for MC labels

---
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 15ee6b6119022..59a355780d9ee 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -46,7 +46,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
   auto& clustererNN = processors.tpcNNClusterer[sector];
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
-  tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
   int8_t isAccepted = (clustererNN.mNnClusterizerUseClassification ? (clustererNN.mOutputDataClass[CAMath::Min(glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] > 0) : 1);
   GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, reinterpret_cast<GPUTPCCFClusterizer::GPUSharedMemory&>(smem), chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, isAccepted);
 }
@@ -464,7 +464,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
-  tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
   uint32_t full_glo_idx = glo_idx + batchStart;
 
   if (full_glo_idx >= maxClusterNum) {

From b756dcc19b4e75bb7fccbfc042318e61370c79e9 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 12 Sep 2025 00:42:17 +0200
Subject: [PATCH 2/4] Switch on timers on CPU for the first four lanes

---
 .../Global/GPUChainTrackingClusterizer.cxx           | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index fd3699ae4d125..5c2a13df1a2b7 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -1052,7 +1052,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             // NN evaluations
             if(clustererNNShadow.mNnClusterizerUseClassification) {
-              if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Start(); }
+              if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Start(); }
               if (clustererNNShadow.mNnInferenceInputDType == 0) {
                 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                   (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16);
@@ -1066,13 +1066,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                   (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32);
                 }
               }
-              if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); }
+              if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU || lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time...
               if (nn_settings.nnClusterizerVerbosity > 3) {
                 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN classification inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
               }
             }
             if (!clustererNNShadow.mNnClusterizerUseCfRegression) {
-              if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); }
+              if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Start(); }
               if (clustererNNShadow.mNnInferenceInputDType == 0) {
                 if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                   (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16);
@@ -1086,9 +1086,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                   (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32);
                 }
               }
-              if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Stop(); }
+              if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Stop(); }
               if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) {
-                if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Start(); }
+                if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Start(); }
                 if (clustererNNShadow.mNnInferenceInputDType == 0) {
                   if (clustererNNShadow.mNnInferenceOutputDType == 0) {
                     (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16);
@@ -1102,7 +1102,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                     (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32);
                   }
                 }
-                if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Stop(); }
+                if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Stop(); }
               }
               if (nn_settings.nnClusterizerVerbosity > 3) {
                 LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN regression inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";

From b5ace3556bd784ef76c15a21fbd6c5003023d0f0 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 17 Sep 2025 23:41:37 +0200
Subject: [PATCH 3/4] Improved boundary checking for input filling and cluster
 publishing

---
 .../Global/GPUChainTrackingClusterizer.cxx    |   6 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |   4 +
 .../GPUTPCNNClusterizerHost.cxx               |   5 +-
 .../GPUTPCNNClusterizerHost.h                 |   2 +-
 .../GPUTPCNNClusterizerKernels.cxx            | 156 +++++++++++++++---
 .../GPUTPCNNClusterizerKernels.h              |   1 +
 6 files changed, 147 insertions(+), 27 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 4a7552069c61e..619940ff6d3dd 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -709,6 +709,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
       }
     });
+    const int16_t maxFragmentLen = GetProcessingSettings().overrideClusterizerFragmentLen;
+    const uint32_t maxAllowedTimebin = param().par.continuousTracking ? std::max<int32_t>(param().continuousMaxTimeBin, maxFragmentLen) : TPC_MAX_TIME_BIN_TRIGGERED;
     for (int32_t sector = 0; sector < NSECTORS; sector++) {
       GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector];
       GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN;
@@ -716,12 +718,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       clustererNN.mDeviceId = deviceId;
       clustererNN.mISector = sector;
       clustererNN.mNnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
-      nnApplications[lane].initClusterizer(nn_settings, clustererNN);
+      nnApplications[lane].initClusterizer(nn_settings, clustererNN, maxFragmentLen, maxAllowedTimebin);
       if (doGPU) {
         clustererNNShadow.mDeviceId = deviceId;
         clustererNNShadow.mISector = sector;
         clustererNNShadow.mNnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
-        nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
+        nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow, maxFragmentLen, maxAllowedTimebin);
       }
       if (nn_settings.nnClusterizerVerbosity > 2) {
         LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Processor initialized. Sector " << sector << ", lane " << lane << ", max clusters " << clustererNN.mNnClusterizerTotalClusters << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index a6b0b081fc3dd..0b9553437765c 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -60,6 +60,10 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int32_t mISector = -1;
   int32_t mDeviceId = -1;
 
+  // charge array boundaries
+  int32_t maxFragmentLen = -1;
+  int32_t maxAllowedTimebin = -1; // == tpcMaxTimeBin
+
   // GPU optimizations
   uint32_t mNnClusterizerFullRowSize = 0;
   uint32_t mNnClusterizerFullPadSize = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index ad635c15b9256..ae833ace2f648 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -21,6 +21,7 @@
 #include "GPUReconstruction.h"
 #include "GPUTPCGeometry.h"
 #include "DataFormatsTPC/Constants.h"
+#include "clusterFinderDefs.h"
 
 #ifdef GPUCA_HAS_ONNX
 #include <onnxruntime_cxx_api.h>
@@ -84,7 +85,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
   }
 }
 
-void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN)
+void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN, int32_t maxFragmentLen, int32_t maxAllowedTimebin)
 {
   clustererNN.mNnClusterizerUseCfRegression = settings.nnClusterizerUseCfRegression;
   clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
@@ -109,6 +110,8 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
   clustererNN.mNnClusterizerUseClassification = settings.nnClusterizerUseClassification;
   clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags;
+  clustererNN.maxFragmentLen = maxFragmentLen == -1 ? TPC_MAX_FRAGMENT_LEN_GPU : maxFragmentLen;
+  clustererNN.maxAllowedTimebin = maxAllowedTimebin == -1 ? TPC_MAX_FRAGMENT_LEN_GPU : maxAllowedTimebin;
   if (clustererNN.mNnSigmoidTrafoClassThreshold) {
     clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold));
   } else {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 414c4539a33c1..8f8465d5dca34 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -48,7 +48,7 @@ class GPUTPCNNClusterizerHost
   GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode = false) { init(settings, useDeterministicMode); }
 
   void init(const GPUSettingsProcessingNNclusterizer&, bool = false);
-  void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+  void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&, int32_t = -1, int32_t = -1);
   void createBoundary(GPUTPCNNClusterizer&);
   void createIndexLookup(GPUTPCNNClusterizer&);
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 59a355780d9ee..cd8a73542f0e1 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -85,7 +85,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) {
         int32_t target_time = time + t;
 
-        if (is_boundary || target_time < 0 || target_time >= TPC_MAX_FRAGMENT_LEN_GPU) {
+        if (is_boundary || target_time < 0 || target_time >= clustererNN.maxAllowedTimebin) {
           // Fill boundary value
           float boundary_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
           if (dtype == 0) {
@@ -229,7 +229,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       int32_t target_time = time + t_local;
 
       // Optimized boundary check
-      int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= TPC_MAX_FRAGMENT_LEN_GPU);
+      int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
 
       float output_value;
       if (is_boundary) {
@@ -340,7 +340,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   int32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes;
 
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(full_glo_idx, maxClusterNum - 1)];
+  uint32_t peakIndex = CAMath::Min(full_glo_idx, maxClusterNum - 1);
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[peakIndex];
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
 
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
@@ -365,11 +366,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
 
-  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.mNnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
-
   ClusterAccumulator pc;
 
-  // Publishing logic is taken from default clusterizer
   if (withMC) {
     ClusterAccumulator dummy_pc;
     CPU_ONLY(labelAcc->collect(peak, central_charge));
@@ -390,37 +388,113 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     return;
   }
 
-  // For flag influence on cluster error setting: O2/GPU/GPUTracking/Base/GPUParam.inc -> UpdateClusterError2ByState
   bool notSinglePad = false, notSingleTime = false;
   for (uint16_t i = 0; i < 8; i++) {
     Delta2 d = cfconsts::InnerNeighbors[i];
     CfChargePos tmp_pos = peak.delta(d);
-    notSinglePad |= (d.x != 0) && (static_cast<float>(chargeMap[tmp_pos].unpack()) > 0);
-    notSingleTime |= (d.y != 0) && (static_cast<float>(chargeMap[tmp_pos].unpack()) > 0);
+    float v = static_cast<float>(chargeMap[tmp_pos].unpack());
+    notSinglePad |= (d.x != 0) && (v > 0.f);
+    notSingleTime |= (d.y != 0) && (v > 0.f);
   }
 
+  float publishPadPosition = 0.f, publishTimePosition = 0.f;
   if (dtype == 0) {
+    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat();
+    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat();
+    isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
     pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(),
-               static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(),
+               publishPadPosition,
                notSinglePad ? clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat() : 0.f,
-               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(),
+               (clusterer.mPmemory->fragment).start + publishTimePosition,
                notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f,
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
-  } else if (dtype == 1) {
+  } else {
+    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index];
+    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1];
+    isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
     pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4],
-               static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index],
+               publishPadPosition,
                notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f,
-               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1],
+               (clusterer.mPmemory->fragment).start + publishTimePosition,
                notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f,
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
   }
 
+  // if (boundaryFlag != 0) { // Prints the entire NN input for the given index
+  //   // Build a simple buffer manually (float with 3 decimals)
+  //   const int MAX_CHARS = 4096;
+  //   char buffer[MAX_CHARS];
+  //   int pos = 0;
+//
+  //   auto appendChar = [&](char c) {
+  //     if (pos < MAX_CHARS - 1) buffer[pos++] = c;
+  //   };
+  //   auto appendStr = [&](const char* s) {
+  //     while (*s && pos < MAX_CHARS - 1) buffer[pos++] = *s++;
+  //   };
+  //   auto appendUInt = [&](uint32_t v) {
+  //     char tmp[16]; int tp = 0;
+  //     if (v == 0) { appendChar('0'); return; }
+  //     while (v && tp < 16) { tmp[tp++] = char('0' + (v % 10)); v /= 10; }
+  //     while (tp--) appendChar(tmp[tp]);
+  //   };
+  //   auto appendInt = [&](int v) {
+  //     if (v < 0) { appendChar('-'); v = -v; }
+  //     appendUInt((uint32_t)v);
+  //   };
+  //   auto appendFloat = [&](float f) {
+  //     if (f < 0) { appendChar('-'); f = -f; }
+  //     int ip = (int)f;
+  //     float frac = f - (float)ip;
+  //     appendInt(ip);
+  //     appendChar('.');
+  //     for (int i = 0; i < 3; i++) {
+  //       frac *= 10.f;
+  //       int d = (int)frac;
+  //       appendChar((char)('0' + (d < 0 ? 0 : (d > 9 ? 9 : d))));
+  //       frac -= d;
+  //       if (frac < 0) frac = 0;
+  //     }
+  //   };
+//
+  //   appendStr("(NN CLUS) DEBUG: Boundary cluster detected (sector ");
+  //   appendUInt(sector);
+  //   appendStr(", row ");
+  //   appendUInt(peak.row());
+  //   appendStr(", pad ");
+  //   appendFloat(publishPadPosition);
+  //   appendStr(", time ");
+  //   appendFloat(publishTimePosition);
+  //   appendStr(") [glo_idx=");
+  //   appendUInt(glo_idx);
+  //   appendStr(" elemSize=");
+  //   appendInt(clustererNN.mNnClusterizerElementSize);
+  //   appendStr(" dtype=");
+  //   appendInt(dtype);
+  //   appendStr("] INPUT:");
+//
+  //   int elemSize = clustererNN.mNnClusterizerElementSize;
+  //   int baseIdx = glo_idx * elemSize;
+//
+  //   int maxPrint = elemSize;
+  //   for (int i = 0; i < maxPrint; ++i) {
+  //     appendChar(' ');
+  //     float v = (dtype == 0) ? clustererNN.mInputData_16[baseIdx + i].ToFloat()
+  //                            : clustererNN.mInputData_32[baseIdx + i];
+  //     appendFloat(v);
+  //     if (pos > (MAX_CHARS - 32)) { appendStr(" ..."); break; }
+  //   }
+//
+  //   buffer[pos] = 0;
+  //   printf("%s\n", buffer);
+  // }
+
   tpc::ClusterNative myCluster;
   bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
   if (clustererNN.mNnClusterizerUseClassification) {
-    rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0);
+    rejectCluster |= (clustererNN.mOutputDataClass[peakIndex] <= 0);
   }
   if (rejectCluster) {
     if (clusterer.mPclusterPosInRow) {
@@ -509,19 +583,26 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   }
 
   // Cluster 1
+  float publishPadPosition = 0.f, publishTimePosition = 0.f;
   if (dtype == 0) {
+    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat();
+    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat();
+    isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
     pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(),
-               static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(),
+               publishPadPosition,
                clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(),
-               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 2].ToFloat(),
+               (clusterer.mPmemory->fragment).start + publishTimePosition,
                clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(),
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
   } else if (dtype == 1) {
+    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index];
+    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 1];
+    isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
     pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8],
-               static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index],
+               publishPadPosition,
                clustererNN.mOutputDataReg2_32[model_output_index + 4],
-               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 2],
+               (clusterer.mPmemory->fragment).start + publishTimePosition,
                clustererNN.mOutputDataReg2_32[model_output_index + 6],
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
@@ -558,18 +639,24 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   // Cluster 2
   if (dtype == 0) {
+    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat();
+    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat();
+    isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
     pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(),
-               static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(),
+               publishPadPosition,
                clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(),
-               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(),
+               (clusterer.mPmemory->fragment).start + publishTimePosition,
                clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(),
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
   } else if (dtype == 1) {
+    publishPadPosition = static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1];
+    publishTimePosition = static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3];
+    isBoundaryPublish(full_glo_idx, static_cast<int32_t>(peak.row()), publishPadPosition, publishTimePosition);
     pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9],
-               static_cast<float>(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1],
+               publishPadPosition,
                clustererNN.mOutputDataReg2_32[model_output_index + 5],
-               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3],
+               (clusterer.mPmemory->fragment).start + publishTimePosition,
                clustererNN.mOutputDataReg2_32[model_output_index + 7],
                clustererNN.mClusterFlags[2 * glo_idx],
                clustererNN.mClusterFlags[2 * glo_idx + 1]);
@@ -664,3 +751,26 @@ GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int
     return true;
   }
 }
+
+GPUd() bool GPUTPCNNClusterizerKernels::isBoundaryPublish(int32_t idx, int32_t row, float& pad, float& time)
+{
+  if(pad < 0) {
+    // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, pad < 0: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast<int>(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU);
+    pad = 0.f;
+    return true;
+  } else if (pad >= static_cast<int>(GPUTPCGeometry::NPads(row))) {
+    // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, pad >= static_cast<int>(GPUTPCGeometry::NPads(row): row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast<int>(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU);
+    pad = static_cast<float>(GPUTPCGeometry::NPads(row) - 1);
+    return true;
+  } else if (time < 0) {
+    // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, time < 0: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast<int>(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU);
+    time = 0.f;
+    return true;
+  } else if (time >= TPC_MAX_FRAGMENT_LEN_GPU) {
+    // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, time >= TPC_MAX_FRAGMENT_LEN_GPU: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast<int>(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU);
+    time = static_cast<float>(TPC_MAX_FRAGMENT_LEN_GPU - 1);
+    return true;
+  } else {
+    return false;
+  }
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index 9c93726a097b7..cd3d7783771fe 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -67,6 +67,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
   static GPUd() int32_t padOffset(int32_t, int32_t);
   static GPUd() int32_t rowOffset(int32_t, int32_t);
   static GPUd() bool isBoundary(int32_t, int32_t, int32_t);
+  static GPUd() bool isBoundaryPublish(int32_t, int32_t, float&, float&);
 };
 
 } // namespace o2::gpu

From 82d76e57f871866476a630f36bcc4ace6e18260d Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Wed, 17 Sep 2025 21:42:23 +0000
Subject: [PATCH 4/4] Please consider the following formatting changes

---
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index cd8a73542f0e1..55fefa7dcf149 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -427,7 +427,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   //   const int MAX_CHARS = 4096;
   //   char buffer[MAX_CHARS];
   //   int pos = 0;
-//
+  //
   //   auto appendChar = [&](char c) {
   //     if (pos < MAX_CHARS - 1) buffer[pos++] = c;
   //   };
@@ -458,7 +458,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   //       if (frac < 0) frac = 0;
   //     }
   //   };
-//
+  //
   //   appendStr("(NN CLUS) DEBUG: Boundary cluster detected (sector ");
   //   appendUInt(sector);
   //   appendStr(", row ");
@@ -474,10 +474,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   //   appendStr(" dtype=");
   //   appendInt(dtype);
   //   appendStr("] INPUT:");
-//
+  //
   //   int elemSize = clustererNN.mNnClusterizerElementSize;
   //   int baseIdx = glo_idx * elemSize;
-//
+  //
   //   int maxPrint = elemSize;
   //   for (int i = 0; i < maxPrint; ++i) {
   //     appendChar(' ');
@@ -486,7 +486,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   //     appendFloat(v);
   //     if (pos > (MAX_CHARS - 32)) { appendStr(" ..."); break; }
   //   }
-//
+  //
   //   buffer[pos] = 0;
   //   printf("%s\n", buffer);
   // }
@@ -754,7 +754,7 @@ GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int
 
 GPUd() bool GPUTPCNNClusterizerKernels::isBoundaryPublish(int32_t idx, int32_t row, float& pad, float& time)
 {
-  if(pad < 0) {
+  if (pad < 0) {
     // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, pad < 0: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast<int>(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU);
     pad = 0.f;
     return true;