From 290ba0e44c75d6f58ed030fd4f3ed70e96eed753 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 12 Jul 2025 03:22:17 +0200
Subject: [PATCH 01/14] First version of lookup tables

---
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |  2 +
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    | 53 +++++++++++--------
 .../GPUTPCNNClusterizerHost.cxx               | 42 ++++++++++++++-
 .../GPUTPCNNClusterizerHost.h                 |  2 +
 .../GPUTPCNNClusterizerKernels.cxx            | 43 ++++++---------
 prodtests/full-system-test/dpl-workflow.sh    |  2 +-
 6 files changed, 94 insertions(+), 50 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index da37c0771fe84..282abe98a80bb 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -61,6 +61,8 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
   }
   if (mNnClusterizerTotalClusters > 0) {
     computePointerWithAlignment(mem, mOutputDataClass, mNnClusterizerTotalClusters);
+    computePointerWithAlignment(mem, mIsBoundary, mBoundaryMapSize);
+    computePointerWithAlignment(mem, mIndexLookup, mIndexLookupSize);
   }
   return mem;
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 980c0977aca65..2564515f58551 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -37,31 +37,42 @@ class GPUTPCNNClusterizer : public GPUProcessor
 
   // Neural network clusterization
 
-  int mNnClusterizerSizeInputRow = 3;
-  int mNnClusterizerSizeInputPad = 3;
-  int mNnClusterizerSizeInputTime = 3;
-  int mNnClusterizerElementSize = -1;
-  bool mNnClusterizerAddIndexData = true;
+  int32_t mNnClusterizerSizeInputRow = 3;
+  int32_t mNnClusterizerSizeInputPad = 3;
+  int32_t mNnClusterizerSizeInputTime = 3;
+  int32_t mNnClusterizerChargeArraySize = -1;
+  int32_t mNnClusterizerElementSize = -1;
+  int8_t mNnClusterizerAddIndexData = 1;
   float mNnClassThreshold = 0.01;
-  bool mNnSigmoidTrafoClassThreshold = 1;
-  bool mNnClusterizerSetDeconvolutionFlags = true;
-  int mNnClusterizerUseCfRegression = 0;
-  int mNnClusterizerBatchedMode = 1;
-  int mNnClusterizerTotalClusters = 1;
-  int mNnClusterizerVerbosity = 0;
-  int mNnClusterizerBoundaryFillValue = -1;
-  int mNnClusterizerModelClassNumOutputNodes = -1;
-  int mNnClusterizerModelReg1NumOutputNodes = -1;
-  int mNnClusterizerModelReg2NumOutputNodes = -1;
-  int mNnInferenceInputDType = 0;  // 0: float16, 1: float32
-  int mNnInferenceOutputDType = 0; // 0: float16, 1: float32
-  int mISector = -1;
-  int mDeviceId = -1;
+  int8_t mNnSigmoidTrafoClassThreshold = 1;
+  int8_t mNnClusterizerSetDeconvolutionFlags = 1;
+  int32_t mNnClusterizerUseCfRegression = 0;
+  int32_t mNnClusterizerBatchedMode = 1;
+  int32_t mNnClusterizerTotalClusters = 1;
+  int32_t mNnClusterizerVerbosity = 0;
+  int32_t mNnClusterizerBoundaryFillValue = -1;
+  int32_t mNnClusterizerModelClassNumOutputNodes = -1;
+  int32_t mNnClusterizerModelReg1NumOutputNodes = -1;
+  int32_t mNnClusterizerModelReg2NumOutputNodes = -1;
+  int32_t mNnInferenceInputDType = 0;  // 0: float16, 1: float32
+  int32_t mNnInferenceOutputDType = 0; // 0: float16, 1: float32
+  int32_t mISector = -1;
+  int32_t mDeviceId = -1;
+
+  // Boundary lookup table
+  int32_t mBoundaryMapSizeRow = 0;
+  int32_t mBoundaryMapSizePerRow = 0;
+  int32_t mBoundaryMapSize = 0;
+  int8_t* mIsBoundary = nullptr;
+
+  // Index lookup table
+  int32_t mIndexLookupSize = 0;
+  int32_t* mIndexLookup = nullptr;
 
   // Memory allocation for neural network
 
-  bool* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
-  int* mOutputDataClass = nullptr;
+  int8_t* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
+  int32_t* mOutputDataClass = nullptr;
 
   // FP32
   float* mInputData_32 = nullptr;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 124320396d0d4..c394d9b0766fc 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -19,6 +19,8 @@
 #include "GPUSettings.h"
 #include "ML/3rdparty/GPUORTFloat16.h"
 #include "GPUReconstruction.h"
+#include "GPUTPCGeometry.h"
+#include "DataFormatsTPC/Constants.h"
 
 #ifdef GPUCA_HAS_ONNX
 #include <onnxruntime_cxx_api.h>
@@ -87,8 +89,11 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
   clustererNN.mNnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad;
   clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
+  clustererNN.mNnClusterizerChargeArraySize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1));
+  clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
+  clustererNN.mBoundaryMapSize = (3*clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW)*(GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2*clustererNN.mNnClusterizerSizeInputPad);
+  clustererNN.mIndexLookupSize = 3*clustererNN.mNnClusterizerElementSize; // local row, pad, time coordinate from flat index
   clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
-  clustererNN.mNnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0);
   clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
   clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
   clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
@@ -114,6 +119,41 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
       clustererNN.mNnClusterizerModelReg2NumOutputNodes = mModelReg2.getNumOutputNodes()[0][1];
     }
   }
+  createBoundary(clustererNN);
+  createIndexLookup(clustererNN);
+}
+
+void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) {
+  // Call after init of the clustererNN elements
+  clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
+  clustererNN.mBoundaryMapSizePerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2 * clustererNN.mNnClusterizerSizeInputPad;
+  for(int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
+    for (int p = 0; p < clustererNN.mBoundaryMapSizePerRow; p++) {
+      int32_t i = r * clustererNN.mBoundaryMapSizePerRow + p;
+      clustererNN.mIsBoundary[i] = 1;
+      if (p >= clustererNN.mNnClusterizerSizeInputPad || r >= clustererNN.mNnClusterizerSizeInputRow) {
+        if ((r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) ||
+            (r >= (GPUTPCGeometry::EndIROC() + 2*clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2*clustererNN.mNnClusterizerSizeInputRow))) {
+          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mNnClusterizerSizeInputPad) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
+        }
+        if (clustererNN.mIsBoundary[i] == 1) {
+          break; // No need to check further pads in this row
+        }
+      }
+    }
+  }
+}
+
+void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN) {
+  for(int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++){
+    int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
+    int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
+    int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad;
+    int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
+    clustererNN.mIndexLookup[3*i] = r;
+    clustererNN.mIndexLookup[3*i + 1] = p;
+    clustererNN.mIndexLookup[3*i + 2] = t;
+  }
 }
 
 // MockedOrtAllocator implementation to be able to use volatile assignment
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index a4449165261be..ed3c80320b632 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -49,6 +49,8 @@ class GPUTPCNNClusterizerHost
 
   void init(const GPUSettingsProcessingNNclusterizer&);
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+  void createBoundary(GPUTPCNNClusterizer&);
+  void createIndexLookup(GPUTPCNNClusterizer&);
 
   // ONNX
   void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 8cdc0684ad588..8a7453f7be3fc 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -155,37 +155,26 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     }
   } else if ((int32_t)transient_index < (clustererNN.mNnClusterizerElementSize - 3)) {
     int32_t time = static_cast<int>(peak.time());
-    int32_t r = CAMath::Floor(transient_index / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
-    bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
-    if (is_row_boundary) {
+    int32_t idxLookup = 3*transient_index;
+    int32_t r = clustererNN.mIndexLookup[idxLookup], p = clustererNN.mIndexLookup[idxLookup + 1], t = clustererNN.mIndexLookup[idxLookup + 2] + time;
+    int32_t current_row = row + r, current_pad = pad + p;
+    int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
+    int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, current_row);
+    int32_t isBoundaryIndex = (current_row + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePerRow + current_pad + clustererNN.mNnClusterizerSizeInputPad;
+
+    if (!clustererNN.mIsBoundary[isBoundaryIndex] && (t >= 0) && (t < TPC_MAX_FRAGMENT_LEN_GPU)) {
+      float central_charge = static_cast<float>(chargeMap[peak].unpack());
+      CfChargePos tmp_pos(current_row, current_pad + pad_offset, t);
       if (dtype == 0) {
-        clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
-      } else {
-        clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+        clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+      } else if (dtype == 1) {
+        clustererNN.mInputData_32[glo_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
       }
     } else {
-      int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-      int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r);
-      int32_t rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
-      int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad + pad_offset;
-      int32_t time_pos = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime + time;
-
-      bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (time_pos < 0 || time_pos >= TPC_MAX_FRAGMENT_LEN_GPU);
-
-      if (!is_boundary) {
-        float central_charge = static_cast<float>(chargeMap[peak].unpack());
-        CfChargePos tmp_pos(row + r, pad + p, time_pos);
-        if (dtype == 0) {
-          clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
-        } else if (dtype == 1) {
-          clustererNN.mInputData_32[glo_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
-        }
+      if (dtype == 0) {
+        clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
       } else {
-        if (dtype == 0) {
-          clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
-        } else {
-          clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
-        }
+        clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
       }
     }
   }
diff --git a/prodtests/full-system-test/dpl-workflow.sh b/prodtests/full-system-test/dpl-workflow.sh
index 202352730ddc7..df87018b1013a 100755
--- a/prodtests/full-system-test/dpl-workflow.sh
+++ b/prodtests/full-system-test/dpl-workflow.sh
@@ -74,7 +74,7 @@ fi
 GPU_INPUT=zsraw
 GPU_OUTPUT=tracks,clusters
 GPU_CONFIG=
-GPU_CONFIG_KEY=
+#GPU_CONFIG_KEY=
 TOF_CONFIG=
 TOF_INPUT=raw
 TOF_OUTPUT=clusters

From ed459ea8ea7d011059a2f84aff4dfaf44b5ce025 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 12 Jul 2025 03:48:49 +0200
Subject: [PATCH 02/14] Simplifying computations + bug-fixes

---
 .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx         | 5 +++--
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx      | 9 ++++-----
 prodtests/full-system-test/dpl-workflow.sh               | 2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index c394d9b0766fc..50b3fb15f6294 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -132,9 +132,10 @@ void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) {
       int32_t i = r * clustererNN.mBoundaryMapSizePerRow + p;
       clustererNN.mIsBoundary[i] = 1;
       if (p >= clustererNN.mNnClusterizerSizeInputPad || r >= clustererNN.mNnClusterizerSizeInputRow) {
-        if ((r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) ||
-            (r >= (GPUTPCGeometry::EndIROC() + 2*clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2*clustererNN.mNnClusterizerSizeInputRow))) {
+        if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
           clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mNnClusterizerSizeInputPad) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
+        } else if (r >= (GPUTPCGeometry::EndIROC() + 2*clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2*clustererNN.mNnClusterizerSizeInputRow)) {
+          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mNnClusterizerSizeInputPad) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2*clustererNN.mNnClusterizerSizeInputRow)));
         }
         if (clustererNN.mIsBoundary[i] == 1) {
           break; // No need to check further pads in this row
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 8a7453f7be3fc..962d651bad97e 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -156,15 +156,14 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   } else if ((int32_t)transient_index < (clustererNN.mNnClusterizerElementSize - 3)) {
     int32_t time = static_cast<int>(peak.time());
     int32_t idxLookup = 3*transient_index;
-    int32_t r = clustererNN.mIndexLookup[idxLookup], p = clustererNN.mIndexLookup[idxLookup + 1], t = clustererNN.mIndexLookup[idxLookup + 2] + time;
-    int32_t current_row = row + r, current_pad = pad + p;
+    int32_t r = clustererNN.mIndexLookup[idxLookup] + row, p = clustererNN.mIndexLookup[idxLookup + 1] + pad, t = clustererNN.mIndexLookup[idxLookup + 2] + time;
     int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-    int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, current_row);
-    int32_t isBoundaryIndex = (current_row + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePerRow + current_pad + clustererNN.mNnClusterizerSizeInputPad;
+    int32_t isBoundaryIndex = (r + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePerRow + p + clustererNN.mNnClusterizerSizeInputPad;
 
     if (!clustererNN.mIsBoundary[isBoundaryIndex] && (t >= 0) && (t < TPC_MAX_FRAGMENT_LEN_GPU)) {
+      int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, r);
       float central_charge = static_cast<float>(chargeMap[peak].unpack());
-      CfChargePos tmp_pos(current_row, current_pad + pad_offset, t);
+      CfChargePos tmp_pos(r, p + pad_offset, t);
       if (dtype == 0) {
         clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
       } else if (dtype == 1) {
diff --git a/prodtests/full-system-test/dpl-workflow.sh b/prodtests/full-system-test/dpl-workflow.sh
index df87018b1013a..202352730ddc7 100755
--- a/prodtests/full-system-test/dpl-workflow.sh
+++ b/prodtests/full-system-test/dpl-workflow.sh
@@ -74,7 +74,7 @@ fi
 GPU_INPUT=zsraw
 GPU_OUTPUT=tracks,clusters
 GPU_CONFIG=
-#GPU_CONFIG_KEY=
+GPU_CONFIG_KEY=
 TOF_CONFIG=
 TOF_INPUT=raw
 TOF_OUTPUT=clusters

From 1ac8c2cd240a2063e3e729904b7297db09402b3f Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 13 Jul 2025 10:54:18 +0200
Subject: [PATCH 03/14] Fixes for indexing and offsets

---
 .../Global/GPUChainTrackingClusterizer.cxx    |  2 ++
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  3 +-
 .../GPUTPCNNClusterizerHost.cxx               | 26 +++++++-------
 .../GPUTPCNNClusterizerKernels.cxx            | 34 ++++++++++++-------
 4 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 25bfe37f0db30..beac893e0072c 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -706,6 +706,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
       }
       AllocateRegisteredMemory(clustererNN.mMemoryId);
+      nnApplications[lane].createBoundary(clustererNNShadow);
+      nnApplications[lane].createIndexLookup(clustererNNShadow);
     });
     if (doGPU) {
       WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 2564515f58551..086b3c2211b5b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -61,8 +61,9 @@ class GPUTPCNNClusterizer : public GPUProcessor
 
   // Boundary lookup table
   int32_t mBoundaryMapSizeRow = 0;
-  int32_t mBoundaryMapSizePerRow = 0;
+  int32_t mBoundaryMapSizePadsPerRow = 0;
   int32_t mBoundaryMapSize = 0;
+  int32_t mBoundaryPadding = 11; // Padding on each side of the boundary map to account for pad_offset
   int8_t* mIsBoundary = nullptr;
 
   // Index lookup table
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 50b3fb15f6294..d9bf511b2c724 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -91,8 +91,11 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
   clustererNN.mNnClusterizerChargeArraySize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1));
   clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
-  clustererNN.mBoundaryMapSize = (3*clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW)*(GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2*clustererNN.mNnClusterizerSizeInputPad);
-  clustererNN.mIndexLookupSize = 3*clustererNN.mNnClusterizerElementSize; // local row, pad, time coordinate from flat index
+  clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
+  clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
+  clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2*clustererNN.mBoundaryPadding;
+  clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow*clustererNN.mBoundaryMapSizePadsPerRow;
+  clustererNN.mIndexLookupSize = 3*clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
   clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
   clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
   clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
@@ -119,27 +122,22 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
       clustererNN.mNnClusterizerModelReg2NumOutputNodes = mModelReg2.getNumOutputNodes()[0][1];
     }
   }
-  createBoundary(clustererNN);
-  createIndexLookup(clustererNN);
 }
 
 void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) {
   // Call after init of the clustererNN elements
-  clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
-  clustererNN.mBoundaryMapSizePerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2 * clustererNN.mNnClusterizerSizeInputPad;
   for(int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
-    for (int p = 0; p < clustererNN.mBoundaryMapSizePerRow; p++) {
-      int32_t i = r * clustererNN.mBoundaryMapSizePerRow + p;
+    int8_t skipCheckInRow = 0;
+    for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) {
+      int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p;
       clustererNN.mIsBoundary[i] = 1;
-      if (p >= clustererNN.mNnClusterizerSizeInputPad || r >= clustererNN.mNnClusterizerSizeInputRow) {
+      if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) {
         if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
-          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mNnClusterizerSizeInputPad) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
+          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
         } else if (r >= (GPUTPCGeometry::EndIROC() + 2*clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2*clustererNN.mNnClusterizerSizeInputRow)) {
-          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mNnClusterizerSizeInputPad) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2*clustererNN.mNnClusterizerSizeInputRow)));
-        }
-        if (clustererNN.mIsBoundary[i] == 1) {
-          break; // No need to check further pads in this row
+          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2*clustererNN.mNnClusterizerSizeInputRow)));
         }
+        skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row
       }
     }
   }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 962d651bad97e..ef8c649fd977e 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -121,9 +121,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
-  uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
+
+  uint32_t glo_idx = get_global_id(0);
   uint32_t base_idx = CAMath::Floor(glo_idx / clustererNN.mNnClusterizerElementSize);
   uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerElementSize);
 
@@ -153,17 +154,22 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       clustererNN.mInputData_32[top_idx - 2] = row / 152.f;
       clustererNN.mInputData_32[top_idx - 1] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
     }
-  } else if ((int32_t)transient_index < (clustererNN.mNnClusterizerElementSize - 3)) {
+  } else if ((int32_t)transient_index < clustererNN.mNnClusterizerChargeArraySize) {
     int32_t time = static_cast<int>(peak.time());
     int32_t idxLookup = 3*transient_index;
     int32_t r = clustererNN.mIndexLookup[idxLookup] + row, p = clustererNN.mIndexLookup[idxLookup + 1] + pad, t = clustererNN.mIndexLookup[idxLookup + 2] + time;
     int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-    int32_t isBoundaryIndex = (r + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePerRow + p + clustererNN.mNnClusterizerSizeInputPad;
+    int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, r);
+    p += pad_offset;
+    int32_t isBoundaryIndex = (r + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePadsPerRow + p + clustererNN.mBoundaryPadding;
 
     if (!clustererNN.mIsBoundary[isBoundaryIndex] && (t >= 0) && (t < TPC_MAX_FRAGMENT_LEN_GPU)) {
-      int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, r);
       float central_charge = static_cast<float>(chargeMap[peak].unpack());
-      CfChargePos tmp_pos(r, p + pad_offset, t);
+      CfChargePos tmp_pos(r, p, t);
+      // if ((glo_idx % (clustererNN.mNnClusterizerElementSize*1000)) == (int)((clustererNN.mNnClusterizerChargeArraySize-1)/2.f)){
+      //   printf("glo_idx: %d, r: %d, p: %d, t: %d, tmp_pos: (%d, %d, %d), charge: %f, central_charge: %f\n",
+      //          glo_idx, clustererNN.mIndexLookup[idxLookup], clustererNN.mIndexLookup[idxLookup + 1], clustererNN.mIndexLookup[idxLookup + 2], tmp_pos.row(), tmp_pos.pad(), tmp_pos.time(), chargeMap[tmp_pos].unpack(), central_charge);
+      // }
       if (dtype == 0) {
         clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
       } else if (dtype == 1) {
@@ -489,24 +495,28 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
 {
-  return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
+  if(row_current < 0 || row_current > o2::tpc::constants::MAXGLOBALPADROW) {
+    return 0; // Short-circuit for negative rows
+  } else {
+    return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
+  }
 }
 
-GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t global_shift)
+GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset)
 {
-  return (row > 62 ? global_shift : 0);
+  return (row > 62 ? offset : 0);
 }
 
-GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t global_shift)
+GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset)
 {
   if (pad < 0 || row < 0) { // Faster short-circuit
     return true;
   } else if (row < 63) {
     return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row)));
-  } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
+  } else if (row < (63 + offset)) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
     return true;
-  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) {
-    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - global_shift)));
+  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + offset)) {
+    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - offset)));
   } else {
     return true;
   }

From fdf6ef88ed36887d4dbf57cd030cd1930b895a7e Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 14 Jul 2025 01:16:31 +0200
Subject: [PATCH 04/14] Adjusting CPU kernel

---
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 27 ----------------
 .../GPUTPCNNClusterizerKernels.cxx            | 31 ++++++++++++-------
 2 files changed, 19 insertions(+), 39 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 282abe98a80bb..35cecb458c7a3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -67,33 +67,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
   return mem;
 }
 
-// std::vector<int32_t> GPUTPCNNClusterizer::pointerSizes() {
-//   std::vector<int32_t> sizes(7, -1);
-//   if (mNnClusterizerBatchedMode > 0) {
-//     if (mNnInferenceInputDType == 0 && mNnClusterizerElementSize > 0) {
-//       sizes[0] = mNnClusterizerBatchedMode * mNnClusterizerElementSize; // inputData16
-//     } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) {
-//       sizes[1] = mNnClusterizerBatchedMode * mNnClusterizerElementSize; // inputData32
-//     }
-//     sizes[2] = 2 * mNnClusterizerBatchedMode; // mClusterFlags
-//     if (mNnClusterizerModelClassNumOutputNodes > 0) {
-//       sizes[3] = mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes; // modelProbabilities
-//     }
-//     if (!mNnClusterizerUseCfRegression) {
-//       if (mNnClusterizerModelReg1NumOutputNodes > 0) {
-//         sizes[4] = mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes; // outputDataReg1
-//       }
-//       if (mNnClusterizerModelReg2NumOutputNodes > 0) {
-//         sizes[5] = mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes; // outputDataReg2
-//       }
-//     }
-//   }
-//   if (mNnClusterizerTotalClusters > 0) {
-//     sizes[6] = mNnClusterizerTotalClusters; // mOutputDataClass
-//   }
-//   return sizes;
-// }
-
 void GPUTPCNNClusterizer::RegisterMemoryAllocation()
 {
   AllocateAndInitializeLate();
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index ef8c649fd977e..af194b9809e44 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -61,25 +61,31 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
   CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
-  int32_t row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
+  int32_t row = static_cast<int32_t>(peak.row()), pad = static_cast<int32_t>(peak.pad()), time = static_cast<int32_t>(peak.time()); // Explicit casting to avoid conversion errors
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
   int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
 
   for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) {
-    bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
-    int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r);
-    for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) {
-      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow);
+    int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r);
+    int32_t row_pos = row + r;
+    for (int32_t p = (-clustererNN.mNnClusterizerSizeInputPad + pad_offset); p <= (clustererNN.mNnClusterizerSizeInputPad + pad_offset); p++) {
+      int32_t pad_pos = pad + p;
       for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) {
         int32_t time_pos = time + t;
-        if (!is_boundary && (time_pos >= 0) && (time_pos < TPC_MAX_FRAGMENT_LEN_GPU)) {
-          CfChargePos tmp_pos(row + r, pad + p, time + t);
-          if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
-            clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
-            clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
+        int32_t isBoundaryIndex = (row_pos + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePadsPerRow + pad_pos + clustererNN.mBoundaryPadding;
+        if (!clustererNN.mIsBoundary[isBoundaryIndex] && (time_pos >= 0) && (time_pos < TPC_MAX_FRAGMENT_LEN_GPU)) {
+          CfChargePos tmp_pos(row_pos, pad_pos, time_pos);
+          if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { // Only if deconvolution flags are not set
+            if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
+              clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
+              clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
+            }
           }
           if (dtype == 0) {
             clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+            // if(CAMath::Abs(static_cast<float>(clustererNN.mInputData_16[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-6) {
+            //   printf("Warning: (Charge) Charge difference at idx %d, batchStart %d, maxClusters %d, sector %d, row %d (%d), pad %d (%d), time %d (%d): %f / %f\n", glo_idx, batchStart, clusterer.mPmemory->counters.nClusters - 1, sector, row_pos, r, pad_pos, p, time_pos, t, static_cast<float>(clustererNN.mInputData_16[write_idx]), static_cast<float>(clustererNN.mInputData_16[write_idx]));
+            // }
           } else if (dtype == 1) {
             clustererNN.mInputData_32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
           }
@@ -507,16 +513,17 @@ GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset
   return (row > 62 ? offset : 0);
 }
 
+// Legacy. Deprecated.
 GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset)
 {
   if (pad < 0 || row < 0) { // Faster short-circuit
     return true;
   } else if (row < 63) {
-    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row)));
+    return ((pad < 0) || (pad >= static_cast<int>(GPUTPCGeometry::NPads(row))));
   } else if (row < (63 + offset)) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
     return true;
   } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + offset)) {
-    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - offset)));
+    return ((pad < 0) || (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - offset))));
   } else {
     return true;
   }

From 7604fab521a503ddf2fc337108a5d8c23f06800c Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Sun, 13 Jul 2025 23:18:51 +0000
Subject: [PATCH 05/14] Please consider the following formatting changes

---
 .../GPUTPCNNClusterizerHost.cxx               | 26 ++++++++++---------
 .../GPUTPCNNClusterizerKernels.cxx            |  6 ++---
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index d9bf511b2c724..b8ddcb296a414 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -93,9 +93,9 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
   clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
   clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
-  clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2*clustererNN.mBoundaryPadding;
-  clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow*clustererNN.mBoundaryMapSizePadsPerRow;
-  clustererNN.mIndexLookupSize = 3*clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
+  clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2 * clustererNN.mBoundaryPadding;
+  clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow;
+  clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
   clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
   clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
   clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
@@ -124,9 +124,10 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   }
 }
 
-void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) {
+void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN)
+{
   // Call after init of the clustererNN elements
-  for(int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
+  for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
     int8_t skipCheckInRow = 0;
     for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) {
       int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p;
@@ -134,8 +135,8 @@ void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) {
       if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) {
         if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
           clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
-        } else if (r >= (GPUTPCGeometry::EndIROC() + 2*clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2*clustererNN.mNnClusterizerSizeInputRow)) {
-          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2*clustererNN.mNnClusterizerSizeInputRow)));
+        } else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) {
+          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow)));
         }
         skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row
       }
@@ -143,15 +144,16 @@ void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) {
   }
 }
 
-void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN) {
-  for(int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++){
+void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN)
+{
+  for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) {
     int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
     int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
     int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad;
     int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
-    clustererNN.mIndexLookup[3*i] = r;
-    clustererNN.mIndexLookup[3*i + 1] = p;
-    clustererNN.mIndexLookup[3*i + 2] = t;
+    clustererNN.mIndexLookup[3 * i] = r;
+    clustererNN.mIndexLookup[3 * i + 1] = p;
+    clustererNN.mIndexLookup[3 * i + 2] = t;
   }
 }
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index af194b9809e44..ee4f10160e5ea 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -75,7 +75,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
         int32_t isBoundaryIndex = (row_pos + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePadsPerRow + pad_pos + clustererNN.mBoundaryPadding;
         if (!clustererNN.mIsBoundary[isBoundaryIndex] && (time_pos >= 0) && (time_pos < TPC_MAX_FRAGMENT_LEN_GPU)) {
           CfChargePos tmp_pos(row_pos, pad_pos, time_pos);
-          if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { // Only if deconvolution flags are not set
+          if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) {                                                                    // Only if deconvolution flags are not set
             if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
               clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
               clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
@@ -162,7 +162,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     }
   } else if ((int32_t)transient_index < clustererNN.mNnClusterizerChargeArraySize) {
     int32_t time = static_cast<int>(peak.time());
-    int32_t idxLookup = 3*transient_index;
+    int32_t idxLookup = 3 * transient_index;
     int32_t r = clustererNN.mIndexLookup[idxLookup] + row, p = clustererNN.mIndexLookup[idxLookup + 1] + pad, t = clustererNN.mIndexLookup[idxLookup + 2] + time;
     int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
     int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, r);
@@ -501,7 +501,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
 {
-  if(row_current < 0 || row_current > o2::tpc::constants::MAXGLOBALPADROW) {
+  if (row_current < 0 || row_current > o2::tpc::constants::MAXGLOBALPADROW) {
     return 0; // Short-circuit for negative rows
   } else {
     return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);

From 0c2ac6a5444e31e5735f86ca159f4400bf320b84 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 14 Jul 2025 09:16:54 +0200
Subject: [PATCH 06/14] Fix for row-number access

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx    | 2 +-
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index b8ddcb296a414..b9b4619687ed4 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -93,7 +93,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
   clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
   clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
-  clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2 * clustererNN.mBoundaryPadding;
+  clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW-1) + 2 * clustererNN.mBoundaryPadding;
   clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow;
   clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
   clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index ee4f10160e5ea..7f8e2528b6561 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -501,7 +501,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
 {
-  if (row_current < 0 || row_current > o2::tpc::constants::MAXGLOBALPADROW) {
+  if (row_current < 0 || row_current >= o2::tpc::constants::MAXGLOBALPADROW) {
     return 0; // Short-circuit for negative rows
   } else {
     return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);

From 33c8278acf49d1bd462cfe7f2c43a72385866680 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Mon, 14 Jul 2025 07:17:37 +0000
Subject: [PATCH 07/14] Please consider the following formatting changes

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index b9b4619687ed4..669e4214b2756 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -93,7 +93,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
   clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
   clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
-  clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW-1) + 2 * clustererNN.mBoundaryPadding;
+  clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding;
   clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow;
   clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
   clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;

From eecb8a8e3f944b663b419c7a5dae39cd7c0486c3 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 16 Jul 2025 00:57:13 +0200
Subject: [PATCH 08/14] Improve kernel speed by ~15%. Next test: for-loop in
 pad direction for coallesced access

---
 .../Global/GPUChainTrackingClusterizer.cxx    |   4 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |   2 -
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  21 +-
 .../GPUTPCNNClusterizerHost.cxx               |  80 ++++----
 .../GPUTPCNNClusterizerKernels.cxx            | 184 +++++++++++-------
 .../GPUTPCNNClusterizerKernels.h              |   2 +-
 6 files changed, 173 insertions(+), 120 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index beac893e0072c..13455efe6cb47 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -706,8 +706,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
       }
       AllocateRegisteredMemory(clustererNN.mMemoryId);
-      nnApplications[lane].createBoundary(clustererNNShadow);
-      nnApplications[lane].createIndexLookup(clustererNNShadow);
+      // nnApplications[lane].createBoundary(clustererNNShadow);
+      // nnApplications[lane].createIndexLookup(clustererNNShadow);
     });
     if (doGPU) {
       WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 35cecb458c7a3..3dd8b0d621a56 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -61,8 +61,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
   }
   if (mNnClusterizerTotalClusters > 0) {
     computePointerWithAlignment(mem, mOutputDataClass, mNnClusterizerTotalClusters);
-    computePointerWithAlignment(mem, mIsBoundary, mBoundaryMapSize);
-    computePointerWithAlignment(mem, mIndexLookup, mIndexLookupSize);
   }
   return mem;
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 086b3c2211b5b..0bffc525ac8cf 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -59,16 +59,22 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int32_t mISector = -1;
   int32_t mDeviceId = -1;
 
+  // GPU optimizations
+  uint32_t mNnClusterizerFullRowSize = 0;
+  uint32_t mNnClusterizerFullPadSize = 0;
+  uint32_t mNnClusterizerFullTimeSize = 0;
+  uint32_t mNnClusterizerPadTimeSize = 0;
+
   // Boundary lookup table
-  int32_t mBoundaryMapSizeRow = 0;
-  int32_t mBoundaryMapSizePadsPerRow = 0;
-  int32_t mBoundaryMapSize = 0;
-  int32_t mBoundaryPadding = 11; // Padding on each side of the boundary map to account for pad_offset
-  int8_t* mIsBoundary = nullptr;
+  // int32_t mBoundaryMapSizeRow = 0;
+  // int32_t mBoundaryMapSizePadsPerRow = 0;
+  // int32_t mBoundaryMapSize = 0;
+  // int32_t mBoundaryPadding = 11; // Padding on each side of the boundary map to account for pad_offset
+  // int8_t* mIsBoundary = nullptr;
 
   // Index lookup table
-  int32_t mIndexLookupSize = 0;
-  int32_t* mIndexLookup = nullptr;
+  // int32_t mIndexLookupSize = 0;
+  // int32_t* mIndexLookup = nullptr;
 
   // Memory allocation for neural network
 
@@ -83,6 +89,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
 
   // FP16
   OrtDataType::Float16_t* mInputData_16 = nullptr;
+  OrtDataType::Float16_t* mInputData_16_Test = nullptr;
   OrtDataType::Float16_t* mModelProbabilities_16 = nullptr;
   OrtDataType::Float16_t* mOutputDataReg1_16 = nullptr;
   OrtDataType::Float16_t* mOutputDataReg2_16 = nullptr;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 669e4214b2756..17a55ed0df1e3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -89,13 +89,17 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
   clustererNN.mNnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad;
   clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
-  clustererNN.mNnClusterizerChargeArraySize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1));
+  clustererNN.mNnClusterizerFullRowSize = 2 * settings.nnClusterizerSizeInputRow + 1;
+  clustererNN.mNnClusterizerFullPadSize = 2 * settings.nnClusterizerSizeInputPad + 1;
+  clustererNN.mNnClusterizerFullTimeSize = 2 * settings.nnClusterizerSizeInputTime + 1;
+  clustererNN.mNnClusterizerChargeArraySize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
+  clustererNN.mNnClusterizerPadTimeSize = clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
   clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
-  clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
-  clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
-  clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding;
-  clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow;
-  clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
+  // clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
+  // clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
+  // clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding;
+  // clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow;
+  // clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
   clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
   clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
   clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
@@ -124,38 +128,38 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   }
 }
 
-void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN)
-{
-  // Call after init of the clustererNN elements
-  for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
-    int8_t skipCheckInRow = 0;
-    for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) {
-      int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p;
-      clustererNN.mIsBoundary[i] = 1;
-      if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) {
-        if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
-          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
-        } else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) {
-          clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow)));
-        }
-        skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row
-      }
-    }
-  }
-}
-
-void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN)
-{
-  for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) {
-    int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
-    int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
-    int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad;
-    int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
-    clustererNN.mIndexLookup[3 * i] = r;
-    clustererNN.mIndexLookup[3 * i + 1] = p;
-    clustererNN.mIndexLookup[3 * i + 2] = t;
-  }
-}
+// void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN)
+// {
+//   // Call after init of the clustererNN elements
+//   for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
+//     int8_t skipCheckInRow = 0;
+//     for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) {
+//       int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p;
+//       clustererNN.mIsBoundary[i] = 1;
+//       if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) {
+//         if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
+//           clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
+//         } else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) {
+//           clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow)));
+//         }
+//         skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row
+//       }
+//     }
+//   }
+// }
+
+// void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN)
+// {
+//   for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) {
+//     int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
+//     int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
+//     int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad;
+//     int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
+//     clustererNN.mIndexLookup[3 * i] = r;
+//     clustererNN.mIndexLookup[3 * i + 1] = p;
+//     clustererNN.mIndexLookup[3 * i + 2] = t;
+//   }
+// }
 
 // MockedOrtAllocator implementation to be able to use volatile assignment
 struct MockedOrtAllocator : OrtAllocator {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 7f8e2528b6561..8f1c8269339f6 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -50,53 +50,57 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
   GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
+
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNCPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
   uint32_t glo_idx = get_global_id(0);
+
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
+
+  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
+    return;
+  }
   uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize; // Potential optimization: Either choose mNnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
 
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
   CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
-  int32_t row = static_cast<int32_t>(peak.row()), pad = static_cast<int32_t>(peak.pad()), time = static_cast<int32_t>(peak.time()); // Explicit casting to avoid conversion errors
+  int32_t row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
   int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
 
   for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) {
-    int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r);
-    int32_t row_pos = row + r;
-    for (int32_t p = (-clustererNN.mNnClusterizerSizeInputPad + pad_offset); p <= (clustererNN.mNnClusterizerSizeInputPad + pad_offset); p++) {
-      int32_t pad_pos = pad + p;
+    bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
+    int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r);
+    for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) {
+      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow);
       for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) {
         int32_t time_pos = time + t;
-        int32_t isBoundaryIndex = (row_pos + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePadsPerRow + pad_pos + clustererNN.mBoundaryPadding;
-        if (!clustererNN.mIsBoundary[isBoundaryIndex] && (time_pos >= 0) && (time_pos < TPC_MAX_FRAGMENT_LEN_GPU)) {
-          CfChargePos tmp_pos(row_pos, pad_pos, time_pos);
-          if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) {                                                                    // Only if deconvolution flags are not set
-            if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
-              clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
-              clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
-            }
-          }
-          if (dtype == 0) {
-            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
-            // if(CAMath::Abs(static_cast<float>(clustererNN.mInputData_16[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-6) {
-            //   printf("Warning: (Charge) Charge difference at idx %d, batchStart %d, maxClusters %d, sector %d, row %d (%d), pad %d (%d), time %d (%d): %f / %f\n", glo_idx, batchStart, clusterer.mPmemory->counters.nClusters - 1, sector, row_pos, r, pad_pos, p, time_pos, t, static_cast<float>(clustererNN.mInputData_16[write_idx]), static_cast<float>(clustererNN.mInputData_16[write_idx]));
-            // }
-          } else if (dtype == 1) {
-            clustererNN.mInputData_32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
-          }
-        } else {
+        if (is_boundary || (time_pos < 0) || (time_pos >= TPC_MAX_FRAGMENT_LEN_GPU)) {
           // Filling boundary just to make sure that no values are left unintentionally
           if (dtype == 0) {
             clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
           } else {
             clustererNN.mInputData_32[write_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
           }
+        } else {
+          CfChargePos tmp_pos(row + r, pad + p, time + t);
+          if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
+            clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
+            clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
+          }
+          if (dtype == 0) {
+            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+          } else if (dtype == 1) {
+            clustererNN.mInputData_32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+          }
         }
+        // if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
+        //   printf("Warning: Input data mismatch at index %d, %d - row, pad, time: %d, %d, %d : %f -> %f\n", glo_idx, glo_idx + batchStart, r, p, t,
+        //          static_cast<float>(clustererNN.mInputData_16_Test[write_idx]), static_cast<float>(clustererNN.mInputData_16[write_idx]));
+        // }
         write_idx++;
       }
     }
@@ -127,66 +131,107 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
+  uint32_t glo_idx = get_global_id(0);
+
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
-  uint32_t glo_idx = get_global_id(0);
-  uint32_t base_idx = CAMath::Floor(glo_idx / clustererNN.mNnClusterizerElementSize);
+  // Optimized division using bit operations
+  uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerElementSize;
   uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerElementSize);
 
+  // Early exit for out-of-bounds threads
+  if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
+    return;
+  }
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
+
+  // Use dedicated neural network shared memory arrays for warp-level caching
+  // First thread in each warp loads shared data
   CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
-  int32_t row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad());
-
-  if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.mNnClusterizerElementSize - 1)) {
-    uint32_t top_idx = (base_idx + 1) * clustererNN.mNnClusterizerElementSize;
-    if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { // Only if deconvolution flags are not set
-      clustererNN.mClusterFlags[2 * base_idx] = 0;
-      clustererNN.mClusterFlags[2 * base_idx + 1] = 0;
-      for (uint16_t i = 0; i < 8; i++) {                    // This solution needs testing. It is not the same as the deconvolution flags
-        Delta2 d = cfconsts::InnerNeighbors[i];
-        CfChargePos tmp_pos = peak.delta(d);
-        clustererNN.mClusterFlags[2 * base_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
-      }
-      clustererNN.mClusterFlags[2 * base_idx + 1] = clustererNN.mClusterFlags[2 * base_idx];
-    }
+  float central_charge = static_cast<float>(chargeMap[peak].unpack());
+  int32_t row = static_cast<int>(peak.row());
+  int32_t pad = static_cast<int>(peak.pad());
+  int32_t time = static_cast<int>(peak.time());
+
+  // Handle index data with fewer branches
+  if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index >= clustererNN.mNnClusterizerChargeArraySize) {
+    uint32_t output_idx = base_idx * clustererNN.mNnClusterizerElementSize + transient_index;
+    int32_t data_idx = transient_index - clustererNN.mNnClusterizerChargeArraySize;
+
+    float index_values[3] = {
+      sector / 36.f,
+      row / 152.f,
+      static_cast<float>(pad) / GPUTPCGeometry::NPads(row)
+    };
+
     if (dtype == 0) {
-      clustererNN.mInputData_16[top_idx - 3] = (OrtDataType::Float16_t)(sector / 36.f);
-      clustererNN.mInputData_16[top_idx - 2] = (OrtDataType::Float16_t)(row / 152.f);
-      clustererNN.mInputData_16[top_idx - 1] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
+      clustererNN.mInputData_16[output_idx] = (OrtDataType::Float16_t)index_values[data_idx];
     } else {
-      clustererNN.mInputData_32[top_idx - 3] = sector / 36.f;
-      clustererNN.mInputData_32[top_idx - 2] = row / 152.f;
-      clustererNN.mInputData_32[top_idx - 1] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
+      clustererNN.mInputData_32[output_idx] = index_values[data_idx];
     }
-  } else if ((int32_t)transient_index < clustererNN.mNnClusterizerChargeArraySize) {
-    int32_t time = static_cast<int>(peak.time());
-    int32_t idxLookup = 3 * transient_index;
-    int32_t r = clustererNN.mIndexLookup[idxLookup] + row, p = clustererNN.mIndexLookup[idxLookup + 1] + pad, t = clustererNN.mIndexLookup[idxLookup + 2] + time;
-    int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-    int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, r);
-    p += pad_offset;
-    int32_t isBoundaryIndex = (r + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePadsPerRow + p + clustererNN.mBoundaryPadding;
-
-    if (!clustererNN.mIsBoundary[isBoundaryIndex] && (t >= 0) && (t < TPC_MAX_FRAGMENT_LEN_GPU)) {
-      float central_charge = static_cast<float>(chargeMap[peak].unpack());
-      CfChargePos tmp_pos(r, p, t);
-      // if ((glo_idx % (clustererNN.mNnClusterizerElementSize*1000)) == (int)((clustererNN.mNnClusterizerChargeArraySize-1)/2.f)){
-      //   printf("glo_idx: %d, r: %d, p: %d, t: %d, tmp_pos: (%d, %d, %d), charge: %f, central_charge: %f\n",
-      //          glo_idx, clustererNN.mIndexLookup[idxLookup], clustererNN.mIndexLookup[idxLookup + 1], clustererNN.mIndexLookup[idxLookup + 2], tmp_pos.row(), tmp_pos.pad(), tmp_pos.time(), chargeMap[tmp_pos].unpack(), central_charge);
-      // }
-      if (dtype == 0) {
-        clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
-      } else if (dtype == 1) {
-        clustererNN.mInputData_32[glo_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+
+    // Handle deconvolution flags only once per cluster (last thread in element)
+    if (data_idx == 2 && !clustererNN.mNnClusterizerSetDeconvolutionFlags) {
+      uint8_t cluster_flags = 0;
+      for (uint16_t i = 0; i < 8; i++) {
+        Delta2 d = cfconsts::InnerNeighbors[i];
+        CfChargePos tmp_pos = peak.delta(d);
+        cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
       }
-    } else {
+      clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
+      clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
+    }
+    return;
+  }
+
+  // Main data processing - optimize index calculations
+  if ((int32_t)transient_index < clustererNN.mNnClusterizerChargeArraySize) {
+    // Optimize 3D index calculation
+    int32_t r_local = (transient_index / clustererNN.mNnClusterizerPadTimeSize) - clustererNN.mNnClusterizerSizeInputRow;
+    int32_t pad_time_slice = (transient_index % clustererNN.mNnClusterizerPadTimeSize);
+    int32_t p_local = (pad_time_slice / clustererNN.mNnClusterizerFullPadSize) - clustererNN.mNnClusterizerSizeInputPad;
+    int32_t t_local = (pad_time_slice % clustererNN.mNnClusterizerFullPadSize) - clustererNN.mNnClusterizerSizeInputTime;
+
+    // Early boundary check for row
+    int32_t target_row = row + r_local;
+    int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1));
+
+    if (is_row_boundary) {
+      // Use boundary fill value
+      float boundary_val = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
       if (dtype == 0) {
-        clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
+        clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)boundary_val;
       } else {
-        clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+        clustererNN.mInputData_32[glo_idx] = boundary_val;
       }
+      return;
+    }
+
+    // Calculate offsets
+    int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
+    int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
+    int32_t target_pad = pad + p_local + pad_offset;
+    int32_t target_time = time + t_local;
+
+    // Optimized boundary check
+    int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= TPC_MAX_FRAGMENT_LEN_GPU);
+
+    float output_value;
+    if (is_boundary) {
+      output_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+    } else {
+      // Coalesced memory access - create position and read charge
+      CfChargePos tmp_pos(target_row, target_pad, target_time);
+      output_value = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge
+    }
+
+    // Write output with reduced branching
+    if (dtype == 0) {
+      clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)output_value;
+    } else {
+      clustererNN.mInputData_32[glo_idx] = output_value;
     }
   }
 }
@@ -501,7 +546,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
 {
-  if (row_current < 0 || row_current >= o2::tpc::constants::MAXGLOBALPADROW) {
+  if(row_current < 0 || row_current >= o2::tpc::constants::MAXGLOBALPADROW) {
     return 0; // Short-circuit for negative rows
   } else {
     return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
@@ -513,7 +558,6 @@ GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset
   return (row > 62 ? offset : 0);
 }
 
-// Legacy. Deprecated.
 GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset)
 {
   if (pad < 0 || row < 0) { // Faster short-circuit
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index a3858d47eb99b..5659c61894c85 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -65,7 +65,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
     determineClass2Labels = 4,
     publishClass1Regression = 5,
     publishClass2Regression = 6,
-    publishDeconvolutionFlags = 7
+    publishDeconvolutionFlags = 7,
   };
 
   template <int32_t iKernel = defaultKernel, typename... Args>

From 3447927ba0ff0aafb1b790cd08d85a005a0b64c2 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 16 Jul 2025 09:58:15 +0200
Subject: [PATCH 09/14] IMproving kernel speed by 30% compared to original
 version. Next try: for-loop over row dimension as access is somewhat coalsced
 too

---
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  3 +-
 .../GPUTPCNNClusterizerHost.cxx               |  2 +
 .../GPUTPCNNClusterizerKernels.cxx            | 86 ++++++++++---------
 3 files changed, 50 insertions(+), 41 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 0bffc525ac8cf..7c22d8123fdec 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -64,6 +64,8 @@ class GPUTPCNNClusterizer : public GPUProcessor
   uint32_t mNnClusterizerFullPadSize = 0;
   uint32_t mNnClusterizerFullTimeSize = 0;
   uint32_t mNnClusterizerPadTimeSize = 0;
+  uint32_t mNnClusterizerRowTimeSize = 0;
+  uint32_t mNnClusterizerRowTimeSizeFull = 0;
 
   // Boundary lookup table
   // int32_t mBoundaryMapSizeRow = 0;
@@ -89,7 +91,6 @@ class GPUTPCNNClusterizer : public GPUProcessor
 
   // FP16
   OrtDataType::Float16_t* mInputData_16 = nullptr;
-  OrtDataType::Float16_t* mInputData_16_Test = nullptr;
   OrtDataType::Float16_t* mModelProbabilities_16 = nullptr;
   OrtDataType::Float16_t* mOutputDataReg1_16 = nullptr;
   OrtDataType::Float16_t* mOutputDataReg2_16 = nullptr;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 17a55ed0df1e3..fd56d49de7921 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -94,6 +94,8 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnClusterizerFullTimeSize = 2 * settings.nnClusterizerSizeInputTime + 1;
   clustererNN.mNnClusterizerChargeArraySize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
   clustererNN.mNnClusterizerPadTimeSize = clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
+  clustererNN.mNnClusterizerRowTimeSize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullTimeSize;
+  clustererNN.mNnClusterizerRowTimeSizeFull = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 3 : 0);
   clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
   // clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
   // clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 8f1c8269339f6..7765315c8f692 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -137,8 +137,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
   // Optimized division using bit operations
-  uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerElementSize;
-  uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerElementSize);
+  uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeFull;
+  uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeFull);
 
   // Early exit for out-of-bounds threads
   if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
@@ -156,9 +156,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   int32_t time = static_cast<int>(peak.time());
 
   // Handle index data with fewer branches
-  if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index >= clustererNN.mNnClusterizerChargeArraySize) {
-    uint32_t output_idx = base_idx * clustererNN.mNnClusterizerElementSize + transient_index;
-    int32_t data_idx = transient_index - clustererNN.mNnClusterizerChargeArraySize;
+  if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
+    int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
+    uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
 
     float index_values[3] = {
       sector / 36.f,
@@ -167,9 +167,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     };
 
     if (dtype == 0) {
-      clustererNN.mInputData_16[output_idx] = (OrtDataType::Float16_t)index_values[data_idx];
+      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
     } else {
-      clustererNN.mInputData_32[output_idx] = index_values[data_idx];
+      clustererNN.mInputData_32[write_idx] = index_values[data_idx];
     }
 
     // Handle deconvolution flags only once per cluster (last thread in element)
@@ -187,51 +187,57 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   }
 
   // Main data processing - optimize index calculations
-  if ((int32_t)transient_index < clustererNN.mNnClusterizerChargeArraySize) {
+  if ((int32_t)transient_index < clustererNN.mNnClusterizerRowTimeSize) {
     // Optimize 3D index calculation
-    int32_t r_local = (transient_index / clustererNN.mNnClusterizerPadTimeSize) - clustererNN.mNnClusterizerSizeInputRow;
-    int32_t pad_time_slice = (transient_index % clustererNN.mNnClusterizerPadTimeSize);
-    int32_t p_local = (pad_time_slice / clustererNN.mNnClusterizerFullPadSize) - clustererNN.mNnClusterizerSizeInputPad;
-    int32_t t_local = (pad_time_slice % clustererNN.mNnClusterizerFullPadSize) - clustererNN.mNnClusterizerSizeInputTime;
+    int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
+    int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow;
+    int32_t time_idx = transient_index - row_idx*clustererNN.mNnClusterizerFullTimeSize;
+    int32_t t_local = time_idx - clustererNN.mNnClusterizerSizeInputTime;
+    int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx;
 
     // Early boundary check for row
     int32_t target_row = row + r_local;
     int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1));
 
-    if (is_row_boundary) {
-      // Use boundary fill value
-      float boundary_val = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
-      if (dtype == 0) {
-        clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)boundary_val;
-      } else {
-        clustererNN.mInputData_32[glo_idx] = boundary_val;
-      }
-      return;
-    }
-
     // Calculate offsets
     int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
     int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
-    int32_t target_pad = pad + p_local + pad_offset;
-    int32_t target_time = time + t_local;
+    for (int32_t p_local = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local++) {
+      if (is_row_boundary) {
+        // Use boundary fill value
+        float boundary_val = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+        if (dtype == 0) {
+          clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_val;
+        } else {
+          clustererNN.mInputData_32[write_idx] = boundary_val;
+        }
+        write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
+        continue;
+      }
 
-    // Optimized boundary check
-    int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= TPC_MAX_FRAGMENT_LEN_GPU);
+      // Calculate target pad and time
+      int32_t target_pad = pad + p_local;
+      int32_t target_time = time + t_local;
 
-    float output_value;
-    if (is_boundary) {
-      output_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
-    } else {
-      // Coalesced memory access - create position and read charge
-      CfChargePos tmp_pos(target_row, target_pad, target_time);
-      output_value = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge
-    }
+      // Optimized boundary check
+      int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= TPC_MAX_FRAGMENT_LEN_GPU);
 
-    // Write output with reduced branching
-    if (dtype == 0) {
-      clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)output_value;
-    } else {
-      clustererNN.mInputData_32[glo_idx] = output_value;
+      float output_value;
+      if (is_boundary) {
+        output_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+      } else {
+        // Coalesced memory access - create position and read charge
+        CfChargePos tmp_pos(target_row, target_pad, target_time);
+        output_value = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge
+      }
+
+      // Write output with reduced branching
+      if (dtype == 0) {
+        clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
+      } else {
+        clustererNN.mInputData_32[write_idx] = output_value;
+      }
+      write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
     }
   }
 }

From 6ef38a2fab6dc4bb9b2a9bd67b5596cd3992be15 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Wed, 16 Jul 2025 07:58:53 +0000
Subject: [PATCH 10/14] Please consider the following formatting changes

---
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx       | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 7765315c8f692..1431367812e8c 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -50,7 +50,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
   GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
-
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNCPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
@@ -163,8 +162,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     float index_values[3] = {
       sector / 36.f,
       row / 152.f,
-      static_cast<float>(pad) / GPUTPCGeometry::NPads(row)
-    };
+      static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
 
     if (dtype == 0) {
       clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
@@ -191,7 +189,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     // Optimize 3D index calculation
     int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
     int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow;
-    int32_t time_idx = transient_index - row_idx*clustererNN.mNnClusterizerFullTimeSize;
+    int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize;
     int32_t t_local = time_idx - clustererNN.mNnClusterizerSizeInputTime;
     int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx;
 
@@ -552,7 +550,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
 {
-  if(row_current < 0 || row_current >= o2::tpc::constants::MAXGLOBALPADROW) {
+  if (row_current < 0 || row_current >= o2::tpc::constants::MAXGLOBALPADROW) {
     return 0; // Short-circuit for negative rows
   } else {
     return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);

From 069a7e9c8cec0d7271c538c6e9eca49bb7a12451 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 16 Jul 2025 17:50:58 +0200
Subject: [PATCH 11/14] Minor improvements for MC handling

---
 .../GPUTPCNNClusterizerKernels.cxx            | 40 +++++++++++++++++--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 1431367812e8c..f8cf0dc6a0127 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -291,9 +291,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
   uint32_t full_glo_idx = glo_idx + batchStart;
-  if (full_glo_idx >= maxClusterNum) {
-    return;
-  }
   int32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes;
 
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
@@ -302,6 +299,24 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
+
+  if (full_glo_idx >= maxClusterNum) {
+    if (withMC) {
+      ClusterAccumulator dummy_pc;
+      CPU_ONLY(labelAcc->collect(peak, central_charge));
+      GPUTPCCFClusterizer::buildCluster(
+        clusterer.Param().rec,
+        chargeMap,
+        peak,
+        smem.posBcast,
+        smem.buf,
+        smem.innerAboveThreshold,
+        &dummy_pc,
+        labelAcc);
+    }
+    return;
+  }
+
   tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
 
   // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.mNnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
@@ -389,6 +404,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
+  uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
@@ -397,6 +413,24 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
   tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
   uint32_t full_glo_idx = glo_idx + batchStart;
+
+  if (full_glo_idx >= maxClusterNum) {
+    if (withMC) {
+      ClusterAccumulator dummy_pc;
+      CPU_ONLY(labelAcc->collect(peak, central_charge));
+      GPUTPCCFClusterizer::buildCluster(
+        clusterer.Param().rec,
+        chargeMap,
+        peak,
+        smem.posBcast,
+        smem.buf,
+        smem.innerAboveThreshold,
+        &dummy_pc,
+        labelAcc);
+    }
+    return;
+  }
+
   uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes;
 
   if (clustererNN.mOutputDataClass[full_glo_idx] > 0) {

From 4949b55f8030e9f12bcef34a88e99bf8de69ea63 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 17 Jul 2025 23:46:02 +0200
Subject: [PATCH 12/14] Beautifications to trigger the CI

---
 .../GPUTPCNNClusterizerKernels.cxx            | 74 ++++++++++++-------
 1 file changed, 46 insertions(+), 28 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index f8cf0dc6a0127..8940d40bbb574 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -53,47 +53,58 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNCPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
-  uint32_t glo_idx = get_global_id(0);
-
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
+  uint32_t glo_idx = get_global_id(0);
   if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
     return;
   }
-  uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize; // Potential optimization: Either choose mNnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
+
+  uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize;
 
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
-  CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
-  int32_t row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, clusterer.mPmemory->counters.nClusters - 1)];
+  int32_t row = static_cast<int>(peak.row());
+  int32_t pad = static_cast<int>(peak.pad());
+  int32_t time = static_cast<int>(peak.time());
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
   int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
 
-  for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) {
-    bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
-    int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r);
-    for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) {
-      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow);
-      for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) {
-        int32_t time_pos = time + t;
-        if (is_boundary || (time_pos < 0) || (time_pos >= TPC_MAX_FRAGMENT_LEN_GPU)) {
-          // Filling boundary just to make sure that no values are left unintentionally
+  for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) {
+    int32_t target_row = row + r;
+    bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
+    int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row);
+
+    for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
+      int32_t target_pad = pad + p;
+      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow);
+
+      for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) {
+        int32_t target_time = time + t;
+
+        if (is_boundary || target_time < 0 || target_time >= TPC_MAX_FRAGMENT_LEN_GPU) {
+          // Fill boundary value
+          float boundary_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
           if (dtype == 0) {
-            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
+            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_value;
           } else {
-            clustererNN.mInputData_32[write_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+            clustererNN.mInputData_32[write_idx] = boundary_value;
           }
         } else {
-          CfChargePos tmp_pos(row + r, pad + p, time + t);
-          if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
+          CfChargePos tmp_pos(target_row, target_pad, target_time);
+          float normalized_charge = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+
+          if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && r == 0 && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) {
             clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
             clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
           }
+
           if (dtype == 0) {
-            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
-          } else if (dtype == 1) {
-            clustererNN.mInputData_32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)normalized_charge;
+          } else {
+            clustererNN.mInputData_32[write_idx] = normalized_charge;
           }
         }
         // if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
@@ -104,21 +115,28 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       }
     }
   }
+
   if (clustererNN.mNnClusterizerAddIndexData) {
+    float sector_norm = sector / 36.f;
+    float row_norm = row / 152.f;
+    float pad_norm = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
+
     if (dtype == 0) {
-      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(sector / 36.f);
-      clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
-      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
+      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)sector_norm;
+      clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)row_norm;
+      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)pad_norm;
     } else {
-      clustererNN.mInputData_32[write_idx] = sector / 36.f;
-      clustererNN.mInputData_32[write_idx + 1] = row / 152.f;
-      clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
+      clustererNN.mInputData_32[write_idx] = sector_norm;
+      clustererNN.mInputData_32[write_idx + 1] = row_norm;
+      clustererNN.mInputData_32[write_idx + 2] = pad_norm;
     }
   }
+
   if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) {
     clustererNN.mClusterFlags[2 * glo_idx] = 0;
     clustererNN.mClusterFlags[2 * glo_idx + 1] = 0;
-    for (uint16_t i = 0; i < 8; i++) {
+
+    for (uint16_t i = 0; i < 8; ++i) {
       Delta2 d = cfconsts::InnerNeighbors[i];
       CfChargePos tmp_pos = peak.delta(d);
       clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);

From f4729fd69b1d767964aa32c36e1a25fb83643753 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 18 Jul 2025 08:31:47 +0200
Subject: [PATCH 13/14] Compile-fix

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 8940d40bbb574..395230dab6bf1 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -65,7 +65,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
-  CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, clusterer.mPmemory->counters.nClusters - 1)];
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
   int32_t row = static_cast<int>(peak.row());
   int32_t pad = static_cast<int>(peak.pad());
   int32_t time = static_cast<int>(peak.time());

From 2af6060fd2b24a166b290b332b637a5fa5041139 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 18 Jul 2025 10:43:19 +0200
Subject: [PATCH 14/14] Fix int32_t error in fullCI build

---
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 395230dab6bf1..4cd0c094398df 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -173,7 +173,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   int32_t time = static_cast<int>(peak.time());
 
   // Handle index data with fewer branches
-  if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
+  if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
     int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
     uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
 
@@ -203,7 +203,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   }
 
   // Main data processing - optimize index calculations
-  if ((int32_t)transient_index < clustererNN.mNnClusterizerRowTimeSize) {
+  if (transient_index < clustererNN.mNnClusterizerRowTimeSize) {
     // Optimize 3D index calculation
     int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
     int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow;