From 9c8984c53d01d851f4294d5a68d6a8bbefd39295 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 18 Oct 2025 00:55:54 +0200
Subject: [PATCH 1/6] Improve GPU filling kernel speed

---
 .../Global/GPUChainTrackingClusterizer.cxx    |   7 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |   1 +
 .../GPUTPCNNClusterizerHost.cxx               |   1 +
 .../GPUTPCNNClusterizerKernels.cxx            | 181 +++++++++---------
 .../GPUTPCNNClusterizerKernels.h              |   4 +-
 5 files changed, 104 insertions(+), 90 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index b0d466f13e5ef..c4b7b2e47cbd9 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -642,6 +642,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
   // Maximum of 4 lanes supported
   HighResTimer* nnTimers[12];
+  int32_t countLoops = 0;
 
   if (GetProcessingSettings().nn.applyNNclusterizer) {
     int32_t deviceId = -1;
@@ -1035,7 +1036,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             // Filling the data
             if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
               // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeFull, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
+              for(int throughput_counter = 0; throughput_counter < 16; throughput_counter++) { // Loop to increase throughput on GPU, at least for large batch sizes
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
+                countLoops++;
+              }
             } else {
               // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
@@ -1138,6 +1142,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
               LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
             }
           }
+          LOG(info) << "countLoops: " << countLoops;
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 0b9553437765c..b7bc1575d349a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -71,6 +71,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   uint32_t mNnClusterizerPadTimeSize = 0;
   uint32_t mNnClusterizerRowTimeSize = 0;
   uint32_t mNnClusterizerRowTimeSizeFull = 0;
+  uint32_t mNnClusterizerRowTimeSizeThreads = 0;
 
   // Boundary lookup table
   // int32_t mBoundaryMapSizeRow = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index ae833ace2f648..582a0c6d7435a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -98,6 +98,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnClusterizerPadTimeSize = clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
   clustererNN.mNnClusterizerRowTimeSize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullTimeSize;
   clustererNN.mNnClusterizerRowTimeSizeFull = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 3 : 0);
+  clustererNN.mNnClusterizerRowTimeSizeThreads = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 1 : 0);
   clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
   // clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
   // clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 55fefa7dcf149..1c76fc3f1fa57 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -72,15 +72,19 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   int32_t time = static_cast<int>(peak.time());
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
   int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
+  const int32_t iroc_row = 63 + row_offset;
+  const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset;
+  const int32_t npads_row = GPUTPCGeometry::NPads(row);
 
   for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) {
     int32_t target_row = row + r;
     bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
     int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row);
+    int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(target_row + row_offset);
 
     for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
       int32_t target_pad = pad + p;
-      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow);
+      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference);
 
       for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) {
         int32_t target_time = time + t;
@@ -143,125 +147,119 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
-  if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeFull) {
+  if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeThreads) {
     return;
   }
 
-  uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeFull;
-  uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeFull);
+  uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads;
+  uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads);
 
   // Early exit for out-of-bounds threads
-  if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
-    return;
-  }
+  // if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
+  //   return;
+  // }
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
 
   // Use dedicated neural network shared memory arrays for warp-level caching
   // First thread in each warp loads shared data
   CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
-  float central_charge = static_cast<float>(chargeMap[peak].unpack());
+  float central_charge = chargeMap[peak].unpack();
   int32_t row = static_cast<int>(peak.row());
   int32_t pad = static_cast<int>(peak.pad());
   int32_t time = static_cast<int>(peak.time());
 
+  const int32_t npads_row = GPUTPCGeometry::NPads(row);
+
   // Handle index data with fewer branches
   if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
-    int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
-    uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
-
-    float index_values[3] = {
-      static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR,
-      static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW,
-      static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
-
+    // int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
+    // uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
+//
+    // float index_values[3] = {
+    //   static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR,
+    //   static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW,
+    //   static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
+//
+    // if (dtype == 0) {
+    //   clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
+    // } else {
+    //   clustererNN.mInputData_32[write_idx] = index_values[data_idx];
+    // }
+//
+    // // Handle deconvolution flags only once per cluster (last thread in element)
+    // if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) {
+    //   uint8_t cluster_flags = 0;
+    //   for (uint16_t i = 0; i < 8; i++) {
+    //     Delta2 d = cfconsts::InnerNeighbors[i];
+    //     CfChargePos tmp_pos = peak.delta(d);
+    //     cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
+    //   }
+    //   clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
+    //   clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
+    // }
+    // return;
+    uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize;
     if (dtype == 0) {
-      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
+      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
+      clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
+      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads_row);
     } else {
-      clustererNN.mInputData_32[write_idx] = index_values[data_idx];
-    }
-
-    // Handle deconvolution flags only once per cluster (last thread in element)
-    if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) {
-      uint8_t cluster_flags = 0;
-      for (uint16_t i = 0; i < 8; i++) {
-        Delta2 d = cfconsts::InnerNeighbors[i];
-        CfChargePos tmp_pos = peak.delta(d);
-        cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
-      }
-      clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
-      clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
+      clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
+      clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
+      clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads_row;
     }
-    return;
   }
 
   // Main data processing - optimize index calculations
   if (transient_index < clustererNN.mNnClusterizerRowTimeSize) {
     // Optimize 3D index calculation
-    int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
-    int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow;
-    int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize;
-    int32_t t_local = time_idx - clustererNN.mNnClusterizerSizeInputTime;
+    const int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
+    const int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize;
     int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx;
 
     // Early boundary check for row
-    int32_t target_row = row + r_local;
-    int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1));
+    const int32_t target_row = row + row_idx - clustererNN.mNnClusterizerSizeInputRow;
+    const int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1));
+    const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime;
+    const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
+    const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide
 
     // Calculate offsets
-    int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-    int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
-    for (int32_t p_local = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local++) {
+    // int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
+    // int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
+    const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
+    const int32_t iroc_row = 63 + row_offset;
+    const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset;
+    const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row);
+    const int32_t boundary_row = target_row + row_offset;
+    const int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(boundary_row - clustererNN.mNnClusterizerSizeInputRow);
+    const float boundary_val = clustererNN.mNnClusterizerBoundaryFillValue;
+
+    float output_value = boundary_val;
+
+    const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local;
+    const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local;
+
+    for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) {
       if (is_row_boundary) {
-        // Use boundary fill value
-        float boundary_val = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
-        if (dtype == 0) {
-          clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_val;
+        output_value = boundary_val;
+      } else {
+        const uint8_t is_boundary = is_time_boundary || GPUTPCNNClusterizerKernels::isBoundary(boundary_row, target_pad, maxrow, iroc_row, npads_row, npads_reference);
+        if (!is_boundary) {
+          CfChargePos pos(target_row, target_pad, target_time);
+          // one load + one multiply
+          output_value = chargeMap[pos].unpack() * inverse_central_charge;
         } else {
-          clustererNN.mInputData_32[write_idx] = boundary_val;
+          output_value = boundary_val;
         }
-        write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
-        continue;
-      }
-
-      // Calculate target pad and time
-      int32_t target_pad = pad + p_local;
-      int32_t target_time = time + t_local;
-
-      // Optimized boundary check
-      int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
-
-      float output_value;
-      if (is_boundary) {
-        output_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
-      } else {
-        // Coalesced memory access - create position and read charge
-        CfChargePos tmp_pos(target_row, target_pad, target_time);
-        output_value = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge
       }
-
-      // Write output with reduced branching
       if (dtype == 0) {
         clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
       } else {
         clustererNN.mInputData_32[write_idx] = output_value;
       }
-      // if (write_idx >= clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode) {
-      //   printf("Error: Write index out of bounds (central array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
-      //          write_idx, (int)(clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
-      // }
-      // if ((clusterer.mPmemory->counters.nClusters - batchStart) < clustererNN.mNnClusterizerBatchedMode) {
-      //   if (write_idx >= ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize)) {
-      //     printf("Error: Write index out of bounds (end of array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
-      //           write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
-      //   }
-      //   if (write_idx > ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 5)) {
-      //     printf("Sanity check (should appear only once) %d == %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
-      //           write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 4), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
-      //   }
-      // }
-
-      write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
+      write_idx += clustererNN.mNnClusterizerFullTimeSize;
     }
   }
 }
@@ -275,6 +273,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
   if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
+  if (glo_idx + batchStart >= clustererNN.mNnClusterizerTotalClusters) {
+    printf("Error: Class output index out of bounds! %d >= %d (glo_idx: %d, batchStart: %d, mNnClusterizerBatchedMode: %d, mNnClusterizerModelClassNumOutputNodes: %d, clusterer.mPmemory->counters.nClusters %d)\n",
+           glo_idx + batchStart, clustererNN.mNnClusterizerTotalClusters, glo_idx, batchStart, clustererNN.mNnClusterizerBatchedMode, clustererNN.mNnClusterizerModelClassNumOutputNodes, clusterer.mPmemory->counters.nClusters);
+  }
   if (clustererNN.mNnClusterizerUseClassification) {
     if (dtype == 0) {
       clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold);
@@ -364,6 +366,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     return;
   }
 
+  if (full_glo_idx >=  clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes) {
+    printf("Error: Global index out of bounds! %d >= %d (full_glo_idx: %d, maxClusterNum: %d, batchStart: %d)\n",
+           full_glo_idx, clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes, full_glo_idx, maxClusterNum, batchStart);
+  }
+
   tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
 
   ClusterAccumulator pc;
@@ -737,16 +744,16 @@ GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset
   return (row > 62 ? offset : 0);
 }
 
-GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset)
+GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t maxrow, int32_t iroc_row, int32_t npads_row, int32_t npads_reference)
 {
-  if (pad < 0 || row < 0) { // Faster short-circuit
+  if (pad < 0) { // Faster short-circuit
     return true;
   } else if (row < 63) {
-    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row)));
-  } else if (row < (63 + offset)) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
+    return (pad >= npads_row);
+  } else if (row < iroc_row) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
     return true;
-  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + offset)) {
-    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - offset)));
+  } else if (row < maxrow) {
+    return (pad >= npads_reference);
   } else {
     return true;
   }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index cd3d7783771fe..9353722568b1f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -57,7 +57,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
     determineClass2Labels = 4,
     publishClass1Regression = 5,
     publishClass2Regression = 6,
-    publishDeconvolutionFlags = 7,
+    publishDeconvolutionFlags = 7
   };
 
   template <int32_t iKernel = defaultKernel, typename... Args>
@@ -66,7 +66,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
  private:
   static GPUd() int32_t padOffset(int32_t, int32_t);
   static GPUd() int32_t rowOffset(int32_t, int32_t);
-  static GPUd() bool isBoundary(int32_t, int32_t, int32_t);
+  static GPUd() bool isBoundary(int32_t, int32_t, int32_t, int32_t, int32_t, int32_t);
   static GPUd() bool isBoundaryPublish(int32_t, int32_t, float&, float&);
 };
 

From a075c43cd3da71e56da1cd96769100c68c6c6c11 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 18 Oct 2025 18:33:48 +0200
Subject: [PATCH 2/6] Adjusting parameter bounds and additional GPU kernel
 optimizations

---
 .../Definitions/GPUDefParametersDefaults.h    |   2 +-
 .../Global/GPUChainTrackingClusterizer.cxx    |   7 +-
 .../GPUTPCNNClusterizerKernels.cxx            | 104 +++++++-----------
 3 files changed, 40 insertions(+), 73 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
index 29aa3808506dc..648482304aca2 100644
--- a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
+++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
@@ -482,7 +482,7 @@
 
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNCPU GPUCA_LB_GPUTPCNNClusterizerKernels
-  #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU GPUCA_LB_GPUTPCNNClusterizerKernels
+  #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU 1024
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index c4b7b2e47cbd9..96ee665bda4eb 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -642,7 +642,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
   // Maximum of 4 lanes supported
   HighResTimer* nnTimers[12];
-  int32_t countLoops = 0;
 
   if (GetProcessingSettings().nn.applyNNclusterizer) {
     int32_t deviceId = -1;
@@ -1036,10 +1035,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             // Filling the data
             if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
               // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations
-              for(int throughput_counter = 0; throughput_counter < 16; throughput_counter++) { // Loop to increase throughput on GPU, at least for large batch sizes
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
-                countLoops++;
-              }
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
             } else {
               // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
@@ -1142,7 +1138,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
               LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
             }
           }
-          LOG(info) << "countLoops: " << countLoops;
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 1c76fc3f1fa57..3a27b8740e46d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -72,8 +72,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   int32_t time = static_cast<int>(peak.time());
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
   int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-  const int32_t iroc_row = 63 + row_offset;
-  const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset;
+  const int32_t iroc_row = 63 + clustererNN.mNnClusterizerSizeInputRow;
+  const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + clustererNN.mNnClusterizerSizeInputRow;
   const int32_t npads_row = GPUTPCGeometry::NPads(row);
 
   for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) {
@@ -169,45 +169,18 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   int32_t pad = static_cast<int>(peak.pad());
   int32_t time = static_cast<int>(peak.time());
 
-  const int32_t npads_row = GPUTPCGeometry::NPads(row);
-
   // Handle index data with fewer branches
   if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
-    // int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
-    // uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
-//
-    // float index_values[3] = {
-    //   static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR,
-    //   static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW,
-    //   static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
-//
-    // if (dtype == 0) {
-    //   clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
-    // } else {
-    //   clustererNN.mInputData_32[write_idx] = index_values[data_idx];
-    // }
-//
-    // // Handle deconvolution flags only once per cluster (last thread in element)
-    // if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) {
-    //   uint8_t cluster_flags = 0;
-    //   for (uint16_t i = 0; i < 8; i++) {
-    //     Delta2 d = cfconsts::InnerNeighbors[i];
-    //     CfChargePos tmp_pos = peak.delta(d);
-    //     cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
-    //   }
-    //   clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
-    //   clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
-    // }
-    // return;
     uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize;
+    const int32_t npads = GPUTPCGeometry::NPads(row);
     if (dtype == 0) {
       clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
       clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
-      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads_row);
+      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads);
     } else {
       clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
       clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
-      clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads_row;
+      clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads;
     }
   }
 
@@ -220,46 +193,45 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 
     // Early boundary check for row
     const int32_t target_row = row + row_idx - clustererNN.mNnClusterizerSizeInputRow;
-    const int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1));
-    const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime;
-    const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
-    const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide
-
-    // Calculate offsets
-    // int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-    // int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
-    const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-    const int32_t iroc_row = 63 + row_offset;
-    const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset;
-    const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row);
-    const int32_t boundary_row = target_row + row_offset;
-    const int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(boundary_row - clustererNN.mNnClusterizerSizeInputRow);
-    const float boundary_val = clustererNN.mNnClusterizerBoundaryFillValue;
-
-    float output_value = boundary_val;
-
-    const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local;
-    const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local;
-
-    for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) {
-      if (is_row_boundary) {
-        output_value = boundary_val;
-      } else {
-        const uint8_t is_boundary = is_time_boundary || GPUTPCNNClusterizerKernels::isBoundary(boundary_row, target_pad, maxrow, iroc_row, npads_row, npads_reference);
-        if (!is_boundary) {
+    float output_value = clustererNN.mNnClusterizerBoundaryFillValue;
+
+    if ((row < 63 && target_row > 62) || (target_row < 0) || (row > 62 && target_row < 63) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) {
+      for (int32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) {
+        if (dtype == 0) {
+          clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
+        } else {
+          clustererNN.mInputData_32[write_idx] = output_value;
+        }
+        write_idx += clustererNN.mNnClusterizerFullTimeSize;
+      }
+      return;
+    } else {
+      // Calculate offsets
+      const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime;
+      const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
+      const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide
+      const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row);
+      const int32_t npads = GPUTPCGeometry::NPads(target_row);
+
+      const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local;
+      const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local;
+
+      for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) {
+        if (target_pad >= npads || target_pad < 0 || is_time_boundary) {
+          output_value = clustererNN.mNnClusterizerBoundaryFillValue;
+        } else {
           CfChargePos pos(target_row, target_pad, target_time);
           // one load + one multiply
           output_value = chargeMap[pos].unpack() * inverse_central_charge;
+        }
+        if (dtype == 0) {
+          clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
         } else {
-          output_value = boundary_val;
+          clustererNN.mInputData_32[write_idx] = output_value;
         }
+        write_idx += clustererNN.mNnClusterizerFullTimeSize;
       }
-      if (dtype == 0) {
-        clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
-      } else {
-        clustererNN.mInputData_32[write_idx] = output_value;
-      }
-      write_idx += clustererNN.mNnClusterizerFullTimeSize;
+      return;
     }
   }
 }

From 587c3e6fadd812b0489891c28759dc3d6865e036 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 18 Oct 2025 21:12:47 +0200
Subject: [PATCH 3/6] Adding back if statement for early exit

---
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 3a27b8740e46d..0f000451794ec 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -155,9 +155,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads);
 
   // Early exit for out-of-bounds threads
-  // if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
-  //   return;
-  // }
+  if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
+    return;
+  }
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
 

From 6e432576d34b871974fdb236bc9c6bc75619fcb8 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 18 Oct 2025 22:19:01 +0200
Subject: [PATCH 4/6] const'ing + fixing CPU kernel

---
 .../Global/GPUChainTrackingClusterizer.cxx    |  2 +-
 .../GPUTPCNNClusterizerKernels.cxx            | 63 +++++++++----------
 2 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 96ee665bda4eb..1740e525937f3 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -1035,7 +1035,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             // Filling the data
             if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
               // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
             } else {
               // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 0f000451794ec..7927dcf572acc 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -57,7 +57,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
-  uint32_t glo_idx = get_global_id(0);
+  const uint32_t glo_idx = get_global_id(0);
   if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
@@ -67,43 +67,42 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
   CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
-  int32_t row = static_cast<int>(peak.row());
-  int32_t pad = static_cast<int>(peak.pad());
-  int32_t time = static_cast<int>(peak.time());
-  float central_charge = static_cast<float>(chargeMap[peak].unpack());
-  int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
+  const int32_t row = static_cast<int>(peak.row());
+  const int32_t pad = static_cast<int>(peak.pad());
+  const int32_t time = static_cast<int>(peak.time());
+  const float central_charge = static_cast<float>(chargeMap[peak].unpack());
+  const float inverse_charge = 1.f / central_charge;
+
+  const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
   const int32_t iroc_row = 63 + clustererNN.mNnClusterizerSizeInputRow;
   const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + clustererNN.mNnClusterizerSizeInputRow;
   const int32_t npads_row = GPUTPCGeometry::NPads(row);
+  float output_value = clustererNN.mNnClusterizerBoundaryFillValue;
 
-  for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) {
-    int32_t target_row = row + r;
-    bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
-    int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row);
-    int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(target_row + row_offset);
+  for (int32_t target_row = -clustererNN.mNnClusterizerSizeInputRow + row; target_row <= clustererNN.mNnClusterizerSizeInputRow + row; ++target_row) {
+    uint8_t is_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
+    const int32_t p_local = pad + (is_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row));
+    const int32_t npads_reference = is_boundary ? 0 : GPUTPCGeometry::NPads(target_row - row_offset);
 
-    for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
-      int32_t target_pad = pad + p;
-      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference);
-
-      for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) {
-        int32_t target_time = time + t;
+    for (int32_t target_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; target_pad <= clustererNN.mNnClusterizerSizeInputPad + p_local; ++target_pad) {
+      is_boundary = is_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference);
 
+      for (int32_t target_time = -clustererNN.mNnClusterizerSizeInputTime + time; target_time <= clustererNN.mNnClusterizerSizeInputTime + time; ++target_time) {
         if (is_boundary || target_time < 0 || target_time >= clustererNN.maxAllowedTimebin) {
           // Fill boundary value
-          float boundary_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+          output_value = clustererNN.mNnClusterizerBoundaryFillValue;
           if (dtype == 0) {
-            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_value;
+            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
           } else {
-            clustererNN.mInputData_32[write_idx] = boundary_value;
+            clustererNN.mInputData_32[write_idx] = output_value;
           }
         } else {
           CfChargePos tmp_pos(target_row, target_pad, target_time);
-          float normalized_charge = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+          output_value = chargeMap[tmp_pos].unpack() * inverse_charge;
           if (dtype == 0) {
-            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)normalized_charge;
+            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
           } else {
-            clustererNN.mInputData_32[write_idx] = normalized_charge;
+            clustererNN.mInputData_32[write_idx] = output_value;
           }
         }
         // if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
@@ -119,11 +118,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     if (dtype == 0) {
       clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
       clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
-      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
+      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads_row);
     } else {
       clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
       clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
-      clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
+      clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads_row;
     }
   }
 
@@ -143,7 +142,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
-  uint32_t glo_idx = get_global_id(0);
+  const uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
@@ -151,8 +150,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     return;
   }
 
-  uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads;
-  uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads);
+  const uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads;
+  const uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads);
 
   // Early exit for out-of-bounds threads
   if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
@@ -164,10 +163,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   // Use dedicated neural network shared memory arrays for warp-level caching
   // First thread in each warp loads shared data
   CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
-  float central_charge = chargeMap[peak].unpack();
-  int32_t row = static_cast<int>(peak.row());
-  int32_t pad = static_cast<int>(peak.pad());
-  int32_t time = static_cast<int>(peak.time());
+  const float central_charge = chargeMap[peak].unpack();
+  const int32_t row = static_cast<int>(peak.row());
+  const int32_t pad = static_cast<int>(peak.pad());
+  const int32_t time = static_cast<int>(peak.time());
 
   // Handle index data with fewer branches
   if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {

From bb795c4170289bae93b19223264dc09aacaebea0 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 19 Oct 2025 09:43:08 +0200
Subject: [PATCH 5/6] Remiving print statements

---
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx      | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 7927dcf572acc..b214a8f549a56 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -244,10 +244,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
   if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
-  if (glo_idx + batchStart >= clustererNN.mNnClusterizerTotalClusters) {
-    printf("Error: Class output index out of bounds! %d >= %d (glo_idx: %d, batchStart: %d, mNnClusterizerBatchedMode: %d, mNnClusterizerModelClassNumOutputNodes: %d, clusterer.mPmemory->counters.nClusters %d)\n",
-           glo_idx + batchStart, clustererNN.mNnClusterizerTotalClusters, glo_idx, batchStart, clustererNN.mNnClusterizerBatchedMode, clustererNN.mNnClusterizerModelClassNumOutputNodes, clusterer.mPmemory->counters.nClusters);
-  }
   if (clustererNN.mNnClusterizerUseClassification) {
     if (dtype == 0) {
       clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold);
@@ -337,11 +333,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     return;
   }
 
-  if (full_glo_idx >=  clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes) {
-    printf("Error: Global index out of bounds! %d >= %d (full_glo_idx: %d, maxClusterNum: %d, batchStart: %d)\n",
-           full_glo_idx, clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes, full_glo_idx, maxClusterNum, batchStart);
-  }
-
   tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
 
   ClusterAccumulator pc;

From f7cdc0be08798e9b10a490a8b08697d0826450bc Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 27 Oct 2025 13:08:55 +0100
Subject: [PATCH 6/6] Fixing CI build issue

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index b214a8f549a56..ee0fa217b8095 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -195,7 +195,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     float output_value = clustererNN.mNnClusterizerBoundaryFillValue;
 
     if ((row < 63 && target_row > 62) || (target_row < 0) || (row > 62 && target_row < 63) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) {
-      for (int32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) {
+      for (uint32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) {
         if (dtype == 0) {
           clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
         } else {