diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 25bfe37f0db30..13455efe6cb47 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -706,6 +706,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
       }
       AllocateRegisteredMemory(clustererNN.mMemoryId);
+      // nnApplications[lane].createBoundary(clustererNNShadow);
+      // nnApplications[lane].createIndexLookup(clustererNNShadow);
     });
     if (doGPU) {
       WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index da37c0771fe84..3dd8b0d621a56 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -65,33 +65,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
   return mem;
 }
 
-// std::vector<int32_t> GPUTPCNNClusterizer::pointerSizes() {
-//   std::vector<int32_t> sizes(7, -1);
-//   if (mNnClusterizerBatchedMode > 0) {
-//     if (mNnInferenceInputDType == 0 && mNnClusterizerElementSize > 0) {
-//       sizes[0] = mNnClusterizerBatchedMode * mNnClusterizerElementSize; // inputData16
-//     } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) {
-//       sizes[1] = mNnClusterizerBatchedMode * mNnClusterizerElementSize; // inputData32
-//     }
-//     sizes[2] = 2 * mNnClusterizerBatchedMode; // mClusterFlags
-//     if (mNnClusterizerModelClassNumOutputNodes > 0) {
-//       sizes[3] = mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes; // modelProbabilities
-//     }
-//     if (!mNnClusterizerUseCfRegression) {
-//       if (mNnClusterizerModelReg1NumOutputNodes > 0) {
-//         sizes[4] = mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes; // outputDataReg1
-//       }
-//       if (mNnClusterizerModelReg2NumOutputNodes > 0) {
-//         sizes[5] = mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes; // outputDataReg2
-//       }
-//     }
-//   }
-//   if (mNnClusterizerTotalClusters > 0) {
-//     sizes[6] = mNnClusterizerTotalClusters; // mOutputDataClass
-//   }
-//   return sizes;
-// }
-
 void GPUTPCNNClusterizer::RegisterMemoryAllocation()
 {
   AllocateAndInitializeLate();
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 980c0977aca65..7c22d8123fdec 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -37,31 +37,51 @@ class GPUTPCNNClusterizer : public GPUProcessor
 
   // Neural network clusterization
 
-  int mNnClusterizerSizeInputRow = 3;
-  int mNnClusterizerSizeInputPad = 3;
-  int mNnClusterizerSizeInputTime = 3;
-  int mNnClusterizerElementSize = -1;
-  bool mNnClusterizerAddIndexData = true;
+  int32_t mNnClusterizerSizeInputRow = 3;
+  int32_t mNnClusterizerSizeInputPad = 3;
+  int32_t mNnClusterizerSizeInputTime = 3;
+  int32_t mNnClusterizerChargeArraySize = -1;
+  int32_t mNnClusterizerElementSize = -1;
+  int8_t mNnClusterizerAddIndexData = 1;
   float mNnClassThreshold = 0.01;
-  bool mNnSigmoidTrafoClassThreshold = 1;
-  bool mNnClusterizerSetDeconvolutionFlags = true;
-  int mNnClusterizerUseCfRegression = 0;
-  int mNnClusterizerBatchedMode = 1;
-  int mNnClusterizerTotalClusters = 1;
-  int mNnClusterizerVerbosity = 0;
-  int mNnClusterizerBoundaryFillValue = -1;
-  int mNnClusterizerModelClassNumOutputNodes = -1;
-  int mNnClusterizerModelReg1NumOutputNodes = -1;
-  int mNnClusterizerModelReg2NumOutputNodes = -1;
-  int mNnInferenceInputDType = 0;  // 0: float16, 1: float32
-  int mNnInferenceOutputDType = 0; // 0: float16, 1: float32
-  int mISector = -1;
-  int mDeviceId = -1;
+  int8_t mNnSigmoidTrafoClassThreshold = 1;
+  int8_t mNnClusterizerSetDeconvolutionFlags = 1;
+  int32_t mNnClusterizerUseCfRegression = 0;
+  int32_t mNnClusterizerBatchedMode = 1;
+  int32_t mNnClusterizerTotalClusters = 1;
+  int32_t mNnClusterizerVerbosity = 0;
+  int32_t mNnClusterizerBoundaryFillValue = -1;
+  int32_t mNnClusterizerModelClassNumOutputNodes = -1;
+  int32_t mNnClusterizerModelReg1NumOutputNodes = -1;
+  int32_t mNnClusterizerModelReg2NumOutputNodes = -1;
+  int32_t mNnInferenceInputDType = 0;  // 0: float16, 1: float32
+  int32_t mNnInferenceOutputDType = 0; // 0: float16, 1: float32
+  int32_t mISector = -1;
+  int32_t mDeviceId = -1;
+
+  // GPU optimizations
+  uint32_t mNnClusterizerFullRowSize = 0;
+  uint32_t mNnClusterizerFullPadSize = 0;
+  uint32_t mNnClusterizerFullTimeSize = 0;
+  uint32_t mNnClusterizerPadTimeSize = 0;
+  uint32_t mNnClusterizerRowTimeSize = 0;
+  uint32_t mNnClusterizerRowTimeSizeFull = 0;
+
+  // Boundary lookup table
+  // int32_t mBoundaryMapSizeRow = 0;
+  // int32_t mBoundaryMapSizePadsPerRow = 0;
+  // int32_t mBoundaryMapSize = 0;
+  // int32_t mBoundaryPadding = 11; // Padding on each side of the boundary map to account for pad_offset
+  // int8_t* mIsBoundary = nullptr;
+
+  // Index lookup table
+  // int32_t mIndexLookupSize = 0;
+  // int32_t* mIndexLookup = nullptr;
 
   // Memory allocation for neural network
 
-  bool* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
-  int* mOutputDataClass = nullptr;
+  int8_t* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
+  int32_t* mOutputDataClass = nullptr;
 
   // FP32
   float* mInputData_32 = nullptr;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 124320396d0d4..fd56d49de7921 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -19,6 +19,8 @@
 #include "GPUSettings.h"
 #include "ML/3rdparty/GPUORTFloat16.h"
 #include "GPUReconstruction.h"
+#include "GPUTPCGeometry.h"
+#include "DataFormatsTPC/Constants.h"
 
 #ifdef GPUCA_HAS_ONNX
 #include <onnxruntime_cxx_api.h>
@@ -87,8 +89,20 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
   clustererNN.mNnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad;
   clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
+  clustererNN.mNnClusterizerFullRowSize = 2 * settings.nnClusterizerSizeInputRow + 1;
+  clustererNN.mNnClusterizerFullPadSize = 2 * settings.nnClusterizerSizeInputPad + 1;
+  clustererNN.mNnClusterizerFullTimeSize = 2 * settings.nnClusterizerSizeInputTime + 1;
+  clustererNN.mNnClusterizerChargeArraySize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
+  clustererNN.mNnClusterizerPadTimeSize = clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
+  clustererNN.mNnClusterizerRowTimeSize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullTimeSize;
+  clustererNN.mNnClusterizerRowTimeSizeFull = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 3 : 0);
+  clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
+  // clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
+  // clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
+  // clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding;
+  // clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow;
+  // clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index
   clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
-  clustererNN.mNnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0);
   clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
   clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
   clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
@@ -116,6 +130,39 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   }
 }
 
+// void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN)
+// {
+//   // Call after init of the clustererNN elements
+//   for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) {
+//     int8_t skipCheckInRow = 0;
+//     for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) {
+//       int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p;
+//       clustererNN.mIsBoundary[i] = 1;
+//       if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) {
+//         if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) {
+//           clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow)));
+//         } else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) {
+//           clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast<int>(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow)));
+//         }
+//         skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row
+//       }
+//     }
+//   }
+// }
+
+// void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN)
+// {
+//   for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) {
+//     int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
+//     int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
+//     int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad;
+//     int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
+//     clustererNN.mIndexLookup[3 * i] = r;
+//     clustererNN.mIndexLookup[3 * i + 1] = p;
+//     clustererNN.mIndexLookup[3 * i + 2] = t;
+//   }
+// }
+
 // MockedOrtAllocator implementation to be able to use volatile assignment
 struct MockedOrtAllocator : OrtAllocator {
   MockedOrtAllocator(GPUReconstruction* = nullptr, OrtMemoryInfo* = nullptr);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index a4449165261be..ed3c80320b632 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -49,6 +49,8 @@ class GPUTPCNNClusterizerHost
 
   void init(const GPUSettingsProcessingNNclusterizer&);
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+  void createBoundary(GPUTPCNNClusterizer&);
+  void createIndexLookup(GPUTPCNNClusterizer&);
 
   // ONNX
   void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 8cdc0684ad588..4cd0c094398df 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -53,63 +53,90 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNCPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
-  uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
-  uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize; // Potential optimization: Either choose mNnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
+
+  uint32_t glo_idx = get_global_id(0);
+  if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
+    return;
+  }
+
+  uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize;
 
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
   CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
-  int32_t row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
+  int32_t row = static_cast<int>(peak.row());
+  int32_t pad = static_cast<int>(peak.pad());
+  int32_t time = static_cast<int>(peak.time());
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
   int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
 
-  for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) {
-    bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
-    int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r);
-    for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) {
-      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow);
-      for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) {
-        int32_t time_pos = time + t;
-        if (!is_boundary && (time_pos >= 0) && (time_pos < TPC_MAX_FRAGMENT_LEN_GPU)) {
-          CfChargePos tmp_pos(row + r, pad + p, time + t);
-          if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
-            clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
-            clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
-          }
+  for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) {
+    int32_t target_row = row + r;
+    bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
+    int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row);
+
+    for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
+      int32_t target_pad = pad + p;
+      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow);
+
+      for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) {
+        int32_t target_time = time + t;
+
+        if (is_boundary || target_time < 0 || target_time >= TPC_MAX_FRAGMENT_LEN_GPU) {
+          // Fill boundary value
+          float boundary_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
           if (dtype == 0) {
-            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
-          } else if (dtype == 1) {
-            clustererNN.mInputData_32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_value;
+          } else {
+            clustererNN.mInputData_32[write_idx] = boundary_value;
           }
         } else {
-          // Filling boundary just to make sure that no values are left unintentionally
+          CfChargePos tmp_pos(target_row, target_pad, target_time);
+          float normalized_charge = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+
+          if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && r == 0 && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) {
+            clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
+            clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
+          }
+
           if (dtype == 0) {
-            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
+            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)normalized_charge;
           } else {
-            clustererNN.mInputData_32[write_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+            clustererNN.mInputData_32[write_idx] = normalized_charge;
           }
         }
+        // if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
+        //   printf("Warning: Input data mismatch at index %d, %d - row, pad, time: %d, %d, %d : %f -> %f\n", glo_idx, glo_idx + batchStart, r, p, t,
+        //          static_cast<float>(clustererNN.mInputData_16_Test[write_idx]), static_cast<float>(clustererNN.mInputData_16[write_idx]));
+        // }
         write_idx++;
       }
     }
   }
+
   if (clustererNN.mNnClusterizerAddIndexData) {
+    float sector_norm = sector / 36.f;
+    float row_norm = row / 152.f;
+    float pad_norm = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
+
     if (dtype == 0) {
-      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(sector / 36.f);
-      clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
-      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
+      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)sector_norm;
+      clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)row_norm;
+      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)pad_norm;
     } else {
-      clustererNN.mInputData_32[write_idx] = sector / 36.f;
-      clustererNN.mInputData_32[write_idx + 1] = row / 152.f;
-      clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
+      clustererNN.mInputData_32[write_idx] = sector_norm;
+      clustererNN.mInputData_32[write_idx + 1] = row_norm;
+      clustererNN.mInputData_32[write_idx + 2] = pad_norm;
     }
   }
+
   if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) {
     clustererNN.mClusterFlags[2 * glo_idx] = 0;
     clustererNN.mClusterFlags[2 * glo_idx + 1] = 0;
-    for (uint16_t i = 0; i < 8; i++) {
+
+    for (uint16_t i = 0; i < 8; ++i) {
       Delta2 d = cfconsts::InnerNeighbors[i];
       CfChargePos tmp_pos = peak.delta(d);
       clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
@@ -122,71 +149,111 @@ template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
   uint32_t glo_idx = get_global_id(0);
+
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
-  uint32_t base_idx = CAMath::Floor(glo_idx / clustererNN.mNnClusterizerElementSize);
-  uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerElementSize);
 
+  // Optimized division using bit operations
+  uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeFull;
+  uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeFull);
+
+  // Early exit for out-of-bounds threads
+  if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
+    return;
+  }
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
+
+  // Use dedicated neural network shared memory arrays for warp-level caching
+  // First thread in each warp loads shared data
   CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
-  int32_t row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad());
-
-  if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.mNnClusterizerElementSize - 1)) {
-    uint32_t top_idx = (base_idx + 1) * clustererNN.mNnClusterizerElementSize;
-    if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { // Only if deconvolution flags are not set
-      clustererNN.mClusterFlags[2 * base_idx] = 0;
-      clustererNN.mClusterFlags[2 * base_idx + 1] = 0;
-      for (uint16_t i = 0; i < 8; i++) {                    // This solution needs testing. It is not the same as the deconvolution flags
-        Delta2 d = cfconsts::InnerNeighbors[i];
-        CfChargePos tmp_pos = peak.delta(d);
-        clustererNN.mClusterFlags[2 * base_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
-      }
-      clustererNN.mClusterFlags[2 * base_idx + 1] = clustererNN.mClusterFlags[2 * base_idx];
-    }
+  float central_charge = static_cast<float>(chargeMap[peak].unpack());
+  int32_t row = static_cast<int>(peak.row());
+  int32_t pad = static_cast<int>(peak.pad());
+  int32_t time = static_cast<int>(peak.time());
+
+  // Handle index data with fewer branches
+  if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
+    int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
+    uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
+
+    float index_values[3] = {
+      sector / 36.f,
+      row / 152.f,
+      static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
+
     if (dtype == 0) {
-      clustererNN.mInputData_16[top_idx - 3] = (OrtDataType::Float16_t)(sector / 36.f);
-      clustererNN.mInputData_16[top_idx - 2] = (OrtDataType::Float16_t)(row / 152.f);
-      clustererNN.mInputData_16[top_idx - 1] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
+      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
     } else {
-      clustererNN.mInputData_32[top_idx - 3] = sector / 36.f;
-      clustererNN.mInputData_32[top_idx - 2] = row / 152.f;
-      clustererNN.mInputData_32[top_idx - 1] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
+      clustererNN.mInputData_32[write_idx] = index_values[data_idx];
     }
-  } else if ((int32_t)transient_index < (clustererNN.mNnClusterizerElementSize - 3)) {
-    int32_t time = static_cast<int>(peak.time());
-    int32_t r = CAMath::Floor(transient_index / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
-    bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
-    if (is_row_boundary) {
-      if (dtype == 0) {
-        clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
-      } else {
-        clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
-      }
-    } else {
-      int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-      int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r);
-      int32_t rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
-      int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad + pad_offset;
-      int32_t time_pos = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime + time;
 
-      bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (time_pos < 0 || time_pos >= TPC_MAX_FRAGMENT_LEN_GPU);
+    // Handle deconvolution flags only once per cluster (last thread in element)
+    if (data_idx == 2 && !clustererNN.mNnClusterizerSetDeconvolutionFlags) {
+      uint8_t cluster_flags = 0;
+      for (uint16_t i = 0; i < 8; i++) {
+        Delta2 d = cfconsts::InnerNeighbors[i];
+        CfChargePos tmp_pos = peak.delta(d);
+        cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
+      }
+      clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
+      clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
+    }
+    return;
+  }
 
-      if (!is_boundary) {
-        float central_charge = static_cast<float>(chargeMap[peak].unpack());
-        CfChargePos tmp_pos(row + r, pad + p, time_pos);
-        if (dtype == 0) {
-          clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
-        } else if (dtype == 1) {
-          clustererNN.mInputData_32[glo_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
-        }
-      } else {
+  // Main data processing - optimize index calculations
+  if (transient_index < clustererNN.mNnClusterizerRowTimeSize) {
+    // Optimize 3D index calculation
+    int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
+    int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow;
+    int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize;
+    int32_t t_local = time_idx - clustererNN.mNnClusterizerSizeInputTime;
+    int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx;
+
+    // Early boundary check for row
+    int32_t target_row = row + r_local;
+    int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1));
+
+    // Calculate offsets
+    int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
+    int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
+    for (int32_t p_local = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local++) {
+      if (is_row_boundary) {
+        // Use boundary fill value
+        float boundary_val = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
         if (dtype == 0) {
-          clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
+          clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_val;
         } else {
-          clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+          clustererNN.mInputData_32[write_idx] = boundary_val;
         }
+        write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
+        continue;
       }
+
+      // Calculate target pad and time
+      int32_t target_pad = pad + p_local;
+      int32_t target_time = time + t_local;
+
+      // Optimized boundary check
+      int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= TPC_MAX_FRAGMENT_LEN_GPU);
+
+      float output_value;
+      if (is_boundary) {
+        output_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+      } else {
+        // Coalesced memory access - create position and read charge
+        CfChargePos tmp_pos(target_row, target_pad, target_time);
+        output_value = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge
+      }
+
+      // Write output with reduced branching
+      if (dtype == 0) {
+        clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
+      } else {
+        clustererNN.mInputData_32[write_idx] = output_value;
+      }
+      write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
     }
   }
 }
@@ -242,9 +309,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
   uint32_t full_glo_idx = glo_idx + batchStart;
-  if (full_glo_idx >= maxClusterNum) {
-    return;
-  }
   int32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes;
 
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
@@ -253,6 +317,24 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
+
+  if (full_glo_idx >= maxClusterNum) {
+    if (withMC) {
+      ClusterAccumulator dummy_pc;
+      CPU_ONLY(labelAcc->collect(peak, central_charge));
+      GPUTPCCFClusterizer::buildCluster(
+        clusterer.Param().rec,
+        chargeMap,
+        peak,
+        smem.posBcast,
+        smem.buf,
+        smem.innerAboveThreshold,
+        &dummy_pc,
+        labelAcc);
+    }
+    return;
+  }
+
   tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
 
   // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.mNnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
@@ -340,6 +422,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
+  uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
@@ -348,6 +431,24 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
   tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
   uint32_t full_glo_idx = glo_idx + batchStart;
+
+  if (full_glo_idx >= maxClusterNum) {
+    if (withMC) {
+      ClusterAccumulator dummy_pc;
+      CPU_ONLY(labelAcc->collect(peak, central_charge));
+      GPUTPCCFClusterizer::buildCluster(
+        clusterer.Param().rec,
+        chargeMap,
+        peak,
+        smem.posBcast,
+        smem.buf,
+        smem.innerAboveThreshold,
+        &dummy_pc,
+        labelAcc);
+    }
+    return;
+  }
+
   uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes;
 
   if (clustererNN.mOutputDataClass[full_glo_idx] > 0) {
@@ -501,24 +602,28 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
 {
-  return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
+  if (row_current < 0 || row_current >= o2::tpc::constants::MAXGLOBALPADROW) {
+    return 0; // Short-circuit for negative rows
+  } else {
+    return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
+  }
 }
 
-GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t global_shift)
+GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset)
 {
-  return (row > 62 ? global_shift : 0);
+  return (row > 62 ? offset : 0);
 }
 
-GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t global_shift)
+GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset)
 {
   if (pad < 0 || row < 0) { // Faster short-circuit
     return true;
   } else if (row < 63) {
-    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row)));
-  } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
+    return ((pad < 0) || (pad >= static_cast<int>(GPUTPCGeometry::NPads(row))));
+  } else if (row < (63 + offset)) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
     return true;
-  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) {
-    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - global_shift)));
+  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + offset)) {
+    return ((pad < 0) || (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - offset))));
   } else {
     return true;
   }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index a3858d47eb99b..5659c61894c85 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -65,7 +65,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
     determineClass2Labels = 4,
     publishClass1Regression = 5,
     publishClass2Regression = 6,
-    publishDeconvolutionFlags = 7
+    publishDeconvolutionFlags = 7,
   };
 
   template <int32_t iKernel = defaultKernel, typename... Args>