diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index b4f40f3f5c694..7224645425856 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -45,14 +45,10 @@ class OrtModel
 
  public:
   // Constructors & destructors
-  OrtModel() = default;
-  OrtModel(std::unordered_map<std::string, std::string> optionsMap) { init(optionsMap); }
-  void init(std::unordered_map<std::string, std::string> optionsMap)
-  {
-    initOptions(optionsMap);
-    initEnvironment();
-  }
-  virtual ~OrtModel() = default;
+  OrtModel();
+  OrtModel(std::unordered_map<std::string, std::string> optionsMap);
+  void init(std::unordered_map<std::string, std::string> optionsMap);
+  virtual ~OrtModel();
 
   // General purpose
   void initOptions(std::unordered_map<std::string, std::string> optionsMap);
@@ -113,7 +109,7 @@ class OrtModel
  private:
   // ORT variables -> need to be hidden as pImpl
   struct OrtVariables;
-  OrtVariables* mPImplOrt;
+  std::unique_ptr<OrtVariables> mPImplOrt;
 
   // Input & Output specifications of the loaded network
   std::vector<const char*> mInputNamesChar, mOutputNamesChar;
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index df7f0a2deba82..8f31761489997 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -27,11 +27,20 @@ namespace o2
 namespace ml
 {
 
+OrtModel::OrtModel() = default;
+OrtModel::OrtModel(std::unordered_map<std::string, std::string> optionsMap) { init(optionsMap); }
+OrtModel::~OrtModel() = default;
+void OrtModel::init(std::unordered_map<std::string, std::string> optionsMap)
+{
+  initOptions(optionsMap);
+  initEnvironment();
+}
+
 struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file
   // ORT runtime objects
   Ort::RunOptions runOptions;
-  std::shared_ptr<Ort::Env> env = nullptr;
-  std::shared_ptr<Ort::Session> session = nullptr; ///< ONNX session
+  std::unique_ptr<Ort::Env> env = nullptr;
+  std::unique_ptr<Ort::Session> session = nullptr; ///< ONNX session
   Ort::SessionOptions sessionOptions;
   Ort::AllocatorWithDefaultOptions allocator;
   Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
@@ -41,7 +50,7 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c
 // General purpose
 void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsMap)
 {
-  mPImplOrt = new OrtVariables();
+  mPImplOrt = std::make_unique<OrtVariables>();
 
   // Load from options map
   if (!optionsMap.contains("model-path")) {
@@ -101,7 +110,7 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
 
 void OrtModel::initEnvironment()
 {
-  mPImplOrt->env = std::make_shared<Ort::Env>(
+  mPImplOrt->env = std::make_unique<Ort::Env>(
     OrtLoggingLevel(mLoggingLevel),
     (mEnvName.empty() ? "ORT" : mEnvName.c_str()),
     // Integrate ORT logging into Fairlogger
@@ -129,7 +138,7 @@ void OrtModel::initSession()
   if (mAllocateDeviceMemory) {
     memoryOnDevice(mDeviceId);
   }
-  mPImplOrt->session = std::make_shared<Ort::Session>(*mPImplOrt->env, mModelPath.c_str(), mPImplOrt->sessionOptions);
+  mPImplOrt->session = std::make_unique<Ort::Session>(*mPImplOrt->env, mModelPath.c_str(), mPImplOrt->sessionOptions);
   mPImplOrt->ioBinding = std::make_unique<Ort::IoBinding>(*mPImplOrt->session);
 
   setIO();
@@ -147,12 +156,12 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
     (mPImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1");                    // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
     (mPImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0");          // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
     // Arena memory shrinkage comes at performance cost
-    /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
-    // (mPImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
+    // For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
+    (mPImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
 
     std::string dev_mem_str = "";
     if (mDeviceType == "ROCM") {
-      dev_mem_str = "Hip";
+      dev_mem_str = "HipPinned";
     }
     if (mDeviceType == "CUDA") {
       dev_mem_str = "Cuda";
@@ -166,7 +175,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
 
 void OrtModel::resetSession()
 {
-  mPImplOrt->session = std::make_shared<Ort::Session>(*(mPImplOrt->env), mModelPath.c_str(), mPImplOrt->sessionOptions);
+  mPImplOrt->session = std::make_unique<Ort::Session>(*(mPImplOrt->env), mModelPath.c_str(), mPImplOrt->sessionOptions);
 }
 
 // Getters
@@ -252,7 +261,7 @@ void OrtModel::setIO()
 
 void OrtModel::setEnv(Ort::Env* env)
 {
-  mPImplOrt->env = std::shared_ptr<Ort::Env>(env);
+  mPImplOrt->env.reset(env);
 }
 
 // Inference
@@ -308,6 +317,14 @@ void OrtModel::inference(I* input, int64_t input_size, O* output)
   (mPImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor);
 
   (mPImplOrt->session)->Run(mPImplOrt->runOptions, *mPImplOrt->ioBinding);
+  // mPImplOrt->session->Run(
+  //   mPImplOrt->runOptions,
+  //   mInputNamesChar.data(),
+  //   &inputTensor,
+  //   mInputNamesChar.size(),
+  //   mOutputNamesChar.data(),
+  //   &outputTensor,
+  //   mOutputNamesChar.size());
 }
 
 template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t*, int64_t, OrtDataType::Float16_t*);
@@ -427,10 +444,7 @@ template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Fl
 // Release session
 void OrtModel::release(bool profilingEnabled)
 {
-  // if (profilingEnabled) {
-  //   mPImplOrt->session->EndProfiling();
-  // }
-  LOG(info) << "(ORT) Size of mPImplOrt: " << sizeof(*mPImplOrt) << " bytes";
+  mPImplOrt.reset();
 }
 
 // private
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index f188388e76a02..f904ced60fcfa 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -658,7 +658,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         // But environment must be valid, so we init the model environment first and use it here afterwards.
         // Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator.
         // TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // if (lane == 0) {
+        //   nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // }
         // recreateMemoryAllocator = true;
         (nnApplications[lane].mModelClass).initSession();
       }
@@ -670,7 +672,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         }
         // (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv());
         (nnApplications[lane].mModelReg1).initEnvironment();
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
         (nnApplications[lane].mModelReg1).initSession();
       }
       if (nnApplications[lane].mModelsUsed[2]) {
@@ -679,8 +681,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].mModelReg2.setIntraOpNumThreads(maxThreads);
         }
+        // (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv());
         (nnApplications[lane].mModelReg2).initEnvironment();
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
         (nnApplications[lane].mModelReg2).initSession();
       }
       if (nn_settings.nnClusterizerVerbosity < 3) {
@@ -706,8 +709,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     if (doGPU) {
       WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
     }
-    LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes";
-    LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes";
   }
 #endif
 
@@ -975,6 +976,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
           GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];
 
+          // // bool recreateMemoryAllocator = false;
+          // if (lane == 0) {
+          //   (nnApplications[lane].mModelClass).initEnvironment();
+          //   nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, 0);
+          // }
+          // // recreateMemoryAllocator = true;
+          // (nnApplications[lane].mModelClass).initSession();
+          // (nnApplications[lane].mModelReg1).initSession();
+
           int withMC = (doGPU && propagateMCLabels);
 
           if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
@@ -1187,12 +1197,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     }
   }
   for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
-    // if (GetProcessingSettings().nn.applyNNclusterizer) {
-    //   GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
-    //   nnApplication.mModelClass.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
-    //   nnApplication.mModelReg1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
-    //   nnApplication.mModelReg2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
-    // }
+    if (GetProcessingSettings().nn.applyNNclusterizer) {
+      LOG(info) << "(ORT) Environment releasing...";
+      GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
+      nnApplication.mModelClass.release(true);
+      nnApplication.mModelReg1.release(true);
+      nnApplication.mModelReg2.release(true);
+    }
     if (transferRunning[i]) {
       ReleaseEvent(mEvents->stream[i], doGPU);
     }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index ca2deec60601c..90f1d6e27246f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -136,8 +136,8 @@ struct MockedOrtAllocator : OrtAllocator {
   std::atomic<size_t> memory_inuse{0};
   std::atomic<size_t> num_allocations{0};
   std::atomic<size_t> num_reserve_allocations{0};
-  OrtMemoryInfo* memory_info;
-  GPUReconstruction* rec;
+  OrtMemoryInfo* mMemoryInfoInternal;
+  GPUReconstruction* mRecInternal;
 };
 
 MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info)
@@ -147,37 +147,36 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info
   OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
   OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
   OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
-  rec = r;
-  memory_info = info;
+  mRecInternal = r;
+  mMemoryInfoInternal = info;
 }
 
 MockedOrtAllocator::~MockedOrtAllocator()
 {
-  // Ort::GetApi().ReleaseMemoryInfo(memory_info);
+  // Ort::GetApi().ReleaseMemoryInfo(mMemoryInfoInternal);
   (void)0; // Suppress warning for empty destructor
 }
 
 void* MockedOrtAllocator::Alloc(size_t size)
 {
-  // LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes";
-  return rec->AllocateVolatileDeviceMemory(size);
+  LOG(info) << "(ORT) Allocating direct memory of size " << size << " bytes";
+  return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
 }
 
 void* MockedOrtAllocator::Reserve(size_t size)
 {
-  // LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes";
-  return rec->AllocateVolatileDeviceMemory(size);
+  LOG(info) << "(ORT) Reserving direct memory of size " << size << " bytes";
+  return mRecInternal->AllocateDirectMemory(size, GPUMemoryResource::MEMORY_GPU | GPUMemoryResource::MEMORY_STACK);
 }
 
 void MockedOrtAllocator::Free(void* p)
 {
   // LOG(info) << "(ORT) Freeing volatile memory " << p;
-  rec->ReturnVolatileDeviceMemory();
 }
 
 const OrtMemoryInfo* MockedOrtAllocator::Info() const
 {
-  return memory_info;
+  return mMemoryInfoInternal;
 }
 
 size_t MockedOrtAllocator::NumAllocations() const
@@ -197,7 +196,7 @@ void MockedOrtAllocator::LeakCheck()
   }
 }
 
-void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
+void GPUTPCNNClusterizerHost::directOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
 {
   mMockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)(*memInfo));
   if (recreate) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index e659753f21d7d..4334c3418eb09 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -53,7 +53,7 @@ class GPUTPCNNClusterizerHost
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
 
   // ONNX
-  void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
+  void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
   MockedOrtAllocator* getMockedAllocator();
   const OrtMemoryInfo* getMockedMemoryInfo();
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 47bc5e8da80ca..8ca61602ab4e9 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -35,9 +35,9 @@ using namespace o2::gpu::tpccf;
 
 // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
-  uint glo_idx = get_global_id(0);
+  uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
   if (clustererNN.mOutputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
@@ -51,29 +51,29 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart)
 {
-  uint glo_idx = get_global_id(0);
+  uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
-  uint write_idx = glo_idx * clustererNN.mNnClusterizerElementSize; // Potential optimization: Either choose mNnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
+  uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize; // Potential optimization: Either choose mNnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
 
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
-  CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
-  int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
+  int32_t row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
-  int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
+  int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
 
 #ifndef GPUCA_GPUCODE
   GPUCA_UNROLL(U(), U());
 #endif
-  for (int r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) {
+  for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) {
     bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
-    int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r);
-    for (int p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) {
+    int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r);
+    for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) {
       bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow);
-      for (int t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) {
+      for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) {
         if (!is_boundary) {
           CfChargePos tmp_pos(row + r, pad + p, time + t);
           if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
@@ -111,21 +111,21 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNSingleElement>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNSingleElement>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart)
 {
-  uint glo_idx = get_global_id(0);
+  uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
-  uint base_idx = CAMath::Floor(glo_idx / clustererNN.mNnClusterizerElementSize);
-  uint transient_index = glo_idx % clustererNN.mNnClusterizerElementSize;
+  uint32_t base_idx = CAMath::Floor(glo_idx / clustererNN.mNnClusterizerElementSize);
+  uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerElementSize);
 
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
-  CfChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart];
-  int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad());
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
+  int32_t row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad());
 
   if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.mNnClusterizerElementSize - 1)) {
-    uint top_idx = (base_idx + 1) * clustererNN.mNnClusterizerElementSize;
+    uint32_t top_idx = (base_idx + 1) * clustererNN.mNnClusterizerElementSize;
     for (uint16_t i = 0; i < 8; i++) {
       Delta2 d = cfconsts::InnerNeighbors[i];
       CfChargePos tmp_pos = peak.delta(d);
@@ -142,8 +142,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       clustererNN.mInputData_32[top_idx - 1] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
     }
   } else if ((int32_t)transient_index < (clustererNN.mNnClusterizerElementSize - 3)) {
-    int time = static_cast<int>(peak.time());
-    int r = CAMath::Floor(transient_index / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
+    int32_t time = static_cast<int>(peak.time());
+    int32_t r = CAMath::Floor(transient_index / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow;
     bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
     if (is_row_boundary) {
       if (dtype == 0) {
@@ -152,15 +152,16 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
         clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
       }
     } else {
-      int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-      int pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r);
-      int rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
-      int p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad + pad_offset;
-      bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow);
+      int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
+      int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r);
+      int32_t rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
+      int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad + pad_offset;
+      int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
+
+      bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (t < 0 || t >= TPC_MAX_FRAGMENT_LEN_GPU);
 
       if (!is_boundary) {
         float central_charge = static_cast<float>(chargeMap[peak].unpack());
-        int t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
         CfChargePos tmp_pos(row + r, pad + p, time + t);
         if (dtype == 0) {
           clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
@@ -179,9 +180,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart)
 {
-  uint glo_idx = get_global_id(0);
+  uint32_t glo_idx = get_global_id(0);
   if (dtype == 0) {
     processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int)((processors.tpcNNClusterer[sector].mModelProbabilities_16[glo_idx]).ToFloat() > processors.tpcNNClusterer[sector].mNnClassThreshold);
   } else if (dtype == 1) {
@@ -190,14 +191,14 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart)
 {
   auto& clustererNN = processors.tpcNNClusterer[sector];
-  uint glo_idx = get_global_id(0);
-  uint elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes;
+  uint32_t glo_idx = get_global_id(0);
+  uint32_t elem_iterator = glo_idx * clustererNN.mNnClusterizerModelClassNumOutputNodes;
   float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
-  uint class_label = 0;
-  for (uint pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes; pIdx++) {
+  uint32_t class_label = 0;
+  for (uint32_t pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes; pIdx++) {
     if (pIdx == elem_iterator) {
       if (dtype == 0) {
         current_max_prob = static_cast<float>(clustererNN.mModelProbabilities_16[pIdx]);
@@ -212,7 +213,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
       }
     }
   }
-  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
+  // uint32_t class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
   clustererNN.mOutputDataClass[glo_idx + batchStart] = class_label;
   if (class_label > 1) {
     clustererNN.mClusterFlags[2 * glo_idx] = 1;
@@ -221,25 +222,30 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
-  uint glo_idx = get_global_id(0);
+  uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
+  uint32_t maxClusterNum = clusterer.mPmemory->counters.nClusters;
+  uint32_t full_glo_idx = glo_idx + batchStart;
+  if (full_glo_idx >= maxClusterNum) {
+    return;
+  }
+  int32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes;
+
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(full_glo_idx, maxClusterNum - 1)];
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
 
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
   tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
-  uint full_glo_idx = glo_idx + batchStart;
-  int model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes;
 
   // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.mNnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
 
-  if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerModelReg2NumOutputNodes == -1 && clustererNN.mOutputDataClass[full_glo_idx] >= 1)) {
+  if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerModelReg2NumOutputNodes != -1 && clustererNN.mOutputDataClass[full_glo_idx] >= 1)) {
 
     ClusterAccumulator pc;
 
@@ -291,7 +297,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
       return;
     }
 
-    uint rowIndex = 0;
+    uint32_t rowIndex = 0;
     if (clusterOut != nullptr) {
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
@@ -316,21 +322,21 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
-  uint glo_idx = get_global_id(0);
+  uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
 
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
   tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
-  uint full_glo_idx = glo_idx + batchStart;
-  int model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes;
+  uint32_t full_glo_idx = glo_idx + batchStart;
+  uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes;
 
   if (clustererNN.mOutputDataClass[full_glo_idx] > 0) {
 
@@ -384,7 +390,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
       return;
     }
 
-    uint rowIndex = 0;
+    uint32_t rowIndex = 0;
     if (clusterOut != nullptr) {
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
@@ -452,17 +458,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 }
 
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
-GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current)
+GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
 {
   return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
 }
 
-GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift)
+GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t global_shift)
 {
   return (row > 62 ? global_shift : 0);
 }
 
-GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift)
+GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t global_shift)
 {
   if (pad < 0 || row < 0) { // Faster short-circuit
     return true;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index dc7f537c6c1e8..dac2bf9554849 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -73,11 +73,12 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
  private:
   static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, uint8_t, int8_t, uint);
   static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
+  static GPUd() uint32_t sortIntoBuckets(GPUTPCClusterFinder&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t);
   static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
 
-  static GPUd() int padOffset(int, int);
-  static GPUd() int rowOffset(int, int);
-  static GPUd() bool isBoundary(int, int, int);
+  static GPUd() int32_t padOffset(int32_t, int32_t);
+  static GPUd() int32_t rowOffset(int32_t, int32_t);
+  static GPUd() bool isBoundary(int32_t, int32_t, int32_t);
 };
 
 } // namespace o2::gpu