diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt
index 540fe8ebf271c..2db91fc4f4320 100644
--- a/Common/ML/CMakeLists.txt
+++ b/Common/ML/CMakeLists.txt
@@ -9,21 +9,14 @@
 # granted to it by virtue of its status as an Intergovernmental Organization
 # or submit itself to any jurisdiction.
 
-# Pass ORT variables as a preprocessor definition
-if(ORT_ROCM_BUILD)
-    add_compile_definitions(ORT_ROCM_BUILD=1)
-endif()
-if(ORT_CUDA_BUILD)
-    add_compile_definitions(ORT_CUDA_BUILD=1)
-endif()
-if(ORT_MIGRAPHX_BUILD)
-    add_compile_definitions(ORT_MIGRAPHX_BUILD=1)
-endif()
-if(ORT_TENSORRT_BUILD)
-    add_compile_definitions(ORT_TENSORRT_BUILD=1)
-endif()
-
 o2_add_library(ML
                SOURCES src/OrtInterface.cxx
                TARGETVARNAME targetName
                PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime)
+
+# Pass ORT variables as a preprocessor definition
+target_compile_definitions(${targetName} PRIVATE
+    $<$<BOOL:${ORT_ROCM_BUILD}>:ORT_ROCM_BUILD>
+    $<$<BOOL:${ORT_CUDA_BUILD}>:ORT_CUDA_BUILD>
+    $<$<BOOL:${ORT_MIGRAPHX_BUILD}>:ORT_MIGRAPHX_BUILD>
+    $<$<BOOL:${ORT_TENSORRT_BUILD}>:ORT_TENSORRT_BUILD>)
diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
index 76fd6734cf9db..9516ba5dad573 100644
--- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -882,4 +882,4 @@ static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
 } // namespace OrtDataType
 
 } // namespace o2
-#endif
\ No newline at end of file
+#endif
diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 93549178848ca..e37b6a69b6036 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -26,6 +26,13 @@
 // O2 includes
 #include "Framework/Logger.h"
 
+namespace Ort
+{
+struct SessionOptions;
+struct MemoryInfo;
+struct Env;
+} // namespace Ort
+
 namespace o2
 {
 
@@ -36,14 +43,52 @@ class OrtModel
 {
 
  public:
-  // Constructor
+  // Constructors & destructors
   OrtModel() = default;
-  OrtModel(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
-  void init(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
-  void reset(std::unordered_map<std::string, std::string>);
+  OrtModel(std::unordered_map<std::string, std::string> optionsMap) { init(optionsMap); }
+  void init(std::unordered_map<std::string, std::string> optionsMap)
+  {
+    initOptions(optionsMap);
+    initEnvironment();
+  }
+  virtual ~OrtModel() = default;
+
+  // General purpose
+  void initOptions(std::unordered_map<std::string, std::string> optionsMap);
+  void initEnvironment();
+  void initSession();
+  void memoryOnDevice(int32_t = 0);
   bool isInitialized() { return mInitialized; }
+  void resetSession();
 
-  virtual ~OrtModel() = default;
+  // Getters
+  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+  std::vector<std::string> getInputNames() const { return mInputNames; }
+  std::vector<std::string> getOutputNames() const { return mOutputNames; }
+  Ort::SessionOptions* getSessionOptions();
+  Ort::MemoryInfo* getMemoryInfo();
+  Ort::Env* getEnv();
+  int32_t getIntraOpNumThreads() const { return intraOpNumThreads; }
+  int32_t getInterOpNumThreads() const { return interOpNumThreads; }
+
+  // Setters
+  void setDeviceId(int32_t id) { deviceId = id; }
+  void setIO();
+  void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+  void setIntraOpNumThreads(int threads)
+  {
+    if (deviceType == "CPU") {
+      intraOpNumThreads = threads;
+    }
+  }
+  void setInterOpNumThreads(int threads)
+  {
+    if (deviceType == "CPU") {
+      interOpNumThreads = threads;
+    }
+  }
+  void setEnv(Ort::Env*);
 
   // Conversion
   template <class I, class O>
@@ -53,41 +98,36 @@ class OrtModel
   template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
   std::vector<O> inference(std::vector<I>&);
 
-  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+  template <class I, class O>
   std::vector<O> inference(std::vector<std::vector<I>>&);
 
-  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
-  void inference(I*, size_t, O*);
-
-  // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
-  // std::vector<O> inference(std::vector<I>&);
-
-  // Reset session
-  void resetSession();
+  template <class I, class O>
+  void inference(I*, int64_t, O*);
 
-  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
-  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
-  std::vector<std::string> getInputNames() const { return mInputNames; }
-  std::vector<std::string> getOutputNames() const { return mOutputNames; }
+  template <class I, class O>
+  void inference(I**, int64_t, O*);
 
-  void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+  void release(bool = false);
 
  private:
-  // ORT variables -> need to be hidden as Pimpl
+  // ORT variables -> need to be hidden as pImpl
   struct OrtVariables;
   OrtVariables* pImplOrt;
 
   // Input & Output specifications of the loaded network
   std::vector<const char*> inputNamesChar, outputNamesChar;
   std::vector<std::string> mInputNames, mOutputNames;
-  std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
+  std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes, inputShapesCopy, outputShapesCopy; // Input shapes
+  std::vector<int64_t> inputSizePerNode, outputSizePerNode;                                         // Output shapes
+  int32_t mInputsTotal = 0, mOutputsTotal = 0;                                                      // Total number of inputs and outputs
 
   // Environment settings
   bool mInitialized = false;
-  std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
-  int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
+  std::string modelPath, envName = "", deviceType = "CPU", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
+  int32_t intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = -1, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
   std::string printShape(const std::vector<int64_t>&);
+  std::string printShape(const std::vector<std::vector<int64_t>>&, std::vector<std::string>&);
 };
 
 } // namespace ml
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 88f548bd4fe7b..24a2fbffb252c 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -33,11 +33,12 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c
   Ort::SessionOptions sessionOptions;
   Ort::AllocatorWithDefaultOptions allocator;
   Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
+  std::unique_ptr<Ort::IoBinding> ioBinding = nullptr;
 };
 
-void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
+// General purpose
+void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsMap)
 {
-
   pImplOrt = new OrtVariables();
 
   // Load from options map
@@ -47,77 +48,60 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 
   if (!optionsMap["model-path"].empty()) {
     modelPath = optionsMap["model-path"];
-    device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
-    dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
-    deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
+    deviceType = (optionsMap.contains("device-type") ? optionsMap["device-type"] : "CPU");
+    deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : -1);
     allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
     intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
     interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0);
     loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
     enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
     enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
-
-    std::string dev_mem_str = "Hip";
-#if defined(ORT_ROCM_BUILD)
-    if (device == "ROCM") {
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
-      LOG(info) << "(ORT) ROCM execution provider set";
-    }
-#endif
-#if defined(ORT_MIGRAPHX_BUILD)
-    if (device == "MIGRAPHX") {
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
-      LOG(info) << "(ORT) MIGraphX execution provider set";
-    }
-#endif
-#if defined(ORT_CUDA_BUILD)
-    if (device == "CUDA") {
-      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
-      LOG(info) << "(ORT) CUDA execution provider set";
-      dev_mem_str = "Cuda";
+    envName = (optionsMap.contains("onnx-environment-name") ? optionsMap["onnx-environment-name"] : "onnx_model_inference");
+
+    if (deviceType == "CPU") {
+      (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
+      (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads);
+      if (intraOpNumThreads > 1 || interOpNumThreads > 1) {
+        (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+      } else if (intraOpNumThreads == 1) {
+        (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+      }
+      if (loggingLevel < 2) {
+        LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " (intraOpNumThreads) and " << interOpNumThreads << " (interOpNumThreads) threads";
+      }
     }
-#endif
 
-  if (allocateDeviceMemory) {
-    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
-    LOG(info) << "(ORT) Memory info set to on-device memory";
-  }
+    // OrtROCMProviderOptions rocm_options{};
+    // (pImplOrt->sessionOptions).AppendExecutionProvider_ROCM(rocm_options);
 
-  if (device == "CPU") {
-    (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
-    (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads);
-    if (intraOpNumThreads > 1 || interOpNumThreads > 1) {
-      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-    } else if (intraOpNumThreads == 1) {
-      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
-    }
-    if (loggingLevel < 2) {
-      LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " (intraOpNumThreads) and " << interOpNumThreads << " (interOpNumThreads) threads";
-    }
-  }
-
-  (pImplOrt->sessionOptions).DisableMemPattern();
-  (pImplOrt->sessionOptions).DisableCpuMemArena();
+    (pImplOrt->sessionOptions).DisableMemPattern();
+    (pImplOrt->sessionOptions).DisableCpuMemArena();
 
-  if (enableProfiling) {
-    if (optionsMap.contains("profiling-output-path")) {
-      (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
+    if (enableProfiling) {
+      if (optionsMap.contains("profiling-output-path")) {
+        (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
+      } else {
+        LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
+        (pImplOrt->sessionOptions).DisableProfiling();
+      }
     } else {
-      LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
       (pImplOrt->sessionOptions).DisableProfiling();
     }
-  } else {
-    (pImplOrt->sessionOptions).DisableProfiling();
-  }
 
-  mInitialized = true;
+    (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
+    (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
 
-  (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
-  (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
+    mInitialized = true;
+  } else {
+    LOG(fatal) << "(ORT) Model path cannot be empty!";
+  }
+}
 
+void OrtModel::initEnvironment()
+{
   pImplOrt->env = std::make_shared<Ort::Env>(
     OrtLoggingLevel(loggingLevel),
-    (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()),
+    (envName.empty() ? "ORT" : envName.c_str()),
     // Integrate ORT logging into Fairlogger
     [](void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, const char* message) {
       if (severity == ORT_LOGGING_LEVEL_VERBOSE) {
@@ -136,31 +120,48 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
     },
     (void*)3);
   (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
-  pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
+}
 
-  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-    mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
-  }
-  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-    mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-  }
-  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-    mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
-  }
-  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-    mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+void OrtModel::initSession()
+{
+  if (allocateDeviceMemory) {
+    memoryOnDevice(deviceId);
   }
+  pImplOrt->session = std::make_shared<Ort::Session>(*pImplOrt->env, modelPath.c_str(), pImplOrt->sessionOptions);
+  pImplOrt->ioBinding = std::make_unique<Ort::IoBinding>(*pImplOrt->session);
+
+  setIO();
 
-  inputNamesChar.resize(mInputNames.size(), nullptr);
-  std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
-                 [&](const std::string& str) { return str.c_str(); });
-  outputNamesChar.resize(mOutputNames.size(), nullptr);
-  std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
-                 [&](const std::string& str) { return str.c_str(); });
-  }
   if (loggingLevel < 2) {
-    LOG(info) << "(ORT) Model loaded successfully! (input: " << printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")";
+    LOG(info) << "(ORT) Model loaded successfully! (inputs: " << printShape(mInputShapes, mInputNames) << ", outputs: " << printShape(mOutputShapes, mInputNames) << ")";
+  }
+}
+
+void OrtModel::memoryOnDevice(int32_t deviceIndex)
+{
+#if (defined(ORT_ROCM_BUILD) || defined(ORT_MIGRAPHX_BUILD) || defined(ORT_CUDA_BUILD) || defined(ORT_TENSORRT_BUILD))
+  if (deviceIndex >= 0) {
+    (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1");
+    (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+    (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1");                    // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
+    (pImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0");          // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
+    // Arena memory shrinkage comes at performance cost
+    /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
+    // (pImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
+
+    std::string dev_mem_str = "";
+    if (deviceType == "ROCM") {
+      dev_mem_str = "Hip";
+    }
+    if (deviceType == "CUDA") {
+      dev_mem_str = "Cuda";
+    }
+    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault);
+    if (loggingLevel < 2) {
+      LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex << " and pImplOrt pointer " << pImplOrt;
+    }
   }
+#endif
 }
 
 void OrtModel::resetSession()
@@ -168,6 +169,22 @@ void OrtModel::resetSession()
   pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
 }
 
+// Getters
+Ort::SessionOptions* OrtModel::getSessionOptions()
+{
+  return &pImplOrt->sessionOptions;
+}
+
+Ort::MemoryInfo* OrtModel::getMemoryInfo()
+{
+  return &pImplOrt->memoryInfo;
+}
+
+Ort::Env* OrtModel::getEnv()
+{
+  return (pImplOrt->env).get();
+}
+
 template <class I, class O>
 std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
 {
@@ -183,20 +200,70 @@ std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
   }
 }
 
-std::string OrtModel::printShape(const std::vector<int64_t>& v)
+void OrtModel::setIO()
 {
-  std::stringstream ss("");
-  for (size_t i = 0; i < v.size() - 1; i++) {
-    ss << v[i] << "x";
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+    mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
   }
-  ss << v[v.size() - 1];
-  return ss.str();
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+    mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+    mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+    mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
+
+  inputNamesChar.resize(mInputNames.size(), nullptr);
+  std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
+                 [&](const std::string& str) { return str.c_str(); });
+  outputNamesChar.resize(mOutputNames.size(), nullptr);
+  std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
+                 [&](const std::string& str) { return str.c_str(); });
+
+  inputShapesCopy = mInputShapes;
+  outputShapesCopy = mOutputShapes;
+  inputSizePerNode.resize(mInputShapes.size(), 1);
+  outputSizePerNode.resize(mOutputShapes.size(), 1);
+  mInputsTotal = 1;
+  for (size_t i = 0; i < mInputShapes.size(); ++i) {
+    if (mInputShapes[i].size() > 0) {
+      for (size_t j = 1; j < mInputShapes[i].size(); ++j) {
+        if (mInputShapes[i][j] > 0) {
+          mInputsTotal *= mInputShapes[i][j];
+          inputSizePerNode[i] *= mInputShapes[i][j];
+        }
+      }
+    }
+  }
+  mOutputsTotal = 1;
+  for (size_t i = 0; i < mOutputShapes.size(); ++i) {
+    if (mOutputShapes[i].size() > 0) {
+      for (size_t j = 1; j < mOutputShapes[i].size(); ++j) {
+        if (mOutputShapes[i][j] > 0) {
+          mOutputsTotal *= mOutputShapes[i][j];
+          outputSizePerNode[i] *= mOutputShapes[i][j];
+        }
+      }
+    }
+  }
+}
+
+void OrtModel::setEnv(Ort::Env* env)
+{
+  pImplOrt->env = std::shared_ptr<Ort::Env>(env);
 }
 
+// Inference
 template <class I, class O>
 std::vector<O> OrtModel::inference(std::vector<I>& input)
 {
-  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<int64_t> inputShape = mInputShapes[0];
+  inputShape[0] = input.size();
+  for (size_t i = 1; i < mInputShapes[0].size(); ++i) {
+    inputShape[0] /= mInputShapes[0][i];
+  }
   std::vector<Ort::Value> inputTensor;
   if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
     inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -212,51 +279,182 @@ std::vector<O> OrtModel::inference(std::vector<I>& input)
 }
 
 template std::vector<float> OrtModel::inference<float, float>(std::vector<float>&);
-
 template std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>&);
-
 template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>&);
 
 template <class I, class O>
-void OrtModel::inference(I* input, size_t input_size, O* output)
+void OrtModel::inference(I* input, int64_t input_size, O* output)
 {
-  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  // std::vector<std::string> providers = Ort::GetAvailableProviders();
+  // for (const auto& provider : providers) {
+  //     LOG(info) << "Available Execution Provider: " << provider;
+  // }
+  std::vector<int64_t> inputShape{input_size, (int64_t)mInputShapes[0][1]};
   Ort::Value inputTensor = Ort::Value(nullptr);
   if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
-    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size());
+    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
   } else {
-    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size());
+    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
   }
+  (pImplOrt->ioBinding)->BindInput(mInputNames[0].c_str(), inputTensor);
 
-  std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
-  size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]);
-  Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
+  std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
+  Ort::Value outputTensor = Ort::Value(nullptr);
+  if constexpr (std::is_same_v<O, OrtDataType::Float16_t>) {
+    outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
+  } else {
+    outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
+  }
+  (pImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor);
 
-  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here
+  (pImplOrt->session)->Run(pImplOrt->runOptions, *pImplOrt->ioBinding);
 }
 
-template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
+template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t*, int64_t, OrtDataType::Float16_t*);
+template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, int64_t, float*);
+template void OrtModel::inference<float, OrtDataType::Float16_t>(float*, int64_t, OrtDataType::Float16_t*);
+template void OrtModel::inference<float, float>(float*, int64_t, float*);
+
+template <class I, class O>
+void OrtModel::inference(I** input, int64_t input_size, O* output)
+{
+  std::vector<Ort::Value> inputTensors(inputShapesCopy.size());
+
+  for (size_t i = 0; i < inputShapesCopy.size(); ++i) {
+
+    inputShapesCopy[i][0] = input_size;  // batch-size
+    outputShapesCopy[i][0] = input_size; // batch-size
+
+    if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
+      inputTensors[i] = Ort::Value::CreateTensor<Ort::Float16_t>(
+        pImplOrt->memoryInfo,
+        reinterpret_cast<Ort::Float16_t*>(input[i]),
+        inputSizePerNode[i] * input_size,
+        inputShapesCopy[i].data(),
+        inputShapesCopy[i].size());
+    } else {
+      inputTensors[i] = Ort::Value::CreateTensor<I>(
+        pImplOrt->memoryInfo,
+        input[i],
+        inputSizePerNode[i] * input_size,
+        inputShapesCopy[i].data(),
+        inputShapesCopy[i].size());
+    }
+  }
+
+  Ort::Value outputTensor = Ort::Value(nullptr);
+  if constexpr (std::is_same_v<O, OrtDataType::Float16_t>) {
+    outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(
+      pImplOrt->memoryInfo,
+      reinterpret_cast<Ort::Float16_t*>(output),
+      outputSizePerNode[0] * input_size, // assumes that there is only one output node
+      outputShapesCopy[0].data(),
+      outputShapesCopy[0].size());
+  } else {
+    outputTensor = Ort::Value::CreateTensor<O>(
+      pImplOrt->memoryInfo,
+      output,
+      outputSizePerNode[0] * input_size, // assumes that there is only one output node
+      outputShapesCopy[0].data(),
+      outputShapesCopy[0].size());
+  }
+
+  // === Run inference ===
+  pImplOrt->session->Run(
+    pImplOrt->runOptions,
+    inputNamesChar.data(),
+    inputTensors.data(),
+    inputNamesChar.size(),
+    outputNamesChar.data(),
+    &outputTensor,
+    outputNamesChar.size());
+}
 
-template void OrtModel::inference<float, float>(float*, size_t, float*);
+template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t**, int64_t, OrtDataType::Float16_t*);
+template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t**, int64_t, float*);
+template void OrtModel::inference<float, OrtDataType::Float16_t>(float**, int64_t, OrtDataType::Float16_t*);
+template void OrtModel::inference<float, float>(float**, int64_t, float*);
 
 template <class I, class O>
-std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& inputs)
 {
-  std::vector<Ort::Value> inputTensor;
-  for (auto i : input) {
-    std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> input_tensors;
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+
+    inputShapesCopy[i][0] = inputs[i].size() / inputSizePerNode[i]; // batch-size
+
     if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
-      inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
+      input_tensors.emplace_back(
+        Ort::Value::CreateTensor<Ort::Float16_t>(
+          pImplOrt->memoryInfo,
+          reinterpret_cast<Ort::Float16_t*>(inputs[i].data()),
+          inputSizePerNode[i] * inputShapesCopy[i][0],
+          inputShapesCopy[i].data(),
+          inputShapesCopy[i].size()));
     } else {
-      inputTensor.emplace_back(Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, i.data(), i.size(), inputShape.data(), inputShape.size()));
+      input_tensors.emplace_back(
+        Ort::Value::CreateTensor<I>(
+          pImplOrt->memoryInfo,
+          inputs[i].data(),
+          inputSizePerNode[i] * inputShapesCopy[i][0],
+          inputShapesCopy[i].data(),
+          inputShapesCopy[i].size()));
     }
   }
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
-  std::vector<O> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
-  outputTensors.clear();
-  return outputValuesVec;
+
+  int32_t totalOutputSize = mOutputsTotal * inputShapesCopy[0][0];
+
+  // === Run inference ===
+  auto output_tensors = pImplOrt->session->Run(
+    pImplOrt->runOptions,
+    inputNamesChar.data(),
+    input_tensors.data(),
+    input_tensors.size(),
+    outputNamesChar.data(),
+    outputNamesChar.size());
+
+  // === Extract output values ===
+  O* output_data = output_tensors[0].template GetTensorMutableData<O>();
+  std::vector<O> output_vec(output_data, output_data + totalOutputSize);
+  output_tensors.clear();
+  return output_vec;
+}
+
+template std::vector<float> OrtModel::inference<float, float>(std::vector<std::vector<float>>&);
+template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>&);
+
+// Release session
+void OrtModel::release(bool profilingEnabled)
+{
+  // if (profilingEnabled) {
+  //   pImplOrt->session->EndProfiling();
+  // }
+  LOG(info) << "(ORT) Size of pImplOrt: " << sizeof(*pImplOrt) << " bytes";
+}
+
+// private
+std::string OrtModel::printShape(const std::vector<int64_t>& v)
+{
+  std::stringstream ss("");
+  for (size_t i = 0; i < v.size() - 1; i++) {
+    ss << v[i] << "x";
+  }
+  ss << v[v.size() - 1];
+  return ss.str();
+}
+
+std::string OrtModel::printShape(const std::vector<std::vector<int64_t>>& v, std::vector<std::string>& n)
+{
+  std::stringstream ss("");
+  for (size_t i = 0; i < v.size(); i++) {
+    ss << n[i] << " -> (";
+    for (size_t j = 0; j < v[i].size() - 1; j++) {
+      ss << v[i][j] << "x";
+    }
+    ss << v[i][v[i].size() - 1] << "); ";
+  }
+  return ss.str();
 }
 
 } // namespace ml
diff --git a/Detectors/TPC/calibration/CMakeLists.txt b/Detectors/TPC/calibration/CMakeLists.txt
index 0ec62e5f323b3..7722fc4e2884f 100644
--- a/Detectors/TPC/calibration/CMakeLists.txt
+++ b/Detectors/TPC/calibration/CMakeLists.txt
@@ -25,6 +25,7 @@ o2_add_library(TPCCalibration
                        src/CalibPadGainTracksBase.cxx
                        src/CalibLaserTracks.cxx
                        src/LaserTracksCalibrator.cxx
+                       src/NeuralNetworkClusterizer.cxx
                        src/SACDecoder.cxx
                        src/IDCAverageGroup.cxx
                        src/IDCAverageGroupBase.cxx
@@ -82,6 +83,7 @@ o2_target_root_dictionary(TPCCalibration
                                   include/TPCCalibration/FastHisto.h
                                   include/TPCCalibration/CalibLaserTracks.h
                                   include/TPCCalibration/LaserTracksCalibrator.h
+                                  include/TPCCalibration/NeuralNetworkClusterizer.h
                                   include/TPCCalibration/SACDecoder.h
                                   include/TPCCalibration/IDCAverageGroup.h
                                   include/TPCCalibration/IDCAverageGroupBase.h
diff --git a/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h
new file mode 100644
index 0000000000000..196bba644714c
--- /dev/null
+++ b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h
@@ -0,0 +1,38 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file   NeuralNetworkClusterizer.h
+/// \brief  Fetching neural networks for clusterization from CCDB
+/// \author Christian Sonnabend
+
+#ifndef AliceO2_TPC_NeuralNetworkClusterizer_h
+#define AliceO2_TPC_NeuralNetworkClusterizer_h
+
+#include "CCDB/CcdbApi.h"
+
+namespace o2::tpc
+{
+
+class NeuralNetworkClusterizer
+{
+ public:
+  NeuralNetworkClusterizer() = default;
+  void initCcdbApi(std::string url);
+  void loadIndividualFromCCDB(std::map<std::string, std::string> settings);
+
+ private:
+  o2::ccdb::CcdbApi ccdbApi;
+  std::map<std::string, std::string> metadata;
+  std::map<std::string, std::string> headers;
+};
+
+} // namespace o2::tpc
+#endif
diff --git a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx
new file mode 100644
index 0000000000000..bfbb7afc946f8
--- /dev/null
+++ b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx
@@ -0,0 +1,48 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file   NeuralNetworkClusterizer.cxx
+/// \brief  Fetching neural networks for clusterization from CCDB
+/// \author Christian Sonnabend
+
+#include <CommonUtils/StringUtils.h>
+#include "TPCCalibration/NeuralNetworkClusterizer.h"
+
+using namespace o2::tpc;
+
+void NeuralNetworkClusterizer::initCcdbApi(std::string url)
+{
+  ccdbApi.init(url);
+}
+
+void NeuralNetworkClusterizer::loadIndividualFromCCDB(std::map<std::string, std::string> settings)
+{
+  metadata["inputDType"] = settings["inputDType"];
+  metadata["outputDType"] = settings["outputDType"];
+  metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"];         // classification_1C, classification_2C, regression_1C, regression_2C
+  metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model
+  metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"];       // FC, CNN
+  if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) {
+    metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"];
+  }
+  if (settings["nnCCDBBeamType"] != "") {
+    metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"];
+  }
+
+  bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], settings["outputFolder"], metadata, 1, false, settings["outputFile"]);
+  // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables
+
+  if (retrieveSuccess) {
+    LOG(info) << "Network " << settings["nnCCDBPath"] << " retrieved from CCDB, stored at " << settings["outputFile"];
+  } else {
+    LOG(error) << "Failed to retrieve network from CCDB";
+  }
+}
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index 163b00c804d7f..1174fcd8a38d7 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -24,6 +24,11 @@
 #include "GPUReconstructionKernelIncludes.h"
 #include "GPUReconstructionKernels.h"
 
+namespace Ort
+{
+struct SessionOptions;
+}
+
 namespace o2::gpu
 {
 
@@ -108,6 +113,9 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
   size_t WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream = -1, deviceEvent* ev = nullptr) override;
   virtual size_t TransferMemoryInternal(GPUMemoryResource* res, int32_t stream, deviceEvent* ev, deviceEvent* evList, int32_t nEvents, bool toGPU, const void* src, void* dst);
 
+  // ONNX runtime
+  virtual void SetONNXGPUStream(Ort::SessionOptions&, int32_t, int32_t*) {}
+
   int32_t InitDevice() override;
   int32_t ExitDevice() override;
   int32_t GetThread();
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
index e8892c4be702b..4ce8bc1b42743 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
@@ -22,6 +22,11 @@
 #include <functional>
 #include <atomic>
 
+namespace Ort
+{
+struct SessionOptions;
+}
+
 namespace o2::gpu
 {
 
diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
index 3655eaf66055e..f595fb051db54 100644
--- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
@@ -114,6 +114,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
     ${MODULE}
     SOURCES ${SRCS}
     PUBLIC_LINK_LIBRARIES O2::GPUTracking O2::ITStrackingCUDA
+    PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime
     PRIVATE_INCLUDE_DIRECTORIES
       ${CMAKE_SOURCE_DIR}/Detectors/Base/src
       ${CMAKE_SOURCE_DIR}/Detectors/TRD/base/src
@@ -121,6 +122,11 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
       ${CMAKE_CURRENT_SOURCE_DIR}
     TARGETVARNAME targetName)
 
+  target_compile_definitions(${targetName} PRIVATE
+    GPUCA_HAS_ONNX=1
+    $<$<BOOL:${ORT_CUDA_BUILD}>:ORT_CUDA_BUILD>
+    $<$<BOOL:${ORT_TENSORRT_BUILD}>:ORT_TENSORRT_BUILD>)
+
   install(FILES ${HDRS} DESTINATION include/GPU)
 endif()
 
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 3bea91994ba86..741f160158b43 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -37,6 +37,10 @@
 #undef GPUCA_KRNL
 #endif
 
+#ifdef GPUCA_HAS_ONNX
+#include <onnxruntime_cxx_api.h>
+#endif
+
 static constexpr size_t REQUIRE_MIN_MEMORY = 1024L * 1024 * 1024;
 static constexpr size_t REQUIRE_MEMORY_RESERVED = 512L * 1024 * 1024;
 static constexpr size_t REQUIRE_FREE_MEMORY_RESERVED_PER_SM = 40L * 1024 * 1024;
@@ -648,6 +652,28 @@ void GPUReconstructionCUDA::endGPUProfiling()
 {
   GPUChkErr(cudaProfilerStop());
 }
+
+void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
+{
+#ifdef ORT_CUDA_BUILD
+  cudaGetDevice(deviceId);
+  OrtCUDAProviderOptionsV2* cuda_options = nullptr;
+  CreateCUDAProviderOptions(&cuda_options);
+
+  // std::vector<const char*> keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"};
+  // std::vector<const char*> values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"};
+  // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
+
+  // this implicitly sets "has_user_compute_stream"
+  cuda_options.has_user_compute_stream = 1;
+  UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream]);
+  session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
+
+  // Finally, don't forget to release the provider options
+  ReleaseCUDAProviderOptions(cuda_options);
+#endif // ORT_CUDA_BUILD
+}
+
 #else  // HIP
 void* GPUReconstructionHIP::getGPUPointer(void* ptr)
 {
@@ -655,6 +681,22 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr)
   GPUChkErr(hipHostGetDevicePointer(&retVal, ptr, 0));
   return retVal;
 }
+
+void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
+{
+#ifdef ORT_ROCM_BUILD
+  // Create ROCm provider options
+  cudaGetDevice(deviceId);
+  // const auto& api = Ort::GetApi();
+  // api.GetCurrentGpuDeviceId(deviceId);
+  OrtROCMProviderOptions rocm_options;
+  rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
+  rocm_options.arena_extend_strategy = 0;   // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
+  // rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit
+  rocm_options.user_compute_stream = mInternals->Streams[stream];
+  session_options.AppendExecutionProvider_ROCM(rocm_options);
+#endif // ORT_ROCM_BUILD
+}
 #endif // __HIPCC__
 
 namespace o2::gpu
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
index b1a3a53a6a62f..6bac1fefb2346 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -25,6 +25,11 @@ extern "C" __declspec(dllexport) o2::gpu::GPUReconstruction* GPUReconstruction_C
 extern "C" o2::gpu::GPUReconstruction* GPUReconstruction_Create_CUDA(const o2::gpu::GPUSettingsDeviceBackend& cfg);
 #endif
 
+namespace Ort
+{
+struct SessionOptions;
+}
+
 namespace o2::gpu
 {
 struct GPUReconstructionCUDAInternals;
@@ -74,6 +79,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels<GPUReconstructionC
   size_t GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override;
   void ReleaseEvent(deviceEvent ev) override;
   void RecordMarker(deviceEvent* ev, int32_t stream) override;
+  void SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId) override;
 
   void GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame>* timeFrame) override;
 
diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt
index 3a03a054d4a7e..d7adb222d547b 100644
--- a/GPU/GPUTracking/Base/hip/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt
@@ -162,6 +162,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
     ${MODULE}
     SOURCES ${SRCS}
     PUBLIC_LINK_LIBRARIES O2::GPUTracking O2::ITStrackingHIP
+    PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime
     PRIVATE_INCLUDE_DIRECTORIES
       ${CMAKE_SOURCE_DIR}/Detectors/Base/src
       ${CMAKE_SOURCE_DIR}/Detectors/TRD/base/src
@@ -169,6 +170,11 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
       ${GPUCA_HIP_SOURCE_DIR}
     TARGETVARNAME targetName)
 
+  target_compile_definitions(${targetName} PRIVATE
+    GPUCA_HAS_ONNX=1
+    $<$<BOOL:${ORT_ROCM_BUILD}>:ORT_ROCM_BUILD>
+    $<$<BOOL:${ORT_MIGRAPHX_BUILD}>:ORT_MIGRAPHX_BUILD>)
+
   install(FILES ${HDRS} DESTINATION include/GPU)
 
 #  o2_add_test(GPUsortHIP NAME test_GPUsortHIP
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index eaeec508ff27a..e82799b9e59c3 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -341,6 +341,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                                        O2::DetectorsRaw
                                        O2::Steer
                                        O2::ML
+                 PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime
                  PUBLIC_INCLUDE_DIRECTORIES ${INCDIRS}
                  SOURCES ${SRCS} ${SRCS_NO_CINT} ${SRCS_NO_H})
 
diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
index b212abbcd2707..a84fce453a34a 100644
--- a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
+++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
@@ -493,6 +493,7 @@
 
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN GPUCA_LB_GPUTPCNNClusterizerKernels
+  #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNSingleElement GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 34fac6514851c..64e41e94b3335 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -247,7 +247,8 @@ AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural
 AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
 AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
 AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
-AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
 AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
 AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
 AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347")
@@ -268,6 +269,17 @@ AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The c
 AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
 AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
 AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
+AddOption(nnEvalMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)")
+// CCDB
+AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally")
+AddOption(nnLocalFolder, std::string, ".", "", 0, "Local folder in which the networks will be fetched")
+AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched")
+AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks")
+AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression")
+AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
+AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
+AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp")
+AddOption(nnCCDBInteractionRate, int, 50, "", 0, "Distinguishes between networks for different interaction rates [kHz].")
 AddHelp("help", 'h')
 EndConfig()
 
diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
index 290ae32cafca8..59712c30a62dd 100644
--- a/GPU/GPUTracking/Global/GPUChain.h
+++ b/GPU/GPUTracking/Global/GPUChain.h
@@ -83,6 +83,7 @@ class GPUChain
   inline GPUParam& param() { return mRec->param(); }
   inline const GPUConstantMem* processors() const { return mRec->processors(); }
   inline void SynchronizeStream(int32_t stream) { mRec->SynchronizeStream(stream); }
+  inline void SetONNXGPUStream(Ort::SessionOptions& opt, int32_t stream, int32_t* deviceId) { mRec->SetONNXGPUStream(opt, stream, deviceId); }
   inline void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) { mRec->SynchronizeEvents(evList, nEvents); }
   inline void SynchronizeEventAndRelease(deviceEvent& ev, bool doGPU = true)
   {
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 4047dcae0a6b3..7db0ba66305e9 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -611,49 +611,89 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
   }
 
+  if (doGPU && mIOPtrs.tpcZS) {
+    processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta;
+    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
+  }
+  if (doGPU) {
+    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
+  }
+
 #ifdef GPUCA_HAS_ONNX
+  const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
+  GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes];
+
   if (GetProcessingSettings().nn.applyNNclusterizer) {
-    uint32_t maxClusters = -1;
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
-      maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
-    }
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
-      GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
-      const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
-      clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
-      clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
-      clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
-      clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
-      clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
-      clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
-      clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
-      clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
-      clustererNN.nnClusterizerTotalClusters = maxClusters;
-      clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
-      clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
-      if (clustererNN.nnSigmoidTrafoClassThreshold) {
-        clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
+    int32_t deviceId = -1;
+    int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
+    int32_t maxThreads = mRec->getNKernelHostThreads(true);
+    // bool recreateMemoryAllocator = false;
+    mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
+      nnApplications[lane].init(nn_settings);
+      if (nnApplications[lane].modelsUsed[0]) {
+        SetONNXGPUStream(*(nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
+        (nnApplications[lane].model_class).setDeviceId(deviceId);
+        if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) {
+          nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads);
+        }
+        (nnApplications[lane].model_class).initEnvironment();
+        // Registering this once seems to be enough, even with different environmnents / models. ONNX apparently uses this per device and stores the OrtAllocator internally. All models will then use the volatile allocation.
+        // But environment must be valid, so we init the model environment first and use it here afterwards.
+        // Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator.
+        // TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // recreateMemoryAllocator = true;
+        (nnApplications[lane].model_class).initSession();
       }
-      if (nn_settings.nnClusterizerVerbosity < 0) {
-        clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
-      } else {
-        clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
+      if (nnApplications[lane].modelsUsed[1]) {
+        SetONNXGPUStream(*(nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
+        (nnApplications[lane].model_reg_1).setDeviceId(deviceId);
+        if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) {
+          nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads);
+        }
+        // (nnApplications[lane].model_reg_1).setEnv((nnApplications[lane].model_class).getEnv());
+        (nnApplications[lane].model_reg_1).initEnvironment();
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        (nnApplications[lane].model_reg_1).initSession();
+      }
+      if (nnApplications[lane].modelsUsed[2]) {
+        SetONNXGPUStream(*(nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
+        (nnApplications[lane].model_reg_2).setDeviceId(deviceId);
+        if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) {
+          nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads);
+        }
+        (nnApplications[lane].model_reg_2).initEnvironment();
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        (nnApplications[lane].model_reg_2).initSession();
+      }
+      if (nn_settings.nnClusterizerVerbosity < 3) {
+        LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
+      }
+    });
+    mRec->runParallelOuterLoop(doGPU, NSECTORS, [&](uint32_t sector) {
+      GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector];
+      GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN;
+      int32_t lane = sector % numLanes;
+      clustererNN.deviceId = deviceId;
+      clustererNN.mISector = sector;
+      clustererNN.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
+      nnApplications[lane].initClusterizer(nn_settings, clustererNN);
+      if (doGPU) {
+        clustererNNShadow.deviceId = deviceId;
+        clustererNNShadow.mISector = sector;
+        clustererNNShadow.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
+        nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
       }
-      clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
-      GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
       AllocateRegisteredMemory(clustererNN.mMemoryId);
+    });
+    if (doGPU) {
+      WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
     }
+    LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes";
+    LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes";
   }
 #endif
 
-  if (doGPU && mIOPtrs.tpcZS) {
-    processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta;
-    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
-  }
-  if (doGPU) {
-    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
-  }
-
   size_t nClsTotal = 0;
   ClusterNativeAccess* tmpNativeAccess = mClusterNativeAccess.get();
   ClusterNative* tmpNativeClusters = nullptr;
@@ -914,58 +954,122 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
         if (GetProcessingSettings().nn.applyNNclusterizer) {
 #ifdef GPUCA_HAS_ONNX
-          GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
-          const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
-          GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+          GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane];
+          GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
+          GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];
+
+          int withMC = (doGPU && propagateMCLabels);
 
-          if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
+          if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
 
-          float time_clusterizer = 0, time_fill = 0;
-          for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
-            uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
-            size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
+          float time_clusterizer = 0, time_fill = 0, time_networks = 0;
+          for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.nnClusterizerBatchedMode); batch++) {
+            uint batchStart = batch * clustererNNShadow.nnClusterizerBatchedMode;
+            size_t iSize = CAMath::Min((uint)clustererNNShadow.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
             auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data
 
-            auto stop0 = std::chrono::high_resolution_clock::now();
-            auto start1 = std::chrono::high_resolution_clock::now();
-            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
+            // auto stop0 = std::chrono::high_resolution_clock::now();
+            // auto start1 = std::chrono::high_resolution_clock::now();
+
+            // NN evaluations
+            if (clustererNNShadow.nnInferenceInputDType == 0) {
+              if (clustererNNShadow.nnInferenceOutputDType == 0) {
+                (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_16);
+              } else if (clustererNNShadow.nnInferenceOutputDType == 1) {
+                (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_32);
+              }
+            } else if (clustererNNShadow.nnInferenceInputDType == 1) {
+              if (clustererNNShadow.nnInferenceOutputDType == 0) {
+                (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_16);
+              } else if (clustererNNShadow.nnInferenceOutputDType == 1) {
+                (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_32);
+              }
+            }
+            if (!clustererNNShadow.nnClusterizerUseCfRegression) {
+              if (clustererNNShadow.nnInferenceInputDType == 0) {
+                if (clustererNNShadow.nnInferenceOutputDType == 0) {
+                  (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_16);
+                } else if (clustererNNShadow.nnInferenceOutputDType == 1) {
+                  (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_32);
+                }
+              } else if (clustererNNShadow.nnInferenceInputDType == 1) {
+                if (clustererNNShadow.nnInferenceOutputDType == 0) {
+                  (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_16);
+                } else if (clustererNNShadow.nnInferenceOutputDType == 1) {
+                  (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_32);
+                }
+              }
+              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
+                if (clustererNNShadow.nnInferenceInputDType == 0) {
+                  if (clustererNNShadow.nnInferenceOutputDType == 0) {
+                    (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_16);
+                  } else if (clustererNNShadow.nnInferenceOutputDType == 1) {
+                    (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_32);
+                  }
+                } else if (clustererNNShadow.nnInferenceInputDType == 1) {
+                  if (clustererNNShadow.nnInferenceOutputDType == 0) {
+                    (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_16);
+                  } else if (clustererNNShadow.nnInferenceOutputDType == 1) {
+                    (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_32);
+                  }
+                }
+              }
+            }
+
+            // auto stopNNs = std::chrono::high_resolution_clock::now();
+
+            // Publishing kernels
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
             } else {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
             }
-
-            if (!clustererNN.nnClusterizerUseCfRegression) {
-              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
-              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
-                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
+            if (!clustererNNShadow.nnClusterizerUseCfRegression) {
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 1 regression results
+              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results
               }
             }
-            auto stop1 = std::chrono::high_resolution_clock::now();
 
-            time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
-            time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
+            // for(int i = 0; i < iSize; ++i) {
+            //   if(clustererNNShadow.outputDataClass[i + batchStart] > 1) {
+            //     LOG(info) << "WARNING ORT: Output of  " << i + batchStart << " / " << clusterer.mPmemory->counters.nClusters << " is " << clustererNNShadow.modelProbabilities_16[i].ToFloat() << " and " << clustererNNShadow.outputDataClass[i + batchStart] << " thresh " << clustererNNShadow.nnClassThreshold << " instead of 0 or 1. Please check the model and the input data.";
+            //     // std::string input = "[";
+            //     // for(int j = 0; j < clustererNNShadow.nnClusterizerElementSize; j++){
+            //     //   input += std::to_string(clustererNNShadow.inputData_16[i * clustererNNShadow.nnClusterizerElementSize + j].ToFloat()) + ", ";
+            //     // }
+            //     // input += "]";
+            //     // LOG(info) << "Input is: " << input;
+            //   }
+            // }
+
+            // auto stop1 = std::chrono::high_resolution_clock::now();
+
+            // time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count() / 1e9;
+            // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+            // time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
           }
-          auto start1 = std::chrono::high_resolution_clock::now();
-          if (clustererNN.nnClusterizerUseCfRegression) {
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
-          }
-          auto stop1 = std::chrono::high_resolution_clock::now();
-          time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
-          if (clustererNN.nnClusterizerVerbosity < 3) {
-            int acceptedClusters = 0;
-            for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
-              acceptedClusters += clustererNN.outputDataClass[i];
-            }
-            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+          if (clustererNNShadow.nnClusterizerUseCfRegression) {
+            // auto start1 = std::chrono::high_resolution_clock::now();
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            // auto stop1 = std::chrono::high_resolution_clock::now();
+            // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
           }
+          // if (clustererNNShadow.nnClusterizerVerbosity < 3) {
+          //   int acceptedClusters = 0;
+          //   for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
+          //     if(clustererNNShadow.outputDataClass[i] > 1 || clustererNNShadow.outputDataClass[i] < 0) {
+          //       LOG(info) << "WARNING ORT 2: " << clustererNNShadow.outputDataClass[i] << " for index " << i << " / " << clusterer.mPmemory->counters.nClusters;
+          //     }
+          //     acceptedClusters += clustererNNShadow.outputDataClass[i];
+          //   }
+          //   LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+          // }
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif
@@ -1066,6 +1170,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     }
   }
   for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
+    // if (GetProcessingSettings().nn.applyNNclusterizer) {
+    //   GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
+    //   nnApplication.model_class.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
+    //   nnApplication.model_reg_1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
+    //   nnApplication.model_reg_2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
+    // }
     if (transferRunning[i]) {
       ReleaseEvent(mEvents->stream[i], doGPU);
     }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 6a9b6f546ae07..fe3202fe7b439 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -24,29 +24,73 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}
 
 void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 {
-  if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
-    computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
-  } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {
-    computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
-  }
-  computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters);
-  if (nnClusterizerModelClassNumOutputNodes > 0) {
-    computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
-  }
-  if (!nnClusterizerUseCfRegression) {
-    if (nnClusterizerModelReg1NumOutputNodes > 0) {
-      computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
+  if (nnClusterizerBatchedMode > 0) {
+    if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) {
+      computePointerWithAlignment(mem, inputData_16, nnClusterizerBatchedMode * nnClusterizerElementSize);
+    } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
+      computePointerWithAlignment(mem, inputData_32, nnClusterizerBatchedMode * nnClusterizerElementSize);
     }
-    if (nnClusterizerModelReg2NumOutputNodes > 0) {
-      computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
+    computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
+
+    if (nnInferenceOutputDType == 0 && nnClusterizerElementSize > 0) {
+      if (nnClusterizerModelClassNumOutputNodes > 0) {
+        computePointerWithAlignment(mem, modelProbabilities_16, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
+      }
+      if (!nnClusterizerUseCfRegression) {
+        if (nnClusterizerModelReg1NumOutputNodes > 0) {
+          computePointerWithAlignment(mem, outputDataReg1_16, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
+        }
+        if (nnClusterizerModelReg2NumOutputNodes > 0) {
+          computePointerWithAlignment(mem, outputDataReg2_16, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
+        }
+      }
+    } else if (nnInferenceOutputDType == 1 && nnClusterizerElementSize > 0) {
+      if (nnClusterizerModelClassNumOutputNodes > 0) {
+        computePointerWithAlignment(mem, modelProbabilities_32, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
+      }
+      if (!nnClusterizerUseCfRegression) {
+        if (nnClusterizerModelReg1NumOutputNodes > 0) {
+          computePointerWithAlignment(mem, outputDataReg1_32, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
+        }
+        if (nnClusterizerModelReg2NumOutputNodes > 0) {
+          computePointerWithAlignment(mem, outputDataReg2_32, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
+        }
+      }
     }
   }
+  if (nnClusterizerTotalClusters > 0) {
+    computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters);
+  }
   return mem;
 }
 
+// std::vector<int32_t> GPUTPCNNClusterizer::pointerSizes() {
+//   std::vector<int32_t> sizes(7, -1);
+//   if (nnClusterizerBatchedMode > 0) {
+//     if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) {
+//       sizes[0] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData16
+//     } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
+//       sizes[1] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData32
+//     }
+//     sizes[2] = 2 * nnClusterizerBatchedMode; // clusterFlags
+//     if (nnClusterizerModelClassNumOutputNodes > 0) {
+//       sizes[3] = nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes; // modelProbabilities
+//     }
+//     if (!nnClusterizerUseCfRegression) {
+//       if (nnClusterizerModelReg1NumOutputNodes > 0) {
+//         sizes[4] = nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes; // outputDataReg1
+//       }
+//       if (nnClusterizerModelReg2NumOutputNodes > 0) {
+//         sizes[5] = nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes; // outputDataReg2
+//       }
+//     }
+//   }
+//   if (nnClusterizerTotalClusters > 0) {
+//     sizes[6] = nnClusterizerTotalClusters; // outputDataClass
+//   }
+//   return sizes;
+// }
+
 void GPUTPCNNClusterizer::RegisterMemoryAllocation()
 {
   AllocateAndInitializeLate();
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index ea6340dfd48bc..da490b0f94d58 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -42,33 +42,38 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int nnClusterizerSizeInputTime = 3;
   int nnClusterizerElementSize = -1;
   bool nnClusterizerAddIndexData = true;
-  float nnClassThreshold = 0.16;
+  float nnClassThreshold = 0.01;
   bool nnSigmoidTrafoClassThreshold = 1;
   int nnClusterizerUseCfRegression = 0;
   int nnClusterizerBatchedMode = 1;
   int nnClusterizerTotalClusters = 1;
   int nnClusterizerVerbosity = 0;
   int nnClusterizerBoundaryFillValue = -1;
-  int nnClusterizerDumpDigits = 0;
-  int nnClusterizerApplyCfDeconvolution = 0;
   int nnClusterizerModelClassNumOutputNodes = -1;
   int nnClusterizerModelReg1NumOutputNodes = -1;
   int nnClusterizerModelReg2NumOutputNodes = -1;
-  int nnClusterizerDtype = 0; // 0: float16, 1: float32
+  int nnInferenceInputDType = 0;  // 0: float16, 1: float32
+  int nnInferenceOutputDType = 0; // 0: float16, 1: float32
   int mISector = -1;
+  int deviceId = -1;
 
   // Memory allocation for neural network
-  uint class2_elements = 0;
-  float* inputData32 = nullptr;
-  OrtDataType::Float16_t* inputData16 = nullptr;
-  float* outputDataClass = nullptr;
-  float* modelProbabilities = nullptr;
-  float* outputDataReg1 = nullptr;
-  float* outputDataReg2 = nullptr;
 
-  ChargePos* peakPositions = nullptr;
-  bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx
-  float* centralCharges = nullptr;
+  bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
+  int* outputDataClass = nullptr;
+
+  // FP32
+  float* inputData_32 = nullptr;
+  float* modelProbabilities_32 = nullptr;
+  float* outputDataReg1_32 = nullptr;
+  float* outputDataReg2_32 = nullptr;
+
+  // FP16
+  OrtDataType::Float16_t* inputData_16 = nullptr;
+  OrtDataType::Float16_t* modelProbabilities_16 = nullptr;
+  OrtDataType::Float16_t* outputDataReg1_16 = nullptr;
+  OrtDataType::Float16_t* outputDataReg2_16 = nullptr;
+
   int16_t mMemoryId = -1;
 }; // class GPUTPCNNClusterizer
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 5002c63524020..db2f05711f537 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -12,54 +12,205 @@
 /// \file GPUTPCNNClusterizerHost.cxx
 /// \author Christian Sonnabend
 
+#include <CommonUtils/StringUtils.h>
+
 #include "GPUTPCNNClusterizerHost.h"
 #include "GPUTPCNNClusterizer.h"
 #include "GPUSettings.h"
 #include "ML/3rdparty/GPUORTFloat16.h"
+#include "GPUReconstruction.h"
+
+#ifdef GPUCA_HAS_ONNX
+#include <onnxruntime_cxx_api.h>
+#endif
 
 using namespace o2::gpu;
 
-GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer)
+void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
 {
+  std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath;
+  std::vector<std::string> reg_model_paths;
+  std::vector<std::string> evalMode = o2::utils::Str::tokenize(settings.nnEvalMode, ':');
+
+  if (settings.nnLoadFromCCDB) {
+    reg_model_path = settings.nnLocalFolder + "/net_regression_c1.onnx"; // Needs to be set identical to NeuralNetworkClusterizer.cxx, otherwise the networks might be loaded from the wrong place
+    if (evalMode[0] == "c1") {
+      class_model_path = settings.nnLocalFolder + "/net_classification_c1.onnx";
+    } else if (evalMode[0] == "c2") {
+      class_model_path = settings.nnLocalFolder + "/net_classification_c2.onnx";
+    }
+
+    if (evalMode[1] == "r2") {
+      reg_model_path += ":" + settings.nnLocalFolder + "/net_regression_c2.onnx";
+    }
+  }
+
   OrtOptions = {
-    {"model-path", settings.nnClassificationPath},
-    {"device", settings.nnInferenceDevice},
-    {"device-id", std::to_string(settings.nnInferenceDeviceId)},
+    {"model-path", class_model_path},
+    {"device-type", settings.nnInferenceDevice},
     {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
-    {"dtype", settings.nnInferenceDtype},
     {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
     {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)},
     {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
     {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)},
     {"profiling-output-path", settings.nnInferenceOrtProfilingPath},
-    {"logging-level", std::to_string(settings.nnInferenceVerbosity)}};
+    {"logging-level", std::to_string(settings.nnInferenceVerbosity)},
+    {"onnx-environment-name", "c1"}};
 
-  model_class.init(OrtOptions);
-  clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
+  model_class.initOptions(OrtOptions);
+  modelsUsed[0] = true;
 
-  reg_model_paths = splitString(settings.nnRegressionPath, ":");
+  reg_model_paths = o2::utils::Str::tokenize(reg_model_path, ':');
 
   if (!settings.nnClusterizerUseCfRegression) {
-    if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
+    if (reg_model_paths.size() == 1) {
       OrtOptions["model-path"] = reg_model_paths[0];
-      model_reg_1.init(OrtOptions);
-      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
+      OrtOptions["onnx-environment-name"] = "r1";
+      model_reg_1.initOptions(OrtOptions);
+      modelsUsed[1] = true;
     } else {
       OrtOptions["model-path"] = reg_model_paths[0];
-      model_reg_1.init(OrtOptions);
-      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
+      OrtOptions["onnx-environment-name"] = "r1";
+      model_reg_1.initOptions(OrtOptions);
+      modelsUsed[1] = true;
       OrtOptions["model-path"] = reg_model_paths[1];
-      model_reg_2.init(OrtOptions);
-      clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1];
+      OrtOptions["onnx-environment-name"] = "r2";
+      model_reg_2.initOptions(OrtOptions);
+      modelsUsed[2] = true;
     }
   }
 }
 
-void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype)
+void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN)
 {
-  if (dtype == 0) {
-    model.inference<OrtDataType::Float16_t, float>(clusterer.inputData16, size * clusterer.nnClusterizerElementSize, output);
+  clustererNN.nnClusterizerUseCfRegression = settings.nnClusterizerUseCfRegression;
+  clustererNN.nnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
+  clustererNN.nnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad;
+  clustererNN.nnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
+  clustererNN.nnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
+  clustererNN.nnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0);
+  clustererNN.nnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
+  clustererNN.nnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
+  clustererNN.nnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
+  if (clustererNN.nnSigmoidTrafoClassThreshold) {
+    clustererNN.nnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold));
+  } else {
+    clustererNN.nnClassThreshold = settings.nnClassThreshold;
+  }
+  if (settings.nnClusterizerVerbosity < 0) {
+    clustererNN.nnClusterizerVerbosity = settings.nnInferenceVerbosity;
   } else {
-    model.inference<float, float>(clusterer.inputData32, size * clusterer.nnClusterizerElementSize, output);
+    clustererNN.nnClusterizerVerbosity = settings.nnClusterizerVerbosity;
+  }
+  clustererNN.nnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos;
+  clustererNN.nnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos;
+  clustererNN.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
+  if (!settings.nnClusterizerUseCfRegression) {
+    if (model_class.getNumOutputNodes()[0][1] == 1 || !model_reg_2.isInitialized()) {
+      clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
+    } else {
+      clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
+      clustererNN.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1];
+    }
+  }
+}
+
+// MockedOrtAllocator implementation to be able to use volatile assignment
+struct MockedOrtAllocator : OrtAllocator {
+  MockedOrtAllocator(GPUReconstruction* = nullptr, OrtMemoryInfo* = nullptr);
+  ~MockedOrtAllocator();
+
+  void* Alloc(size_t size);
+  void Free(void* p);
+  const OrtMemoryInfo* Info() const;
+  void* Reserve(size_t size);
+  size_t NumAllocations() const;
+  size_t NumReserveAllocations() const;
+
+  void LeakCheck();
+
+ private:
+  MockedOrtAllocator(const MockedOrtAllocator&) = delete;
+  MockedOrtAllocator& operator=(const MockedOrtAllocator&) = delete;
+
+  std::atomic<size_t> memory_inuse{0};
+  std::atomic<size_t> num_allocations{0};
+  std::atomic<size_t> num_reserve_allocations{0};
+  OrtMemoryInfo* memory_info;
+  GPUReconstruction* rec;
+};
+
+MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info)
+{
+  OrtAllocator::version = ORT_API_VERSION;
+  OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Alloc(size); };
+  OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
+  OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
+  OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
+  rec = r;
+  memory_info = info;
+}
+
+MockedOrtAllocator::~MockedOrtAllocator()
+{
+  // Ort::GetApi().ReleaseMemoryInfo(memory_info);
+}
+
+void* MockedOrtAllocator::Alloc(size_t size)
+{
+  // LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes";
+  return rec->AllocateVolatileDeviceMemory(size);
+}
+
+void* MockedOrtAllocator::Reserve(size_t size)
+{
+  // LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes";
+  return rec->AllocateVolatileDeviceMemory(size);
+}
+
+void MockedOrtAllocator::Free(void* p)
+{
+  // LOG(info) << "(ORT) Freeing volatile memory " << p;
+  rec->ReturnVolatileDeviceMemory();
+}
+
+const OrtMemoryInfo* MockedOrtAllocator::Info() const
+{
+  return memory_info;
+}
+
+size_t MockedOrtAllocator::NumAllocations() const
+{
+  return num_allocations.load();
+}
+
+size_t MockedOrtAllocator::NumReserveAllocations() const
+{
+  return num_reserve_allocations.load();
+}
+
+void MockedOrtAllocator::LeakCheck()
+{
+  if (memory_inuse.load())
+    LOG(warning) << "memory leak!!!";
+}
+
+void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
+{
+  mockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)(*memInfo));
+  if (recreate) {
+    Ort::ThrowOnError(Ort::GetApi().UnregisterAllocator((OrtEnv*)(*env), (OrtMemoryInfo*)(*memInfo)));
   }
+  Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc.get()));
+  memInfo = (Ort::MemoryInfo*)mockedAlloc->Info();
+}
+
+const OrtMemoryInfo* GPUTPCNNClusterizerHost::getMockedMemoryInfo()
+{
+  return mockedAlloc->Info();
+}
+
+MockedOrtAllocator* GPUTPCNNClusterizerHost::getMockedAllocator()
+{
+  return mockedAlloc.get();
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 7efa0edecb893..0379b83d0ae02 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -22,6 +22,15 @@
 
 using namespace o2::ml;
 
+class OrtMemoryInfo;
+class OrtAllocator;
+struct MockedOrtAllocator;
+namespace Ort
+{
+struct Env;
+struct MemoryInfo;
+} // namespace Ort
+
 namespace o2::OrtDataType
 {
 struct Float16_t;
@@ -30,6 +39,7 @@ struct Float16_t;
 namespace o2::gpu
 {
 
+class GPUReconstruction;
 class GPUTPCNNClusterizer;
 struct GPUSettingsProcessingNNclusterizer;
 
@@ -37,30 +47,23 @@ class GPUTPCNNClusterizerHost
 {
  public:
   GPUTPCNNClusterizerHost() = default;
-  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) { init(settings); }
+
+  void init(const GPUSettingsProcessingNNclusterizer&);
+  void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
 
-  void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype);
+  // ONNX
+  void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
+  MockedOrtAllocator* getMockedAllocator();
+  const OrtMemoryInfo* getMockedMemoryInfo();
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
+  std::vector<bool> modelsUsed = {false, false, false};   // 0: class, 1: reg_1, 2: reg_2
+  int32_t deviceId = -1;
   std::vector<std::string> reg_model_paths;
 
- private:
-  // Avoid including CommonUtils/StringUtils.h
-  std::vector<std::string> splitString(const std::string& input, const std::string& delimiter)
-  {
-    std::vector<std::string> tokens;
-    std::size_t pos = 0;
-    std::size_t found;
-
-    while ((found = input.find(delimiter, pos)) != std::string::npos) {
-      tokens.push_back(input.substr(pos, found - pos));
-      pos = found + delimiter.length();
-    }
-    tokens.push_back(input.substr(pos));
-
-    return tokens;
-  }
+  std::shared_ptr<MockedOrtAllocator> mockedAlloc = nullptr;
 }; // class GPUTPCNNClusterizerHost
 
 } // namespace o2::gpu
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 379ea27443fea..2cf9ab2037007 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -35,7 +35,7 @@ using namespace o2::gpu::tpccf;
 
 // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
@@ -45,109 +45,26 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
   }
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
   o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
   GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart);
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  auto& clusterer = processors.tpcNNClusterer[sector];
-  uint glo_idx = get_global_id(0);
-  uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
-  float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
-  uint class_label = 0;
-  for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
-    if (pIdx == elem_iterator) {
-      current_max_prob = clusterer.modelProbabilities[pIdx];
-    } else {
-      class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
-    }
-  }
-  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
-  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
-    return;
-  }
-  GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
-    return;
-  }
-  GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
-}
-
-// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
-GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current)
-{
-  return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
-}
-
-GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift)
-{
-  return (row > 62 ? global_shift : 0);
-}
-
-GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift)
-{
-  if (pad < 0 || row < 0) { // Faster short-circuit
-    return true;
-  } else if (row < 63) {
-    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row)));
-  } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
-    return true;
-  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) {
-    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - global_shift)));
-  } else {
-    return true;
-  }
-}
-
-// Filling the input data for the neural network where there is no boundary
-GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
-
   uint write_idx = glo_idx * clustererNN.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
 
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
   ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
   int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
-
-  clustererNN.peakPositions[glo_idx] = peak;
-  clustererNN.centralCharges[glo_idx] = central_charge;
-  clustererNN.outputDataClass[glo_idx + batchStart] = -1;
-
   int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
+
 #ifndef GPUCA_GPUCODE
   GPUCA_UNROLL(U(), U());
 #endif
@@ -160,20 +77,20 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n
         if (!is_boundary) {
           ChargePos tmp_pos(row + r, pad + p, time + t);
           if (r == 0 && !clustererNN.clusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
-            clustererNN.clusterFlags[2 * glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
+            clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
             clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
           }
           if (dtype == 0) {
-            clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
-          } else {
-            clustererNN.inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+            clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+          } else if (dtype == 1) {
+            clustererNN.inputData_32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
           }
         } else {
           // Filling boundary just to make sure that no values are left unintentionally
           if (dtype == 0) {
-            clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
+            clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
           } else {
-            clustererNN.inputData32[write_idx] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
+            clustererNN.inputData_32[write_idx] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
           }
         }
         write_idx++;
@@ -182,66 +99,191 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n
   }
   if (clustererNN.nnClusterizerAddIndexData) {
     if (dtype == 0) {
-      clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f);
-      clustererNN.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
-      clustererNN.inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
+      clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(sector / 36.f);
+      clustererNN.inputData_16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
+      clustererNN.inputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
+    } else {
+      clustererNN.inputData_32[write_idx] = sector / 36.f;
+      clustererNN.inputData_32[write_idx + 1] = row / 152.f;
+      clustererNN.inputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
+    }
+  }
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNSingleElement>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  uint base_idx = CAMath::Floor(glo_idx / clustererNN.nnClusterizerElementSize);
+  uint transient_index = glo_idx % clustererNN.nnClusterizerElementSize;
+
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
+  ChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart];
+  int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad());
+
+  if (clustererNN.nnClusterizerAddIndexData && transient_index == (clustererNN.nnClusterizerElementSize - 1)) {
+    uint top_idx = (base_idx + 1) * clustererNN.nnClusterizerElementSize;
+    for (uint16_t i = 0; i < 8; i++) {
+      Delta2 d = cfconsts::InnerNeighbors[i];
+      ChargePos tmp_pos = peak.delta(d);
+      clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
+      clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
+    }
+    if (dtype == 0) {
+      clustererNN.inputData_16[top_idx - 3] = (OrtDataType::Float16_t)(sector / 36.f);
+      clustererNN.inputData_16[top_idx - 2] = (OrtDataType::Float16_t)(row / 152.f);
+      clustererNN.inputData_16[top_idx - 1] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
     } else {
-      clustererNN.inputData32[write_idx] = clusterer.mISector / 36.f;
-      clustererNN.inputData32[write_idx + 1] = row / 152.f;
-      clustererNN.inputData32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
+      clustererNN.inputData_32[top_idx - 3] = sector / 36.f;
+      clustererNN.inputData_32[top_idx - 2] = row / 152.f;
+      clustererNN.inputData_32[top_idx - 1] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
+    }
+  } else if (transient_index < (clustererNN.nnClusterizerElementSize - 3)) {
+    int time = static_cast<int>(peak.time());
+    int r = CAMath::Floor(transient_index / ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1))) - clustererNN.nnClusterizerSizeInputRow;
+    bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
+    if (is_row_boundary) {
+      if (dtype == 0) {
+        clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
+      } else {
+        clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
+      }
+    } else {
+      int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
+      int pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r);
+      int rest_1 = transient_index % ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1));
+      int p = CAMath::Floor(rest_1 / (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputPad + pad_offset;
+      bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow);
+
+      if (!is_boundary) {
+        float central_charge = static_cast<float>(chargeMap[peak].unpack());
+        int t = (rest_1 % (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputTime;
+        ChargePos tmp_pos(row + r, pad + p, time + t);
+        if (dtype == 0) {
+          clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+        } else if (dtype == 1) {
+          clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+        }
+      } else {
+        if (dtype == 0) {
+          clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
+        } else {
+          clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
+        }
+      }
     }
   }
 }
 
-GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  if (dtype == 0) {
+    processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)((processors.tpcNNClusterer[sector].modelProbabilities_16[glo_idx]).ToFloat() > processors.tpcNNClusterer[sector].nnClassThreshold);
+  } else if (dtype == 1) {
+    processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities_32[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
+  }
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  uint glo_idx = get_global_id(0);
+  uint elem_iterator = glo_idx * clustererNN.nnClusterizerModelClassNumOutputNodes;
+  float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
+  uint class_label = 0;
+  for (int pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes; pIdx++) {
+    if (pIdx == elem_iterator) {
+      if (dtype == 0) {
+        current_max_prob = static_cast<float>(clustererNN.modelProbabilities_16[pIdx]);
+      } else if (dtype == 1) {
+        current_max_prob = clustererNN.modelProbabilities_32[pIdx];
+      }
+    } else {
+      if (dtype == 0) {
+        current_max_prob = CAMath::Max(current_max_prob, clustererNN.modelProbabilities_16[pIdx].ToFloat());
+      } else if (dtype == 1) {
+        current_max_prob = CAMath::Max(current_max_prob, clustererNN.modelProbabilities_32[pIdx]);
+      }
+    }
+  }
+  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
+  clustererNN.outputDataClass[glo_idx + batchStart] = class_label;
+  if (class_label > 1) {
+    clustererNN.clusterFlags[2 * glo_idx] = 1;
+    clustererNN.clusterFlags[2 * glo_idx + 1] = 1;
+  }
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
+
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
+  float central_charge = static_cast<float>(chargeMap[peak].unpack());
+
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
   int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg1NumOutputNodes;
 
   // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
 
-  if (clustererNN.outputDataClass[full_glo_idx] == 1) {
+  if (clustererNN.outputDataClass[full_glo_idx] == 1 || (clustererNN.nnClusterizerModelReg2NumOutputNodes == -1 && clustererNN.outputDataClass[full_glo_idx] >= 1)) {
 
     ClusterAccumulator pc;
 
     // Publishing logic is taken from default clusterizer
-    if (onlyMC) {
+    if (withMC) {
       ClusterAccumulator dummy_pc;
-      CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
+      CPU_ONLY(labelAcc->collect(peak, central_charge));
       GPUTPCCFClusterizer::buildCluster(
         clusterer.Param().rec,
         chargeMap,
-        clustererNN.peakPositions[glo_idx],
+        peak,
         smem.posBcast,
         smem.buf,
         smem.innerAboveThreshold,
         &dummy_pc,
         labelAcc);
     }
-
-    if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
+    if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
       }
       return;
     }
 
-    pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg1[model_output_index + 4],
-               static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg1[model_output_index],
-               clustererNN.outputDataReg1[model_output_index + 2],
-               (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1],
-               clustererNN.outputDataReg1[model_output_index + 3],
-               clustererNN.clusterFlags[2 * glo_idx],
-               clustererNN.clusterFlags[2 * glo_idx + 1]);
+    if (dtype == 0) {
+      pc.setFull(central_charge * clustererNN.outputDataReg1_16[model_output_index + 4].ToFloat(),
+                 static_cast<float>(peak.pad()) + clustererNN.outputDataReg1_16[model_output_index].ToFloat(),
+                 clustererNN.outputDataReg1_16[model_output_index + 2].ToFloat(),
+                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg1_16[model_output_index + 1].ToFloat(),
+                 clustererNN.outputDataReg1_16[model_output_index + 3].ToFloat(),
+                 clustererNN.clusterFlags[2 * glo_idx],
+                 clustererNN.clusterFlags[2 * glo_idx + 1]);
+    } else if (dtype == 1) {
+      pc.setFull(central_charge * clustererNN.outputDataReg1_32[model_output_index + 4],
+                 static_cast<float>(peak.pad()) + clustererNN.outputDataReg1_32[model_output_index],
+                 clustererNN.outputDataReg1_32[model_output_index + 2],
+                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg1_32[model_output_index + 1],
+                 clustererNN.outputDataReg1_32[model_output_index + 3],
+                 clustererNN.clusterFlags[2 * glo_idx],
+                 clustererNN.clusterFlags[2 * glo_idx + 1]);
+    }
 
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
+    bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -250,11 +292,11 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
     }
 
     uint rowIndex = 0;
-    if (clusterer.mPclusterByRow != nullptr) {
+    if (clusterOut != nullptr) {
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
         myCluster,
-        clustererNN.peakPositions[glo_idx].row(),
+        peak.row(),
         clusterer.mNMaxClusterPerRow,
         clusterer.mPclusterInRow,
         clusterOut);
@@ -264,7 +306,7 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
     } else if (clusterer.mPclusterPosInRow) {
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-    CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+    CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
   } else {
     if (clusterer.mPclusterPosInRow) {
       clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -273,38 +315,41 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
   }
 }
 
-GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
 {
+  uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
+
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
+  float central_charge = static_cast<float>(chargeMap[peak].unpack());
+
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
   int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg2NumOutputNodes;
 
-  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg2NumOutputNodes << " -- " << clustererNN.peakPositions.size() << " -- " << clustererNN.centralCharges.size();
-
   if (clustererNN.outputDataClass[full_glo_idx] > 0) {
 
     ClusterAccumulator pc;
 
-    if (onlyMC) {
+    if (withMC) {
       ClusterAccumulator dummy_pc;
-      CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
+      CPU_ONLY(labelAcc->collect(peak, central_charge));
       GPUTPCCFClusterizer::buildCluster(
         clusterer.Param().rec,
         chargeMap,
-        clustererNN.peakPositions[glo_idx],
+        peak,
         smem.posBcast,
         smem.buf,
         smem.innerAboveThreshold,
         &dummy_pc,
         labelAcc);
     }
-
-    if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
+    if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
       }
@@ -312,16 +357,26 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
     }
 
     // Cluster 1
-    pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 8],
-               static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index],
-               clustererNN.outputDataReg2[model_output_index + 4],
-               (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2],
-               clustererNN.outputDataReg2[model_output_index + 6],
-               clustererNN.clusterFlags[2 * glo_idx],
-               clustererNN.clusterFlags[2 * glo_idx + 1]);
+    if (dtype == 0) {
+      pc.setFull(central_charge * clustererNN.outputDataReg2_16[model_output_index + 8].ToFloat(),
+                 static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index].ToFloat(),
+                 clustererNN.outputDataReg2_16[model_output_index + 4].ToFloat(),
+                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 2].ToFloat(),
+                 clustererNN.outputDataReg2_16[model_output_index + 6].ToFloat(),
+                 clustererNN.clusterFlags[2 * glo_idx],
+                 clustererNN.clusterFlags[2 * glo_idx + 1]);
+    } else if (dtype == 1) {
+      pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 8],
+                 static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index],
+                 clustererNN.outputDataReg2_32[model_output_index + 4],
+                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 2],
+                 clustererNN.outputDataReg2_32[model_output_index + 6],
+                 clustererNN.clusterFlags[2 * glo_idx],
+                 clustererNN.clusterFlags[2 * glo_idx + 1]);
+    }
 
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
+    bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -330,11 +385,11 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
     }
 
     uint rowIndex = 0;
-    if (clusterer.mPclusterByRow != nullptr) {
+    if (clusterOut != nullptr) {
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
         myCluster,
-        clustererNN.peakPositions[glo_idx].row(),
+        peak.row(),
         clusterer.mNMaxClusterPerRow,
         clusterer.mPclusterInRow,
         clusterOut);
@@ -344,18 +399,28 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
     } else if (clusterer.mPclusterPosInRow) {
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-    CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+    CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
 
     // Cluster 2
-    pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 9],
-               static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index + 1],
-               clustererNN.outputDataReg2[model_output_index + 5],
-               (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3],
-               clustererNN.outputDataReg2[model_output_index + 7],
-               clustererNN.clusterFlags[2 * glo_idx],
-               clustererNN.clusterFlags[2 * glo_idx + 1]);
-
-    rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
+    if (dtype == 0) {
+      pc.setFull(central_charge * clustererNN.outputDataReg2_16[model_output_index + 9].ToFloat(),
+                 static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index + 1].ToFloat(),
+                 clustererNN.outputDataReg2_16[model_output_index + 5].ToFloat(),
+                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 3].ToFloat(),
+                 clustererNN.outputDataReg2_16[model_output_index + 7].ToFloat(),
+                 clustererNN.clusterFlags[2 * glo_idx],
+                 clustererNN.clusterFlags[2 * glo_idx + 1]);
+    } else if (dtype == 1) {
+      pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 9],
+                 static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index + 1],
+                 clustererNN.outputDataReg2_32[model_output_index + 5],
+                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 3],
+                 clustererNN.outputDataReg2_32[model_output_index + 7],
+                 clustererNN.clusterFlags[2 * glo_idx],
+                 clustererNN.clusterFlags[2 * glo_idx + 1]);
+    }
+
+    rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -363,11 +428,11 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
       return;
     }
 
-    if (clusterer.mPclusterByRow != nullptr) {
+    if (clusterOut != nullptr) {
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
         myCluster,
-        clustererNN.peakPositions[glo_idx].row(),
+        peak.row(),
         clusterer.mNMaxClusterPerRow,
         clusterer.mPclusterInRow,
         clusterOut);
@@ -377,7 +442,7 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
     } else if (clusterer.mPclusterPosInRow) {
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-    // CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
+    // CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
   } else {
     if (clusterer.mPclusterPosInRow) {
       clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -385,3 +450,29 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
     return;
   }
 }
+
+// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
+GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current)
+{
+  return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2);
+}
+
+GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift)
+{
+  return (row > 62 ? global_shift : 0);
+}
+
+GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift)
+{
+  if (pad < 0 || row < 0) { // Faster short-circuit
+    return true;
+  } else if (row < 63) {
+    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row)));
+  } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
+    return true;
+  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) {
+    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - global_shift)));
+  } else {
+    return true;
+  }
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index a1d641fdb0b93..27cfba2487aed 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -39,6 +39,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
  public:
   // Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below
   static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
+  static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNSingleElement) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
   static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
   static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
   static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer));
@@ -59,10 +60,11 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
   enum K : int32_t {
     runCfClusterizer = 0,
     fillInputNN = 1,
-    determineClass1Labels = 2,
-    determineClass2Labels = 3,
-    publishClass1Regression = 4,
-    publishClass2Regression = 5,
+    fillInputNNSingleElement = 2,
+    determineClass1Labels = 3,
+    determineClass2Labels = 4,
+    publishClass1Regression = 5,
+    publishClass2Regression = 6,
   };
 
   template <int32_t iKernel = defaultKernel, typename... Args>
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index ee3af2b87d925..23ab0f0da5bac 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -116,6 +116,7 @@ o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUS
 if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
 o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer"        "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN"             "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement"             "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels"   "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels"   "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
diff --git a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
index 0038233f1c376..73f1f208e8889 100644
--- a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
+++ b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
@@ -83,6 +83,7 @@ class GPUO2Interface;
 struct TPCPadGainCalib;
 struct TPCZSLinkMapping;
 struct GPUSettingsO2;
+struct GPUSettingsProcessingNNclusterizer;
 class GPUO2InterfaceQA;
 struct GPUTrackingInOutPointers;
 struct GPUTrackingInOutZS;
@@ -225,6 +226,8 @@ class GPURecoWorkflowSpec : public o2::framework::Task
   uint32_t mNextThreadIndex = 0;
   bool mUpdateGainMapCCDB = true;
   std::unique_ptr<o2::gpu::GPUSettingsTF> mTFSettings;
+  std::unique_ptr<o2::gpu::GPUSettingsProcessingNNclusterizer> mNNClusterizerSettings;
+
   Config mSpecConfig;
   std::shared_ptr<o2::base::GRPGeomRequest> mGGR;
   bool mGRPGeomUpdated = false;
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index 7ad03ec58ae80..8a755a703705f 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -78,6 +78,7 @@
 #include "DetectorsRaw/RDHUtils.h"
 #include "ITStracking/TrackingInterface.h"
 #include "GPUWorkflowInternal.h"
+#include "TPCCalibration/NeuralNetworkClusterizer.h"
 // #include "Framework/ThreadPool.h"
 
 #include <TStopwatch.h>
@@ -132,6 +133,50 @@ void GPURecoWorkflowSpec::init(InitContext& ic)
 {
   GRPGeomHelper::instance().setRequest(mGGR);
   GPUO2InterfaceConfiguration& config = *mConfig.get();
+  GPUSettingsProcessingNNclusterizer& mNNClusterizerSettings = mConfig->configProcessing.nn;
+
+  if (mNNClusterizerSettings.nnLoadFromCCDB) {
+    LOG(info) << "Loading neural networks from CCDB";
+    o2::tpc::NeuralNetworkClusterizer nnClusterizerFetcher;
+    nnClusterizerFetcher.initCcdbApi(mNNClusterizerSettings.nnCCDBURL);
+    std::map<std::string, std::string> ccdbSettings = {
+      {"nnCCDBURL", mNNClusterizerSettings.nnCCDBURL},
+      {"nnCCDBPath", mNNClusterizerSettings.nnCCDBPath},
+      {"inputDType", mNNClusterizerSettings.nnInferenceInputDType},
+      {"outputDType", mNNClusterizerSettings.nnInferenceOutputDType},
+      {"outputFolder", mNNClusterizerSettings.nnLocalFolder},
+      {"nnCCDBPath", mNNClusterizerSettings.nnCCDBPath},
+      {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings.nnCCDBWithMomentum)},
+      {"nnCCDBBeamType", mNNClusterizerSettings.nnCCDBBeamType},
+      {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings.nnCCDBInteractionRate)}};
+
+    std::string nnFetchFolder = mNNClusterizerSettings.nnLocalFolder;
+    std::vector<std::string> evalMode = o2::utils::Str::tokenize(mNNClusterizerSettings.nnEvalMode, ':');
+
+    if (evalMode[0] == "c1") {
+      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBClassificationLayerType;
+      ccdbSettings["nnCCDBEvalType"] = "classification_c1";
+      ccdbSettings["outputFile"] = "net_classification_c1.onnx";
+      nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
+    } else if (evalMode[0] == "c2") {
+      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBClassificationLayerType;
+      ccdbSettings["nnCCDBEvalType"] = "classification_c2";
+      ccdbSettings["outputFile"] = "net_classification_c2.onnx";
+      nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
+    }
+
+    ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBRegressionLayerType;
+    ccdbSettings["nnCCDBEvalType"] = "regression_c1";
+    ccdbSettings["outputFile"] = "net_regression_c1.onnx";
+    nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
+    if (evalMode[1] == "r2") {
+      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBRegressionLayerType;
+      ccdbSettings["nnCCDBEvalType"] = "regression_c2";
+      ccdbSettings["outputFile"] = "net_regression_c2.onnx";
+      nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
+    }
+    LOG(info) << "Neural network loading done!";
+  }
 
   // Create configuration object and fill settings
   mConfig->configGRP.solenoidBzNominalGPU = 0;