diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 89631d59a3846..93549178848ca 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -41,6 +41,7 @@ class OrtModel
   OrtModel(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
   void init(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
   void reset(std::unordered_map<std::string, std::string>);
+  bool isInitialized() { return mInitialized; }
 
   virtual ~OrtModel() = default;
 
@@ -55,6 +56,9 @@ class OrtModel
   template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
   std::vector<O> inference(std::vector<std::vector<I>>&);
 
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
+  void inference(I*, size_t, O*);
+
   // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
   // std::vector<O> inference(std::vector<I>&);
 
@@ -79,8 +83,9 @@ class OrtModel
   std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
 
   // Environment settings
-  std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
-  int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
+  bool mInitialized = false;
+  std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
+  int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
   std::string printShape(const std::vector<int64_t>&);
 };
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index eb124ff6f12c9..fc784dd14d2dc 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -44,17 +44,20 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   if (!optionsMap.contains("model-path")) {
     LOG(fatal) << "(ORT) Model path cannot be empty!";
   }
-  modelPath = optionsMap["model-path"];
-  device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
-  dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
-  deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
-  allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
-  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
-  loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 2);
-  enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
-  enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
-
-  std::string dev_mem_str = "Hip";
+
+  if (!optionsMap["model-path"].empty()) {
+    modelPath = optionsMap["model-path"];
+    device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
+    dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
+    deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
+    allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
+    intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
+    interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0);
+    loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
+    enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
+    enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
+
+    std::string dev_mem_str = "Hip";
 #if defined(ORT_ROCM_BUILD)
 #if ORT_ROCM_BUILD == 1
   if (device == "ROCM") {
@@ -88,12 +91,15 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 
   if (device == "CPU") {
     (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
-    if (intraOpNumThreads > 1) {
+    (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads);
+    if (intraOpNumThreads > 1 || interOpNumThreads > 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
     } else if (intraOpNumThreads == 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
     }
-    LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
+    if (loggingLevel < 2) {
+      LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " (intraOpNumThreads) and " << interOpNumThreads << " (interOpNumThreads) threads";
+    }
   }
 
   (pImplOrt->sessionOptions).DisableMemPattern();
@@ -109,6 +115,9 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   } else {
     (pImplOrt->sessionOptions).DisableProfiling();
   }
+
+  mInitialized = true;
+
   (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
   (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
 
@@ -154,16 +163,9 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   outputNamesChar.resize(mOutputNames.size(), nullptr);
   std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
                  [&](const std::string& str) { return str.c_str(); });
-
-  // Print names
-  LOG(info) << "\tInput Nodes:";
-  for (size_t i = 0; i < mInputNames.size(); i++) {
-    LOG(info) << "\t\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
   }
-
-  LOG(info) << "\tOutput Nodes:";
-  for (size_t i = 0; i < mOutputNames.size(); i++) {
-    LOG(info) << "\t\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
+  if (loggingLevel < 2) {
+    LOG(info) << "(ORT) Model loaded successfully! (input: " << printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")";
   }
 }
 
@@ -187,36 +189,6 @@ std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
   }
 }
 
-template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<I>& input)
-{
-  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
-  std::vector<O> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
-  outputTensors.clear();
-  return outputValuesVec;
-}
-
-template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
-{
-  std::vector<Ort::Value> inputTensor;
-  for (auto i : input) {
-    std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-    inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
-  }
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
-  std::vector<O> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
-  outputTensors.clear();
-  return outputValuesVec;
-}
-
 std::string OrtModel::printShape(const std::vector<int64_t>& v)
 {
   std::stringstream ss("");
@@ -227,74 +199,68 @@ std::string OrtModel::printShape(const std::vector<int64_t>& v)
   return ss.str();
 }
 
-template <>
-std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input)
+template <class I, class O>
+std::vector<O> OrtModel::inference(std::vector<I>& input)
 {
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
+  if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
+    inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  } else {
+    inputTensor.emplace_back(Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
+  }
   // input.clear();
   auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  float* outputValues = outputTensors[0].template GetTensorMutableData<float>();
-  std::vector<float> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  O* outputValues = outputTensors[0].template GetTensorMutableData<O>();
+  std::vector<O> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
   outputTensors.clear();
   return outputValuesVec;
 }
 
-template <>
-std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input)
-{
-  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  float* outputValues = outputTensors[0].template GetTensorMutableData<float>();
-  std::vector<float> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
-  outputTensors.clear();
-  return outputValuesVec;
-}
+template std::vector<float> OrtModel::inference<float, float>(std::vector<float>&);
 
-template <>
-std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input)
-{
-  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
-  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
-  outputTensors.clear();
-  return outputValuesVec;
-}
+template std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>&);
 
-template <>
-std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input)
+template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>&);
+
+template <class I, class O>
+void OrtModel::inference(I* input, size_t input_size, O* output)
 {
-  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
-  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
-  outputTensors.clear();
-  return outputValuesVec;
+  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  Ort::Value inputTensor = Ort::Value(nullptr);
+  if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
+    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size());
+  } else {
+    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size());
+  }
+
+  std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
+  size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]);
+  Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
+
+  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here
 }
 
-template <>
-std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input)
+template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
+
+template void OrtModel::inference<float, float>(float*, size_t, float*);
+
+template <class I, class O>
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
 {
   std::vector<Ort::Value> inputTensor;
   for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-    inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
+    if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
+      inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
+    } else {
+      inputTensor.emplace_back(Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, i.data(), i.size(), inputShape.data(), inputShape.size()));
+    }
   }
   // input.clear();
   auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
-  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
+  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
+  std::vector<O> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
   outputTensors.clear();
   return outputValuesVec;
 }
diff --git a/GPU/GPUTracking/Base/GPUConstantMem.h b/GPU/GPUTracking/Base/GPUConstantMem.h
index 4f83fa48a64e0..8f1cc90f5ae93 100644
--- a/GPU/GPUTracking/Base/GPUConstantMem.h
+++ b/GPU/GPUTracking/Base/GPUConstantMem.h
@@ -34,12 +34,15 @@
 #include "GPUKernelDebugOutput.h"
 #endif
 
+#ifdef GPUCA_HAS_ONNX
+#include "GPUTPCNNClusterizer.h"
+#endif
+
 namespace o2::gpu
 {
 struct GPUConstantMem {
   GPUParam param;
-  GPUTPCTracker
-    tpcTrackers[GPUCA_NSECTORS];
+  GPUTPCTracker tpcTrackers[GPUCA_NSECTORS];
   GPUTPCConvert tpcConverter;
   GPUTPCCompression tpcCompressor;
   GPUTPCDecompression tpcDecompressor;
@@ -55,6 +58,9 @@ struct GPUConstantMem {
 #ifdef GPUCA_KERNEL_DEBUGGER_OUTPUT
   GPUKernelDebugOutput debugOutput;
 #endif
+#ifdef GPUCA_HAS_ONNX
+  GPUTPCNNClusterizer tpcNNClusterer[GPUCA_NSECTORS];
+#endif
 
   template <int32_t I>
   GPUd() auto& getTRDTracker();
diff --git a/GPU/GPUTracking/Base/GPUMemoryResource.h b/GPU/GPUTracking/Base/GPUMemoryResource.h
index 3bb2c363db2a9..06e350db0bfc7 100644
--- a/GPU/GPUTracking/Base/GPUMemoryResource.h
+++ b/GPU/GPUTracking/Base/GPUMemoryResource.h
@@ -28,6 +28,7 @@ struct GPUMemoryReuse {
   };
   enum Group : uint16_t {
     ClustererScratch,
+    NNClusterer,
     ClustererZS,
     TrackerScratch,
     TrackerDataLinks,
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index 656fa37fb6a4c..df9a7380834ce 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -93,6 +93,9 @@ GPUReconstruction::GPUReconstruction(const GPUSettingsDeviceBackend& cfg) : mHos
   for (uint32_t i = 0; i < NSECTORS; i++) {
     processors()->tpcTrackers[i].SetSector(i); // TODO: Move to a better place
     processors()->tpcClusterer[i].mISector = i;
+#ifdef GPUCA_HAS_ONNX
+    processors()->tpcNNClusterer[i].mISector = i;
+#endif
   }
 #ifndef GPUCA_NO_ROOT
   mROOTDump = GPUROOTDumpCore::getAndCreate();
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index c97742ac1d47f..d5a90dbd65ea3 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -159,8 +159,8 @@ set(HDRS_INSTALL
 )
 
 set(SRCS_NO_CINT ${SRCS_NO_CINT} display/GPUDisplayInterface.cxx)
-set(SRCS_NO_CINT
-    ${SRCS_NO_CINT}
+
+set(SRCS_NO_CINT ${SRCS_NO_CINT}
     Global/GPUChainITS.cxx
     ITS/GPUITSFitter.cxx
     ITS/GPUITSFitterKernels.cxx
@@ -191,6 +191,10 @@ set(SRCS_NO_CINT
     Refit/GPUTrackingRefitKernel.cxx
     Merger/GPUTPCGMO2Output.cxx)
 
+if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
+  list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx TPCClusterFinder/GPUTPCNNClusterizer.cxx TPCClusterFinder/GPUTPCNNClusterizerHost.cxx)
+endif()
+
 set(SRCS_DATATYPES
     ${SRCS_DATATYPES}
     DataTypes/TPCPadGainCalib.cxx
@@ -273,6 +277,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                  PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
                  SOURCES ${SRCS_DATATYPES})
   target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2)
+
   o2_target_root_dictionary(GPUDataTypes
                             HEADERS ${HDRS_CINT_DATATYPES} ${HDRS_CINT_O2_ADDITIONAL}
                             LINKDEF GPUTrackingLinkDef_O2_DataTypes.h)
@@ -292,6 +297,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                                        O2::TPCFastTransformation
                                        O2::DetectorsRaw
                                        O2::Steer
+                                       O2::ML
                  PUBLIC_INCLUDE_DIRECTORIES .
                                             Definitions
                                             DataTypes
@@ -317,7 +323,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
     ${targetName}
     PRIVATE $<TARGET_PROPERTY:O2::Framework,INTERFACE_INCLUDE_DIRECTORIES>)
 
-  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2)
+  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1)
 
   o2_target_root_dictionary(${MODULE}
                             HEADERS ${HDRS_CINT_O2} ${HDRS_CINT_O2_ADDITIONAL}
diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
index 3ed6c25762405..55f2e76344bd5 100644
--- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
+++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
@@ -477,6 +477,9 @@
   #ifndef GPUCA_LB_GPUTPCCFClusterizer
     #define GPUCA_LB_GPUTPCCFClusterizer 512
   #endif
+  #ifndef GPUCA_LB_GPUTPCNNClusterizerKernels
+    #define GPUCA_LB_GPUTPCNNClusterizerKernels 512
+  #endif
   #ifndef GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU
     #define GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU 256
   #endif
@@ -495,6 +498,16 @@
 
 #define GPUCA_LB_GPUTPCCFNoiseSuppression_noiseSuppression GPUCA_LB_GPUTPCCFNoiseSuppression
 #define GPUCA_LB_GPUTPCCFNoiseSuppression_updatePeaks GPUCA_LB_GPUTPCCFNoiseSuppression
+
+#ifdef GPUCA_HAS_ONNX
+#define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels
+#define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN GPUCA_LB_GPUTPCNNClusterizerKernels
+#define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels
+#define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels
+#define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels
+#define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizerKernels
+#endif
+
 #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_THREAD_COUNT_SCAN
 #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_THREAD_COUNT_SCAN
 #define GPUCA_LB_GPUTPCCFStreamCompaction_scanTop GPUCA_THREAD_COUNT_SCAN
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 9b6be7743e485..63fcf51004eae 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -222,6 +222,35 @@ AddOption(tpcTriggerHandling, bool, true, "", 0, "Enable TPC trigger handling")
 AddHelp("help", 'h')
 EndConfig()
 
+BeginSubConfig(GPUSettingsProcessingNNclusterizer, nn, configStandalone.proc, "NN", 0, "Processing settings for neural network clusterizer", proc_nn)
+AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural network clusterizer should be used.")
+AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
+AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
+AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
+AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
+AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
+AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347")
+AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
+AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data")
+AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
+AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
+AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerUseCfRegression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN")
+AddOption(nnClusterizerApplyCfDeconvolution, int, 0, "", 0, "Applies the CFDeconvolution kernel before the digits to the network are filled")
+AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
+AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
+AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
+AddOption(nnClusterizerApplyNoiseSuppression, int, 1, "", 0, "Applies the NoiseSuppression kernel before the digits to the network are filled")
+AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
+AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
+AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
+AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
+AddHelp("help", 'h')
+EndConfig()
+
 BeginSubConfig(GPUSettingsProcessing, proc, configStandalone, "PROC", 0, "Processing settings", proc)
 AddOption(deviceNum, int32_t, -1, "gpuDevice", 0, "Set GPU device to use (-1: automatic, -2: for round-robin usage in timeslice-pipeline)")
 AddOption(gpuDeviceOnly, bool, false, "", 0, "Use only GPU as device (i.e. no CPU for OpenCL)")
@@ -299,6 +328,7 @@ AddOption(printSettings, bool, false, "", 0, "Print all settings when initializi
 AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr)
 AddSubConfig(GPUSettingsProcessingRTC, rtc)
 AddSubConfig(GPUSettingsProcessingParam, param)
+AddSubConfig(GPUSettingsProcessingNNclusterizer, nn)
 AddHelp("help", 'h')
 EndConfig()
 #endif // __OPENCL__
diff --git a/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h b/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h
index ab60827655a43..35ebbabe41672 100644
--- a/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h
+++ b/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h
@@ -30,6 +30,7 @@
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessing + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingParam + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingRTC + ;
+#pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingNNclusterizer + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsDisplay + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsDisplayLight + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsDisplayHeavy + ;
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index 6dcb6f1d7e514..37ad164d20a60 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -103,6 +103,9 @@ void GPUChainTracking::RegisterPermanentMemoryAndProcessors()
   if (GetRecoSteps() & RecoStep::TPCClusterFinding) {
     for (uint32_t i = 0; i < NSECTORS; i++) {
       mRec->RegisterGPUProcessor(&processors()->tpcClusterer[i], GetRecoStepsGPU() & RecoStep::TPCClusterFinding);
+#ifdef GPUCA_HAS_ONNX
+      mRec->RegisterGPUProcessor(&processors()->tpcNNClusterer[i], GetRecoStepsGPU() & RecoStep::TPCClusterFinding);
+#endif
     }
   }
   if (GetRecoSteps() & RecoStep::Refit) {
@@ -148,6 +151,9 @@ void GPUChainTracking::RegisterGPUProcessors()
   if (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) {
     for (uint32_t i = 0; i < NSECTORS; i++) {
       mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcClusterer[i], &processors()->tpcClusterer[i]);
+#ifdef GPUCA_HAS_ONNX
+      mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcNNClusterer[i], &processors()->tpcNNClusterer[i]);
+#endif
     }
   }
   if (GetRecoStepsGPU() & RecoStep::Refit) {
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 62a4a524d67df..63d56da37595b 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -39,6 +39,11 @@
 #include <Vc/Vc>
 #endif
 
+#ifdef GPUCA_HAS_ONNX
+#include "GPUTPCNNClusterizerKernels.h"
+#include "GPUTPCNNClusterizerHost.h"
+#endif
+
 using namespace o2::gpu;
 using namespace o2::tpc;
 using namespace o2::tpc::constants;
@@ -149,7 +154,8 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
   uint32_t endpointAdcSamples[GPUTrackingInOutZS::NENDPOINTS];
   memset(endpointAdcSamples, 0, sizeof(endpointAdcSamples));
   bool doGPU = mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding;
-  int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : (mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0;
+  int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : (mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0])
+                                                                                                                                                                                                               : 0;
 
   for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
 #ifndef GPUCA_NO_VC
@@ -606,6 +612,41 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
   }
 
+#ifdef GPUCA_HAS_ONNX
+  if (GetProcessingSettings().nn.applyNNclusterizer) {
+    uint32_t maxClusters = -1;
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+      maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
+    }
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+      GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
+      const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
+      clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
+      clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
+      clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
+      clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
+      clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
+      clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
+      clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
+      clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
+      clustererNN.nnClusterizerTotalClusters = maxClusters;
+      clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
+      clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
+      if (clustererNN.nnSigmoidTrafoClassThreshold) {
+        clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
+      }
+      if (nn_settings.nnClusterizerVerbosity < 0) {
+        clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
+      } else {
+        clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
+      }
+      clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
+      GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+      AllocateRegisteredMemory(clustererNN.mMemoryId);
+    }
+  }
+#endif
+
   if (doGPU && mIOPtrs.tpcZS) {
     processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta;
     WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
@@ -854,6 +895,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         uint32_t iSector = iSectorBase + lane;
         GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
         GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
+
         if (doGPU) {
           SynchronizeStream(lane);
         }
@@ -871,17 +913,77 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           return;
         }
 
-        runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
-        DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+        if (GetProcessingSettings().nn.applyNNclusterizer) {
+#ifdef GPUCA_HAS_ONNX
+          GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
+          const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
+          GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+
+          if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
+            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
+            DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+          }
+
+          float time_clusterizer = 0, time_fill = 0;
+          for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
+            uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
+            size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
+
+            auto start0 = std::chrono::high_resolution_clock::now();
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
+
+            auto stop0 = std::chrono::high_resolution_clock::now();
+            auto start1 = std::chrono::high_resolution_clock::now();
+            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
+            if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+            } else {
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+            }
+
+            if (!clustererNN.nnClusterizerUseCfRegression) {
+              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
+              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
+                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
+              }
+            }
+            auto stop1 = std::chrono::high_resolution_clock::now();
+
+            time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+            time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
+          }
+          auto start1 = std::chrono::high_resolution_clock::now();
+          if (clustererNN.nnClusterizerUseCfRegression) {
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+          }
+          auto stop1 = std::chrono::high_resolution_clock::now();
+          time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+          if (clustererNN.nnClusterizerVerbosity < 3) {
+            int acceptedClusters = 0;
+            for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
+              acceptedClusters += clustererNN.outputDataClass[i];
+            }
+            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+          }
+#else
+          GPUFatal("Project not compiled with neural network clusterization. Aborting.");
+#endif
+        } else {
+          runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
+          DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0);
+        }
 
-        runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0);
         if (doGPU && propagateMCLabels) {
           TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
           if (doGPU) {
             SynchronizeStream(lane);
           }
-          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1);
+          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1); // Computes MC labels
         }
+
         if (GetProcessingSettings().debugLevel >= 3) {
           GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSector, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters);
         }
diff --git a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h
index b4a4752b0f932..cdd489e0ef938 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h
@@ -45,6 +45,7 @@ struct ChargePos {
   GPUdi() tpccf::Row row() const { return gpad / TPC_PADS_PER_ROW_PADDED; }
   GPUdi() tpccf::Pad pad() const { return gpad % TPC_PADS_PER_ROW_PADDED - GPUCF_PADDING_PAD; }
   GPUdi() tpccf::TPCFragmentTime time() const { return timePadded - GPUCF_PADDING_TIME; }
+  GPUdi() tpccf::TPCFragmentTime globalTime() const { return timePadded; }
 
  private:
   // Maps the position of a pad given as row and index in that row to a unique
diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
index f0c6ac47f3c8a..90d977372b201 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
@@ -41,6 +41,17 @@ class ClusterAccumulator
   GPUd() tpccf::Charge updateInner(PackedCharge, tpccf::Delta2);
   GPUd() tpccf::Charge updateOuter(PackedCharge, tpccf::Delta2);
 
+  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInPad, uint8_t splitInTime)
+  {
+    mQtot = qtot;
+    mPadMean = padMean;
+    mPadSigma = padSigma;
+    mTimeMean = timeMean;
+    mTimeSigma = timeSigma;
+    mSplitInPad = splitInPad;
+    mSplitInTime = splitInTime;
+  }
+
   GPUd() void finalize(const ChargePos&, const tpccf::Charge, tpccf::TPCTime);
   GPUd() bool toNative(const ChargePos&, const tpccf::Charge, tpc::ClusterNative&, const GPUParam&, const Array2D<PackedCharge>&);
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
new file mode 100644
index 0000000000000..6a9b6f546ae07
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -0,0 +1,55 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizer.cxx
+/// \author Christian Sonnabend
+
+#include "GPUReconstruction.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
+#include "GPUTPCNNClusterizer.h"
+
+using namespace o2::gpu;
+
+void GPUTPCNNClusterizer::InitializeProcessor() {}
+
+void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}
+
+void* GPUTPCNNClusterizer::setIOPointers(void* mem)
+{
+  if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
+    computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
+  } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {
+    computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
+  }
+  computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
+  computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
+  computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
+  computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters);
+  if (nnClusterizerModelClassNumOutputNodes > 0) {
+    computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
+  }
+  if (!nnClusterizerUseCfRegression) {
+    if (nnClusterizerModelReg1NumOutputNodes > 0) {
+      computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
+    }
+    if (nnClusterizerModelReg2NumOutputNodes > 0) {
+      computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
+    }
+  }
+  return mem;
+}
+
+void GPUTPCNNClusterizer::RegisterMemoryAllocation()
+{
+  AllocateAndInitializeLate();
+  int32_t memType = GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK;
+  mMemoryId = mRec->RegisterMemoryAllocation(this, &GPUTPCNNClusterizer::setIOPointers, memType, "TPCNNClusterer", GPUMemoryReuse{GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::NNClusterer, (uint16_t)(mISector % mRec->GetProcessingSettings().nTPCClustererLanes)});
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
new file mode 100644
index 0000000000000..ea6340dfd48bc
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -0,0 +1,77 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizer.h
+/// \author Christian Sonnabend
+
+#ifndef O2_GPUTPCNNCLUSTERIZER_H
+#define O2_GPUTPCNNCLUSTERIZER_H
+
+#include "ChargePos.h"
+#include "GPUProcessor.h"
+
+namespace o2::OrtDataType
+{
+struct Float16_t;
+}
+
+namespace o2::gpu
+{
+
+class GPUTPCNNClusterizer : public GPUProcessor
+{
+ public:
+  GPUTPCNNClusterizer() = default;
+  void* setIOPointers(void*);
+  void RegisterMemoryAllocation();
+  void InitializeProcessor();
+  void SetMaxData(const GPUTrackingInOutPointers&);
+
+  // Neural network clusterization
+
+  int nnClusterizerSizeInputRow = 3;
+  int nnClusterizerSizeInputPad = 3;
+  int nnClusterizerSizeInputTime = 3;
+  int nnClusterizerElementSize = -1;
+  bool nnClusterizerAddIndexData = true;
+  float nnClassThreshold = 0.16;
+  bool nnSigmoidTrafoClassThreshold = 1;
+  int nnClusterizerUseCfRegression = 0;
+  int nnClusterizerBatchedMode = 1;
+  int nnClusterizerTotalClusters = 1;
+  int nnClusterizerVerbosity = 0;
+  int nnClusterizerBoundaryFillValue = -1;
+  int nnClusterizerDumpDigits = 0;
+  int nnClusterizerApplyCfDeconvolution = 0;
+  int nnClusterizerModelClassNumOutputNodes = -1;
+  int nnClusterizerModelReg1NumOutputNodes = -1;
+  int nnClusterizerModelReg2NumOutputNodes = -1;
+  int nnClusterizerDtype = 0; // 0: float16, 1: float32
+  int mISector = -1;
+
+  // Memory allocation for neural network
+  uint class2_elements = 0;
+  float* inputData32 = nullptr;
+  OrtDataType::Float16_t* inputData16 = nullptr;
+  float* outputDataClass = nullptr;
+  float* modelProbabilities = nullptr;
+  float* outputDataReg1 = nullptr;
+  float* outputDataReg2 = nullptr;
+
+  ChargePos* peakPositions = nullptr;
+  bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx
+  float* centralCharges = nullptr;
+  int16_t mMemoryId = -1;
+}; // class GPUTPCNNClusterizer
+
+} // namespace o2::gpu
+
+#endif
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
new file mode 100644
index 0000000000000..5002c63524020
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -0,0 +1,65 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizerHost.cxx
+/// \author Christian Sonnabend
+
+#include "GPUTPCNNClusterizerHost.h"
+#include "GPUTPCNNClusterizer.h"
+#include "GPUSettings.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
+
+using namespace o2::gpu;
+
+GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer)
+{
+  OrtOptions = {
+    {"model-path", settings.nnClassificationPath},
+    {"device", settings.nnInferenceDevice},
+    {"device-id", std::to_string(settings.nnInferenceDeviceId)},
+    {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
+    {"dtype", settings.nnInferenceDtype},
+    {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
+    {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)},
+    {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
+    {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)},
+    {"profiling-output-path", settings.nnInferenceOrtProfilingPath},
+    {"logging-level", std::to_string(settings.nnInferenceVerbosity)}};
+
+  model_class.init(OrtOptions);
+  clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
+
+  reg_model_paths = splitString(settings.nnRegressionPath, ":");
+
+  if (!settings.nnClusterizerUseCfRegression) {
+    if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
+      OrtOptions["model-path"] = reg_model_paths[0];
+      model_reg_1.init(OrtOptions);
+      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
+    } else {
+      OrtOptions["model-path"] = reg_model_paths[0];
+      model_reg_1.init(OrtOptions);
+      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
+      OrtOptions["model-path"] = reg_model_paths[1];
+      model_reg_2.init(OrtOptions);
+      clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1];
+    }
+  }
+}
+
+void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype)
+{
+  if (dtype == 0) {
+    model.inference<OrtDataType::Float16_t, float>(clusterer.inputData16, size * clusterer.nnClusterizerElementSize, output);
+  } else {
+    model.inference<float, float>(clusterer.inputData32, size * clusterer.nnClusterizerElementSize, output);
+  }
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
new file mode 100644
index 0000000000000..7efa0edecb893
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -0,0 +1,68 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizerHost.h
+/// \author Christian Sonnabend
+
+#ifndef O2_GPUTPCNNCLUSTERIZERHOST_H
+#define O2_GPUTPCNNCLUSTERIZERHOST_H
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "ML/OrtInterface.h"
+
+using namespace o2::ml;
+
+namespace o2::OrtDataType
+{
+struct Float16_t;
+}
+
+namespace o2::gpu
+{
+
+class GPUTPCNNClusterizer;
+struct GPUSettingsProcessingNNclusterizer;
+
+class GPUTPCNNClusterizerHost
+{
+ public:
+  GPUTPCNNClusterizerHost() = default;
+  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+
+  void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype);
+
+  std::unordered_map<std::string, std::string> OrtOptions;
+  o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
+  std::vector<std::string> reg_model_paths;
+
+ private:
+  // Avoid including CommonUtils/StringUtils.h
+  std::vector<std::string> splitString(const std::string& input, const std::string& delimiter)
+  {
+    std::vector<std::string> tokens;
+    std::size_t pos = 0;
+    std::size_t found;
+
+    while ((found = input.find(delimiter, pos)) != std::string::npos) {
+      tokens.push_back(input.substr(pos, found - pos));
+      pos = found + delimiter.length();
+    }
+    tokens.push_back(input.substr(pos));
+
+    return tokens;
+  }
+}; // class GPUTPCNNClusterizerHost
+
+} // namespace o2::gpu
+
+#endif
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
new file mode 100644
index 0000000000000..25cd2497fbf62
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -0,0 +1,386 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizerKernels.cxx
+/// \author Christian Sonnabend
+
+#include "GPUTPCNNClusterizerKernels.h"
+#include "GPUTPCCFClusterizer.h"
+
+using namespace o2::gpu;
+using namespace o2::gpu::tpccf;
+
+#include "CfConsts.h"
+#include "CfUtils.h"
+#include "ClusterAccumulator.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
+
+#if !defined(GPUCA_GPUCODE)
+#include "GPUHostDataTypes.h"
+#include "MCLabelAccumulator.h"
+#endif
+
+#ifdef GPUCA_GPUCODE
+#include "GPUTPCCFClusterizer.inc"
+#endif
+
+// Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  if (clustererNN.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
+    return;
+  }
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
+  GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  auto& clusterer = processors.tpcNNClusterer[sector];
+  uint glo_idx = get_global_id(0);
+  uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
+  float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
+  uint class_label = 0;
+  for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
+    if (pIdx == elem_iterator) {
+      current_max_prob = clusterer.modelProbabilities[pIdx];
+    } else {
+      class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
+    }
+  }
+  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
+  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
+    return;
+  }
+  GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
+    return;
+  }
+  GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
+}
+
+// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
+GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
+{
+  return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);
+}
+
+GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift)
+{
+  return (row > 62 ? global_shift : 0);
+}
+
+GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
+{
+  if (pad < 0 || row < 0) { // Faster short-circuit
+    return true;
+  } else if (row < 63) {
+    return (pad >= static_cast<int>(geo.NPads(row)));
+  } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
+    return true;
+  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) {
+    return (pad >= static_cast<int>(geo.NPads(row - global_shift)));
+  } else {
+    return true;
+  }
+}
+
+// Filling the input data for the neural network where there is no boundary
+GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
+
+  uint write_idx = glo_idx * clustererNN.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
+
+  ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
+  int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
+  float central_charge = static_cast<float>(chargeMap[peak].unpack());
+
+  clustererNN.peakPositions[glo_idx] = peak;
+  clustererNN.centralCharges[glo_idx] = central_charge;
+  clustererNN.outputDataClass[glo_idx + batchStart] = -1;
+
+  int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
+#ifndef GPUCA_GPUCODE
+  GPUCA_UNROLL(U(), U());
+#endif
+  for (int r = -clustererNN.nnClusterizerSizeInputRow; r <= clustererNN.nnClusterizerSizeInputRow; r++) {
+    bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
+    int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r, clusterer.Param().tpcGeometry);
+    for (int p = -clustererNN.nnClusterizerSizeInputPad + pad_offset; p <= clustererNN.nnClusterizerSizeInputPad + pad_offset; p++) {
+      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
+      for (int t = -clustererNN.nnClusterizerSizeInputTime; t <= clustererNN.nnClusterizerSizeInputTime; t++) {
+        if (!is_boundary) {
+          ChargePos tmp_pos(row + r, pad + p, time + t);
+          if (r == 0 && !clustererNN.clusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
+            clustererNN.clusterFlags[2 * glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
+            clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
+          }
+          if (dtype == 0) {
+            clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+          } else {
+            clustererNN.inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+          }
+        } else {
+          // Filling boundary just to make sure that no values are left unintentionally
+          if (dtype == 0) {
+            clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
+          } else {
+            clustererNN.inputData32[write_idx] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
+          }
+        }
+        write_idx++;
+      }
+    }
+  }
+  if (clustererNN.nnClusterizerAddIndexData) {
+    if (dtype == 0) {
+      clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f);
+      clustererNN.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
+      clustererNN.inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row));
+    } else {
+      clustererNN.inputData32[write_idx] = clusterer.mISector / 36.f;
+      clustererNN.inputData32[write_idx + 1] = row / 152.f;
+      clustererNN.inputData32[write_idx + 2] = static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row);
+    }
+  }
+}
+
+GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
+  MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  uint full_glo_idx = glo_idx + batchStart;
+  int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg1NumOutputNodes;
+
+  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
+
+  if (clustererNN.outputDataClass[full_glo_idx] == 1) {
+
+    ClusterAccumulator pc;
+
+    // Publishing logic is taken from default clusterizer
+    if (onlyMC) {
+      ClusterAccumulator dummy_pc;
+      CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
+      GPUTPCCFClusterizer::buildCluster(
+        clusterer.Param().rec,
+        chargeMap,
+        clustererNN.peakPositions[glo_idx],
+        smem.posBcast,
+        smem.buf,
+        smem.innerAboveThreshold,
+        &dummy_pc,
+        labelAcc);
+    }
+
+    if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+      }
+      return;
+    }
+
+    pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg1[model_output_index + 4],
+               static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg1[model_output_index],
+               clustererNN.outputDataReg1[model_output_index + 2],
+               (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1],
+               clustererNN.outputDataReg1[model_output_index + 3],
+               clustererNN.clusterFlags[2 * glo_idx],
+               clustererNN.clusterFlags[2 * glo_idx + 1]);
+
+    tpc::ClusterNative myCluster;
+    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
+    if (rejectCluster) {
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+      }
+      return;
+    }
+
+    uint rowIndex = 0;
+    if (clusterer.mPclusterByRow != nullptr) {
+      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
+        clusterer,
+        myCluster,
+        clustererNN.peakPositions[glo_idx].row(),
+        clusterer.mNMaxClusterPerRow,
+        clusterer.mPclusterInRow,
+        clusterOut);
+      if (clusterer.mPclusterPosInRow != nullptr) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
+      }
+    } else if (clusterer.mPclusterPosInRow) {
+      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
+    }
+    CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+  } else {
+    if (clusterer.mPclusterPosInRow) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+    }
+    return;
+  }
+}
+
+GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
+  MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  uint full_glo_idx = glo_idx + batchStart;
+  int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg2NumOutputNodes;
+
+  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg2NumOutputNodes << " -- " << clustererNN.peakPositions.size() << " -- " << clustererNN.centralCharges.size();
+
+  if (clustererNN.outputDataClass[full_glo_idx] > 0) {
+
+    ClusterAccumulator pc;
+
+    if (onlyMC) {
+      ClusterAccumulator dummy_pc;
+      CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
+      GPUTPCCFClusterizer::buildCluster(
+        clusterer.Param().rec,
+        chargeMap,
+        clustererNN.peakPositions[glo_idx],
+        smem.posBcast,
+        smem.buf,
+        smem.innerAboveThreshold,
+        &dummy_pc,
+        labelAcc);
+    }
+
+    if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+      }
+      return;
+    }
+
+    // Cluster 1
+    pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 8],
+               static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index],
+               clustererNN.outputDataReg2[model_output_index + 4],
+               (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2],
+               clustererNN.outputDataReg2[model_output_index + 6],
+               clustererNN.clusterFlags[2 * glo_idx],
+               clustererNN.clusterFlags[2 * glo_idx + 1]);
+
+    tpc::ClusterNative myCluster;
+    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
+    if (rejectCluster) {
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+      }
+      return;
+    }
+
+    uint rowIndex = 0;
+    if (clusterer.mPclusterByRow != nullptr) {
+      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
+        clusterer,
+        myCluster,
+        clustererNN.peakPositions[glo_idx].row(),
+        clusterer.mNMaxClusterPerRow,
+        clusterer.mPclusterInRow,
+        clusterOut);
+      if (clusterer.mPclusterPosInRow != nullptr) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
+      }
+    } else if (clusterer.mPclusterPosInRow) {
+      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
+    }
+    CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+
+    // Cluster 2
+    pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 9],
+               static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index + 1],
+               clustererNN.outputDataReg2[model_output_index + 5],
+               (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3],
+               clustererNN.outputDataReg2[model_output_index + 7],
+               clustererNN.clusterFlags[2 * glo_idx],
+               clustererNN.clusterFlags[2 * glo_idx + 1]);
+
+    rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
+    if (rejectCluster) {
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+      }
+      return;
+    }
+
+    if (clusterer.mPclusterByRow != nullptr) {
+      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
+        clusterer,
+        myCluster,
+        clustererNN.peakPositions[glo_idx].row(),
+        clusterer.mNMaxClusterPerRow,
+        clusterer.mPclusterInRow,
+        clusterOut);
+      if (clusterer.mPclusterPosInRow != nullptr) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
+      }
+    } else if (clusterer.mPclusterPosInRow) {
+      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
+    }
+    // CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
+  } else {
+    if (clusterer.mPclusterPosInRow) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+    }
+    return;
+  }
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
new file mode 100644
index 0000000000000..c7bd18115d61f
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -0,0 +1,77 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizerKernels.h
+/// \author Christian Sonnabend
+
+#ifndef O2_GPU_NN_CLUSTERIZER_H
+#define O2_GPU_NN_CLUSTERIZER_H
+
+#include "clusterFinderDefs.h"
+#include "GPUGeneralKernels.h"
+#include "GPUConstantMem.h"
+#include "GPUTPCClusterFinder.h"
+#include "Array2D.h"
+#include "PackedCharge.h"
+#include "GPUTPCNNClusterizer.h"
+
+namespace o2::tpc
+{
+struct ClusterNative;
+} // namespace o2::tpc
+
+namespace o2::gpu
+{
+
+class ClusterAccumulator;
+class MCLabelAccumulator;
+
+class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
+{
+ public:
+  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels);
+  struct GPUSharedMemory {
+    // Regular cluster finder
+    ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
+    PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
+    uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
+  };
+
+  GPUhdi() constexpr static GPUDataTypes::RecoStep GetRecoStep()
+  {
+    return GPUDataTypes::RecoStep::TPCClusterFinding;
+  }
+
+  enum K : int32_t {
+    runCfClusterizer = 0,
+    fillInputNN = 1,
+    determineClass1Labels = 2,
+    determineClass2Labels = 3,
+    publishClass1Regression = 4,
+    publishClass2Regression = 5,
+  };
+
+  template <int32_t iKernel = defaultKernel, typename... Args>
+  GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, uint8_t = 0, int8_t = 0, int8_t = 0, uint = 0, Args...);
+
+ private:
+  static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, uint8_t, int8_t, uint);
+  static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
+  static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
+
+  static GPUd() int padOffset(int, int, const GPUTPCGeometry&);
+  static GPUd() int rowOffset(int, int);
+  static GPUd() bool isBoundary(int, int, int, const GPUTPCGeometry&);
+};
+
+} // namespace o2::gpu
+
+#endif
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index 4b7aab75519fa..ad348a84264f0 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -24,6 +24,9 @@ o2_gpu_kernel_file_list(O2PROPAGATOR TrackParametrization.cxx TrackParametrizati
 o2_gpu_kernel_file_list(TPCCOMPRESSION GPUTPCCompressionTrackModel.cxx)
 o2_gpu_kernel_file_list(TPCDECOMPRESSION GPUTPCCompressionTrackModel.cxx ERRORS)
 o2_gpu_kernel_file_list(TPCCLUSTERFINDER ERRORS ClusterAccumulator.cxx)
+if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
+o2_gpu_kernel_file_list(TPCNNCLUSTERFINDER ERRORS ClusterAccumulator.cxx GPUTPCNNClusterizerKernels.cxx)
+endif()
 o2_gpu_kernel_file_list(TRDTRACKER GPUTRDTrack.cxx GPUTRDTracker.cxx GPUTRDTrackletWord.cxx GeometryBase.cxx)
 o2_gpu_kernel_file_list(GLOBALREFIT TPCMERGER O2PROPAGATOR MATLUT GPUTrackingRefit.cxx)
 
@@ -111,7 +114,15 @@ o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"        "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUSTERFINDER"                                    LB)
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB int8_t onlyMC)
-o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"           "= TPCCLUSTERFINDER")
+if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer"        "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN"             "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels"   "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels"   "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression" "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+endif()
+o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER")
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                 "= TPCCLUSTERFINDER"                                    NO GPUTPCLinearLabels* out)
 o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanStart"               "= TPCCLUSTERFINDER"                                    LB int32_t iBuf int32_t stage)
 o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanUp"                  "= TPCCLUSTERFINDER"                                    LB int32_t iBuf int32_t nElems)