From d4dc46e6465e47ce8d8b3b4e74b35479638d17e2 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 16 May 2024 09:32:45 +0200
Subject: [PATCH 01/77] Copying kernels to implement NN clusterizer

---
 GPU/GPUTracking/Global/GPUChainTracking.cxx   |   2 +-
 GPU/GPUTracking/Global/GPUChainTracking.h     |   2 +-
 .../Global/GPUChainTrackingClusterizer.cxx    |  19 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 271 ++++++++++++++++++
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  76 +++++
 5 files changed, 364 insertions(+), 6 deletions(-)
 create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
 create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h

diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index bd1cd9859cbd2..68615f47d05db 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -722,7 +722,7 @@ int GPUChainTracking::RunChain()
       return 1;
     }
   } else if (mIOPtrs.tpcPackedDigits || mIOPtrs.tpcZS) {
-    if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false)) {
+    if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false, true)) { // FIXME: This enables the neural network clusterization -> Need to actually set this as configurable
       return 1;
     }
   }
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
index 89f2ecd10f65f..032ad0524ccff 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.h
+++ b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -161,7 +161,7 @@ class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelega
   void SetQAFromForeignChain(GPUChainTracking* chain) { mQAFromForeignChain = chain; }
 
   // Processing functions
-  int RunTPCClusterizer(bool synchronizeOutput = true);
+  int RunTPCClusterizer(bool synchronizeOutput = true, bool applyNNclusterizer = false);
   int ForwardTPCDigits();
   int RunTPCTrackingSlices();
   int RunTPCTrackingMerger(bool synchronizeOutput = true);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 29bbf34b46135..7b2c5539439be 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -566,7 +566,7 @@ int GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
 #endif
 
 // TODO: Clusterizer not working with OCL1 (Clusterizer on CPU, Tracking on GPU)
-int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
+int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclusterizer)
 {
   if (param().rec.fwdTPCDigitsAsClusters) {
     return ForwardTPCDigits();
@@ -835,8 +835,14 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (clusterer.mPmemory->counters.nPeaks == 0) {
           continue;
         }
-        runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
-        runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+        if(!applyNNclusterizer){
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+        } else {
+          // FIXME: This needs to be removed when I actually apply the NN! For now its onyl to make the code work
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+        }
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile);
 
         RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
@@ -870,7 +876,12 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           if (doGPU) {
             SynchronizeStream(lane);
           }
-          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          if(!applyNNclusterizer){
+            runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          } else {
+            // FIXME: Here I need to apply the neural network
+            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          }
         }
         if (GetProcessingSettings().debugLevel >= 3) {
           GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSlice, fragment.index, lane, (int)clusterer.mPmemory->counters.nPositions, (int)clusterer.mPmemory->counters.nPeaks, (int)clusterer.mPmemory->counters.nClusters);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
new file mode 100644
index 0000000000000..3097d3adecb3d
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -0,0 +1,271 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizer.cxx
+/// \author Christian Sonnabend
+
+#include "GPUTPCNNClusterizer.h"
+
+#include "CfConsts.h"
+#include "CfUtils.h"
+#include "ClusterAccumulator.h"
+#if !defined(GPUCA_GPUCODE)
+#include "GPUHostDataTypes.h"
+#include "MCLabelAccumulator.h"
+#endif
+
+using namespace GPUCA_NAMESPACE::gpu;
+using namespace GPUCA_NAMESPACE::gpu::tpccf;
+
+template <>
+GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
+{
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(
+    MCLabelAccumulator labelAcc(clusterer));
+
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+
+  GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+}
+
+GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread,
+                                                       processorType& clusterer,
+                                                       const CfFragment& fragment,
+                                                       GPUSharedMemory& smem,
+                                                       const Array2D<PackedCharge>& chargeMap,
+                                                       const ChargePos* filteredPeakPositions,
+                                                       const GPUSettingsRec& calib,
+                                                       MCLabelAccumulator* labelAcc,
+                                                       uint clusternum,
+                                                       uint maxClusterPerRow,
+                                                       uint* clusterInRow,
+                                                       tpc::ClusterNative* clusterByRow,
+                                                       uint* clusterPosInRow)
+{
+  uint idx = get_global_id(0);
+
+  // For certain configurations dummy work items are added, so the total
+  // number of work items is dividable by 64.
+  // These dummy items also compute the last cluster but discard the result.
+  ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
+  Charge charge = chargeMap[pos].unpack();
+
+  ClusterAccumulator pc;
+  CPU_ONLY(labelAcc->collect(pos, charge));
+
+  buildCluster(
+    calib,
+    chargeMap,
+    pos,
+    smem.posBcast,
+    smem.buf,
+    smem.innerAboveThreshold,
+    &pc,
+    labelAcc);
+
+  if (idx >= clusternum) {
+    return;
+  }
+  if (fragment.isOverlap(pos.time())) {
+    if (clusterPosInRow) {
+      clusterPosInRow[idx] = maxClusterPerRow;
+    }
+    return;
+  }
+  pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry);
+
+  tpc::ClusterNative myCluster;
+  bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param());
+
+  if (rejectCluster) {
+    if (clusterPosInRow) {
+      clusterPosInRow[idx] = maxClusterPerRow;
+    }
+    return;
+  }
+
+  uint rowIndex = 0;
+  if (clusterByRow != nullptr) {
+    rowIndex = sortIntoBuckets(
+      clusterer,
+      myCluster,
+      pos.row(),
+      maxClusterPerRow,
+      clusterInRow,
+      clusterByRow);
+    if (clusterPosInRow != nullptr) {
+      clusterPosInRow[idx] = rowIndex;
+    }
+  } else if (clusterPosInRow) {
+    rowIndex = clusterPosInRow[idx];
+  }
+
+  CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
+}
+
+GPUdii() void GPUTPCNNClusterizer::updateClusterInner(
+  const GPUSettingsRec& calib,
+  ushort lid,
+  ushort N,
+  const PackedCharge* buf,
+  const ChargePos& pos,
+  ClusterAccumulator* cluster,
+  MCLabelAccumulator* labelAcc,
+  uchar* innerAboveThreshold)
+{
+  uchar aboveThreshold = 0;
+
+  GPUCA_UNROLL(U(), U())
+  for (ushort i = 0; i < N; i++) {
+    Delta2 d = cfconsts::InnerNeighbors[i];
+
+    PackedCharge p = buf[N * lid + i];
+
+    Charge q = cluster->updateInner(p, d);
+
+    CPU_ONLY(
+      labelAcc->collect(pos.delta(d), q));
+
+    aboveThreshold |= (uchar(q > calib.tpc.cfInnerThreshold) << i);
+  }
+
+  innerAboveThreshold[lid] = aboveThreshold;
+
+  GPUbarrier();
+}
+
+GPUdii() void GPUTPCNNClusterizer::updateClusterOuter(
+  ushort lid,
+  ushort N,
+  ushort M,
+  ushort offset,
+  const PackedCharge* buf,
+  const ChargePos& pos,
+  ClusterAccumulator* cluster,
+  MCLabelAccumulator* labelAcc)
+{
+  GPUCA_UNROLL(U(), U())
+  for (ushort i = offset; i < M + offset; i++) {
+    PackedCharge p = buf[N * lid + i];
+
+    Delta2 d = cfconsts::OuterNeighbors[i];
+
+    Charge q = cluster->updateOuter(p, d);
+    static_cast<void>(q); // Avoid unused varible warning on GPU.
+
+    CPU_ONLY(
+      labelAcc->collect(pos.delta(d), q));
+  }
+}
+
+GPUdii() void GPUTPCNNClusterizer::buildCluster(
+  const GPUSettingsRec& calib,
+  const Array2D<PackedCharge>& chargeMap,
+  ChargePos pos,
+  ChargePos* posBcast,
+  PackedCharge* buf,
+  uchar* innerAboveThreshold,
+  ClusterAccumulator* myCluster,
+  MCLabelAccumulator* labelAcc)
+{
+  ushort ll = get_local_id(0);
+
+  posBcast[ll] = pos;
+  GPUbarrier();
+
+  CfUtils::blockLoad<PackedCharge>(
+    chargeMap,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    8,
+    cfconsts::InnerNeighbors,
+    posBcast,
+    buf);
+  updateClusterInner(
+    calib,
+    ll,
+    8,
+    buf,
+    pos,
+    myCluster,
+    labelAcc,
+    innerAboveThreshold);
+
+  ushort wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
+
+  bool inGroup1 = ll < wgSizeHalf;
+
+  ushort llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
+
+  CfUtils::condBlockLoad(
+    chargeMap,
+    wgSizeHalf,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    16,
+    cfconsts::OuterNeighbors,
+    posBcast,
+    innerAboveThreshold,
+    buf);
+
+  if (inGroup1) {
+    updateClusterOuter(
+      llhalf,
+      16,
+      16,
+      0,
+      buf,
+      pos,
+      myCluster,
+      labelAcc);
+  }
+
+#if defined(GPUCA_GPUCODE)
+  CfUtils::condBlockLoad(
+    chargeMap,
+    wgSizeHalf,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    16,
+    cfconsts::OuterNeighbors,
+    posBcast + wgSizeHalf,
+    innerAboveThreshold + wgSizeHalf,
+    buf);
+  if (!inGroup1) {
+    updateClusterOuter(
+      llhalf,
+      16,
+      16,
+      0,
+      buf,
+      pos,
+      myCluster,
+      labelAcc);
+  }
+#endif
+}
+
+GPUd() uint GPUTPCNNClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint row, uint maxElemsPerBucket, uint* elemsInBucket, tpc::ClusterNative* buckets)
+{
+  uint index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
+  if (index < maxElemsPerBucket) {
+    buckets[maxElemsPerBucket * row + index] = cluster;
+  } else {
+    clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISlice * 1000 + row, index, maxElemsPerBucket);
+    CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
+  }
+  return index;
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
new file mode 100644
index 0000000000000..f2b92c5f50d40
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -0,0 +1,76 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizer.h
+/// \author Christian Sonnabend
+
+#ifndef O2_GPU_CLUSTERIZER_H
+#define O2_GPU_CLUSTERIZER_H
+
+#include "clusterFinderDefs.h"
+#include "GPUGeneralKernels.h"
+#include "GPUConstantMem.h"
+#include "GPUTPCClusterFinder.h"
+#include "Array2D.h"
+#include "PackedCharge.h"
+
+namespace o2::tpc
+{
+struct ClusterNative;
+} // namespace o2::tpc
+
+namespace GPUCA_NAMESPACE::gpu
+{
+
+class ClusterAccumulator;
+class MCLabelAccumulator;
+
+class GPUTPCNNClusterizer : public GPUKernelTemplate
+{
+ public:
+  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFClusterizer);
+  struct GPUSharedMemory {
+    ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
+    PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
+    uchar innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
+  };
+
+#ifdef GPUCA_HAVE_O2HEADERS
+  typedef GPUTPCClusterFinder processorType;
+  GPUhdi() static processorType* Processor(GPUConstantMem& processors)
+  {
+    return processors.tpcClusterer;
+  }
+#endif
+
+  GPUhdi() CONSTEXPR static GPUDataTypes::RecoStep GetRecoStep()
+  {
+    return GPUDataTypes::RecoStep::TPCClusterFinding;
+  }
+
+  template <int iKernel = defaultKernel>
+  GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char);
+
+  static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*);
+
+ private:
+  static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uchar*);
+
+  static GPUd() void updateClusterOuter(ushort, ushort, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
+
+  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uchar*, ClusterAccumulator*, MCLabelAccumulator*);
+
+  static GPUd() uint sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint, uint, uint*, tpc::ClusterNative*);
+};
+
+} // namespace GPUCA_NAMESPACE::gpu
+
+#endif

From 05831efed4629001198fbc3b053c8bb41b2e13f7 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 27 May 2024 10:16:18 +0200
Subject: [PATCH 02/77] First version of clusterizer in GPU code

---
 Common/ML/CMakeLists.txt                      |  16 ++
 Common/ML/include/ML/onnx_interface.h         |  88 +++++++++
 Common/ML/src/onnx_interface.cxx              | 184 ++++++++++++++++++
 GPU/GPUTracking/CMakeLists.txt                |   3 +
 .../Global/GPUChainTrackingClusterizer.cxx    |   5 +-
 GPU/GPUTracking/ML/onnx_interface.cxx         | 184 ++++++++++++++++++
 GPU/GPUTracking/ML/onnx_interface.h           |  88 +++++++++
 .../TPCClusterFinder/ClusterAccumulator.h     |  17 ++
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 133 +++++++++++++
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  26 +++
 10 files changed, 743 insertions(+), 1 deletion(-)
 create mode 100644 Common/ML/CMakeLists.txt
 create mode 100644 Common/ML/include/ML/onnx_interface.h
 create mode 100644 Common/ML/src/onnx_interface.cxx
 create mode 100644 GPU/GPUTracking/ML/onnx_interface.cxx
 create mode 100644 GPU/GPUTracking/ML/onnx_interface.h

diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt
new file mode 100644
index 0000000000000..60a07041da2e0
--- /dev/null
+++ b/Common/ML/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+# All rights not expressly granted are reserved.
+#
+# This software is distributed under the terms of the GNU General Public
+# License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+#
+# In applying this license CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization
+# or submit itself to any jurisdiction.
+
+o2_add_library(ML
+               SOURCES src/onnx_interface.cxx
+               TARGETVARNAME targetName
+               PUBLIC_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime
+)
\ No newline at end of file
diff --git a/Common/ML/include/ML/onnx_interface.h b/Common/ML/include/ML/onnx_interface.h
new file mode 100644
index 0000000000000..506311c067351
--- /dev/null
+++ b/Common/ML/include/ML/onnx_interface.h
@@ -0,0 +1,88 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+///
+/// \file     model.h
+///
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+///
+/// \brief    A general-purpose class for ONNX models
+///
+
+#ifndef GPU_ML_ONNX_INTERFACE_H
+#define GPU_ML_ONNX_INTERFACE_H
+
+// C++ and system includes
+#include <onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>
+#include <vector>
+#include <string>
+#include <memory>
+#include <map>
+#include <thread>
+
+// O2 includes
+#include "Framework/Logger.h"
+
+namespace o2
+{
+
+namespace ml
+{
+
+class OnnxModel
+{
+
+ public:
+  OnnxModel() = default;
+  ~OnnxModel() = default;
+
+  // Inferencing
+  void init(std::string, bool = false, int = 0);
+  // float* inference(std::vector<Ort::Value>, int = 0);
+  // float* inference(std::vector<float>, int = 0);
+  template<class T> float* inference(T input, unsigned int size);
+  template<class T> std::vector<float> inference_vector(T input, unsigned int size);
+
+  // Reset session
+  void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); }
+
+  // Getters & Setters
+  Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post
+  std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
+  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+  void setActiveThreads(int);
+
+ private:
+  // Environment variables for the ONNX runtime
+  std::shared_ptr<Ort::Env> mEnv = nullptr;
+  std::shared_ptr<Ort::Experimental::Session> mSession = nullptr;
+  Ort::SessionOptions sessionOptions;
+
+  // Input & Output specifications of the loaded network
+  std::vector<std::string> mInputNames;
+  std::vector<std::vector<int64_t>> mInputShapes;
+  std::vector<std::string> mOutputNames;
+  std::vector<std::vector<int64_t>> mOutputShapes;
+
+  // Environment settings
+  std::string modelPath;
+  int activeThreads = 0;
+
+  // Internal function for printing the shape of tensors
+  std::string printShape(const std::vector<int64_t>&);
+};
+
+} // namespace gpu
+
+} // namespace GPUCA_NAMESPACE
+
+#endif // GPU_ML_ONNX_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/onnx_interface.cxx b/Common/ML/src/onnx_interface.cxx
new file mode 100644
index 0000000000000..e7c952d6b8cdc
--- /dev/null
+++ b/Common/ML/src/onnx_interface.cxx
@@ -0,0 +1,184 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+///
+/// \file     model.cxx
+///
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+///
+/// \brief    A general-purpose class with functions for ONNX model applications
+///
+
+// ONNX includes
+#include "ML/onnx_interface.h"
+
+namespace o2
+{
+
+namespace ml
+{
+
+std::string OnnxModel::printShape(const std::vector<int64_t>& v)
+{
+  std::stringstream ss("");
+  for (size_t i = 0; i < v.size() - 1; i++)
+    ss << v[i] << "x";
+  ss << v[v.size() - 1];
+  return ss.str();
+}
+
+void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads)
+{
+
+  LOG(info) << "--- ONNX-ML model ---";
+  LOG(info) << "Taking model from: " << localPath;
+  modelPath = localPath;
+  activeThreads = threads;
+
+  /// Enableing optimizations
+  if(threads != 0){
+    // sessionOptions.SetInterOpNumThreads(1);
+    if(threads == 1){
+      sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+    }
+    else{
+      sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+      sessionOptions.SetIntraOpNumThreads(threads);
+    }
+  }
+  if (enableOptimizations) {
+    // sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
+    sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+    // uint32_t coreml_flags = 0;
+    // coreml_flags |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE;
+    // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreml_flags));
+  }
+
+  mEnv = std::make_shared<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "onnx-model");
+  mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
+
+  mInputNames = mSession->GetInputNames();
+  mInputShapes = mSession->GetInputShapes();
+  mOutputNames = mSession->GetOutputNames();
+  mOutputShapes = mSession->GetOutputShapes();
+
+  LOG(info) << "Input Nodes:";
+  for (size_t i = 0; i < mInputNames.size(); i++) {
+    LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
+  }
+
+  LOG(info) << "Output Nodes:";
+  for (size_t i = 0; i < mOutputNames.size(); i++) {
+    LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
+  }
+  
+  LOG(info) << "--- Model initialized! ---";
+}
+
+// float* OnnxModel::inference(std::vector<Ort::Value> input, int device_id)
+// {
+
+//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
+
+//   try {
+//     auto outputTensors = mSession->Run(mInputNames, input, mOutputNames);
+//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+//     return outputValues;
+//   } catch (const Ort::Exception& exception) {
+//     LOG(error) << "Error running model inference: " << exception.what();
+//   }
+//   return nullptr;
+// }
+
+// float* OnnxModel::inference(std::vector<float> input, int device_id)
+// {
+// 
+//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
+// 
+//   int64_t size = input.size();
+//   assert(size % mInputShapes[0][1] == 0);
+//   std::vector<int64_t> inputShape{size / mInputShapes[0][1], mInputShapes[0][1]};
+//   std::vector<Ort::Value> inputTensors;
+//   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), size, inputShape));
+//   try {
+//     auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+//     return outputValues;
+//   } catch (const Ort::Exception& exception) {
+//     LOG(error) << "Error running model inference: " << exception.what();
+//   }
+//   return nullptr;
+// }
+
+template<class T>
+float* OnnxModel::inference(T input, unsigned int size)
+{
+
+  std::vector<int64_t> inputShape = mInputShapes[0];
+  inputShape[0] = size;
+  std::vector<Ort::Value> inputTensors;
+  size_t mem_size = 1;
+  for(auto elem : inputShape){
+    mem_size*=elem;
+  }
+  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
+  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
+  try {
+    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+    return outputValues;
+  } catch (const Ort::Exception& exception) {
+    LOG(error) << "Error running model inference: " << exception.what();
+  }
+  return nullptr;
+}
+
+template<class T>
+std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
+{
+
+  std::vector<int64_t> inputShape = mInputShapes[0];
+  inputShape[0] = size;
+  std::vector<Ort::Value> inputTensors;
+  // std::vector<float> outputValues;
+  size_t mem_size = 1;
+  for(auto elem : inputShape){
+    mem_size*=elem;
+  }
+  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
+  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
+  try {
+    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+    std::vector<float> outputVector{outputValues, outputValues + size * mOutputShapes[0][1]};
+    // for(int s = 0; s < size; s++){
+    //   for(int o = 0; o < mOutputShapes[0][1]; o++){
+    //     outputValues.push_back(tmp_output_values[s*(int)mOutputShapes[0][1] + o]);
+    //   }
+    // }
+    return outputVector;
+  } catch (const Ort::Exception& exception) {
+    LOG(error) << "Error running model inference: " << exception.what();
+  }
+  return std::vector<float>{};
+}
+
+void OnnxModel::setActiveThreads(int threads)
+{
+  activeThreads = threads;
+}
+
+template float* OnnxModel::inference(std::vector<float>, unsigned int);
+template std::vector<float> OnnxModel::inference_vector(std::vector<float>, unsigned int);
+
+} // namespace gpu
+
+} // namespace GPUCA_NAMESPACE
\ No newline at end of file
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 6266d4962b88e..63abf760bf87a 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -62,6 +62,7 @@ set(SRCS
     Merger/GPUTPCGlobalDebugSortKernels.cxx
     Merger/GPUTPCGMPhysicalTrackModel.cxx
     Merger/GPUTPCGMPolynomialFieldManager.cxx
+    ML/onnx_interface.cxx
     DataTypes/GPUTRDTrack.cxx
     TRDTracking/GPUTRDTracker.cxx
     TRDTracking/GPUTRDTrackletWord.cxx
@@ -195,6 +196,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS)
         TPCClusterFinder/GPUTPCCFPeakFinder.cxx
         TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
         TPCClusterFinder/GPUTPCCFClusterizer.cxx
+        TPCClusterFinder/GPUTPCNNClusterizer.cxx
         TPCClusterFinder/GPUTPCCFDeconvolution.cxx
         TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
         TPCClusterFinder/GPUTPCCFDecodeZS.cxx
@@ -306,6 +308,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                  PUBLIC_LINK_LIBRARIES O2::GPUCommon
                                        O2::ReconstructionDataFormats
                                        O2::TPCFastTransformation
+                                       ONNXRuntime::ONNXRuntime
                  PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
                  SOURCES ${SRCS_DATATYPE_HEADERS})
   target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAVE_O2HEADERS)
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 7b2c5539439be..cca00ed3a1d02 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -18,6 +18,7 @@
 #include "GPUO2DataTypes.h"
 #include "GPUMemorySizeScalers.h"
 #include "GPUTrackingInputProvider.h"
+#include "GPUTPCNNClusterizer.h"
 #include <fstream>
 
 #ifdef GPUCA_O2_LIB
@@ -880,7 +881,9 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus
             runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           } else {
             // FIXME: Here I need to apply the neural network
-            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+            // runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+            GPUCA_NAMESPACE::gpu::GPUTPCNNClusterizer nn_clus;
+            nn_clus.exec({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           }
         }
         if (GetProcessingSettings().debugLevel >= 3) {
diff --git a/GPU/GPUTracking/ML/onnx_interface.cxx b/GPU/GPUTracking/ML/onnx_interface.cxx
new file mode 100644
index 0000000000000..e7c952d6b8cdc
--- /dev/null
+++ b/GPU/GPUTracking/ML/onnx_interface.cxx
@@ -0,0 +1,184 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+///
+/// \file     model.cxx
+///
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+///
+/// \brief    A general-purpose class with functions for ONNX model applications
+///
+
+// ONNX includes
+#include "ML/onnx_interface.h"
+
+namespace o2
+{
+
+namespace ml
+{
+
+std::string OnnxModel::printShape(const std::vector<int64_t>& v)
+{
+  std::stringstream ss("");
+  for (size_t i = 0; i < v.size() - 1; i++)
+    ss << v[i] << "x";
+  ss << v[v.size() - 1];
+  return ss.str();
+}
+
+void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads)
+{
+
+  LOG(info) << "--- ONNX-ML model ---";
+  LOG(info) << "Taking model from: " << localPath;
+  modelPath = localPath;
+  activeThreads = threads;
+
+  /// Enableing optimizations
+  if(threads != 0){
+    // sessionOptions.SetInterOpNumThreads(1);
+    if(threads == 1){
+      sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+    }
+    else{
+      sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+      sessionOptions.SetIntraOpNumThreads(threads);
+    }
+  }
+  if (enableOptimizations) {
+    // sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
+    sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
+    // uint32_t coreml_flags = 0;
+    // coreml_flags |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE;
+    // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreml_flags));
+  }
+
+  mEnv = std::make_shared<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "onnx-model");
+  mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
+
+  mInputNames = mSession->GetInputNames();
+  mInputShapes = mSession->GetInputShapes();
+  mOutputNames = mSession->GetOutputNames();
+  mOutputShapes = mSession->GetOutputShapes();
+
+  LOG(info) << "Input Nodes:";
+  for (size_t i = 0; i < mInputNames.size(); i++) {
+    LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
+  }
+
+  LOG(info) << "Output Nodes:";
+  for (size_t i = 0; i < mOutputNames.size(); i++) {
+    LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
+  }
+  
+  LOG(info) << "--- Model initialized! ---";
+}
+
+// float* OnnxModel::inference(std::vector<Ort::Value> input, int device_id)
+// {
+
+//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
+
+//   try {
+//     auto outputTensors = mSession->Run(mInputNames, input, mOutputNames);
+//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+//     return outputValues;
+//   } catch (const Ort::Exception& exception) {
+//     LOG(error) << "Error running model inference: " << exception.what();
+//   }
+//   return nullptr;
+// }
+
+// float* OnnxModel::inference(std::vector<float> input, int device_id)
+// {
+// 
+//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
+// 
+//   int64_t size = input.size();
+//   assert(size % mInputShapes[0][1] == 0);
+//   std::vector<int64_t> inputShape{size / mInputShapes[0][1], mInputShapes[0][1]};
+//   std::vector<Ort::Value> inputTensors;
+//   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), size, inputShape));
+//   try {
+//     auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+//     return outputValues;
+//   } catch (const Ort::Exception& exception) {
+//     LOG(error) << "Error running model inference: " << exception.what();
+//   }
+//   return nullptr;
+// }
+
+template<class T>
+float* OnnxModel::inference(T input, unsigned int size)
+{
+
+  std::vector<int64_t> inputShape = mInputShapes[0];
+  inputShape[0] = size;
+  std::vector<Ort::Value> inputTensors;
+  size_t mem_size = 1;
+  for(auto elem : inputShape){
+    mem_size*=elem;
+  }
+  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
+  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
+  try {
+    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+    return outputValues;
+  } catch (const Ort::Exception& exception) {
+    LOG(error) << "Error running model inference: " << exception.what();
+  }
+  return nullptr;
+}
+
+template<class T>
+std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
+{
+
+  std::vector<int64_t> inputShape = mInputShapes[0];
+  inputShape[0] = size;
+  std::vector<Ort::Value> inputTensors;
+  // std::vector<float> outputValues;
+  size_t mem_size = 1;
+  for(auto elem : inputShape){
+    mem_size*=elem;
+  }
+  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
+  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
+  try {
+    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+    std::vector<float> outputVector{outputValues, outputValues + size * mOutputShapes[0][1]};
+    // for(int s = 0; s < size; s++){
+    //   for(int o = 0; o < mOutputShapes[0][1]; o++){
+    //     outputValues.push_back(tmp_output_values[s*(int)mOutputShapes[0][1] + o]);
+    //   }
+    // }
+    return outputVector;
+  } catch (const Ort::Exception& exception) {
+    LOG(error) << "Error running model inference: " << exception.what();
+  }
+  return std::vector<float>{};
+}
+
+void OnnxModel::setActiveThreads(int threads)
+{
+  activeThreads = threads;
+}
+
+template float* OnnxModel::inference(std::vector<float>, unsigned int);
+template std::vector<float> OnnxModel::inference_vector(std::vector<float>, unsigned int);
+
+} // namespace gpu
+
+} // namespace GPUCA_NAMESPACE
\ No newline at end of file
diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h
new file mode 100644
index 0000000000000..506311c067351
--- /dev/null
+++ b/GPU/GPUTracking/ML/onnx_interface.h
@@ -0,0 +1,88 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+///
+/// \file     model.h
+///
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+///
+/// \brief    A general-purpose class for ONNX models
+///
+
+#ifndef GPU_ML_ONNX_INTERFACE_H
+#define GPU_ML_ONNX_INTERFACE_H
+
+// C++ and system includes
+#include <onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>
+#include <vector>
+#include <string>
+#include <memory>
+#include <map>
+#include <thread>
+
+// O2 includes
+#include "Framework/Logger.h"
+
+namespace o2
+{
+
+namespace ml
+{
+
+class OnnxModel
+{
+
+ public:
+  OnnxModel() = default;
+  ~OnnxModel() = default;
+
+  // Inferencing
+  void init(std::string, bool = false, int = 0);
+  // float* inference(std::vector<Ort::Value>, int = 0);
+  // float* inference(std::vector<float>, int = 0);
+  template<class T> float* inference(T input, unsigned int size);
+  template<class T> std::vector<float> inference_vector(T input, unsigned int size);
+
+  // Reset session
+  void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); }
+
+  // Getters & Setters
+  Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post
+  std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
+  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+  void setActiveThreads(int);
+
+ private:
+  // Environment variables for the ONNX runtime
+  std::shared_ptr<Ort::Env> mEnv = nullptr;
+  std::shared_ptr<Ort::Experimental::Session> mSession = nullptr;
+  Ort::SessionOptions sessionOptions;
+
+  // Input & Output specifications of the loaded network
+  std::vector<std::string> mInputNames;
+  std::vector<std::vector<int64_t>> mInputShapes;
+  std::vector<std::string> mOutputNames;
+  std::vector<std::vector<int64_t>> mOutputShapes;
+
+  // Environment settings
+  std::string modelPath;
+  int activeThreads = 0;
+
+  // Internal function for printing the shape of tensors
+  std::string printShape(const std::vector<int64_t>&);
+};
+
+} // namespace gpu
+
+} // namespace GPUCA_NAMESPACE
+
+#endif // GPU_ML_ONNX_INTERFACE_H
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
index 3958f6d3aa137..344a0fae3995f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
@@ -43,6 +43,23 @@ class ClusterAccumulator
   GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&);
   GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const;
 
+  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uchar splitInTime, uchar splitInPad){
+    mQtot = qtot;
+    mPadMean = padMean;
+    mPadSigma = padSigma;
+    mTimeMean = timeMean;
+    mTimeSigma = timeSigma;
+    mSplitInTime = splitInTime;
+    mSplitInPad = splitInPad;
+  }
+  GPUd() void setQtot(float qtot) { mQtot = qtot; }
+  GPUd() void setPadMean(float padMean) { mPadMean = padMean; }
+  GPUd() void setPadSigma(float padSigma) { mPadSigma = padSigma; }
+  GPUd() void setTimeMean(float timeMean) { mTimeMean = timeMean; }
+  GPUd() void setTimeSigma(float timeSigma) { mTimeSigma = timeSigma; }
+  GPUd() void setSplitInTime(uchar splitInTime) { mSplitInTime = splitInTime; }
+  GPUd() void setSplitInPad(uchar splitInPad) { mSplitInPad = splitInPad; }
+
  private:
   float mQtot = 0;
   float mPadMean = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 3097d3adecb3d..6c64c54ca5193 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -37,6 +37,139 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
   GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
+void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
+{
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(
+    MCLabelAccumulator labelAcc(clusterer));
+
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+
+  OnnxModel model_class, model_reg;
+  std::string path_class = "", path_reg = "";
+
+  model_class.init(path_class, 1, 0);
+  model_reg.init(path_reg, 1, 0);
+
+  GPUTPCNNClusterizer::nn_clusterizer(model_class, model_reg, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, 1, 0.16, 1);
+}
+
+int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current)
+{
+  return (int)((pad_row_max[row_ref] - pad_row_max[row_current]) / 2);
+}
+
+// ---------------------------------
+bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift)
+{
+  if (row < 0 || pad < 0) {
+    return true;
+  } else if (row <= 62) {
+    if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row]) / 2) {
+      return true;
+    } else {
+      return false;
+    }
+  } else if (row <= 62 + global_shift) {
+    return true;
+  } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) {
+    if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row - global_shift]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row - global_shift]) / 2) {
+      return true;
+    } else {
+      return false;
+    }
+  } else if (row > o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void GPUTPCNNClusterizer::nn_clusterizer(OnnxModel model_class, OnnxModel model_reg, 
+                                          processorType& clusterer,
+                                          const CfFragment& fragment,
+                                          GPUSharedMemory& smem,
+                                          const Array2D<PackedCharge>& chargeMap,
+                                          const ChargePos* filteredPeakPositions,
+                                          const GPUSettingsRec& calib,
+                                          MCLabelAccumulator* labelAcc,
+                                          uint clusternum,
+                                          uint maxClusterPerRow,
+                                          uint* clusterInRow,
+                                          tpc::ClusterNative* clusterByRow,
+                                          uint* clusterPosInRow
+                                          int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform){
+
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  std::vector<float> input_data(clusterer.mPmemory->counters.nClusters * (2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1));
+  float classification_threshold = class_threshold;
+  if(sigmoid_transform){
+    classification_threshold = (float)std::log(class_threshold/(1.f-class_threshold));
+  }
+  
+  for(float cls = 0; cls < clusterer.mPmemory->counters.nClusters; cls++){
+    ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
+    int row = peak.row(), pad = peak.pad(), time = peak.time();
+    float central_charge = chargeMap[peak].unpack();
+    unsigned int glo_idx = cls * ((2*in_row + 1) + (2*in_pad + 1) * (2*in_time + 1));
+    for(int r = -in_row; r <= in_row; r++){
+      for(int p = -in_pad; p <= in_pad; p++){
+        for(int t = -in_time; t <= in_time; t++){
+          int offset = padOffset(row, row + r);
+          if(isBoundary(row + r, pad + p + offset)){
+            continue;
+          } else {
+            unsigned int idx = glo_idx + (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t);
+            ChargePos tmp_pos(row + r, pad + p + offset, time + t);
+            input_data[idx] = (chargeMap[tmp_pos].unpack() / central_charge);
+          }
+        }
+      }
+    }
+  }
+  std::vector<float> out_class = model_class.inference_vector(input_data, clusterer.mPmemory->counters.nClusters);
+  std::vector<float> out_reg = model_reg.inference_vector(input_data, clusterer.mPmemory->counters.nClusters);
+  int num_outputs = model_reg.getNumOutputNodes()[0][1];
+
+  for(int cls = 0; cls < clusterer.mPmemory->counters.nClusters; cls++){
+    if(out_class > classification_threshold){
+      int idx = cls * num_outputs;
+      ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
+      ClusterAccumulator pc;
+      pc.setFull(chargeMap[peak].unpack() * out_reg[idx + 4], peak.pad() + out_reg[idx], out_reg[idx + 2], peak.time() + out_reg[idx + 1], out_reg[idx + 3], 0, 0);
+      tpc::ClusterNative myCluster;
+      bool rejectCluster = !pc.toNative(pos, chargeMap[peak].unpack(), myCluster, clusterer.Param());
+      if (rejectCluster) {
+        if (clusterPosInRow) {
+          clusterPosInRow[idx] = maxClusterPerRow;
+        }
+        return;
+      }
+
+      uint rowIndex = 0;
+      if (clusterByRow != nullptr) {
+        rowIndex = sortIntoBuckets(
+          clusterer,
+          myCluster,
+          pos.row(),
+          maxClusterPerRow,
+          clusterInRow,
+          clusterByRow);
+        if (clusterPosInRow != nullptr) {
+          clusterPosInRow[idx] = rowIndex;
+        }
+      } else if (clusterPosInRow) {
+        rowIndex = clusterPosInRow[idx];
+      }
+
+      CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
+    }
+  }
+
+}
+
+
+
 GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread,
                                                        processorType& clusterer,
                                                        const CfFragment& fragment,
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index f2b92c5f50d40..56ffcbc842223 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -21,6 +21,9 @@
 #include "GPUTPCClusterFinder.h"
 #include "Array2D.h"
 #include "PackedCharge.h"
+#include "ML/onnx_interface.h"
+
+using namespace o2::ml;
 
 namespace o2::tpc
 {
@@ -61,7 +64,30 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
 
   static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*);
 
+  void exec(int, int, int, int, GPUSharedMemory&, processorType&, char);
+  int padOffset(int);
+  bool isBoundary(int, int, int);
+  static void nn_clusterizer(OnnxModel, OnnxModel,
+                              processorType&,
+                              const CfFragment&,
+                              GPUSharedMemory&,
+                              const Array2D<PackedCharge>&,
+                              const ChargePos*,
+                              const GPUSettingsRec&,
+                              MCLabelAccumulator*,
+                              uint,
+                              uint,
+                              uint*,
+                              tpc::ClusterNative*,
+                              uint*,
+                              int = 3, int = 3, int = 3, bool = true);
+
  private:
+  // ---------------------------------
+  std::vector<int> pad_row_max{
+  65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137
+  };
+
   static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uchar*);
 
   static GPUd() void updateClusterOuter(ushort, ushort, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);

From 3f6c934987d68cce26ca1c63c07dc2038be3850b Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 29 May 2024 11:38:33 +0200
Subject: [PATCH 03/77] Adding a compiling and running version with
 single-threaded ONNX model executions. Clusters are not getting published yet
 (FIXME)

---
 GPU/GPUTracking/CMakeLists.txt                |   2 +-
 .../Definitions/GPUDefGPUParameters.h         |   6 +
 .../Global/GPUChainTrackingClusterizer.cxx    |  16 +-
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |   5 +
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 161 +++++++++++-------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  22 +--
 GPU/GPUTracking/kernels.cmake                 |   1 +
 7 files changed, 133 insertions(+), 80 deletions(-)

diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 63abf760bf87a..8b3a37894810c 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -195,8 +195,8 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS)
         TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
         TPCClusterFinder/GPUTPCCFPeakFinder.cxx
         TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
-        TPCClusterFinder/GPUTPCCFClusterizer.cxx
         TPCClusterFinder/GPUTPCNNClusterizer.cxx
+        TPCClusterFinder/GPUTPCCFClusterizer.cxx
         TPCClusterFinder/GPUTPCCFDeconvolution.cxx
         TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
         TPCClusterFinder/GPUTPCCFDecodeZS.cxx
diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
index 4bb8303ee9a96..d8eba2a9ad384 100644
--- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
+++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
@@ -79,6 +79,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 448
+  #define GPUCA_LB_GPUTPCNNClusterizer 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -143,6 +144,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 512
+  #define GPUCA_LB_GPUTPCNNClusterizer 512
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -207,6 +209,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 448
   #define GPUCA_LB_GPUTPCCFDeconvolution 384
   #define GPUCA_LB_GPUTPCCFClusterizer 448
+  #define GPUCA_LB_GPUTPCNNClusterizer 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -475,6 +478,9 @@
   #ifndef GPUCA_LB_GPUTPCCFClusterizer
     #define GPUCA_LB_GPUTPCCFClusterizer 512
   #endif
+  #ifndef GPUCA_LB_GPUTPCNNClusterizer
+    #define GPUCA_LB_GPUTPCNNClusterizer 512
+  #endif
   #ifndef GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU
     #define GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU 256
   #endif
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 8a6a899f35a1a..26878e6111bd5 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -18,7 +18,6 @@
 #include "GPUO2DataTypes.h"
 #include "GPUMemorySizeScalers.h"
 #include "GPUTrackingInputProvider.h"
-#include "GPUTPCNNClusterizer.h"
 #include <fstream>
 
 #ifdef GPUCA_O2_LIB
@@ -875,7 +874,15 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus
         runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
 
-        runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0);
+        if(doGPU){
+          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0);
+        } else {
+          std::string path_class = "/lustre/alice/users/csonnab/PhD/jobs/clusterization/NN/output/normalized_qCenter/o2sim_150324_50Ev_10000QED_PbPb_13t7p/classification/3D_FCNN_1cls_03_04_2024_10M_FP16_addIndex/network/net_onnx.onnx", path_reg = "/lustre/alice/users/csonnab/PhD/jobs/clusterization/NN/output/normalized_qCenter/o2sim_150324_50Ev_10000QED_PbPb_13t7p/regression/3D_FCNN_1cls_05_04_2024_10M_FP16_addIndex/network/net_onnx.onnx";
+          clusterer.model_class.init(path_class, 1, 1);
+          clusterer.model_reg.init(path_reg, 1, 1);
+
+          runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+        }
         if (doGPU && propagateMCLabels) {
           TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
           if (doGPU) {
@@ -886,8 +893,9 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus
           } else {
             // FIXME: Here I need to apply the neural network
             // runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
-            GPUCA_NAMESPACE::gpu::GPUTPCNNClusterizer nn_clus;
-            nn_clus.exec({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+            // GPUTPCNNClusterizer nn_clus;
+            // nn_clus.exec({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           }
         }
         if (GetProcessingSettings().debugLevel >= 3) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index ca89053797a47..ae40ff780b25a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -19,6 +19,9 @@
 #include "GPUProcessor.h"
 #include "GPUDataTypes.h"
 #include "CfFragment.h"
+#include "ML/onnx_interface.h"
+
+using namespace o2::ml;
 
 namespace o2
 {
@@ -141,6 +144,8 @@ class GPUTPCClusterFinder : public GPUProcessor
   short mZSOffsetId = -1;
   short mOutputId = -1;
 
+  OnnxModel model_class, model_reg;
+
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
   void DumpChargeMap(std::ostream& out, std::string_view);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 6c64c54ca5193..7c19802825eb6 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -34,10 +34,14 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
 
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
+
+  // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+// 
+  // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
-void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
+GPUd() void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
 {
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(
@@ -45,27 +49,37 @@ void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThrea
 
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  OnnxModel model_class, model_reg;
   std::string path_class = "", path_reg = "";
 
-  model_class.init(path_class, 1, 0);
-  model_reg.init(path_reg, 1, 0);
+  clusterer.model_class.init(path_class, 1, 0);
+  clusterer.model_reg.init(path_reg, 1, 0);
 
-  GPUTPCNNClusterizer::nn_clusterizer(model_class, model_reg, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, 1, 0.16, 1);
+  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
 }
 
 int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current)
 {
+  std::vector<int> pad_row_max{
+    65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137
+  };
   return (int)((pad_row_max[row_ref] - pad_row_max[row_current]) / 2);
 }
 
 // ---------------------------------
 bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift)
 {
+  std::vector<int> pad_row_max{
+    65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137
+  };
   if (row < 0 || pad < 0) {
     return true;
   } else if (row <= 62) {
-    if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row]) / 2) {
+    // if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row]) / 2) {
+    //   return true;
+    // } else {
+    //   return false;
+    // }
+    if (pad < 0 || pad > pad_row_max[row]) {
       return true;
     } else {
       return false;
@@ -73,7 +87,12 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift)
   } else if (row <= 62 + global_shift) {
     return true;
   } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) {
-    if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row - global_shift]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row - global_shift]) / 2) {
+    //if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row - global_shift]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row - global_shift]) / 2) {
+    //  return true;
+    //} else {
+    //  return false;
+    //}
+    if (pad < 0 || pad > pad_row_max[row]) {
       return true;
     } else {
       return false;
@@ -85,7 +104,7 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift)
   }
 }
 
-void GPUTPCNNClusterizer::nn_clusterizer(OnnxModel model_class, OnnxModel model_reg, 
+GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread,
                                           processorType& clusterer,
                                           const CfFragment& fragment,
                                           GPUSharedMemory& smem,
@@ -97,73 +116,93 @@ void GPUTPCNNClusterizer::nn_clusterizer(OnnxModel model_class, OnnxModel model_
                                           uint maxClusterPerRow,
                                           uint* clusterInRow,
                                           tpc::ClusterNative* clusterByRow,
-                                          uint* clusterPosInRow
+                                          uint* clusterPosInRow,
                                           int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform){
 
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  std::vector<float> input_data(clusterer.mPmemory->counters.nClusters * (2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1));
+  std::vector<float> input_data(((2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1) + (add_index_data ? 3 : 0)), -1.f);
   float classification_threshold = class_threshold;
   if(sigmoid_transform){
     classification_threshold = (float)std::log(class_threshold/(1.f-class_threshold));
   }
   
-  for(float cls = 0; cls < clusterer.mPmemory->counters.nClusters; cls++){
-    ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
-    int row = peak.row(), pad = peak.pad(), time = peak.time();
-    float central_charge = chargeMap[peak].unpack();
-    unsigned int glo_idx = cls * ((2*in_row + 1) + (2*in_pad + 1) * (2*in_time + 1));
-    for(int r = -in_row; r <= in_row; r++){
-      for(int p = -in_pad; p <= in_pad; p++){
-        for(int t = -in_time; t <= in_time; t++){
-          int offset = padOffset(row, row + r);
-          if(isBoundary(row + r, pad + p + offset)){
-            continue;
-          } else {
-            unsigned int idx = glo_idx + (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t);
-            ChargePos tmp_pos(row + r, pad + p + offset, time + t);
-            input_data[idx] = (chargeMap[tmp_pos].unpack() / central_charge);
-          }
+  uint idx = get_global_id(0);
+  uint cls = CAMath::Min(idx, clusternum - 1);
+
+  // For certain configurations dummy work items are added, so the total
+  // number of work items is dividable by 64.
+  // These dummy items also compute the last cluster but discard the result.
+  
+  ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
+  int row = peak.row(), pad = peak.pad(), time = peak.time();
+  float central_charge = chargeMap[peak].unpack();
+  CPU_ONLY(labelAcc->collect(peak, central_charge));
+  // unsigned int glo_idx = cls * ((2*in_row + 1) + (2*in_pad + 1) * (2*in_time + 1));
+  unsigned int write_idx = 0;
+  for(int r = -in_row; r <= in_row; r++){
+    for(int p = -in_pad; p <= in_pad; p++){
+      for(int t = -in_time; t <= in_time; t++){
+        int offset = GPUTPCNNClusterizer::padOffset(row, row + r);
+        if(GPUTPCNNClusterizer::isBoundary(row + r, pad + p + offset, in_row)){
+          continue;
+        } else {
+          // unsigned int loc_idx = (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t);
+          ChargePos tmp_pos(row + r, pad + p + offset, time + t);
+          input_data[write_idx] = (chargeMap[tmp_pos].unpack() / central_charge);
+          write_idx++;
         }
       }
+      if(idx == 100){
+        LOG(info) << "[" << input_data[write_idx-7] << ", " << input_data[write_idx-6] << ", " << input_data[write_idx-5] << ", " << input_data[write_idx-4] << ", " << input_data[write_idx-3] << ", " << input_data[write_idx-2] << ", " << input_data[write_idx-1] << "]";
+      }
+    }
+  }
+  if(add_index_data){
+    input_data[input_data.size()-3] = 1;
+    input_data[input_data.size()-2] = (float)peak.row() / 152.f;
+    input_data[input_data.size()-1] = (float)peak.pad() / 138.f;
+    if(idx == 100){
+      LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
     }
   }
-  std::vector<float> out_class = model_class.inference_vector(input_data, clusterer.mPmemory->counters.nClusters);
-  std::vector<float> out_reg = model_reg.inference_vector(input_data, clusterer.mPmemory->counters.nClusters);
-  int num_outputs = model_reg.getNumOutputNodes()[0][1];
-
-  for(int cls = 0; cls < clusterer.mPmemory->counters.nClusters; cls++){
-    if(out_class > classification_threshold){
-      int idx = cls * num_outputs;
-      ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
-      ClusterAccumulator pc;
-      pc.setFull(chargeMap[peak].unpack() * out_reg[idx + 4], peak.pad() + out_reg[idx], out_reg[idx + 2], peak.time() + out_reg[idx + 1], out_reg[idx + 3], 0, 0);
-      tpc::ClusterNative myCluster;
-      bool rejectCluster = !pc.toNative(pos, chargeMap[peak].unpack(), myCluster, clusterer.Param());
-      if (rejectCluster) {
-        if (clusterPosInRow) {
-          clusterPosInRow[idx] = maxClusterPerRow;
-        }
-        return;
-      }
 
-      uint rowIndex = 0;
-      if (clusterByRow != nullptr) {
-        rowIndex = sortIntoBuckets(
-          clusterer,
-          myCluster,
-          pos.row(),
-          maxClusterPerRow,
-          clusterInRow,
-          clusterByRow);
-        if (clusterPosInRow != nullptr) {
-          clusterPosInRow[idx] = rowIndex;
-        }
-      } else if (clusterPosInRow) {
-        rowIndex = clusterPosInRow[idx];
+  std::vector<float> out_class = clusterer.model_class.inference_vector(input_data, 1);
+  std::vector<float> out_reg = clusterer.model_reg.inference_vector(input_data, 1);
+  int num_outputs = clusterer.model_reg.getNumOutputNodes()[0][1];
+
+  if(idx == 100){
+    LOG(info) << "Classification model: " << out_class[0];
+    LOG(info) << "Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
+  }
+
+  if(out_class[0] > classification_threshold){
+    ClusterAccumulator pc;
+    pc.setFull(chargeMap[peak].unpack() * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0);
+    tpc::ClusterNative myCluster;
+    bool rejectCluster = !pc.toNative(peak, chargeMap[peak].unpack(), myCluster, clusterer.Param());
+    if (rejectCluster) {
+      if (clusterPosInRow) {
+        clusterPosInRow[idx] = maxClusterPerRow;
       }
+      return;
+    }
 
-      CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
+    uint rowIndex = 0;
+    if (clusterByRow != nullptr) {
+      rowIndex = sortIntoBuckets(
+        clusterer,
+        myCluster,
+        peak.row(),
+        maxClusterPerRow,
+        clusterInRow,
+        clusterByRow);
+      if (clusterPosInRow != nullptr) {
+        clusterPosInRow[idx] = rowIndex;
+      }
+    } else if (clusterPosInRow) {
+      rowIndex = clusterPosInRow[idx];
     }
+
+    CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, maxClusterPerRow));
   }
 
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 56ffcbc842223..905e6f860a90f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -12,8 +12,8 @@
 /// \file GPUTPCNNClusterizer.h
 /// \author Christian Sonnabend
 
-#ifndef O2_GPU_CLUSTERIZER_H
-#define O2_GPU_CLUSTERIZER_H
+#ifndef O2_GPU_NN_CLUSTERIZER_H
+#define O2_GPU_NN_CLUSTERIZER_H
 
 #include "clusterFinderDefs.h"
 #include "GPUGeneralKernels.h"
@@ -21,9 +21,6 @@
 #include "GPUTPCClusterFinder.h"
 #include "Array2D.h"
 #include "PackedCharge.h"
-#include "ML/onnx_interface.h"
-
-using namespace o2::ml;
 
 namespace o2::tpc
 {
@@ -39,7 +36,7 @@ class MCLabelAccumulator;
 class GPUTPCNNClusterizer : public GPUKernelTemplate
 {
  public:
-  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFClusterizer);
+  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizer);
   struct GPUSharedMemory {
     ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
     PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
@@ -64,10 +61,10 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
 
   static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*);
 
-  void exec(int, int, int, int, GPUSharedMemory&, processorType&, char);
-  int padOffset(int);
-  bool isBoundary(int, int, int);
-  static void nn_clusterizer(OnnxModel, OnnxModel,
+  static GPUd() void exec(int, int, int, int, GPUSharedMemory&, processorType&, char);
+  static int padOffset(int, int);
+  static bool isBoundary(int, int, int);
+  static GPUd() void nn_clusterizer(int, int, int, int,
                               processorType&,
                               const CfFragment&,
                               GPUSharedMemory&,
@@ -80,13 +77,10 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
                               uint*,
                               tpc::ClusterNative*,
                               uint*,
-                              int = 3, int = 3, int = 3, bool = true);
+                              int = 3, int = 3, int = 3, bool = 1, float = 0.16, bool = true);
 
  private:
   // ---------------------------------
-  std::vector<int> pad_row_max{
-  65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137
-  };
 
   static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uchar*);
 
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index d4f5ca93e9def..b0270511c2249 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -117,6 +117,7 @@ o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"        "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single char onlyMC)
+o2_gpu_add_kernel("GPUTPCNNClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single char onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"           "= TPCCLUSTERFINDER"                                    NO      single)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                 "= TPCCLUSTERFINDER"                                    NO      single GPUTPCLinearLabels* out)
 o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanStart"               "= TPCCLUSTERFINDER"                                    LB      single int iBuf int stage)

From 8ba6805ebd889bebf4b11972170570bdd99892cf Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 29 May 2024 21:11:04 +0200
Subject: [PATCH 04/77] Clusters now working by a hack

---
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 7c19802825eb6..afee680bc0ceb 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -32,7 +32,7 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
   CPU_ONLY(
     MCLabelAccumulator labelAcc(clusterer));
 
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; // (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
   GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
 
@@ -142,7 +142,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
     for(int p = -in_pad; p <= in_pad; p++){
       for(int t = -in_time; t <= in_time; t++){
         int offset = GPUTPCNNClusterizer::padOffset(row, row + r);
-        if(GPUTPCNNClusterizer::isBoundary(row + r, pad + p + offset, in_row)){
+        if(GPUTPCNNClusterizer::isBoundary(row + r, pad + p, in_row)){
           continue;
         } else {
           // unsigned int loc_idx = (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t);
@@ -151,18 +151,18 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
           write_idx++;
         }
       }
-      if(idx == 100){
-        LOG(info) << "[" << input_data[write_idx-7] << ", " << input_data[write_idx-6] << ", " << input_data[write_idx-5] << ", " << input_data[write_idx-4] << ", " << input_data[write_idx-3] << ", " << input_data[write_idx-2] << ", " << input_data[write_idx-1] << "]";
-      }
+      // if(idx == 100){
+      //   LOG(info) << "[" << input_data[write_idx-7] << ", " << input_data[write_idx-6] << ", " << input_data[write_idx-5] << ", " << input_data[write_idx-4] << ", " << input_data[write_idx-3] << ", " << input_data[write_idx-2] << ", " << input_data[write_idx-1] << "]";
+      // }
     }
   }
   if(add_index_data){
     input_data[input_data.size()-3] = 1;
     input_data[input_data.size()-2] = (float)peak.row() / 152.f;
     input_data[input_data.size()-1] = (float)peak.pad() / 138.f;
-    if(idx == 100){
-      LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
-    }
+    // if(idx == 100){
+    //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
+    // }
   }
 
   std::vector<float> out_class = clusterer.model_class.inference_vector(input_data, 1);
@@ -170,16 +170,17 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
   int num_outputs = clusterer.model_reg.getNumOutputNodes()[0][1];
 
   if(idx == 100){
-    LOG(info) << "Classification model: " << out_class[0];
+    LOG(info) << "Classification model: " << out_class[0] << " (>? " << classification_threshold << ")";
     LOG(info) << "Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
   }
 
   if(out_class[0] > classification_threshold){
     ClusterAccumulator pc;
-    pc.setFull(chargeMap[peak].unpack() * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0);
+    pc.setFull(central_charge * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0);
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(peak, chargeMap[peak].unpack(), myCluster, clusterer.Param());
+    bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param());
     if (rejectCluster) {
+      LOG(warning) << "Cluster rejected!";
       if (clusterPosInRow) {
         clusterPosInRow[idx] = maxClusterPerRow;
       }

From 6ec3c46d37e82b2f37f648ff3750d14f8d72f5b1 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 6 Jun 2024 17:49:38 +0200
Subject: [PATCH 05/77] Working implementation of settings via GPUSettings.h
 and --configKeyValues "GPU_proc.[setting]=...;..."

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  9 ++++++
 GPU/GPUTracking/Global/GPUChainTracking.cxx   |  2 +-
 GPU/GPUTracking/Global/GPUChainTracking.h     |  2 +-
 .../Global/GPUChainTrackingClusterizer.cxx    | 28 +++++++++++--------
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |  7 +++++
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |  2 +-
 6 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 777ea1e70b0d8..b3f38c6ab81d2 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -280,6 +280,15 @@ AddOption(tpcDownscaledEdx, unsigned char, 0, "", 0, "If != 0, downscale dEdx pr
 AddOption(tpcMaxAttachedClustersPerSectorRow, unsigned int, 51000, "", 0, "Maximum number of TPC attached clusters which can be decoded per SectorRow")
 AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding")
 AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored")
+AddOption(applyNNclusterizer, int, 0, "", 0, "(Bool, default = 0), if the neural network clsuterizer should be used.")
+AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
+AddOption(nnRegressionPath, std::string, "./network_reg.onnx", "", 0, "The regression network path")
+AddOption(nnClassThreshold, float, 0.16, "", 0, "The cutoff at which clusters will be accepted / rejected.")
+AddOption(nnSigmoidTrafoThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
+AddOption(nnAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
+AddOption(nnSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
 AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr)
 AddSubConfig(GPUSettingsProcessingRTC, rtc)
 AddSubConfig(GPUSettingsProcessingParam, param)
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index 68615f47d05db..7a202c852b895 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -722,7 +722,7 @@ int GPUChainTracking::RunChain()
       return 1;
     }
   } else if (mIOPtrs.tpcPackedDigits || mIOPtrs.tpcZS) {
-    if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false, true)) { // FIXME: This enables the neural network clusterization -> Need to actually set this as configurable
+    if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false)) { // FIXME: This enables the neural network clusterization -> Need to actually set this as configurable
       return 1;
     }
   }
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h
index 032ad0524ccff..89f2ecd10f65f 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.h
+++ b/GPU/GPUTracking/Global/GPUChainTracking.h
@@ -161,7 +161,7 @@ class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelega
   void SetQAFromForeignChain(GPUChainTracking* chain) { mQAFromForeignChain = chain; }
 
   // Processing functions
-  int RunTPCClusterizer(bool synchronizeOutput = true, bool applyNNclusterizer = false);
+  int RunTPCClusterizer(bool synchronizeOutput = true);
   int ForwardTPCDigits();
   int RunTPCTrackingSlices();
   int RunTPCTrackingMerger(bool synchronizeOutput = true);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 26878e6111bd5..6ed3406646abb 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -566,7 +566,7 @@ int GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
 #endif
 
 // TODO: Clusterizer not working with OCL1 (Clusterizer on CPU, Tracking on GPU)
-int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclusterizer)
+int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 {
   if (param().rec.fwdTPCDigitsAsClusters) {
     return ForwardTPCDigits();
@@ -837,7 +837,7 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus
         if (clusterer.mPmemory->counters.nPeaks == 0) {
           continue;
         }
-        if(!applyNNclusterizer){
+        if(!GetProcessingSettings().applyNNclusterizer){
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
         } else {
@@ -877,25 +877,29 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput, bool applyNNclus
         if(doGPU){
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0);
         } else {
-          std::string path_class = "/lustre/alice/users/csonnab/PhD/jobs/clusterization/NN/output/normalized_qCenter/o2sim_150324_50Ev_10000QED_PbPb_13t7p/classification/3D_FCNN_1cls_03_04_2024_10M_FP16_addIndex/network/net_onnx.onnx", path_reg = "/lustre/alice/users/csonnab/PhD/jobs/clusterization/NN/output/normalized_qCenter/o2sim_150324_50Ev_10000QED_PbPb_13t7p/regression/3D_FCNN_1cls_05_04_2024_10M_FP16_addIndex/network/net_onnx.onnx";
-          clusterer.model_class.init(path_class, 1, 1);
-          clusterer.model_reg.init(path_reg, 1, 1);
-
-          runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          if(GetProcessingSettings().applyNNclusterizer){
+            clusterer.model_class.init(GetProcessingSettings().nnClassificationPath, 1, 1);
+            clusterer.model_reg.init(GetProcessingSettings().nnRegressionPath, 1, 1);
+            clusterer.nnSizeInputRow = GetProcessingSettings().nnSizeInputRow;
+            clusterer.nnSizeInputPad = GetProcessingSettings().nnSizeInputPad;
+            clusterer.nnSizeInputTime = GetProcessingSettings().nnSizeInputTime;
+            clusterer.nnAddIndexData = GetProcessingSettings().nnAddIndexData;
+            clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold;
+            clusterer.nnSigmoidTrafoThreshold = GetProcessingSettings().nnSigmoidTrafoThreshold;
+            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          } else {
+            runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          }
         }
         if (doGPU && propagateMCLabels) {
           TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
           if (doGPU) {
             SynchronizeStream(lane);
           }
-          if(!applyNNclusterizer){
+          if(!GetProcessingSettings().applyNNclusterizer){
             runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           } else {
-            // FIXME: Here I need to apply the neural network
-            // runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
             runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
-            // GPUTPCNNClusterizer nn_clus;
-            // nn_clus.exec({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           }
         }
         if (GetProcessingSettings().debugLevel >= 3) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index ae40ff780b25a..a449eb23ef426 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -144,6 +144,13 @@ class GPUTPCClusterFinder : public GPUProcessor
   short mZSOffsetId = -1;
   short mOutputId = -1;
 
+  int nnSizeInputRow = 3;
+  int nnSizeInputPad = 3;
+  int nnSizeInputTime = 3;
+  bool nnAddIndexData = true;
+  float nnClassThreshold = 0.16;
+  bool nnSigmoidTrafoThreshold = 1;
+
   OnnxModel model_class, model_reg;
 
 #ifndef GPUCA_GPUCODE
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index afee680bc0ceb..d2656531c6df1 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -34,7 +34,7 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
 
   tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; // (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
+  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold);
 
   // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 // 

From ab4653a78a470e740478ed719f24bfed0b8fc0cb Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 24 Jun 2024 08:08:01 +0200
Subject: [PATCH 06/77] Modifying the onnx_interface to include the right
 headers

---
 Common/ML/include/ML/onnx_interface.h | 6 +++++-
 GPU/GPUTracking/ML/onnx_interface.h   | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/Common/ML/include/ML/onnx_interface.h b/Common/ML/include/ML/onnx_interface.h
index 506311c067351..d3676b7a3f87a 100644
--- a/Common/ML/include/ML/onnx_interface.h
+++ b/Common/ML/include/ML/onnx_interface.h
@@ -21,7 +21,11 @@
 #define GPU_ML_ONNX_INTERFACE_H
 
 // C++ and system includes
-#include <onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>
+#if __has_include(<onnxruntime/core/session/onnxruntime_cxx_api.h>)
+#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
+#else
+#include <onnxruntime_cxx_api.h>
+#endif
 #include <vector>
 #include <string>
 #include <memory>
diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h
index 506311c067351..d3676b7a3f87a 100644
--- a/GPU/GPUTracking/ML/onnx_interface.h
+++ b/GPU/GPUTracking/ML/onnx_interface.h
@@ -21,7 +21,11 @@
 #define GPU_ML_ONNX_INTERFACE_H
 
 // C++ and system includes
-#include <onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>
+#if __has_include(<onnxruntime/core/session/onnxruntime_cxx_api.h>)
+#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
+#else
+#include <onnxruntime_cxx_api.h>
+#endif
 #include <vector>
 #include <string>
 #include <memory>

From 04084c8fd1ea9be525a7368afe5567112d4549cc Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 24 Jun 2024 13:21:09 +0200
Subject: [PATCH 07/77] Adjusting initialization for new ONNXRuntime version

---
 Common/ML/include/ML/onnx_interface.h | 26 ++++---
 Common/ML/src/onnx_interface.cxx      | 98 +++++++++++++++++++--------
 GPU/GPUTracking/ML/onnx_interface.cxx | 98 +++++++++++++++++++--------
 GPU/GPUTracking/ML/onnx_interface.h   | 19 ++++--
 4 files changed, 171 insertions(+), 70 deletions(-)

diff --git a/Common/ML/include/ML/onnx_interface.h b/Common/ML/include/ML/onnx_interface.h
index d3676b7a3f87a..fcc02a49996ea 100644
--- a/Common/ML/include/ML/onnx_interface.h
+++ b/Common/ML/include/ML/onnx_interface.h
@@ -17,8 +17,8 @@
 /// \brief    A general-purpose class for ONNX models
 ///
 
-#ifndef GPU_ML_ONNX_INTERFACE_H
-#define GPU_ML_ONNX_INTERFACE_H
+#ifndef COMMON_ML_ONNX_INTERFACE_H
+#define COMMON_ML_ONNX_INTERFACE_H
 
 // C++ and system includes
 #if __has_include(<onnxruntime/core/session/onnxruntime_cxx_api.h>)
@@ -43,10 +43,9 @@ namespace ml
 
 class OnnxModel
 {
-
  public:
-  OnnxModel() = default;
-  ~OnnxModel() = default;
+  OnnxModel() : mMemoryInfo(Ort::MemoryInfo::CreateCpu(OrtAllocatorType, OrtMemType)) {};
+  virtual ~OnnxModel() = default;
 
   // Inferencing
   void init(std::string, bool = false, int = 0);
@@ -56,11 +55,19 @@ class OnnxModel
   template<class T> std::vector<float> inference_vector(T input, unsigned int size);
 
   // Reset session
-  void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); }
+  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+    void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); };
+  #else
+    void resetSession() { mSession.reset(new Ort::Session{*mEnv, modelPath.c_str(), sessionOptions}); };
+  #endif
 
   // Getters & Setters
   Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post
-  std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
+  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+    std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
+  #else
+    std::shared_ptr<Ort::Session> getSession() { return mSession; }
+  #endif
   std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
   std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
   void setActiveThreads(int);
@@ -68,7 +75,8 @@ class OnnxModel
  private:
   // Environment variables for the ONNX runtime
   std::shared_ptr<Ort::Env> mEnv = nullptr;
-  std::shared_ptr<Ort::Experimental::Session> mSession = nullptr;
+  std::shared_ptr<Ort::Session> mSession = nullptr; ///< ONNX session
+  Ort::MemoryInfo mMemoryInfo;
   Ort::SessionOptions sessionOptions;
 
   // Input & Output specifications of the loaded network
@@ -89,4 +97,4 @@ class OnnxModel
 
 } // namespace GPUCA_NAMESPACE
 
-#endif // GPU_ML_ONNX_INTERFACE_H
\ No newline at end of file
+#endif // COMMON_ML_ONNX_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/onnx_interface.cxx b/Common/ML/src/onnx_interface.cxx
index e7c952d6b8cdc..549575600a656 100644
--- a/Common/ML/src/onnx_interface.cxx
+++ b/Common/ML/src/onnx_interface.cxx
@@ -43,6 +43,11 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
   modelPath = localPath;
   activeThreads = threads;
 
+#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+#else
+  mMemoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
+#endif
+
   /// Enableing optimizations
   if(threads != 0){
     // sessionOptions.SetInterOpNumThreads(1);
@@ -63,12 +68,28 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
   }
 
   mEnv = std::make_shared<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "onnx-model");
-  mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
-
-  mInputNames = mSession->GetInputNames();
-  mInputShapes = mSession->GetInputShapes();
-  mOutputNames = mSession->GetOutputNames();
-  mOutputShapes = mSession->GetOutputShapes();
+  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+    mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
+    mInputNames = mSession->GetInputNames();
+    mInputShapes = mSession->GetInputShapes();
+    mOutputNames = mSession->GetOutputNames();
+    mOutputShapes = mSession->GetOutputShapes();
+  #else
+    mSession = std::make_shared<Ort::Session>(*mEnv, modelPath.c_str(), sessionOptions);
+    Ort::AllocatorWithDefaultOptions tmpAllocator;
+    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
+      mInputNames.push_back(mSession->GetInputNameAllocated(i, tmpAllocator).get());
+    }
+    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
+      mInputShapes.emplace_back(mSession->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    }
+    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
+      mOutputNames.push_back(mSession->GetOutputNameAllocated(i, tmpAllocator).get());
+    }
+    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
+      mOutputShapes.emplace_back(mSession->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    }
+  #endif
 
   LOG(info) << "Input Nodes:";
   for (size_t i = 0; i < mInputNames.size(); i++) {
@@ -121,7 +142,6 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
 template<class T>
 float* OnnxModel::inference(T input, unsigned int size)
 {
-
   std::vector<int64_t> inputShape = mInputShapes[0];
   inputShape[0] = size;
   std::vector<Ort::Value> inputTensors;
@@ -129,22 +149,36 @@ float* OnnxModel::inference(T input, unsigned int size)
   for(auto elem : inputShape){
     mem_size*=elem;
   }
+#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
   try {
-    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-    return outputValues;
-  } catch (const Ort::Exception& exception) {
-    LOG(error) << "Error running model inference: " << exception.what();
-  }
+      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return outputValues;
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#else
+  std::vector<const char*> tmpInputs;
+  std::vector<const char*> tmpOutputs;
+  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
+  try {
+      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return outputValues;
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#endif
+  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
   return nullptr;
 }
 
 template<class T>
 std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
 {
-
   std::vector<int64_t> inputShape = mInputShapes[0];
   inputShape[0] = size;
   std::vector<Ort::Value> inputTensors;
@@ -153,21 +187,29 @@ std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
   for(auto elem : inputShape){
     mem_size*=elem;
   }
+#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
   try {
-    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-    std::vector<float> outputVector{outputValues, outputValues + size * mOutputShapes[0][1]};
-    // for(int s = 0; s < size; s++){
-    //   for(int o = 0; o < mOutputShapes[0][1]; o++){
-    //     outputValues.push_back(tmp_output_values[s*(int)mOutputShapes[0][1] + o]);
-    //   }
-    // }
-    return outputVector;
-  } catch (const Ort::Exception& exception) {
-    LOG(error) << "Error running model inference: " << exception.what();
-  }
+      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#else
+  std::vector<const char*> tmpInputs;
+  std::vector<const char*> tmpOutputs;
+  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
+  try {
+      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#endif
   return std::vector<float>{};
 }
 
diff --git a/GPU/GPUTracking/ML/onnx_interface.cxx b/GPU/GPUTracking/ML/onnx_interface.cxx
index e7c952d6b8cdc..549575600a656 100644
--- a/GPU/GPUTracking/ML/onnx_interface.cxx
+++ b/GPU/GPUTracking/ML/onnx_interface.cxx
@@ -43,6 +43,11 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
   modelPath = localPath;
   activeThreads = threads;
 
+#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+#else
+  mMemoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
+#endif
+
   /// Enableing optimizations
   if(threads != 0){
     // sessionOptions.SetInterOpNumThreads(1);
@@ -63,12 +68,28 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
   }
 
   mEnv = std::make_shared<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "onnx-model");
-  mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
-
-  mInputNames = mSession->GetInputNames();
-  mInputShapes = mSession->GetInputShapes();
-  mOutputNames = mSession->GetOutputNames();
-  mOutputShapes = mSession->GetOutputShapes();
+  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+    mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
+    mInputNames = mSession->GetInputNames();
+    mInputShapes = mSession->GetInputShapes();
+    mOutputNames = mSession->GetOutputNames();
+    mOutputShapes = mSession->GetOutputShapes();
+  #else
+    mSession = std::make_shared<Ort::Session>(*mEnv, modelPath.c_str(), sessionOptions);
+    Ort::AllocatorWithDefaultOptions tmpAllocator;
+    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
+      mInputNames.push_back(mSession->GetInputNameAllocated(i, tmpAllocator).get());
+    }
+    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
+      mInputShapes.emplace_back(mSession->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    }
+    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
+      mOutputNames.push_back(mSession->GetOutputNameAllocated(i, tmpAllocator).get());
+    }
+    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
+      mOutputShapes.emplace_back(mSession->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    }
+  #endif
 
   LOG(info) << "Input Nodes:";
   for (size_t i = 0; i < mInputNames.size(); i++) {
@@ -121,7 +142,6 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
 template<class T>
 float* OnnxModel::inference(T input, unsigned int size)
 {
-
   std::vector<int64_t> inputShape = mInputShapes[0];
   inputShape[0] = size;
   std::vector<Ort::Value> inputTensors;
@@ -129,22 +149,36 @@ float* OnnxModel::inference(T input, unsigned int size)
   for(auto elem : inputShape){
     mem_size*=elem;
   }
+#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
   try {
-    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-    return outputValues;
-  } catch (const Ort::Exception& exception) {
-    LOG(error) << "Error running model inference: " << exception.what();
-  }
+      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return outputValues;
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#else
+  std::vector<const char*> tmpInputs;
+  std::vector<const char*> tmpOutputs;
+  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
+  try {
+      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return outputValues;
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#endif
+  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
   return nullptr;
 }
 
 template<class T>
 std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
 {
-
   std::vector<int64_t> inputShape = mInputShapes[0];
   inputShape[0] = size;
   std::vector<Ort::Value> inputTensors;
@@ -153,21 +187,29 @@ std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
   for(auto elem : inputShape){
     mem_size*=elem;
   }
+#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
   try {
-    auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-    std::vector<float> outputVector{outputValues, outputValues + size * mOutputShapes[0][1]};
-    // for(int s = 0; s < size; s++){
-    //   for(int o = 0; o < mOutputShapes[0][1]; o++){
-    //     outputValues.push_back(tmp_output_values[s*(int)mOutputShapes[0][1] + o]);
-    //   }
-    // }
-    return outputVector;
-  } catch (const Ort::Exception& exception) {
-    LOG(error) << "Error running model inference: " << exception.what();
-  }
+      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#else
+  std::vector<const char*> tmpInputs;
+  std::vector<const char*> tmpOutputs;
+  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
+  try {
+      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
+      inputTensors.clear();
+      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
+    } catch (const Ort::Exception& exception) {
+      LOG(error) << "Error running model inference: " << exception.what();
+    }
+#endif
   return std::vector<float>{};
 }
 
diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h
index d3676b7a3f87a..5ee2bd716d257 100644
--- a/GPU/GPUTracking/ML/onnx_interface.h
+++ b/GPU/GPUTracking/ML/onnx_interface.h
@@ -45,8 +45,8 @@ class OnnxModel
 {
 
  public:
-  OnnxModel() = default;
-  ~OnnxModel() = default;
+  OnnxModel(OrtAllocatorType allocatorType = OrtDeviceAllocator, OrtMemType memoryType = OrtMemTypeCPU) : mMemoryInfo(Ort::MemoryInfo::CreateCpu(allocatorType, memoryType)) {};
+  virtual ~OnnxModel() = default;
 
   // Inferencing
   void init(std::string, bool = false, int = 0);
@@ -56,11 +56,19 @@ class OnnxModel
   template<class T> std::vector<float> inference_vector(T input, unsigned int size);
 
   // Reset session
-  void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); }
+  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+    void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); };
+  #else
+    void resetSession() { mSession.reset(new Ort::Session{*mEnv, modelPath.c_str(), sessionOptions}); };
+  #endif
 
   // Getters & Setters
   Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post
-  std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
+  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
+    std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
+  #else
+    std::shared_ptr<Ort::Session> getSession() { return mSession; }
+  #endif
   std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
   std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
   void setActiveThreads(int);
@@ -68,7 +76,8 @@ class OnnxModel
  private:
   // Environment variables for the ONNX runtime
   std::shared_ptr<Ort::Env> mEnv = nullptr;
-  std::shared_ptr<Ort::Experimental::Session> mSession = nullptr;
+  std::shared_ptr<Ort::Session> mSession = nullptr; ///< ONNX session
+  Ort::MemoryInfo mMemoryInfo;
   Ort::SessionOptions sessionOptions;
 
   // Input & Output specifications of the loaded network

From 01dc4a1bd96f3c6094f1368604dff895754a17d3 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 26 Jun 2024 09:53:36 +0200
Subject: [PATCH 08/77] Adjusting global settings and CF code for several
 settings

---
 Common/ML/src/onnx_interface.cxx              |  4 +-
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  1 +
 .../Global/GPUChainTrackingClusterizer.cxx    | 28 +++++------
 GPU/GPUTracking/ML/onnx_interface.cxx         | 49 +++++++++++--------
 GPU/GPUTracking/ML/onnx_interface.h           |  2 +-
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |  1 +
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 38 +++++++-------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  2 +-
 8 files changed, 67 insertions(+), 58 deletions(-)

diff --git a/Common/ML/src/onnx_interface.cxx b/Common/ML/src/onnx_interface.cxx
index 549575600a656..c348d4577d47f 100644
--- a/Common/ML/src/onnx_interface.cxx
+++ b/Common/ML/src/onnx_interface.cxx
@@ -200,9 +200,9 @@ std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
 #else
   std::vector<const char*> tmpInputs;
   std::vector<const char*> tmpOutputs;
-  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
+  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
   try {
-      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
+      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
       inputTensors.clear();
       float* outputValues = outputTensors[0].GetTensorMutableData<float>();
       return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 5e9d3499eda77..bc42d50d4a88a 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -283,6 +283,7 @@ AddOption(tpcMaxAttachedClustersPerSectorRow, unsigned int, 51000, "", 0, "Maxim
 AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding")
 AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored")
 AddOption(applyNNclusterizer, int, 0, "", 0, "(Bool, default = 0), if the neural network clsuterizer should be used.")
+AddOption(nnClusterizerVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
 AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
 AddOption(nnRegressionPath, std::string, "./network_reg.onnx", "", 0, "The regression network path")
 AddOption(nnClassThreshold, float, 0.16, "", 0, "The cutoff at which clusters will be accepted / rejected.")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 6ed3406646abb..44cd1a5f62f4c 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -874,23 +874,21 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
 
-        if(doGPU){
-          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0);
+        if(GetProcessingSettings().applyNNclusterizer){
+          clusterer.model_class.init(GetProcessingSettings().nnClassificationPath, 1, 1, GetProcessingSettings().nnClusterizerVerbosity);
+          clusterer.model_reg.init(GetProcessingSettings().nnRegressionPath, 1, 1, GetProcessingSettings().nnClusterizerVerbosity);
+          clusterer.nnSizeInputRow = GetProcessingSettings().nnSizeInputRow;
+          clusterer.nnSizeInputPad = GetProcessingSettings().nnSizeInputPad;
+          clusterer.nnSizeInputTime = GetProcessingSettings().nnSizeInputTime;
+          clusterer.nnAddIndexData = GetProcessingSettings().nnAddIndexData;
+          clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold;
+          clusterer.nnSigmoidTrafoThreshold = GetProcessingSettings().nnSigmoidTrafoThreshold;
+          clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnClusterizerVerbosity;
+          runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         } else {
-          if(GetProcessingSettings().applyNNclusterizer){
-            clusterer.model_class.init(GetProcessingSettings().nnClassificationPath, 1, 1);
-            clusterer.model_reg.init(GetProcessingSettings().nnRegressionPath, 1, 1);
-            clusterer.nnSizeInputRow = GetProcessingSettings().nnSizeInputRow;
-            clusterer.nnSizeInputPad = GetProcessingSettings().nnSizeInputPad;
-            clusterer.nnSizeInputTime = GetProcessingSettings().nnSizeInputTime;
-            clusterer.nnAddIndexData = GetProcessingSettings().nnAddIndexData;
-            clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold;
-            clusterer.nnSigmoidTrafoThreshold = GetProcessingSettings().nnSigmoidTrafoThreshold;
-            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
-          } else {
-            runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
-          }
+          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         }
+
         if (doGPU && propagateMCLabels) {
           TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
           if (doGPU) {
diff --git a/GPU/GPUTracking/ML/onnx_interface.cxx b/GPU/GPUTracking/ML/onnx_interface.cxx
index 549575600a656..9bb5137ec63dd 100644
--- a/GPU/GPUTracking/ML/onnx_interface.cxx
+++ b/GPU/GPUTracking/ML/onnx_interface.cxx
@@ -35,11 +35,13 @@ std::string OnnxModel::printShape(const std::vector<int64_t>& v)
   return ss.str();
 }
 
-void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads)
+void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads, int verbosity)
 {
 
-  LOG(info) << "--- ONNX-ML model ---";
-  LOG(info) << "Taking model from: " << localPath;
+  if(verbosity > 1){
+    LOG(info) << "--- ONNX-ML model ---";
+    LOG(info) << "Taking model from: " << localPath;
+  }
   modelPath = localPath;
   activeThreads = threads;
 
@@ -91,17 +93,18 @@ void OnnxModel::init(std::string localPath, bool enableOptimizations, int thread
     }
   #endif
 
-  LOG(info) << "Input Nodes:";
-  for (size_t i = 0; i < mInputNames.size(); i++) {
-    LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
-  }
+  if(verbosity > 1){
+    LOG(info) << "Input Nodes:";
+    for (size_t i = 0; i < mInputNames.size(); i++) {
+      LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
+    }
 
-  LOG(info) << "Output Nodes:";
-  for (size_t i = 0; i < mOutputNames.size(); i++) {
-    LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
+    LOG(info) << "Output Nodes:";
+    for (size_t i = 0; i < mOutputNames.size(); i++) {
+      LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
+    }
+    LOG(info) << "--- Model initialized! ---";
   }
-  
-  LOG(info) << "--- Model initialized! ---";
 }
 
 // float* OnnxModel::inference(std::vector<Ort::Value> input, int device_id)
@@ -200,15 +203,21 @@ std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
 #else
   std::vector<const char*> tmpInputs;
   std::vector<const char*> tmpOutputs;
-  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
+  for (unsigned int i = 0; i < mInputNames.size(); i++) {
+    tmpInputs.emplace_back(mInputNames[i].c_str());
+  }
+  for (unsigned int i = 0; i < mOutputNames.size(); i++) {
+    tmpOutputs.emplace_back(mOutputNames[i].c_str());
+  }
+  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
   try {
-      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
+    auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
+    inputTensors.clear();
+    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
+    return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
+  } catch (const Ort::Exception& exception) {
+    LOG(error) << "Error running model inference: " << exception.what();
+  }
 #endif
   return std::vector<float>{};
 }
diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h
index 5ee2bd716d257..17c45f439dc63 100644
--- a/GPU/GPUTracking/ML/onnx_interface.h
+++ b/GPU/GPUTracking/ML/onnx_interface.h
@@ -49,7 +49,7 @@ class OnnxModel
   virtual ~OnnxModel() = default;
 
   // Inferencing
-  void init(std::string, bool = false, int = 0);
+  void init(std::string, bool = false, int = 0, int = 0);
   // float* inference(std::vector<Ort::Value>, int = 0);
   // float* inference(std::vector<float>, int = 0);
   template<class T> float* inference(T input, unsigned int size);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index a449eb23ef426..aed00623ef167 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -150,6 +150,7 @@ class GPUTPCClusterFinder : public GPUProcessor
   bool nnAddIndexData = true;
   float nnClassThreshold = 0.16;
   bool nnSigmoidTrafoThreshold = 1;
+  int nnClusterizerVerbosity = 1;
 
   OnnxModel model_class, model_reg;
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index d2656531c6df1..d7e3226e0d54c 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -34,28 +34,28 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
 
   tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; // (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold);
+  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold, clusterer.nnClusterizerVerbosity);
 
   // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 // 
   // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
-GPUd() void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
-{
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  CPU_ONLY(
-    MCLabelAccumulator labelAcc(clusterer));
-
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
-
-  std::string path_class = "", path_reg = "";
-
-  clusterer.model_class.init(path_class, 1, 0);
-  clusterer.model_reg.init(path_reg, 1, 0);
-
-  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
-}
+// GPUd() void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
+// {
+//   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+//   CPU_ONLY(
+//     MCLabelAccumulator labelAcc(clusterer));
+// 
+//   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+// 
+//   std::string path_class = "", path_reg = "";
+// 
+//   clusterer.model_class.init(path_class, 1, 0);
+//   clusterer.model_reg.init(path_reg, 1, 0);
+// 
+//   GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
+// }
 
 int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current)
 {
@@ -117,7 +117,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
                                           uint* clusterInRow,
                                           tpc::ClusterNative* clusterByRow,
                                           uint* clusterPosInRow,
-                                          int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform){
+                                          int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform, int verbosity){
 
   std::vector<float> input_data(((2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1) + (add_index_data ? 3 : 0)), -1.f);
   float classification_threshold = class_threshold;
@@ -169,7 +169,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
   std::vector<float> out_reg = clusterer.model_reg.inference_vector(input_data, 1);
   int num_outputs = clusterer.model_reg.getNumOutputNodes()[0][1];
 
-  if(idx == 100){
+  if((verbosity > 4) && idx == 100){
     LOG(info) << "Classification model: " << out_class[0] << " (>? " << classification_threshold << ")";
     LOG(info) << "Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
   }
@@ -179,7 +179,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
     pc.setFull(central_charge * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0);
     tpc::ClusterNative myCluster;
     bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param());
-    if (rejectCluster) {
+    if ((verbosity > 0) && rejectCluster) {
       LOG(warning) << "Cluster rejected!";
       if (clusterPosInRow) {
         clusterPosInRow[idx] = maxClusterPerRow;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 905e6f860a90f..7fbf5a806a916 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -77,7 +77,7 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
                               uint*,
                               tpc::ClusterNative*,
                               uint*,
-                              int = 3, int = 3, int = 3, bool = 1, float = 0.16, bool = true);
+                              int = 3, int = 3, int = 3, bool = 1, float = 0.16, bool = true, int = 1);
 
  private:
   // ---------------------------------

From accd7abaac7a2fce98a280ec6e4d8fa2e8eb6254 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 3 Jul 2024 13:39:11 +0200
Subject: [PATCH 09/77] Adding return statement if cluster is rejected

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index d7e3226e0d54c..3c2dadaf660b1 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -202,8 +202,12 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
     } else if (clusterPosInRow) {
       rowIndex = clusterPosInRow[idx];
     }
-
     CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, maxClusterPerRow));
+  } else {
+    if (clusterPosInRow) {
+      clusterPosInRow[idx] = maxClusterPerRow;
+    }
+    return;
   }
 
 }

From 3473a066755dc4ae23ce7965d7b77cb7d5ffb020 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 4 Jul 2024 14:10:58 +0200
Subject: [PATCH 10/77] Adding some statements back

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 3c2dadaf660b1..98f7cdee72b0c 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -32,7 +32,7 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
   CPU_ONLY(
     MCLabelAccumulator labelAcc(clusterer));
 
-  tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; // (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
   GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold, clusterer.nnClusterizerVerbosity);
 
@@ -210,6 +210,10 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
     return;
   }
 
+  if((verbosity > 4) && idx == 100){
+    LOG(info) << "Clusterization done!";
+  }
+
 }
 
 

From df21c963bc4cd132eb5eb175160bf5c76e264fe3 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 17 Oct 2024 14:09:20 +0200
Subject: [PATCH 11/77] Update to latest status of gpu clusterization

---
 Common/CMakeLists.txt                         |   1 +
 Common/ML/CMakeLists.txt                      |   5 +-
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h | 867 ++++++++++++++++++
 Common/ML/include/ML/onnx_interface.h         | 100 --
 Common/ML/include/ML/ort_interface.h          |  94 ++
 Common/ML/src/onnx_interface.cxx              | 226 -----
 Common/ML/src/ort_interface.cxx               | 262 ++++++
 GPU/GPUTracking/CMakeLists.txt                |   3 +-
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  28 +-
 GPU/GPUTracking/Global/GPUChainTracking.cxx   |   2 +-
 .../Global/GPUChainTrackingClusterizer.cxx    |  67 +-
 GPU/GPUTracking/ML/onnx_interface.cxx         | 235 -----
 GPU/GPUTracking/ML/onnx_interface.h           | 101 --
 GPU/GPUTracking/TPCClusterFinder/ChargePos.h  |   1 +
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |  25 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 446 ++++++---
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  12 +-
 17 files changed, 1651 insertions(+), 824 deletions(-)
 create mode 100644 Common/ML/include/ML/3rdparty/GPUORTFloat16.h
 delete mode 100644 Common/ML/include/ML/onnx_interface.h
 create mode 100644 Common/ML/include/ML/ort_interface.h
 delete mode 100644 Common/ML/src/onnx_interface.cxx
 create mode 100644 Common/ML/src/ort_interface.cxx
 delete mode 100644 GPU/GPUTracking/ML/onnx_interface.cxx
 delete mode 100644 GPU/GPUTracking/ML/onnx_interface.h

diff --git a/Common/CMakeLists.txt b/Common/CMakeLists.txt
index f435e269575aa..0b92758e45f43 100644
--- a/Common/CMakeLists.txt
+++ b/Common/CMakeLists.txt
@@ -16,5 +16,6 @@ add_subdirectory(Types)
 add_subdirectory(Utils)
 add_subdirectory(SimConfig)
 add_subdirectory(DCAFitter)
+add_subdirectory(ML)
 
 o2_data_file(COPY maps DESTINATION Common)
diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt
index 60a07041da2e0..954d29d6e2793 100644
--- a/Common/ML/CMakeLists.txt
+++ b/Common/ML/CMakeLists.txt
@@ -10,7 +10,6 @@
 # or submit itself to any jurisdiction.
 
 o2_add_library(ML
-               SOURCES src/onnx_interface.cxx
+               SOURCES src/ort_interface.cxx
                TARGETVARNAME targetName
-               PUBLIC_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime
-)
\ No newline at end of file
+               PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime)
\ No newline at end of file
diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
new file mode 100644
index 0000000000000..db65328409d3c
--- /dev/null
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -0,0 +1,867 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This code was created from:
+//    - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_float16.h
+//    - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+
+#include <stdint.h>
+#include <cmath>
+#include <cstring>
+#include <limits>
+
+namespace o2
+{
+
+namespace OrtDataType
+{
+
+namespace detail
+{
+
+enum class endian {
+#if defined(_WIN32)
+  little = 0,
+  big = 1,
+  native = little,
+#elif defined(__GNUC__) || defined(__clang__)
+  little = __ORDER_LITTLE_ENDIAN__,
+  big = __ORDER_BIG_ENDIAN__,
+  native = __BYTE_ORDER__,
+#else
+#error OrtDataType::detail::endian is not implemented in this environment.
+#endif
+};
+
+static_assert(
+  endian::native == endian::little || endian::native == endian::big,
+  "Only little-endian or big-endian native byte orders are supported.");
+
+} // namespace detail
+
+/// <summary>
+/// Shared implementation between public and internal classes. CRTP pattern.
+/// </summary>
+template <class Derived>
+struct Float16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  constexpr static uint16_t ToUint16Impl(float v) noexcept;
+
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  float ToFloatImpl() const noexcept;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept
+  {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept
+  {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7C00U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7C00U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFC00U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7E00U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFE00U;
+  static constexpr uint16_t kEpsilonBits = 0x4170U;
+  static constexpr uint16_t kMinValueBits = 0xFBFFU; // Minimum normal number
+  static constexpr uint16_t kMaxValueBits = 0x7BFFU; // Largest normal number
+  static constexpr uint16_t kOneBits = 0x3C00U;
+  static constexpr uint16_t kMinusOneBits = 0xBC00U;
+
+  uint16_t val{0};
+
+  Float16Impl() = default;
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept
+  {
+    return static_cast<int16_t>(val) < 0;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept
+  {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept
+  {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept
+  {
+    return val == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept
+  {
+    return val == kNegativeInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept
+  {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent)
+  }
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent)
+  }
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const Float16Impl& lhs, const Float16Impl& rhs) noexcept
+  {
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+
+  bool operator==(const Float16Impl& rhs) const noexcept
+  {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is not equal to anything, including itself.
+      return false;
+    }
+    return val == rhs.val;
+  }
+
+  bool operator!=(const Float16Impl& rhs) const noexcept { return !(*this == rhs); }
+
+  bool operator<(const Float16Impl& rhs) const noexcept
+  {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is unordered with respect to everything, including itself.
+      return false;
+    }
+
+    const bool left_is_negative = IsNegative();
+    if (left_is_negative != rhs.IsNegative()) {
+      // When the signs of left and right differ, we know that left is less than right if it is
+      // the negative value. The exception to this is if both values are zero, in which case IEEE
+      // says they should be equal, even if the signs differ.
+      return left_is_negative && !AreZero(*this, rhs);
+    }
+    return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative);
+  }
+};
+
+// The following Float16_t conversions are based on the code from
+// Eigen library.
+
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+namespace detail
+{
+union float32_bits {
+  unsigned int u;
+  float f;
+};
+}; // namespace detail
+
+template <class Derived>
+inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept
+{
+  detail::float32_bits f{};
+  f.f = v;
+
+  constexpr detail::float32_bits f32infty = {255 << 23};
+  constexpr detail::float32_bits f16max = {(127 + 16) << 23};
+  constexpr detail::float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
+  constexpr unsigned int sign_mask = 0x80000000u;
+  uint16_t val = static_cast<uint16_t>(0x0u);
+
+  unsigned int sign = f.u & sign_mask;
+  f.u ^= sign;
+
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+
+  if (f.u >= f16max.u) {                        // result is Inf or NaN (all exponent bits set)
+    val = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+  } else {                                      // (De)normalized number or zero
+    if (f.u < (113 << 23)) {                    // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f.f += denorm_magic.f;
+
+      // and one integer subtract of the bias later, we have our final float!
+      val = static_cast<uint16_t>(f.u - denorm_magic.u);
+    } else {
+      unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+
+      // update exponent, rounding bias part 1
+      // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
+      // without arithmetic overflow.
+      f.u += 0xc8000fffU;
+      // rounding bias part 2
+      f.u += mant_odd;
+      // take the bits!
+      val = static_cast<uint16_t>(f.u >> 13);
+    }
+  }
+
+  val |= static_cast<uint16_t>(sign >> 16);
+  return val;
+}
+
+template <class Derived>
+inline float Float16Impl<Derived>::ToFloatImpl() const noexcept
+{
+  constexpr detail::float32_bits magic = {113 << 23};
+  constexpr unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+  detail::float32_bits o{};
+
+  o.u = (val & 0x7fff) << 13;           // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.u; // just the exponent
+  o.u += (127 - 15) << 23;              // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp) {  // Inf/NaN?
+    o.u += (128 - 16) << 23; // extra exp adjust
+  } else if (exp == 0) {     // Zero/Denormal?
+    o.u += 1 << 23;          // extra exp adjust
+    o.f -= magic.f;          // re-normalize
+  }
+
+  // Attempt to workaround the Internal Compiler Error on ARM64
+  // for bitwise | operator, including std::bitset
+#if (defined _MSC_VER) && (defined _M_ARM || defined _M_ARM64 || defined _M_ARM64EC)
+  if (IsNegative()) {
+    return -o.f;
+  }
+#else
+  // original code:
+  o.u |= (val & 0x8000U) << 16U; // sign bit
+#endif
+  return o.f;
+}
+
+/// Shared implementation between public and internal classes. CRTP pattern.
+template <class Derived>
+struct BFloat16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  static uint16_t ToUint16Impl(float v) noexcept;
+
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  float ToFloatImpl() const noexcept;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept
+  {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept
+  {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7F80U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7F80U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFF80U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7FC1U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFFC1U;
+  static constexpr uint16_t kSignaling_NaNBits = 0x7F80U;
+  static constexpr uint16_t kEpsilonBits = 0x0080U;
+  static constexpr uint16_t kMinValueBits = 0xFF7FU;
+  static constexpr uint16_t kMaxValueBits = 0x7F7FU;
+  static constexpr uint16_t kRoundToNearest = 0x7FFFU;
+  static constexpr uint16_t kOneBits = 0x3F80U;
+  static constexpr uint16_t kMinusOneBits = 0xBF80U;
+
+  uint16_t val{0};
+
+  BFloat16Impl() = default;
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept
+  {
+    return static_cast<int16_t>(val) < 0;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept
+  {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept
+  {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept
+  {
+    return val == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept
+  {
+    return val == kNegativeInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept
+  {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent)
+  }
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent)
+  }
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept
+  {
+    // IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+    // for two values by or'ing the private bits together and stripping the sign. They are both zero,
+    // and therefore equivalent, if the resulting value is still zero.
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+};
+
+template <class Derived>
+inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
+{
+  uint16_t result;
+  if (std::isnan(v)) {
+    result = kPositiveQNaNBits;
+  } else {
+    auto get_msb_half = [](float fl) {
+      uint16_t result;
+#ifdef __cpp_if_constexpr
+      if constexpr (detail::endian::native == detail::endian::little)
+#else
+      if (detail::endian::native == detail::endian::little)
+#endif
+      {
+        std::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
+      } else {
+        std::memcpy(&result, &fl, sizeof(uint16_t));
+      }
+      return result;
+    };
+
+    uint16_t upper_bits = get_msb_half(v);
+    union {
+      uint32_t U32;
+      float F32;
+    };
+    F32 = v;
+    U32 += (upper_bits & 1) + kRoundToNearest;
+    result = get_msb_half(F32);
+  }
+  return result;
+}
+
+template <class Derived>
+inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
+{
+  if (IsNaN()) {
+    return std::numeric_limits<float>::quiet_NaN();
+  }
+  float result;
+  char* const first = reinterpret_cast<char*>(&result);
+  char* const second = first + sizeof(uint16_t);
+#ifdef __cpp_if_constexpr
+  if constexpr (detail::endian::native == detail::endian::little)
+#else
+  if (detail::endian::native == detail::endian::little)
+#endif
+  {
+    std::memset(first, 0, sizeof(uint16_t));
+    std::memcpy(second, &val, sizeof(uint16_t));
+  } else {
+    std::memcpy(first, &val, sizeof(uint16_t));
+    std::memset(second, 0, sizeof(uint16_t));
+  }
+  return result;
+}
+
+/** \brief IEEE 754 half-precision floating point data type
+ *
+ * \details This struct is used for converting float to float16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::Float16_t> fp16_values;
+ * fp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(fp16_values),
+ *     [](float value) { return Ort::Float16_t(value); });
+ *
+ * \endcode
+ */
+struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a 16-bit representation of a float16 value
+  /// No conversion is done here.
+  /// </summary>
+  /// <param name="v">16-bit representation</param>
+  constexpr explicit Float16_t(uint16_t v) noexcept { val = v; }
+
+ public:
+  using Base = OrtDataType::Float16Impl<Float16_t>;
+
+  /// <summary>
+  /// Default constructor
+  /// </summary>
+  Float16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of float16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of float16</param>
+  /// <returns>new instance of Float16_t</returns>
+  constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into float16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts Float16_t to float.
+  /// </summary>
+  explicit operator float() const noexcept { return ToFloat(); }
+
+  using Base::operator==;
+  using Base::operator!=;
+  using Base::operator<;
+};
+
+static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
+
+/** \brief bfloat16 (Brain Floating Point) data type
+ *
+ * \details This struct is used for converting float to bfloat16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::BFloat16_t> bfp16_values;
+ * bfp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(bfp16_values),
+ *     [](float value) { return Ort::BFloat16_t(value); });
+ *
+ * \endcode
+ */
+struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a uint16_t representation of bfloat16
+  /// used in FromBits() to escape overload resolution issue with
+  /// constructor from float.
+  /// No conversion is done.
+  /// </summary>
+  /// <param name="v">16-bit bfloat16 value</param>
+  constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; }
+
+ public:
+  using Base = OrtDataType::BFloat16Impl<BFloat16_t>;
+
+  BFloat16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of bfloat16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of bfloat16</param>
+  /// <returns>new instance of BFloat16_t</returns>
+  static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into bfloat16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts BFloat16_t to float.
+  /// </summary>
+  explicit operator float() const noexcept { return ToFloat(); }
+
+  // We do not have an inherited impl for the below operators
+  // as the internal class implements them a little differently
+  bool operator==(const BFloat16_t& rhs) const noexcept;
+  bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); }
+  bool operator<(const BFloat16_t& rhs) const noexcept;
+};
+
+static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
+
+} // namespace OrtDataType
+
+} // namespace o2
\ No newline at end of file
diff --git a/Common/ML/include/ML/onnx_interface.h b/Common/ML/include/ML/onnx_interface.h
deleted file mode 100644
index fcc02a49996ea..0000000000000
--- a/Common/ML/include/ML/onnx_interface.h
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-///
-/// \file     model.h
-///
-/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
-///
-/// \brief    A general-purpose class for ONNX models
-///
-
-#ifndef COMMON_ML_ONNX_INTERFACE_H
-#define COMMON_ML_ONNX_INTERFACE_H
-
-// C++ and system includes
-#if __has_include(<onnxruntime/core/session/onnxruntime_cxx_api.h>)
-#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
-#else
-#include <onnxruntime_cxx_api.h>
-#endif
-#include <vector>
-#include <string>
-#include <memory>
-#include <map>
-#include <thread>
-
-// O2 includes
-#include "Framework/Logger.h"
-
-namespace o2
-{
-
-namespace ml
-{
-
-class OnnxModel
-{
- public:
-  OnnxModel() : mMemoryInfo(Ort::MemoryInfo::CreateCpu(OrtAllocatorType, OrtMemType)) {};
-  virtual ~OnnxModel() = default;
-
-  // Inferencing
-  void init(std::string, bool = false, int = 0);
-  // float* inference(std::vector<Ort::Value>, int = 0);
-  // float* inference(std::vector<float>, int = 0);
-  template<class T> float* inference(T input, unsigned int size);
-  template<class T> std::vector<float> inference_vector(T input, unsigned int size);
-
-  // Reset session
-  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-    void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); };
-  #else
-    void resetSession() { mSession.reset(new Ort::Session{*mEnv, modelPath.c_str(), sessionOptions}); };
-  #endif
-
-  // Getters & Setters
-  Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post
-  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-    std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
-  #else
-    std::shared_ptr<Ort::Session> getSession() { return mSession; }
-  #endif
-  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
-  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
-  void setActiveThreads(int);
-
- private:
-  // Environment variables for the ONNX runtime
-  std::shared_ptr<Ort::Env> mEnv = nullptr;
-  std::shared_ptr<Ort::Session> mSession = nullptr; ///< ONNX session
-  Ort::MemoryInfo mMemoryInfo;
-  Ort::SessionOptions sessionOptions;
-
-  // Input & Output specifications of the loaded network
-  std::vector<std::string> mInputNames;
-  std::vector<std::vector<int64_t>> mInputShapes;
-  std::vector<std::string> mOutputNames;
-  std::vector<std::vector<int64_t>> mOutputShapes;
-
-  // Environment settings
-  std::string modelPath;
-  int activeThreads = 0;
-
-  // Internal function for printing the shape of tensors
-  std::string printShape(const std::vector<int64_t>&);
-};
-
-} // namespace gpu
-
-} // namespace GPUCA_NAMESPACE
-
-#endif // COMMON_ML_ONNX_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h
new file mode 100644
index 0000000000000..a365860db3279
--- /dev/null
+++ b/Common/ML/include/ML/ort_interface.h
@@ -0,0 +1,94 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file     ort_interface.h
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+/// \brief    A header library for loading ONNX models and inferencing them on CPU and GPU
+
+#ifndef O2_ML_ONNX_INTERFACE_H
+#define O2_ML_ONNX_INTERFACE_H
+
+// C++ and system includes
+#include <vector>
+#include <string>
+#include <memory>
+#include <map>
+#include <thread>
+
+// O2 includes
+#include "Framework/Logger.h"
+
+namespace o2
+{
+
+namespace ml
+{
+
+class OrtModel
+{
+
+  public:
+    // Constructor
+    OrtModel() = default;
+    OrtModel(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
+    void init(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
+    void reset(std::unordered_map<std::string, std::string>);
+
+    virtual ~OrtModel() = default;
+
+    // Conversion
+    template<class I, class O>
+    std::vector<O> v2v(std::vector<I>&, bool = true);
+
+    // Inferencing
+    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
+    std::vector<O> inference(std::vector<I>&);
+
+    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+    std::vector<O> inference(std::vector<std::vector<I>>&);
+
+    // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
+    // std::vector<O> inference(std::vector<I>&);
+
+    // Reset session
+    void resetSession();
+
+    std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+    std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+    std::vector<std::string> getInputNames() const { return mInputNames; }
+    std::vector<std::string> getOutputNames() const { return mOutputNames; }
+
+    void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+
+  private:
+
+    // ORT variables -> need to be hidden as Pimpl
+    struct OrtVariables;
+    OrtVariables* pImplOrt;
+
+    // Input & Output specifications of the loaded network
+    std::vector<const char*> inputNamesChar, outputNamesChar;
+    std::vector<std::string> mInputNames, mOutputNames;
+    std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
+
+    // Environment settings
+    std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
+    int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
+
+    std::string printShape(const std::vector<int64_t>&);
+
+};
+
+} // namespace ml
+
+} // namespace ml
+
+#endif // O2_ML_ORT_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/onnx_interface.cxx b/Common/ML/src/onnx_interface.cxx
deleted file mode 100644
index c348d4577d47f..0000000000000
--- a/Common/ML/src/onnx_interface.cxx
+++ /dev/null
@@ -1,226 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-///
-/// \file     model.cxx
-///
-/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
-///
-/// \brief    A general-purpose class with functions for ONNX model applications
-///
-
-// ONNX includes
-#include "ML/onnx_interface.h"
-
-namespace o2
-{
-
-namespace ml
-{
-
-std::string OnnxModel::printShape(const std::vector<int64_t>& v)
-{
-  std::stringstream ss("");
-  for (size_t i = 0; i < v.size() - 1; i++)
-    ss << v[i] << "x";
-  ss << v[v.size() - 1];
-  return ss.str();
-}
-
-void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads)
-{
-
-  LOG(info) << "--- ONNX-ML model ---";
-  LOG(info) << "Taking model from: " << localPath;
-  modelPath = localPath;
-  activeThreads = threads;
-
-#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-#else
-  mMemoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
-#endif
-
-  /// Enableing optimizations
-  if(threads != 0){
-    // sessionOptions.SetInterOpNumThreads(1);
-    if(threads == 1){
-      sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
-    }
-    else{
-      sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-      sessionOptions.SetIntraOpNumThreads(threads);
-    }
-  }
-  if (enableOptimizations) {
-    // sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
-    sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
-    // uint32_t coreml_flags = 0;
-    // coreml_flags |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE;
-    // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreml_flags));
-  }
-
-  mEnv = std::make_shared<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "onnx-model");
-  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-    mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
-    mInputNames = mSession->GetInputNames();
-    mInputShapes = mSession->GetInputShapes();
-    mOutputNames = mSession->GetOutputNames();
-    mOutputShapes = mSession->GetOutputShapes();
-  #else
-    mSession = std::make_shared<Ort::Session>(*mEnv, modelPath.c_str(), sessionOptions);
-    Ort::AllocatorWithDefaultOptions tmpAllocator;
-    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
-      mInputNames.push_back(mSession->GetInputNameAllocated(i, tmpAllocator).get());
-    }
-    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
-      mInputShapes.emplace_back(mSession->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-    }
-    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
-      mOutputNames.push_back(mSession->GetOutputNameAllocated(i, tmpAllocator).get());
-    }
-    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
-      mOutputShapes.emplace_back(mSession->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-    }
-  #endif
-
-  LOG(info) << "Input Nodes:";
-  for (size_t i = 0; i < mInputNames.size(); i++) {
-    LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
-  }
-
-  LOG(info) << "Output Nodes:";
-  for (size_t i = 0; i < mOutputNames.size(); i++) {
-    LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
-  }
-  
-  LOG(info) << "--- Model initialized! ---";
-}
-
-// float* OnnxModel::inference(std::vector<Ort::Value> input, int device_id)
-// {
-
-//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
-
-//   try {
-//     auto outputTensors = mSession->Run(mInputNames, input, mOutputNames);
-//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-//     return outputValues;
-//   } catch (const Ort::Exception& exception) {
-//     LOG(error) << "Error running model inference: " << exception.what();
-//   }
-//   return nullptr;
-// }
-
-// float* OnnxModel::inference(std::vector<float> input, int device_id)
-// {
-// 
-//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
-// 
-//   int64_t size = input.size();
-//   assert(size % mInputShapes[0][1] == 0);
-//   std::vector<int64_t> inputShape{size / mInputShapes[0][1], mInputShapes[0][1]};
-//   std::vector<Ort::Value> inputTensors;
-//   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), size, inputShape));
-//   try {
-//     auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-//     return outputValues;
-//   } catch (const Ort::Exception& exception) {
-//     LOG(error) << "Error running model inference: " << exception.what();
-//   }
-//   return nullptr;
-// }
-
-template<class T>
-float* OnnxModel::inference(T input, unsigned int size)
-{
-  std::vector<int64_t> inputShape = mInputShapes[0];
-  inputShape[0] = size;
-  std::vector<Ort::Value> inputTensors;
-  size_t mem_size = 1;
-  for(auto elem : inputShape){
-    mem_size*=elem;
-  }
-#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  try {
-      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return outputValues;
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#else
-  std::vector<const char*> tmpInputs;
-  std::vector<const char*> tmpOutputs;
-  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
-  try {
-      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return outputValues;
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#endif
-  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
-  return nullptr;
-}
-
-template<class T>
-std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
-{
-  std::vector<int64_t> inputShape = mInputShapes[0];
-  inputShape[0] = size;
-  std::vector<Ort::Value> inputTensors;
-  // std::vector<float> outputValues;
-  size_t mem_size = 1;
-  for(auto elem : inputShape){
-    mem_size*=elem;
-  }
-#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  try {
-      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#else
-  std::vector<const char*> tmpInputs;
-  std::vector<const char*> tmpOutputs;
-  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  try {
-      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#endif
-  return std::vector<float>{};
-}
-
-void OnnxModel::setActiveThreads(int threads)
-{
-  activeThreads = threads;
-}
-
-template float* OnnxModel::inference(std::vector<float>, unsigned int);
-template std::vector<float> OnnxModel::inference_vector(std::vector<float>, unsigned int);
-
-} // namespace gpu
-
-} // namespace GPUCA_NAMESPACE
\ No newline at end of file
diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
new file mode 100644
index 0000000000000..84a06ce1da068
--- /dev/null
+++ b/Common/ML/src/ort_interface.cxx
@@ -0,0 +1,262 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file     ort_interface.cxx
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+/// \brief    A header library for loading ONNX models and inferencing them on CPU and GPU
+
+#include "ML/ort_interface.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
+
+// ONNX includes
+#include <onnxruntime_cxx_api.h>
+
+namespace o2
+{
+
+namespace ml
+{
+
+struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .cxx file
+  // ORT runtime objects
+  Ort::RunOptions runOptions;
+  std::shared_ptr<Ort::Env> env = nullptr;
+  std::shared_ptr<Ort::Session> session = nullptr; ///< ONNX session
+  Ort::SessionOptions sessionOptions;
+  Ort::AllocatorWithDefaultOptions allocator;
+  Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
+};
+
+void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
+
+  pImplOrt = new OrtVariables();
+
+  // Load from options map
+  if(!optionsMap.contains("model-path")){
+    LOG(fatal) << "(ORT) Model path cannot be empty!";
+  }
+  modelPath = optionsMap["model-path"];
+  device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
+  dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
+  deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
+  allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
+  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ?  std::stoi(optionsMap["intra-op-num-threads"]) : 0);
+  loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
+  enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
+  enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
+
+  std::string dev_mem_str = "Hip";
+#ifdef ORT_ROCM_BUILD
+  if(device == "ROCM") {
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
+    LOG(info) << "(ORT) ROCM execution provider set";
+  }
+#endif
+#ifdef ORT_MIGRAPHX_BUILD
+  if(device == "MIGRAPHX") {
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
+    LOG(info) << "(ORT) MIGraphX execution provider set";
+  }
+#endif
+#ifdef ORT_CUDA_BUILD
+  if(device == "CUDA") {
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
+    LOG(info) << "(ORT) CUDA execution provider set";
+    dev_mem_str = "Cuda";
+  }
+#endif
+
+  if(allocateDeviceMemory){
+    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
+    LOG(info) << "(ORT) Memory info set to on-device memory";
+  }
+
+  if(device == "CPU") {
+    (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
+    if(intraOpNumThreads > 1){
+      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+    } else if(intraOpNumThreads == 1){
+      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+    }
+    LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
+  }
+
+  (pImplOrt->sessionOptions).DisableMemPattern();
+  (pImplOrt->sessionOptions).DisableCpuMemArena();
+
+  if(enableProfiling){
+    if(optionsMap.contains("profiling-output-path")){
+      (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
+    } else {
+      LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
+      (pImplOrt->sessionOptions).DisableProfiling();
+    }
+  } else {
+    (pImplOrt->sessionOptions).DisableProfiling();
+  }
+  (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
+  (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
+
+  pImplOrt->env = std::make_shared<Ort::Env>(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()));
+  (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+      mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+      mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+      mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+      mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
+
+  inputNamesChar.resize(mInputNames.size(), nullptr);
+  std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
+      [&](const std::string& str) { return str.c_str(); });
+  outputNamesChar.resize(mOutputNames.size(), nullptr);
+  std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
+      [&](const std::string& str) { return str.c_str(); });
+
+  // Print names
+  if(loggingLevel > 1) {
+    LOG(info) << "Input Nodes:";
+    for (size_t i = 0; i < mInputNames.size(); i++) {
+      LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
+    }
+
+    LOG(info) << "Output Nodes:";
+    for (size_t i = 0; i < mOutputNames.size(); i++) {
+      LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
+    }
+  }
+}
+
+void OrtModel::resetSession() { 
+  (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+}
+
+template<class I, class O>
+std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput) {
+  if constexpr (std::is_same_v<I,O>){
+    return input;
+  } else {
+    std::vector<O> output(input.size());
+    std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); });
+    if(clearInput) input.clear();
+    return output;
+  }
+}
+
+template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<I>& input){
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(input)).data(), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
+  std::vector<O> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input){
+  std::vector<Ort::Value> inputTensor;
+  for(auto i : input){
+    std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+    inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(i)).data(), i.size(), inputShape.data(), inputShape.size()));
+  }
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
+  std::vector<O> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+std::string OrtModel::printShape(const std::vector<int64_t>& v)
+{
+  std::stringstream ss("");
+  for (size_t i = 0; i < v.size() - 1; i++)
+    ss << v[i] << "x";
+  ss << v[v.size() - 1];
+  return ss.str();
+}
+
+template <> std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  float* outputValues = outputTensors[0].template GetTensorMutableData<float>();
+  std::vector<float> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  float* outputValues = outputTensors[0].template GetTensorMutableData<float>();
+  std::vector<float> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
+  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
+  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input) {
+  std::vector<Ort::Value> inputTensor;
+  for(auto i : input){
+    std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+    inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
+  }
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
+  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+} // namespace ml
+
+} // namespace o2
\ No newline at end of file
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index ad8e53309beee..0efed3ad4c76c 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -63,7 +63,6 @@ set(SRCS
     Merger/GPUTPCGlobalDebugSortKernels.cxx
     Merger/GPUTPCGMPhysicalTrackModel.cxx
     Merger/GPUTPCGMPolynomialFieldManager.cxx
-    ML/onnx_interface.cxx
     DataTypes/GPUTRDTrack.cxx
     TRDTracking/GPUTRDTracker.cxx
     TRDTracking/GPUTRDTrackletWord.cxx
@@ -313,7 +312,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                  PUBLIC_LINK_LIBRARIES O2::GPUCommon
                                        O2::ReconstructionDataFormats
                                        O2::TPCFastTransformation
-                                       ONNXRuntime::ONNXRuntime
+                                       O2::ML
                  PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
                  SOURCES ${SRCS_DATATYPE_HEADERS})
   target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAVE_O2HEADERS)
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 31d46b928a33f..24c1ea6a6e2ce 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -294,16 +294,26 @@ AddOption(printSettings, bool, false, "", 0, "Print all settings when initializi
 AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr)
 AddSubConfig(GPUSettingsProcessingRTC, rtc)
 AddSubConfig(GPUSettingsProcessingParam, param)
-AddOption(applyNNclusterizer, int, 0, "", 0, "(Bool, default = 0), if the neural network clsuterizer should be used.")
-AddOption(nnClusterizerVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
+AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural network clusterizer should be used.")
+AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
+AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
+AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
+AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceThreadsPerNN, int, 0, "", 0, "Number of threads used to evaluate one neural network")
+AddOption(nnInferenceEnableOrtOptimization, unsigned int, 1, "", 0, "Enables graph optimizations in ONNX Runtime. Can be greater than 1!")
+AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
+AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If mmInferenceOrtProfiling is set, the path to store the profiling data")
+AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
+AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
+AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerUseCFregression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN")
+AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
 AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
-AddOption(nnRegressionPath, std::string, "./network_reg.onnx", "", 0, "The regression network path")
-AddOption(nnClassThreshold, float, 0.16, "", 0, "The cutoff at which clusters will be accepted / rejected.")
-AddOption(nnSigmoidTrafoThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
-AddOption(nnAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
-AddOption(nnSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
-AddOption(nnSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
-AddOption(nnSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
+AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
+AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
 AddHelp("help", 'h')
 EndConfig()
 #endif // __OPENCL__
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index f413598c13f59..528c683944ef1 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -742,7 +742,7 @@ int32_t GPUChainTracking::RunChain()
       return 1;
     }
   } else if (mIOPtrs.tpcPackedDigits || mIOPtrs.tpcZS) {
-    if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false)) { // FIXME: This enables the neural network clusterization -> Need to actually set this as configurable
+    if (runRecoStep(RecoStep::TPCClusterFinding, &GPUChainTracking::RunTPCClusterizer, false)) {
       return 1;
     }
   }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index e69c3d15c6fc2..eafd50a72424f 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -841,7 +841,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
         } else {
-          // FIXME: This needs to be removed when I actually apply the NN! For now its onyl to make the code work
+          // FIXME: This potentially needs to be removed when I actually apply the NN. For now its only to make the code work
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
         }
@@ -875,16 +875,60 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
 
         if(GetProcessingSettings().applyNNclusterizer){
-          clusterer.model_class.init(GetProcessingSettings().nnClassificationPath, 1, 1, GetProcessingSettings().nnClusterizerVerbosity);
-          clusterer.model_reg.init(GetProcessingSettings().nnRegressionPath, 1, 1, GetProcessingSettings().nnClusterizerVerbosity);
-          clusterer.nnSizeInputRow = GetProcessingSettings().nnSizeInputRow;
-          clusterer.nnSizeInputPad = GetProcessingSettings().nnSizeInputPad;
-          clusterer.nnSizeInputTime = GetProcessingSettings().nnSizeInputTime;
-          clusterer.nnAddIndexData = GetProcessingSettings().nnAddIndexData;
+          // Settings for the clusterizer
+          clusterer.nnClusterizerUseCFregression = GetProcessingSettings().nnClusterizerUseCFregression;
+          clusterer.nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow;
+          clusterer.nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad;
+          clusterer.nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime;
+          clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
+          clusterer.nnClusterizerElementSize = ((2*clusterer.nnClusterizerSizeInputRow + 1) * (2*clusterer.nnClusterizerSizeInputPad + 1) * (2*clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
+          clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
+          clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
+
+          // Settings for the NN evaluation
           clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold;
-          clusterer.nnSigmoidTrafoThreshold = GetProcessingSettings().nnSigmoidTrafoThreshold;
-          clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnClusterizerVerbosity;
-          runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
+          clusterer.nnSigmoidTrafoClassThreshold = GetProcessingSettings().nnSigmoidTrafoClassThreshold;
+
+          // Settings for the neural network evaluation
+          clusterer.OrtOptions = {
+            {"model-path", GetProcessingSettings().nnClassificationPath},
+            {"device",  GetProcessingSettings().nnInferenceDevice},
+            {"device-id", std::to_string(GetProcessingSettings().nnInferenceDeviceId)},
+            {"allocate-device-memory", std::to_string(GetProcessingSettings().nnInferenceAllocateDevMem)},
+            {"dtype", GetProcessingSettings().nnInferenceDtype},
+            {"intra-op-num-threads", std::to_string(GetProcessingSettings().nnInferenceThreadsPerNN)},
+            {"enable-optimizations", std::to_string(GetProcessingSettings().nnInferenceEnableOrtOptimization)},
+            {"enable-profiling", std::to_string(GetProcessingSettings().nnInferenceOrtProfiling)},
+            {"profiling-output-path", GetProcessingSettings().nnInferenceOrtProfilingPath},
+            {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}
+          };
+          clusterer.model_class.init(clusterer.OrtOptions);
+          if(!clusterer.nnClusterizerUseCFregression){
+            std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':');
+            if(clusterer.model_class.getNumOutputNodes()[0][1] == 1){
+              clusterer.OrtOptions["model-path"] = reg_model_paths[0];
+              clusterer.model_reg_1.init(clusterer.OrtOptions);
+            } else {
+              if(reg_model_paths.size() == 1){
+                clusterer.OrtOptions["model-path"] = reg_model_paths[0];
+                clusterer.model_reg_1.init(clusterer.OrtOptions);
+              } else {
+                clusterer.OrtOptions["model-path"] = reg_model_paths[0];
+                clusterer.model_reg_1.init(clusterer.OrtOptions);
+                clusterer.OrtOptions["model-path"] = reg_model_paths[1];
+                clusterer.model_reg_2.init(clusterer.OrtOptions);
+              }
+            }
+          } else {
+            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
+            DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+          }
+
+          if(clusterer.nnSigmoidTrafoClassThreshold){
+            // Inverse sigmoid transformation
+            clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold/(1.f-clusterer.nnClassThreshold));
+          }
+          runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         } else {
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         }
@@ -897,9 +941,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           if(!GetProcessingSettings().applyNNclusterizer){
             runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           } else {
-            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+            runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           }
         }
+
         if (GetProcessingSettings().debugLevel >= 3) {
           GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSlice, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters);
         }
diff --git a/GPU/GPUTracking/ML/onnx_interface.cxx b/GPU/GPUTracking/ML/onnx_interface.cxx
deleted file mode 100644
index 9bb5137ec63dd..0000000000000
--- a/GPU/GPUTracking/ML/onnx_interface.cxx
+++ /dev/null
@@ -1,235 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-///
-/// \file     model.cxx
-///
-/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
-///
-/// \brief    A general-purpose class with functions for ONNX model applications
-///
-
-// ONNX includes
-#include "ML/onnx_interface.h"
-
-namespace o2
-{
-
-namespace ml
-{
-
-std::string OnnxModel::printShape(const std::vector<int64_t>& v)
-{
-  std::stringstream ss("");
-  for (size_t i = 0; i < v.size() - 1; i++)
-    ss << v[i] << "x";
-  ss << v[v.size() - 1];
-  return ss.str();
-}
-
-void OnnxModel::init(std::string localPath, bool enableOptimizations, int threads, int verbosity)
-{
-
-  if(verbosity > 1){
-    LOG(info) << "--- ONNX-ML model ---";
-    LOG(info) << "Taking model from: " << localPath;
-  }
-  modelPath = localPath;
-  activeThreads = threads;
-
-#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-#else
-  mMemoryInfo = Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
-#endif
-
-  /// Enableing optimizations
-  if(threads != 0){
-    // sessionOptions.SetInterOpNumThreads(1);
-    if(threads == 1){
-      sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
-    }
-    else{
-      sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-      sessionOptions.SetIntraOpNumThreads(threads);
-    }
-  }
-  if (enableOptimizations) {
-    // sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
-    sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
-    // uint32_t coreml_flags = 0;
-    // coreml_flags |= COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE;
-    // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOptions, coreml_flags));
-  }
-
-  mEnv = std::make_shared<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "onnx-model");
-  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-    mSession = std::make_shared<Ort::Experimental::Session>(*mEnv, modelPath, sessionOptions);
-    mInputNames = mSession->GetInputNames();
-    mInputShapes = mSession->GetInputShapes();
-    mOutputNames = mSession->GetOutputNames();
-    mOutputShapes = mSession->GetOutputShapes();
-  #else
-    mSession = std::make_shared<Ort::Session>(*mEnv, modelPath.c_str(), sessionOptions);
-    Ort::AllocatorWithDefaultOptions tmpAllocator;
-    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
-      mInputNames.push_back(mSession->GetInputNameAllocated(i, tmpAllocator).get());
-    }
-    for (size_t i = 0; i < mSession->GetInputCount(); ++i) {
-      mInputShapes.emplace_back(mSession->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-    }
-    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
-      mOutputNames.push_back(mSession->GetOutputNameAllocated(i, tmpAllocator).get());
-    }
-    for (size_t i = 0; i < mSession->GetOutputCount(); ++i) {
-      mOutputShapes.emplace_back(mSession->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-    }
-  #endif
-
-  if(verbosity > 1){
-    LOG(info) << "Input Nodes:";
-    for (size_t i = 0; i < mInputNames.size(); i++) {
-      LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
-    }
-
-    LOG(info) << "Output Nodes:";
-    for (size_t i = 0; i < mOutputNames.size(); i++) {
-      LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
-    }
-    LOG(info) << "--- Model initialized! ---";
-  }
-}
-
-// float* OnnxModel::inference(std::vector<Ort::Value> input, int device_id)
-// {
-
-//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
-
-//   try {
-//     auto outputTensors = mSession->Run(mInputNames, input, mOutputNames);
-//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-//     return outputValues;
-//   } catch (const Ort::Exception& exception) {
-//     LOG(error) << "Error running model inference: " << exception.what();
-//   }
-//   return nullptr;
-// }
-
-// float* OnnxModel::inference(std::vector<float> input, int device_id)
-// {
-// 
-//   // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, device_id));
-// 
-//   int64_t size = input.size();
-//   assert(size % mInputShapes[0][1] == 0);
-//   std::vector<int64_t> inputShape{size / mInputShapes[0][1], mInputShapes[0][1]};
-//   std::vector<Ort::Value> inputTensors;
-//   inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), size, inputShape));
-//   try {
-//     auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-//     float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-//     return outputValues;
-//   } catch (const Ort::Exception& exception) {
-//     LOG(error) << "Error running model inference: " << exception.what();
-//   }
-//   return nullptr;
-// }
-
-template<class T>
-float* OnnxModel::inference(T input, unsigned int size)
-{
-  std::vector<int64_t> inputShape = mInputShapes[0];
-  inputShape[0] = size;
-  std::vector<Ort::Value> inputTensors;
-  size_t mem_size = 1;
-  for(auto elem : inputShape){
-    mem_size*=elem;
-  }
-#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  try {
-      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return outputValues;
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#else
-  std::vector<const char*> tmpInputs;
-  std::vector<const char*> tmpOutputs;
-  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), 1));
-  try {
-      auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return outputValues;
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#endif
-  // LOG(info) << "Input tensors created, memory size: " << mem_size*sizeof(float)/1e6 << "MB";
-  return nullptr;
-}
-
-template<class T>
-std::vector<float> OnnxModel::inference_vector(T input, unsigned int size)
-{
-  std::vector<int64_t> inputShape = mInputShapes[0];
-  inputShape[0] = size;
-  std::vector<Ort::Value> inputTensors;
-  // std::vector<float> outputValues;
-  size_t mem_size = 1;
-  for(auto elem : inputShape){
-    mem_size*=elem;
-  }
-#if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-  inputTensors.emplace_back(Ort::Experimental::Value::CreateTensor<float>(input.data(), mem_size, inputShape));
-  try {
-      auto outputTensors = mSession->Run(mInputNames, inputTensors, mOutputNames);
-      inputTensors.clear();
-      float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-      return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
-    } catch (const Ort::Exception& exception) {
-      LOG(error) << "Error running model inference: " << exception.what();
-    }
-#else
-  std::vector<const char*> tmpInputs;
-  std::vector<const char*> tmpOutputs;
-  for (unsigned int i = 0; i < mInputNames.size(); i++) {
-    tmpInputs.emplace_back(mInputNames[i].c_str());
-  }
-  for (unsigned int i = 0; i < mOutputNames.size(); i++) {
-    tmpOutputs.emplace_back(mOutputNames[i].c_str());
-  }
-  inputTensors.emplace_back(Ort::Value::CreateTensor<float>(mMemoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
-  try {
-    auto outputTensors = mSession->Run(Ort::RunOptions{nullptr}, tmpInputs.data(), inputTensors.data(), inputTensors.size(), tmpOutputs.data(), mOutputNames.size());
-    inputTensors.clear();
-    float* outputValues = outputTensors[0].GetTensorMutableData<float>();
-    return std::vector<float>{outputValues, outputValues + size * mOutputShapes[0][1]};
-  } catch (const Ort::Exception& exception) {
-    LOG(error) << "Error running model inference: " << exception.what();
-  }
-#endif
-  return std::vector<float>{};
-}
-
-void OnnxModel::setActiveThreads(int threads)
-{
-  activeThreads = threads;
-}
-
-template float* OnnxModel::inference(std::vector<float>, unsigned int);
-template std::vector<float> OnnxModel::inference_vector(std::vector<float>, unsigned int);
-
-} // namespace gpu
-
-} // namespace GPUCA_NAMESPACE
\ No newline at end of file
diff --git a/GPU/GPUTracking/ML/onnx_interface.h b/GPU/GPUTracking/ML/onnx_interface.h
deleted file mode 100644
index 17c45f439dc63..0000000000000
--- a/GPU/GPUTracking/ML/onnx_interface.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-///
-/// \file     model.h
-///
-/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
-///
-/// \brief    A general-purpose class for ONNX models
-///
-
-#ifndef GPU_ML_ONNX_INTERFACE_H
-#define GPU_ML_ONNX_INTERFACE_H
-
-// C++ and system includes
-#if __has_include(<onnxruntime/core/session/onnxruntime_cxx_api.h>)
-#include <onnxruntime/core/session/onnxruntime_cxx_api.h>
-#else
-#include <onnxruntime_cxx_api.h>
-#endif
-#include <vector>
-#include <string>
-#include <memory>
-#include <map>
-#include <thread>
-
-// O2 includes
-#include "Framework/Logger.h"
-
-namespace o2
-{
-
-namespace ml
-{
-
-class OnnxModel
-{
-
- public:
-  OnnxModel(OrtAllocatorType allocatorType = OrtDeviceAllocator, OrtMemType memoryType = OrtMemTypeCPU) : mMemoryInfo(Ort::MemoryInfo::CreateCpu(allocatorType, memoryType)) {};
-  virtual ~OnnxModel() = default;
-
-  // Inferencing
-  void init(std::string, bool = false, int = 0, int = 0);
-  // float* inference(std::vector<Ort::Value>, int = 0);
-  // float* inference(std::vector<float>, int = 0);
-  template<class T> float* inference(T input, unsigned int size);
-  template<class T> std::vector<float> inference_vector(T input, unsigned int size);
-
-  // Reset session
-  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-    void resetSession() { mSession.reset(new Ort::Experimental::Session{*mEnv, modelPath, sessionOptions}); };
-  #else
-    void resetSession() { mSession.reset(new Ort::Session{*mEnv, modelPath.c_str(), sessionOptions}); };
-  #endif
-
-  // Getters & Setters
-  Ort::SessionOptions* getSessionOptions() { return &sessionOptions; } // For optimizations in post
-  #if __has_include(<onnxruntime/core/session/experimental_onnxruntime_cxx_api.h>)
-    std::shared_ptr<Ort::Experimental::Session> getSession() { return mSession; }
-  #else
-    std::shared_ptr<Ort::Session> getSession() { return mSession; }
-  #endif
-  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
-  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
-  void setActiveThreads(int);
-
- private:
-  // Environment variables for the ONNX runtime
-  std::shared_ptr<Ort::Env> mEnv = nullptr;
-  std::shared_ptr<Ort::Session> mSession = nullptr; ///< ONNX session
-  Ort::MemoryInfo mMemoryInfo;
-  Ort::SessionOptions sessionOptions;
-
-  // Input & Output specifications of the loaded network
-  std::vector<std::string> mInputNames;
-  std::vector<std::vector<int64_t>> mInputShapes;
-  std::vector<std::string> mOutputNames;
-  std::vector<std::vector<int64_t>> mOutputShapes;
-
-  // Environment settings
-  std::string modelPath;
-  int activeThreads = 0;
-
-  // Internal function for printing the shape of tensors
-  std::string printShape(const std::vector<int64_t>&);
-};
-
-} // namespace gpu
-
-} // namespace GPUCA_NAMESPACE
-
-#endif // GPU_ML_ONNX_INTERFACE_H
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h
index f5ca9dbedd5ac..c2ee542f65434 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h
@@ -47,6 +47,7 @@ struct ChargePos {
   GPUdi() tpccf::Row row() const { return gpad / TPC_PADS_PER_ROW_PADDED; }
   GPUdi() tpccf::Pad pad() const { return gpad % TPC_PADS_PER_ROW_PADDED - GPUCF_PADDING_PAD; }
   GPUdi() tpccf::TPCFragmentTime time() const { return timePadded - GPUCF_PADDING_TIME; }
+  GPUdi() tpccf::TPCFragmentTime globalTime() const { return timePadded; }
 
  private:
   // Maps the position of a pad given as row and index in that row to a unique
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 10b52ca05da71..130453e833911 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -19,7 +19,8 @@
 #include "GPUProcessor.h"
 #include "GPUDataTypes.h"
 #include "CfFragment.h"
-#include "ML/onnx_interface.h"
+#include "ML/ort_interface.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
 
 using namespace o2::ml;
 
@@ -144,16 +145,20 @@ class GPUTPCClusterFinder : public GPUProcessor
   int16_t mZSOffsetId = -1;
   int16_t mOutputId = -1;
 
-  int nnSizeInputRow = 3;
-  int nnSizeInputPad = 3;
-  int nnSizeInputTime = 3;
-  bool nnAddIndexData = true;
+  int nnClusterizerSizeInputRow = 3;
+  int nnClusterizerSizeInputPad = 3;
+  int nnClusterizerSizeInputTime = 3;
+  int nnClusterizerElementSize = -1;
+  bool nnClusterizerAddIndexData = true;
   float nnClassThreshold = 0.16;
-  bool nnSigmoidTrafoThreshold = 1;
-  int nnClusterizerVerbosity = 1;
-
-  OnnxModel model_class, model_reg;
-
+  bool nnSigmoidTrafoClassThreshold = 1;
+  int nnClusterizerUseCFregression = 0;
+  int nnClusterizerBatchedMode = 1;
+  int nnClusterizerVerbosity = 0;
+
+  std::unordered_map<std::string, std::string> OrtOptions;
+  OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
+  
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
   void DumpChargeMap(std::ostream& out, std::string_view);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 98f7cdee72b0c..e6cf745ce3101 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -34,76 +34,63 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
 
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, clusterer.nnSizeInputRow, clusterer.nnSizeInputPad, clusterer.nnSizeInputTime, clusterer.nnAddIndexData, clusterer.nnClassThreshold, clusterer.nnSigmoidTrafoThreshold, clusterer.nnClusterizerVerbosity);
-
+  if(clusterer.OrtOptions["dtype"].find("32") != std::string::npos){
+    GPUTPCNNClusterizer::nn_clusterizer<float>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+  } else if(clusterer.OrtOptions["dtype"].find("16") != std::string::npos) {
+    GPUTPCNNClusterizer::nn_clusterizer<OrtDataType::Float16_t>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+  } else {
+    LOG(fatal) << "Unsupported data type for neural network clusterizer!";
+  }
   // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 // 
   // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
-// GPUd() void GPUTPCNNClusterizer::exec(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
-// {
-//   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-//   CPU_ONLY(
-//     MCLabelAccumulator labelAcc(clusterer));
-// 
-//   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
-// 
-//   std::string path_class = "", path_reg = "";
-// 
-//   clusterer.model_class.init(path_class, 1, 0);
-//   clusterer.model_reg.init(path_reg, 1, 0);
-// 
-//   GPUTPCNNClusterizer::nn_clusterizer(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, 3, 3, 3, true, 0.16, true);
-// }
+int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
+{
+  return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);
+}
 
-int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current)
+int GPUTPCNNClusterizer::rowOffset(int row, int global_shift)
 {
-  std::vector<int> pad_row_max{
-    65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137
-  };
-  return (int)((pad_row_max[row_ref] - pad_row_max[row_current]) / 2);
+  return (row > 62 ? global_shift : 0);
 }
 
 // ---------------------------------
-bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift)
+bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
 {
-  std::vector<int> pad_row_max{
-    65, 65, 65, 67, 67, 67, 69, 69, 69, 71, 71, 71, 73, 73, 73, 73, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 81, 81, 81, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 91, 91, 91, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 99, 75, 75, 75, 75, 77, 77, 77, 79, 79, 79, 79, 81, 81, 81, 83, 83, 83, 83, 85, 85, 85, 87, 87, 87, 89, 89, 89, 89, 91, 91, 91, 93, 93, 93, 93, 95, 95, 95, 97, 97, 97, 99, 99, 101, 101, 101, 103, 103, 103, 105, 109, 109, 111, 111, 111, 113, 113, 113, 115, 115, 115, 117, 117, 117, 117, 117, 119, 119, 121, 121, 123, 123, 123, 125, 125, 127, 127, 127, 129, 129, 131, 131, 131, 133, 133, 135, 135, 137, 137
-  };
-  if (row < 0 || pad < 0) {
+  if (pad < 0 || row < 0) { // Faster short-circuit
     return true;
   } else if (row <= 62) {
-    // if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row]) / 2) {
+    // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)) / 2) {
     //   return true;
     // } else {
     //   return false;
     // }
-    if (pad < 0 || pad > pad_row_max[row]) {
+    if (pad < 0 || pad > geo.NPads(row)) {
       return true;
     } else {
       return false;
     }
-  } else if (row <= 62 + global_shift) {
+  } else if (row <= 62 + global_shift) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
     return true;
   } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) {
-    //if (pad < (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] - pad_row_max[row - global_shift]) / 2 || pad > (pad_row_max[o2::tpc::constants::MAXGLOBALPADROW-1] + pad_row_max[row - global_shift]) / 2) {
+    //if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) {
     //  return true;
     //} else {
     //  return false;
     //}
-    if (pad < 0 || pad > pad_row_max[row]) {
+    if (pad < 0 || pad > geo.NPads(row)) {
       return true;
     } else {
       return false;
     }
-  } else if (row > o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) {
-    return true;
   } else {
-    return false;
+    return true;
   }
 }
 
+template <class T>
 GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread,
                                           processorType& clusterer,
                                           const CfFragment& fragment,
@@ -116,104 +103,321 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
                                           uint maxClusterPerRow,
                                           uint* clusterInRow,
                                           tpc::ClusterNative* clusterByRow,
-                                          uint* clusterPosInRow,
-                                          int in_row, int in_pad, int in_time, bool add_index_data, float class_threshold, bool sigmoid_transform, int verbosity){
+                                          uint* clusterPosInRow){
 
-  std::vector<float> input_data(((2*in_row + 1) * (2*in_pad + 1) * (2*in_time + 1) + (add_index_data ? 3 : 0)), -1.f);
-  float classification_threshold = class_threshold;
-  if(sigmoid_transform){
-    classification_threshold = (float)std::log(class_threshold/(1.f-class_threshold));
-  }
-  
-  uint idx = get_global_id(0);
-  uint cls = CAMath::Min(idx, clusternum - 1);
+    uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode;
+    if(glo_idx >= clusternum){
+      return;
+    }
 
-  // For certain configurations dummy work items are added, so the total
-  // number of work items is dividable by 64.
-  // These dummy items also compute the last cluster but discard the result.
-  
-  ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
-  int row = peak.row(), pad = peak.pad(), time = peak.time();
-  float central_charge = chargeMap[peak].unpack();
-  CPU_ONLY(labelAcc->collect(peak, central_charge));
-  // unsigned int glo_idx = cls * ((2*in_row + 1) + (2*in_pad + 1) * (2*in_time + 1));
-  unsigned int write_idx = 0;
-  for(int r = -in_row; r <= in_row; r++){
-    for(int p = -in_pad; p <= in_pad; p++){
-      for(int t = -in_time; t <= in_time; t++){
-        int offset = GPUTPCNNClusterizer::padOffset(row, row + r);
-        if(GPUTPCNNClusterizer::isBoundary(row + r, pad + p, in_row)){
-          continue;
-        } else {
-          // unsigned int loc_idx = (row + r) * (2*in_pad + 1) * (2*in_time + 1) + (pad + p) * (2*in_time + 1) + (time + t);
-          ChargePos tmp_pos(row + r, pad + p + offset, time + t);
-          input_data[write_idx] = (chargeMap[tmp_pos].unpack() / central_charge);
-          write_idx++;
+    std::vector<float> central_charges(clusterer.nnClusterizerBatchedMode, -1.f);
+    std::vector<T> input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f);
+    std::vector<ChargePos> peak_positions(clusterer.nnClusterizerBatchedMode);
+    unsigned int write_idx = 0;
+
+    for(int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++){
+
+      uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1);
+
+      ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
+      int row = peak.row(), pad = peak.pad(), time = peak.time();
+      float central_charge = chargeMap[peak].unpack();
+
+      peak_positions[batch_counter] = peak;
+      central_charges[batch_counter] = central_charge;
+
+      // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize;
+      for(int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++){
+        bool push_mc_label = (r == 0);
+        int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
+        int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
+        for(int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++){
+          push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window
+          bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
+          for(int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++){
+            push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window
+            if(!is_boundary){
+              ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t);
+              input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge);
+              if(push_mc_label){
+                ChargePos tmp_pos_mc(row, pad + p, time + t);
+                CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack()));
+              }
+            }
+            write_idx++;
+          }
         }
       }
-      // if(idx == 100){
-      //   LOG(info) << "[" << input_data[write_idx-7] << ", " << input_data[write_idx-6] << ", " << input_data[write_idx-5] << ", " << input_data[write_idx-4] << ", " << input_data[write_idx-3] << ", " << input_data[write_idx-2] << ", " << input_data[write_idx-1] << "]";
-      // }
+      if(clusterer.nnClusterizerAddIndexData){
+        input_data[write_idx] = (T)(clusterer.mISlice / 36.f);
+        input_data[write_idx + 1] = (T)(row / 152.f);
+        input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row));
+        write_idx+=3;
+        // if(idx == 100){
+        //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
+        // }
+      }
     }
-  }
-  if(add_index_data){
-    input_data[input_data.size()-3] = 1;
-    input_data[input_data.size()-2] = (float)peak.row() / 152.f;
-    input_data[input_data.size()-1] = (float)peak.pad() / 138.f;
-    // if(idx == 100){
-    //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
-    // }
-  }
 
-  std::vector<float> out_class = clusterer.model_class.inference_vector(input_data, 1);
-  std::vector<float> out_reg = clusterer.model_reg.inference_vector(input_data, 1);
-  int num_outputs = clusterer.model_reg.getNumOutputNodes()[0][1];
+    std::vector<int> index_class_2;
+    std::vector<float> out_class = clusterer.model_class.inference<T,float>(input_data);
+    // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size();
+    int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1];
+
+    if(num_output_classes > 1){
+      std::vector<float> tmp_out_class(clusterer.nnClusterizerBatchedMode);
+      for(int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++){
+        auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx*num_output_classes);
+        tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator+num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2
+        if(tmp_out_class[cls_idx] > 1){
+          index_class_2.push_back(cls_idx);
+        }
+      }
+      out_class = tmp_out_class;
+    }
 
-  if((verbosity > 4) && idx == 100){
-    LOG(info) << "Classification model: " << out_class[0] << " (>? " << classification_threshold << ")";
-    LOG(info) << "Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
-  }
+    if(!clusterer.nnClusterizerUseCFregression) {
+
+      std::vector<float> out_reg = clusterer.model_reg_1.inference<T,float>(input_data), tmp_out_reg_2;
+      if(index_class_2.size() > 0){
+        std::vector<T> tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize);
+        int fill_counter = 0;
+        for(int cls_idx : index_class_2){
+          int from_idx = cls_idx*clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize;
+          for(int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++){
+            tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx];
+          }
+          fill_counter++;
+        }
+        tmp_out_reg_2 = clusterer.model_reg_2.inference<T,float>(input_data);
+      }
+
+      input_data.clear();
 
-  if(out_class[0] > classification_threshold){
-    ClusterAccumulator pc;
-    pc.setFull(central_charge * out_reg[4], peak.pad() + out_reg[0], out_reg[2], fragment.start + peak.time() + out_reg[1], out_reg[3], 0, 0);
-    tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param());
-    if ((verbosity > 0) && rejectCluster) {
-      LOG(warning) << "Cluster rejected!";
-      if (clusterPosInRow) {
-        clusterPosInRow[idx] = maxClusterPerRow;
+      if((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0){
+        LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")";
+        LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
       }
-      return;
-    }
 
-    uint rowIndex = 0;
-    if (clusterByRow != nullptr) {
-      rowIndex = sortIntoBuckets(
-        clusterer,
-        myCluster,
-        peak.row(),
-        maxClusterPerRow,
-        clusterInRow,
-        clusterByRow);
-      if (clusterPosInRow != nullptr) {
-        clusterPosInRow[idx] = rowIndex;
+      int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0;
+      if(num_output_classes > 1){
+        num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1];
       }
-    } else if (clusterPosInRow) {
-      rowIndex = clusterPosInRow[idx];
-    }
-    CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, maxClusterPerRow));
-  } else {
-    if (clusterPosInRow) {
-      clusterPosInRow[idx] = maxClusterPerRow;
-    }
-    return;
-  }
 
-  if((verbosity > 4) && idx == 100){
-    LOG(info) << "Clusterization done!";
-  }
+      for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
+
+        if (glo_idx + element >= clusternum) {
+          return;
+        }
 
+        int model_output_index = element*num_outputs_1;
+        if(out_class[element] > clusterer.nnClassThreshold) {
+          if((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) {
+            // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+            ClusterAccumulator pc;
+
+            ClusterAccumulator dummy_pc;
+            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+
+            // Dummy build to push MC labels
+            buildCluster(
+              calib,
+              chargeMap,
+              peak_positions[element],
+              smem.posBcast,
+              smem.buf,
+              smem.innerAboveThreshold,
+              &dummy_pc,
+              labelAcc);
+
+            if (fragment.isOverlap(peak_positions[element].time())) {
+              if (clusterPosInRow) {
+                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+              }
+              continue;
+            }
+
+            pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0);
+            // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+
+            tpc::ClusterNative myCluster;
+            bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+            if (rejectCluster) {
+              if(clusterer.nnClusterizerVerbosity > 3){
+                LOG(warning) << "[CF] Cluster rejected!";
+              }
+              if (clusterPosInRow) {
+                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+              }
+              continue;
+            }
+
+            uint rowIndex = 0;
+            if (clusterByRow != nullptr) {
+              rowIndex = sortIntoBuckets(
+                clusterer,
+                myCluster,
+                peak_positions[element].row(),
+                maxClusterPerRow,
+                clusterInRow,
+                clusterByRow);
+              if (clusterPosInRow != nullptr) {
+                clusterPosInRow[glo_idx + element] = rowIndex;
+              }
+            } else if (clusterPosInRow) {
+              rowIndex = clusterPosInRow[glo_idx + element];
+            }
+            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+          } else {
+            model_output_index = index_class_2[counter_class_2_idcs]*num_outputs_2;
+            counter_class_2_idcs++;
+
+            // Cluster 1
+            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+            ClusterAccumulator pc;
+
+            if (fragment.isOverlap(peak_positions[element].time())) {
+              if (clusterPosInRow) {
+                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+              }
+              continue;
+            }
+
+            pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0);
+            // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+
+            tpc::ClusterNative myCluster;
+            bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+            if (rejectCluster) {
+              if(clusterer.nnClusterizerVerbosity > 3){
+                LOG(warning) << "[CF] Cluster rejected!";
+              }
+              if (clusterPosInRow) {
+                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+              }
+              continue;
+            }
+
+            uint rowIndex = 0;
+            if (clusterByRow != nullptr) {
+              rowIndex = sortIntoBuckets(
+                clusterer,
+                myCluster,
+                peak_positions[element].row(),
+                maxClusterPerRow,
+                clusterInRow,
+                clusterByRow);
+              if (clusterPosInRow != nullptr) {
+                clusterPosInRow[glo_idx + element] = rowIndex;
+              }
+            } else if (clusterPosInRow) {
+              rowIndex = clusterPosInRow[glo_idx + element];
+            }
+            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+
+            // Cluster 2
+            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+            pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0);
+            // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+            rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+            if (rejectCluster) {
+              if(clusterer.nnClusterizerVerbosity > 3){
+                LOG(warning) << "[CF] Cluster rejected!";
+              }
+              if (clusterPosInRow) {
+                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+              }
+              continue;
+            }
+
+            rowIndex = 0;
+            if (clusterByRow != nullptr) {
+              rowIndex = sortIntoBuckets(
+                clusterer,
+                myCluster,
+                peak_positions[element].row(),
+                maxClusterPerRow,
+                clusterInRow,
+                clusterByRow);
+              if (clusterPosInRow != nullptr) {
+                clusterPosInRow[glo_idx + element] = rowIndex;
+              }
+            } else if (clusterPosInRow) {
+              rowIndex = clusterPosInRow[glo_idx + element];
+            }
+            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+          }
+        }
+      }
+
+    } else {
+
+      input_data.clear();
+      for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
+        if (glo_idx + element >= clusternum) {
+          return;
+        }
+
+        if(out_class[element] > clusterer.nnClassThreshold) {
+
+          ClusterAccumulator pc;
+          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+
+          buildCluster(
+            calib,
+            chargeMap,
+            peak_positions[element],
+            smem.posBcast,
+            smem.buf,
+            smem.innerAboveThreshold,
+            &pc,
+            labelAcc);
+
+          if (fragment.isOverlap(peak_positions[element].time())) {
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+          pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry);
+
+          tpc::ClusterNative myCluster;
+          bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+
+          if (rejectCluster) {
+              if(clusterer.nnClusterizerVerbosity > 3){
+                LOG(warning) << "[CF] Cluster rejected!";
+              }
+              if (clusterPosInRow) {
+                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+              }
+              continue;
+            }
+
+          uint rowIndex = 0;
+          if (clusterByRow != nullptr) {
+            rowIndex = sortIntoBuckets(
+              clusterer,
+              myCluster,
+              peak_positions[element].row(),
+              maxClusterPerRow,
+              clusterInRow,
+              clusterByRow);
+            if (clusterPosInRow != nullptr) {
+              clusterPosInRow[glo_idx + element] = rowIndex;
+            }
+          } else if (clusterPosInRow) {
+            rowIndex = clusterPosInRow[glo_idx + element];
+          }
+
+          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+        }
+      }
+    }
+
+    if(clusterer.nnClusterizerVerbosity > 4){
+      LOG(info) << "[CF] Clusterization done!";
+    }
 }
 
 
@@ -449,4 +653,4 @@ GPUd() uint GPUTPCNNClusterizer::sortIntoBuckets(processorType& clusterer, const
     CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
   }
   return index;
-}
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 7fbf5a806a916..42104ae2099d3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -62,8 +62,11 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*);
 
   static GPUd() void exec(int, int, int, int, GPUSharedMemory&, processorType&, char);
-  static int padOffset(int, int);
-  static bool isBoundary(int, int, int);
+  static int padOffset(int, int, const GPUTPCGeometry&);
+  static int rowOffset(int, int);
+  static bool isBoundary(int, int, int, const GPUTPCGeometry&);
+
+  template<class T>
   static GPUd() void nn_clusterizer(int, int, int, int,
                               processorType&,
                               const CfFragment&,
@@ -76,8 +79,7 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
                               uint,
                               uint*,
                               tpc::ClusterNative*,
-                              uint*,
-                              int = 3, int = 3, int = 3, bool = 1, float = 0.16, bool = true, int = 1);
+                              uint*);
 
  private:
   // ---------------------------------
@@ -93,4 +95,4 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
 
 } // namespace GPUCA_NAMESPACE::gpu
 
-#endif
+#endif
\ No newline at end of file

From 06737fd8d044a75d4e6da947a3ae6792c7ae42af Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 18 Oct 2024 09:16:01 +0200
Subject: [PATCH 12/77] Fixing uchar -> uint8_t

---
 GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h    | 6 +++---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 8 ++++----
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h   | 6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
index 534cc44513286..d308b8bd6efa7 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
@@ -43,7 +43,7 @@ class ClusterAccumulator
   GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&);
   GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const;
 
-  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uchar splitInTime, uchar splitInPad){
+  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad){
     mQtot = qtot;
     mPadMean = padMean;
     mPadSigma = padSigma;
@@ -57,8 +57,8 @@ class ClusterAccumulator
   GPUd() void setPadSigma(float padSigma) { mPadSigma = padSigma; }
   GPUd() void setTimeMean(float timeMean) { mTimeMean = timeMean; }
   GPUd() void setTimeSigma(float timeSigma) { mTimeSigma = timeSigma; }
-  GPUd() void setSplitInTime(uchar splitInTime) { mSplitInTime = splitInTime; }
-  GPUd() void setSplitInPad(uchar splitInPad) { mSplitInPad = splitInPad; }
+  GPUd() void setSplitInTime(uint8_t splitInTime) { mSplitInTime = splitInTime; }
+  GPUd() void setSplitInPad(uint8_t splitInPad) { mSplitInPad = splitInPad; }
 
  private:
   float mQtot = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index e6cf745ce3101..f5e094a3c363e 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -505,9 +505,9 @@ GPUdii() void GPUTPCNNClusterizer::updateClusterInner(
   const ChargePos& pos,
   ClusterAccumulator* cluster,
   MCLabelAccumulator* labelAcc,
-  uchar* innerAboveThreshold)
+  uint8_t* innerAboveThreshold)
 {
-  uchar aboveThreshold = 0;
+  uint8_t aboveThreshold = 0;
 
   GPUCA_UNROLL(U(), U())
   for (ushort i = 0; i < N; i++) {
@@ -520,7 +520,7 @@ GPUdii() void GPUTPCNNClusterizer::updateClusterInner(
     CPU_ONLY(
       labelAcc->collect(pos.delta(d), q));
 
-    aboveThreshold |= (uchar(q > calib.tpc.cfInnerThreshold) << i);
+    aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
   }
 
   innerAboveThreshold[lid] = aboveThreshold;
@@ -558,7 +558,7 @@ GPUdii() void GPUTPCNNClusterizer::buildCluster(
   ChargePos pos,
   ChargePos* posBcast,
   PackedCharge* buf,
-  uchar* innerAboveThreshold,
+  uint8_t* innerAboveThreshold,
   ClusterAccumulator* myCluster,
   MCLabelAccumulator* labelAcc)
 {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 42104ae2099d3..51a5c29022421 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -40,7 +40,7 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   struct GPUSharedMemory {
     ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
     PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
-    uchar innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
+    uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
   };
 
 #ifdef GPUCA_HAVE_O2HEADERS
@@ -84,11 +84,11 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
  private:
   // ---------------------------------
 
-  static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uchar*);
+  static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);
 
   static GPUd() void updateClusterOuter(ushort, ushort, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
 
-  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uchar*, ClusterAccumulator*, MCLabelAccumulator*);
+  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
 
   static GPUd() uint sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint, uint, uint*, tpc::ClusterNative*);
 };

From b14844990173a00a66d9e2ad62185232ab3992d6 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 18 Oct 2024 09:55:31 +0200
Subject: [PATCH 13/77] Adding utils header

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index eafd50a72424f..0f22a7472feac 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -37,6 +37,7 @@
 #endif
 
 #include "utils/strtag.h"
+#include <CommonUtils/StringUtils.h>
 
 #ifndef GPUCA_NO_VC
 #include <Vc/Vc>

From 534da50f248210cff92acdeac763f4f74a2de30e Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 21 Oct 2024 09:40:43 +0200
Subject: [PATCH 14/77] Updating kernels.cmake to uint8_t

---
 GPU/GPUTracking/kernels.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index 5b5aed94a7472..b6490c0c5b4c6 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -116,7 +116,7 @@ o2_gpu_add_kernel("GPUTPCCFPeakFinder"                                "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"        "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB      single)
-o2_gpu_add_kernel("GPUTPCNNClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single char onlyMC)
+o2_gpu_add_kernel("GPUTPCNNClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"           "= TPCCLUSTERFINDER"                                    NO      single)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                 "= TPCCLUSTERFINDER"                                    NO      single GPUTPCLinearLabels* out)

From bb2cb6e48d12f71fb634b1429bf284db23bb97ee Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Mon, 21 Oct 2024 07:41:20 +0000
Subject: [PATCH 15/77] Please consider the following formatting changes

---
 Common/ML/include/ML/ort_interface.h          |  76 ++-
 Common/ML/src/ort_interface.cxx               |  88 +--
 .../Global/GPUChainTrackingClusterizer.cxx    |  23 +-
 .../TPCClusterFinder/ClusterAccumulator.h     |   3 +-
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |   2 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 539 +++++++++---------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  26 +-
 7 files changed, 385 insertions(+), 372 deletions(-)

diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h
index a365860db3279..2fe9a44a0623c 100644
--- a/Common/ML/include/ML/ort_interface.h
+++ b/Common/ML/include/ML/ort_interface.h
@@ -35,60 +35,58 @@ namespace ml
 class OrtModel
 {
 
-  public:
-    // Constructor
-    OrtModel() = default;
-    OrtModel(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
-    void init(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
-    void reset(std::unordered_map<std::string, std::string>);
+ public:
+  // Constructor
+  OrtModel() = default;
+  OrtModel(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
+  void init(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
+  void reset(std::unordered_map<std::string, std::string>);
 
-    virtual ~OrtModel() = default;
+  virtual ~OrtModel() = default;
 
-    // Conversion
-    template<class I, class O>
-    std::vector<O> v2v(std::vector<I>&, bool = true);
+  // Conversion
+  template <class I, class O>
+  std::vector<O> v2v(std::vector<I>&, bool = true);
 
-    // Inferencing
-    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
-    std::vector<O> inference(std::vector<I>&);
+  // Inferencing
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
+  std::vector<O> inference(std::vector<I>&);
 
-    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-    std::vector<O> inference(std::vector<std::vector<I>>&);
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+  std::vector<O> inference(std::vector<std::vector<I>>&);
 
-    // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
-    // std::vector<O> inference(std::vector<I>&);
+  // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
+  // std::vector<O> inference(std::vector<I>&);
 
-    // Reset session
-    void resetSession();
+  // Reset session
+  void resetSession();
 
-    std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
-    std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
-    std::vector<std::string> getInputNames() const { return mInputNames; }
-    std::vector<std::string> getOutputNames() const { return mOutputNames; }
+  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+  std::vector<std::string> getInputNames() const { return mInputNames; }
+  std::vector<std::string> getOutputNames() const { return mOutputNames; }
 
-    void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+  void setActiveThreads(int threads) { intraOpNumThreads = threads; }
 
-  private:
+ private:
+  // ORT variables -> need to be hidden as Pimpl
+  struct OrtVariables;
+  OrtVariables* pImplOrt;
 
-    // ORT variables -> need to be hidden as Pimpl
-    struct OrtVariables;
-    OrtVariables* pImplOrt;
+  // Input & Output specifications of the loaded network
+  std::vector<const char*> inputNamesChar, outputNamesChar;
+  std::vector<std::string> mInputNames, mOutputNames;
+  std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
 
-    // Input & Output specifications of the loaded network
-    std::vector<const char*> inputNamesChar, outputNamesChar;
-    std::vector<std::string> mInputNames, mOutputNames;
-    std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
-
-    // Environment settings
-    std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
-    int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
-
-    std::string printShape(const std::vector<int64_t>&);
+  // Environment settings
+  std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
+  int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
+  std::string printShape(const std::vector<int64_t>&);
 };
 
 } // namespace ml
 
-} // namespace ml
+} // namespace o2
 
 #endif // O2_ML_ORT_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
index 84a06ce1da068..8ebe0588b4a2b 100644
--- a/Common/ML/src/ort_interface.cxx
+++ b/Common/ML/src/ort_interface.cxx
@@ -25,7 +25,7 @@ namespace o2
 namespace ml
 {
 
-struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .cxx file
+struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file
   // ORT runtime objects
   Ort::RunOptions runOptions;
   std::shared_ptr<Ort::Env> env = nullptr;
@@ -35,12 +35,13 @@ struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .
   Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
 };
 
-void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
+void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
+{
 
   pImplOrt = new OrtVariables();
 
   // Load from options map
-  if(!optionsMap.contains("model-path")){
+  if (!optionsMap.contains("model-path")) {
     LOG(fatal) << "(ORT) Model path cannot be empty!";
   }
   modelPath = optionsMap["model-path"];
@@ -48,42 +49,42 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
   deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
   allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
-  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ?  std::stoi(optionsMap["intra-op-num-threads"]) : 0);
+  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
   loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
   enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
   enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
 
   std::string dev_mem_str = "Hip";
 #ifdef ORT_ROCM_BUILD
-  if(device == "ROCM") {
+  if (device == "ROCM") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) ROCM execution provider set";
   }
 #endif
 #ifdef ORT_MIGRAPHX_BUILD
-  if(device == "MIGRAPHX") {
+  if (device == "MIGRAPHX") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) MIGraphX execution provider set";
   }
 #endif
 #ifdef ORT_CUDA_BUILD
-  if(device == "CUDA") {
+  if (device == "CUDA") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) CUDA execution provider set";
     dev_mem_str = "Cuda";
   }
 #endif
 
-  if(allocateDeviceMemory){
+  if (allocateDeviceMemory) {
     pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
     LOG(info) << "(ORT) Memory info set to on-device memory";
   }
 
-  if(device == "CPU") {
+  if (device == "CPU") {
     (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
-    if(intraOpNumThreads > 1){
+    if (intraOpNumThreads > 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-    } else if(intraOpNumThreads == 1){
+    } else if (intraOpNumThreads == 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
     }
     LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
@@ -92,8 +93,8 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   (pImplOrt->sessionOptions).DisableMemPattern();
   (pImplOrt->sessionOptions).DisableCpuMemArena();
 
-  if(enableProfiling){
-    if(optionsMap.contains("profiling-output-path")){
+  if (enableProfiling) {
+    if (optionsMap.contains("profiling-output-path")) {
       (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
     } else {
       LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
@@ -109,27 +110,27 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
 
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-      mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
+    mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-      mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-      mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
+    mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-      mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
   }
 
   inputNamesChar.resize(mInputNames.size(), nullptr);
   std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
-      [&](const std::string& str) { return str.c_str(); });
+                 [&](const std::string& str) { return str.c_str(); });
   outputNamesChar.resize(mOutputNames.size(), nullptr);
   std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
-      [&](const std::string& str) { return str.c_str(); });
+                 [&](const std::string& str) { return str.c_str(); });
 
   // Print names
-  if(loggingLevel > 1) {
+  if (loggingLevel > 1) {
     LOG(info) << "Input Nodes:";
     for (size_t i = 0; i < mInputNames.size(); i++) {
       LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
@@ -142,24 +143,28 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   }
 }
 
-void OrtModel::resetSession() { 
+void OrtModel::resetSession()
+{
   (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
 }
 
-template<class I, class O>
-std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput) {
-  if constexpr (std::is_same_v<I,O>){
+template <class I, class O>
+std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
+{
+  if constexpr (std::is_same_v<I, O>) {
     return input;
   } else {
     std::vector<O> output(input.size());
     std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); });
-    if(clearInput) input.clear();
+    if (clearInput)
+      input.clear();
     return output;
   }
 }
 
-template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<I>& input){
+template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<I>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(input)).data(), input.size(), inputShape.data(), inputShape.size()));
@@ -171,10 +176,11 @@ std::vector<O> OrtModel::inference(std::vector<I>& input){
   return outputValuesVec;
 }
 
-template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input){
+template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
+{
   std::vector<Ort::Value> inputTensor;
-  for(auto i : input){
+  for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
     inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(i)).data(), i.size(), inputShape.data(), inputShape.size()));
   }
@@ -195,7 +201,9 @@ std::string OrtModel::printShape(const std::vector<int64_t>& v)
   return ss.str();
 }
 
-template <> std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input) {
+template <>
+std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
@@ -207,7 +215,9 @@ template <> std::vector<float> OrtModel::inference<float, float>(std::vector<flo
   return outputValuesVec;
 }
 
-template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input) {
+template <>
+std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -219,7 +229,9 @@ template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -231,7 +243,9 @@ template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType:
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -243,9 +257,11 @@ template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDa
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input)
+{
   std::vector<Ort::Value> inputTensor;
-  for(auto i : input){
+  for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
     inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
   }
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 0f22a7472feac..d8470fdc2bf10 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -838,7 +838,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (clusterer.mPmemory->counters.nPeaks == 0) {
           continue;
         }
-        if(!GetProcessingSettings().applyNNclusterizer){
+        if (!GetProcessingSettings().applyNNclusterizer) {
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
           runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
         } else {
@@ -875,14 +875,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
 
-        if(GetProcessingSettings().applyNNclusterizer){
+        if (GetProcessingSettings().applyNNclusterizer) {
           // Settings for the clusterizer
           clusterer.nnClusterizerUseCFregression = GetProcessingSettings().nnClusterizerUseCFregression;
           clusterer.nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow;
           clusterer.nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad;
           clusterer.nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime;
           clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
-          clusterer.nnClusterizerElementSize = ((2*clusterer.nnClusterizerSizeInputRow + 1) * (2*clusterer.nnClusterizerSizeInputPad + 1) * (2*clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
+          clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
           clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
           clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
 
@@ -893,7 +893,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           // Settings for the neural network evaluation
           clusterer.OrtOptions = {
             {"model-path", GetProcessingSettings().nnClassificationPath},
-            {"device",  GetProcessingSettings().nnInferenceDevice},
+            {"device", GetProcessingSettings().nnInferenceDevice},
             {"device-id", std::to_string(GetProcessingSettings().nnInferenceDeviceId)},
             {"allocate-device-memory", std::to_string(GetProcessingSettings().nnInferenceAllocateDevMem)},
             {"dtype", GetProcessingSettings().nnInferenceDtype},
@@ -901,16 +901,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             {"enable-optimizations", std::to_string(GetProcessingSettings().nnInferenceEnableOrtOptimization)},
             {"enable-profiling", std::to_string(GetProcessingSettings().nnInferenceOrtProfiling)},
             {"profiling-output-path", GetProcessingSettings().nnInferenceOrtProfilingPath},
-            {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}
-          };
+            {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}};
           clusterer.model_class.init(clusterer.OrtOptions);
-          if(!clusterer.nnClusterizerUseCFregression){
+          if (!clusterer.nnClusterizerUseCFregression) {
             std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':');
-            if(clusterer.model_class.getNumOutputNodes()[0][1] == 1){
+            if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) {
               clusterer.OrtOptions["model-path"] = reg_model_paths[0];
               clusterer.model_reg_1.init(clusterer.OrtOptions);
             } else {
-              if(reg_model_paths.size() == 1){
+              if (reg_model_paths.size() == 1) {
                 clusterer.OrtOptions["model-path"] = reg_model_paths[0];
                 clusterer.model_reg_1.init(clusterer.OrtOptions);
               } else {
@@ -925,9 +924,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
 
-          if(clusterer.nnSigmoidTrafoClassThreshold){
+          if (clusterer.nnSigmoidTrafoClassThreshold) {
             // Inverse sigmoid transformation
-            clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold/(1.f-clusterer.nnClassThreshold));
+            clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold));
           }
           runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         } else {
@@ -939,7 +938,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           if (doGPU) {
             SynchronizeStream(lane);
           }
-          if(!GetProcessingSettings().applyNNclusterizer){
+          if (!GetProcessingSettings().applyNNclusterizer) {
             runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
           } else {
             runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
index d308b8bd6efa7..b7e535a107eac 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
@@ -43,7 +43,8 @@ class ClusterAccumulator
   GPUd() void finalize(const ChargePos&, tpccf::Charge, tpccf::TPCTime, const GPUTPCGeometry&);
   GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&) const;
 
-  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad){
+  GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad)
+  {
     mQtot = qtot;
     mPadMean = padMean;
     mPadSigma = padSigma;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 130453e833911..fd420357073e9 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -158,7 +158,7 @@ class GPUTPCClusterFinder : public GPUProcessor
 
   std::unordered_map<std::string, std::string> OrtOptions;
   OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
-  
+
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
   void DumpChargeMap(std::ostream& out, std::string_view);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index f5e094a3c363e..ba8fac2a397e9 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -34,15 +34,15 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlo
 
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  if(clusterer.OrtOptions["dtype"].find("32") != std::string::npos){
+  if (clusterer.OrtOptions["dtype"].find("32") != std::string::npos) {
     GPUTPCNNClusterizer::nn_clusterizer<float>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
-  } else if(clusterer.OrtOptions["dtype"].find("16") != std::string::npos) {
+  } else if (clusterer.OrtOptions["dtype"].find("16") != std::string::npos) {
     GPUTPCNNClusterizer::nn_clusterizer<OrtDataType::Float16_t>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
   } else {
     LOG(fatal) << "Unsupported data type for neural network clusterizer!";
   }
   // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
-// 
+  //
   // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
@@ -74,12 +74,12 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
     }
   } else if (row <= 62 + global_shift) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
     return true;
-  } else if (row <= o2::tpc::constants::MAXGLOBALPADROW-1 + global_shift) {
-    //if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) {
-    //  return true;
-    //} else {
-    //  return false;
-    //}
+  } else if (row <= o2::tpc::constants::MAXGLOBALPADROW - 1 + global_shift) {
+    // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) {
+    //   return true;
+    // } else {
+    //   return false;
+    // }
     if (pad < 0 || pad > geo.NPads(row)) {
       return true;
     } else {
@@ -92,277 +92,135 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
 
 template <class T>
 GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread,
-                                          processorType& clusterer,
-                                          const CfFragment& fragment,
-                                          GPUSharedMemory& smem,
-                                          const Array2D<PackedCharge>& chargeMap,
-                                          const ChargePos* filteredPeakPositions,
-                                          const GPUSettingsRec& calib,
-                                          MCLabelAccumulator* labelAcc,
-                                          uint clusternum,
-                                          uint maxClusterPerRow,
-                                          uint* clusterInRow,
-                                          tpc::ClusterNative* clusterByRow,
-                                          uint* clusterPosInRow){
-
-    uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode;
-    if(glo_idx >= clusternum){
-      return;
-    }
+                                                processorType& clusterer,
+                                                const CfFragment& fragment,
+                                                GPUSharedMemory& smem,
+                                                const Array2D<PackedCharge>& chargeMap,
+                                                const ChargePos* filteredPeakPositions,
+                                                const GPUSettingsRec& calib,
+                                                MCLabelAccumulator* labelAcc,
+                                                uint clusternum,
+                                                uint maxClusterPerRow,
+                                                uint* clusterInRow,
+                                                tpc::ClusterNative* clusterByRow,
+                                                uint* clusterPosInRow)
+{
+
+  uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode;
+  if (glo_idx >= clusternum) {
+    return;
+  }
 
-    std::vector<float> central_charges(clusterer.nnClusterizerBatchedMode, -1.f);
-    std::vector<T> input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f);
-    std::vector<ChargePos> peak_positions(clusterer.nnClusterizerBatchedMode);
-    unsigned int write_idx = 0;
-
-    for(int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++){
-
-      uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1);
-
-      ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
-      int row = peak.row(), pad = peak.pad(), time = peak.time();
-      float central_charge = chargeMap[peak].unpack();
-
-      peak_positions[batch_counter] = peak;
-      central_charges[batch_counter] = central_charge;
-
-      // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize;
-      for(int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++){
-        bool push_mc_label = (r == 0);
-        int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
-        int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
-        for(int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++){
-          push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window
-          bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
-          for(int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++){
-            push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window
-            if(!is_boundary){
-              ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t);
-              input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge);
-              if(push_mc_label){
-                ChargePos tmp_pos_mc(row, pad + p, time + t);
-                CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack()));
-              }
+  std::vector<float> central_charges(clusterer.nnClusterizerBatchedMode, -1.f);
+  std::vector<T> input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f);
+  std::vector<ChargePos> peak_positions(clusterer.nnClusterizerBatchedMode);
+  unsigned int write_idx = 0;
+
+  for (int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++) {
+
+    uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1);
+
+    ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
+    int row = peak.row(), pad = peak.pad(), time = peak.time();
+    float central_charge = chargeMap[peak].unpack();
+
+    peak_positions[batch_counter] = peak;
+    central_charges[batch_counter] = central_charge;
+
+    // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize;
+    for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {
+      bool push_mc_label = (r == 0);
+      int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
+      int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
+      for (int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++) {
+        push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window
+        bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
+        for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
+          push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window
+          if (!is_boundary) {
+            ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t);
+            input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge);
+            if (push_mc_label) {
+              ChargePos tmp_pos_mc(row, pad + p, time + t);
+              CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack()));
             }
-            write_idx++;
           }
+          write_idx++;
         }
       }
-      if(clusterer.nnClusterizerAddIndexData){
-        input_data[write_idx] = (T)(clusterer.mISlice / 36.f);
-        input_data[write_idx + 1] = (T)(row / 152.f);
-        input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row));
-        write_idx+=3;
-        // if(idx == 100){
-        //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
-        // }
-      }
     }
+    if (clusterer.nnClusterizerAddIndexData) {
+      input_data[write_idx] = (T)(clusterer.mISlice / 36.f);
+      input_data[write_idx + 1] = (T)(row / 152.f);
+      input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row));
+      write_idx += 3;
+      // if(idx == 100){
+      //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
+      // }
+    }
+  }
 
-    std::vector<int> index_class_2;
-    std::vector<float> out_class = clusterer.model_class.inference<T,float>(input_data);
-    // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size();
-    int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1];
-
-    if(num_output_classes > 1){
-      std::vector<float> tmp_out_class(clusterer.nnClusterizerBatchedMode);
-      for(int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++){
-        auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx*num_output_classes);
-        tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator+num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2
-        if(tmp_out_class[cls_idx] > 1){
-          index_class_2.push_back(cls_idx);
-        }
+  std::vector<int> index_class_2;
+  std::vector<float> out_class = clusterer.model_class.inference<T, float>(input_data);
+  // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size();
+  int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1];
+
+  if (num_output_classes > 1) {
+    std::vector<float> tmp_out_class(clusterer.nnClusterizerBatchedMode);
+    for (int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++) {
+      auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx * num_output_classes);
+      tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2
+      if (tmp_out_class[cls_idx] > 1) {
+        index_class_2.push_back(cls_idx);
       }
-      out_class = tmp_out_class;
     }
+    out_class = tmp_out_class;
+  }
 
-    if(!clusterer.nnClusterizerUseCFregression) {
+  if (!clusterer.nnClusterizerUseCFregression) {
 
-      std::vector<float> out_reg = clusterer.model_reg_1.inference<T,float>(input_data), tmp_out_reg_2;
-      if(index_class_2.size() > 0){
-        std::vector<T> tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize);
-        int fill_counter = 0;
-        for(int cls_idx : index_class_2){
-          int from_idx = cls_idx*clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize;
-          for(int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++){
-            tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx];
-          }
-          fill_counter++;
+    std::vector<float> out_reg = clusterer.model_reg_1.inference<T, float>(input_data), tmp_out_reg_2;
+    if (index_class_2.size() > 0) {
+      std::vector<T> tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize);
+      int fill_counter = 0;
+      for (int cls_idx : index_class_2) {
+        int from_idx = cls_idx * clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize;
+        for (int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++) {
+          tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx];
         }
-        tmp_out_reg_2 = clusterer.model_reg_2.inference<T,float>(input_data);
-      }
-
-      input_data.clear();
-
-      if((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0){
-        LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")";
-        LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
-      }
-
-      int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0;
-      if(num_output_classes > 1){
-        num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1];
+        fill_counter++;
       }
+      tmp_out_reg_2 = clusterer.model_reg_2.inference<T, float>(input_data);
+    }
 
-      for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
-
-        if (glo_idx + element >= clusternum) {
-          return;
-        }
-
-        int model_output_index = element*num_outputs_1;
-        if(out_class[element] > clusterer.nnClassThreshold) {
-          if((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) {
-            // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-            ClusterAccumulator pc;
-
-            ClusterAccumulator dummy_pc;
-            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-
-            // Dummy build to push MC labels
-            buildCluster(
-              calib,
-              chargeMap,
-              peak_positions[element],
-              smem.posBcast,
-              smem.buf,
-              smem.innerAboveThreshold,
-              &dummy_pc,
-              labelAcc);
-
-            if (fragment.isOverlap(peak_positions[element].time())) {
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
+    input_data.clear();
 
-            pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0);
-            // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
-
-            tpc::ClusterNative myCluster;
-            bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
-            if (rejectCluster) {
-              if(clusterer.nnClusterizerVerbosity > 3){
-                LOG(warning) << "[CF] Cluster rejected!";
-              }
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
-
-            uint rowIndex = 0;
-            if (clusterByRow != nullptr) {
-              rowIndex = sortIntoBuckets(
-                clusterer,
-                myCluster,
-                peak_positions[element].row(),
-                maxClusterPerRow,
-                clusterInRow,
-                clusterByRow);
-              if (clusterPosInRow != nullptr) {
-                clusterPosInRow[glo_idx + element] = rowIndex;
-              }
-            } else if (clusterPosInRow) {
-              rowIndex = clusterPosInRow[glo_idx + element];
-            }
-            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
-          } else {
-            model_output_index = index_class_2[counter_class_2_idcs]*num_outputs_2;
-            counter_class_2_idcs++;
-
-            // Cluster 1
-            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-            ClusterAccumulator pc;
-
-            if (fragment.isOverlap(peak_positions[element].time())) {
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
+    if ((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0) {
+      LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")";
+      LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
+    }
 
-            pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0);
-            // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
-
-            tpc::ClusterNative myCluster;
-            bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
-            if (rejectCluster) {
-              if(clusterer.nnClusterizerVerbosity > 3){
-                LOG(warning) << "[CF] Cluster rejected!";
-              }
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
+    int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0;
+    if (num_output_classes > 1) {
+      num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1];
+    }
 
-            uint rowIndex = 0;
-            if (clusterByRow != nullptr) {
-              rowIndex = sortIntoBuckets(
-                clusterer,
-                myCluster,
-                peak_positions[element].row(),
-                maxClusterPerRow,
-                clusterInRow,
-                clusterByRow);
-              if (clusterPosInRow != nullptr) {
-                clusterPosInRow[glo_idx + element] = rowIndex;
-              }
-            } else if (clusterPosInRow) {
-              rowIndex = clusterPosInRow[glo_idx + element];
-            }
-            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
-
-            // Cluster 2
-            CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-            pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0);
-            // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
-            rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
-            if (rejectCluster) {
-              if(clusterer.nnClusterizerVerbosity > 3){
-                LOG(warning) << "[CF] Cluster rejected!";
-              }
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
-            }
+    for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
 
-            rowIndex = 0;
-            if (clusterByRow != nullptr) {
-              rowIndex = sortIntoBuckets(
-                clusterer,
-                myCluster,
-                peak_positions[element].row(),
-                maxClusterPerRow,
-                clusterInRow,
-                clusterByRow);
-              if (clusterPosInRow != nullptr) {
-                clusterPosInRow[glo_idx + element] = rowIndex;
-              }
-            } else if (clusterPosInRow) {
-              rowIndex = clusterPosInRow[glo_idx + element];
-            }
-            CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
-          }
-        }
+      if (glo_idx + element >= clusternum) {
+        return;
       }
 
-    } else {
-
-      input_data.clear();
-      for(int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
-        if (glo_idx + element >= clusternum) {
-          return;
-        }
-
-        if(out_class[element] > clusterer.nnClassThreshold) {
-
+      int model_output_index = element * num_outputs_1;
+      if (out_class[element] > clusterer.nnClassThreshold) {
+        if ((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) {
+          // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
           ClusterAccumulator pc;
+
+          ClusterAccumulator dummy_pc;
           CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
 
+          // Dummy build to push MC labels
           buildCluster(
             calib,
             chargeMap,
@@ -370,7 +228,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
             smem.posBcast,
             smem.buf,
             smem.innerAboveThreshold,
-            &pc,
+            &dummy_pc,
             labelAcc);
 
           if (fragment.isOverlap(peak_positions[element].time())) {
@@ -379,20 +237,67 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
             }
             continue;
           }
-          pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry);
+
+          pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0);
+          // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
 
           tpc::ClusterNative myCluster;
           bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+          if (rejectCluster) {
+            if (clusterer.nnClusterizerVerbosity > 3) {
+              LOG(warning) << "[CF] Cluster rejected!";
+            }
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          uint rowIndex = 0;
+          if (clusterByRow != nullptr) {
+            rowIndex = sortIntoBuckets(
+              clusterer,
+              myCluster,
+              peak_positions[element].row(),
+              maxClusterPerRow,
+              clusterInRow,
+              clusterByRow);
+            if (clusterPosInRow != nullptr) {
+              clusterPosInRow[glo_idx + element] = rowIndex;
+            }
+          } else if (clusterPosInRow) {
+            rowIndex = clusterPosInRow[glo_idx + element];
+          }
+          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+        } else {
+          model_output_index = index_class_2[counter_class_2_idcs] * num_outputs_2;
+          counter_class_2_idcs++;
 
+          // Cluster 1
+          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+          ClusterAccumulator pc;
+
+          if (fragment.isOverlap(peak_positions[element].time())) {
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0);
+          // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+
+          tpc::ClusterNative myCluster;
+          bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
           if (rejectCluster) {
-              if(clusterer.nnClusterizerVerbosity > 3){
-                LOG(warning) << "[CF] Cluster rejected!";
-              }
-              if (clusterPosInRow) {
-                clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-              }
-              continue;
+            if (clusterer.nnClusterizerVerbosity > 3) {
+              LOG(warning) << "[CF] Cluster rejected!";
+            }
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
             }
+            continue;
+          }
 
           uint rowIndex = 0;
           if (clusterByRow != nullptr) {
@@ -409,18 +314,112 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
           } else if (clusterPosInRow) {
             rowIndex = clusterPosInRow[glo_idx + element];
           }
+          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
 
+          // Cluster 2
+          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
+          pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0);
+          // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
+          rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+          if (rejectCluster) {
+            if (clusterer.nnClusterizerVerbosity > 3) {
+              LOG(warning) << "[CF] Cluster rejected!";
+            }
+            if (clusterPosInRow) {
+              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+            }
+            continue;
+          }
+
+          rowIndex = 0;
+          if (clusterByRow != nullptr) {
+            rowIndex = sortIntoBuckets(
+              clusterer,
+              myCluster,
+              peak_positions[element].row(),
+              maxClusterPerRow,
+              clusterInRow,
+              clusterByRow);
+            if (clusterPosInRow != nullptr) {
+              clusterPosInRow[glo_idx + element] = rowIndex;
+            }
+          } else if (clusterPosInRow) {
+            rowIndex = clusterPosInRow[glo_idx + element];
+          }
           CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
         }
       }
     }
 
-    if(clusterer.nnClusterizerVerbosity > 4){
-      LOG(info) << "[CF] Clusterization done!";
-    }
-}
+  } else {
+
+    input_data.clear();
+    for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
+      if (glo_idx + element >= clusternum) {
+        return;
+      }
+
+      if (out_class[element] > clusterer.nnClassThreshold) {
 
+        ClusterAccumulator pc;
+        CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
 
+        buildCluster(
+          calib,
+          chargeMap,
+          peak_positions[element],
+          smem.posBcast,
+          smem.buf,
+          smem.innerAboveThreshold,
+          &pc,
+          labelAcc);
+
+        if (fragment.isOverlap(peak_positions[element].time())) {
+          if (clusterPosInRow) {
+            clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+          }
+          continue;
+        }
+        pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry);
+
+        tpc::ClusterNative myCluster;
+        bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
+
+        if (rejectCluster) {
+          if (clusterer.nnClusterizerVerbosity > 3) {
+            LOG(warning) << "[CF] Cluster rejected!";
+          }
+          if (clusterPosInRow) {
+            clusterPosInRow[glo_idx + element] = maxClusterPerRow;
+          }
+          continue;
+        }
+
+        uint rowIndex = 0;
+        if (clusterByRow != nullptr) {
+          rowIndex = sortIntoBuckets(
+            clusterer,
+            myCluster,
+            peak_positions[element].row(),
+            maxClusterPerRow,
+            clusterInRow,
+            clusterByRow);
+          if (clusterPosInRow != nullptr) {
+            clusterPosInRow[glo_idx + element] = rowIndex;
+          }
+        } else if (clusterPosInRow) {
+          rowIndex = clusterPosInRow[glo_idx + element];
+        }
+
+        CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+      }
+    }
+  }
+
+  if (clusterer.nnClusterizerVerbosity > 4) {
+    LOG(info) << "[CF] Clusterization done!";
+  }
+}
 
 GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread,
                                                        processorType& clusterer,
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 51a5c29022421..98d979d28cf15 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -66,20 +66,20 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   static int rowOffset(int, int);
   static bool isBoundary(int, int, int, const GPUTPCGeometry&);
 
-  template<class T>
+  template <class T>
   static GPUd() void nn_clusterizer(int, int, int, int,
-                              processorType&,
-                              const CfFragment&,
-                              GPUSharedMemory&,
-                              const Array2D<PackedCharge>&,
-                              const ChargePos*,
-                              const GPUSettingsRec&,
-                              MCLabelAccumulator*,
-                              uint,
-                              uint,
-                              uint*,
-                              tpc::ClusterNative*,
-                              uint*);
+                                    processorType&,
+                                    const CfFragment&,
+                                    GPUSharedMemory&,
+                                    const Array2D<PackedCharge>&,
+                                    const ChargePos*,
+                                    const GPUSettingsRec&,
+                                    MCLabelAccumulator*,
+                                    uint,
+                                    uint,
+                                    uint*,
+                                    tpc::ClusterNative*,
+                                    uint*);
 
  private:
   // ---------------------------------

From 25093b33e1472d21a14e6396aa1d9fe1953d6b1b Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 18 Nov 2024 12:50:31 +0100
Subject: [PATCH 16/77] Adding an ONNX CPU library in the O2 framework

---
 Common/CMakeLists.txt                         |   1 +
 Common/ML/CMakeLists.txt                      |  15 +
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h | 867 ++++++++++++++++++
 Common/ML/include/ML/ort_interface.h          |  94 ++
 Common/ML/src/ort_interface.cxx               | 262 ++++++
 5 files changed, 1239 insertions(+)
 create mode 100644 Common/ML/CMakeLists.txt
 create mode 100644 Common/ML/include/ML/3rdparty/GPUORTFloat16.h
 create mode 100644 Common/ML/include/ML/ort_interface.h
 create mode 100644 Common/ML/src/ort_interface.cxx

diff --git a/Common/CMakeLists.txt b/Common/CMakeLists.txt
index f435e269575aa..0b92758e45f43 100644
--- a/Common/CMakeLists.txt
+++ b/Common/CMakeLists.txt
@@ -16,5 +16,6 @@ add_subdirectory(Types)
 add_subdirectory(Utils)
 add_subdirectory(SimConfig)
 add_subdirectory(DCAFitter)
+add_subdirectory(ML)
 
 o2_data_file(COPY maps DESTINATION Common)
diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt
new file mode 100644
index 0000000000000..954d29d6e2793
--- /dev/null
+++ b/Common/ML/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+# All rights not expressly granted are reserved.
+#
+# This software is distributed under the terms of the GNU General Public
+# License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+#
+# In applying this license CERN does not waive the privileges and immunities
+# granted to it by virtue of its status as an Intergovernmental Organization
+# or submit itself to any jurisdiction.
+
+o2_add_library(ML
+               SOURCES src/ort_interface.cxx
+               TARGETVARNAME targetName
+               PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime)
\ No newline at end of file
diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
new file mode 100644
index 0000000000000..db65328409d3c
--- /dev/null
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -0,0 +1,867 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This code was created from:
+//    - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_float16.h
+//    - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+
+#include <stdint.h>
+#include <cmath>
+#include <cstring>
+#include <limits>
+
+namespace o2
+{
+
+namespace OrtDataType
+{
+
+namespace detail
+{
+
+enum class endian {
+#if defined(_WIN32)
+  little = 0,
+  big = 1,
+  native = little,
+#elif defined(__GNUC__) || defined(__clang__)
+  little = __ORDER_LITTLE_ENDIAN__,
+  big = __ORDER_BIG_ENDIAN__,
+  native = __BYTE_ORDER__,
+#else
+#error OrtDataType::detail::endian is not implemented in this environment.
+#endif
+};
+
+static_assert(
+  endian::native == endian::little || endian::native == endian::big,
+  "Only little-endian or big-endian native byte orders are supported.");
+
+} // namespace detail
+
+/// <summary>
+/// Shared implementation between public and internal classes. CRTP pattern.
+/// </summary>
+template <class Derived>
+struct Float16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  constexpr static uint16_t ToUint16Impl(float v) noexcept;
+
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  float ToFloatImpl() const noexcept;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept
+  {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept
+  {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7C00U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7C00U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFC00U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7E00U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFE00U;
+  static constexpr uint16_t kEpsilonBits = 0x4170U;
+  static constexpr uint16_t kMinValueBits = 0xFBFFU; // Minimum normal number
+  static constexpr uint16_t kMaxValueBits = 0x7BFFU; // Largest normal number
+  static constexpr uint16_t kOneBits = 0x3C00U;
+  static constexpr uint16_t kMinusOneBits = 0xBC00U;
+
+  uint16_t val{0};
+
+  Float16Impl() = default;
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept
+  {
+    return static_cast<int16_t>(val) < 0;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept
+  {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept
+  {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept
+  {
+    return val == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept
+  {
+    return val == kNegativeInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept
+  {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent)
+  }
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent)
+  }
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const Float16Impl& lhs, const Float16Impl& rhs) noexcept
+  {
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+
+  bool operator==(const Float16Impl& rhs) const noexcept
+  {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is not equal to anything, including itself.
+      return false;
+    }
+    return val == rhs.val;
+  }
+
+  bool operator!=(const Float16Impl& rhs) const noexcept { return !(*this == rhs); }
+
+  bool operator<(const Float16Impl& rhs) const noexcept
+  {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is unordered with respect to everything, including itself.
+      return false;
+    }
+
+    const bool left_is_negative = IsNegative();
+    if (left_is_negative != rhs.IsNegative()) {
+      // When the signs of left and right differ, we know that left is less than right if it is
+      // the negative value. The exception to this is if both values are zero, in which case IEEE
+      // says they should be equal, even if the signs differ.
+      return left_is_negative && !AreZero(*this, rhs);
+    }
+    return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative);
+  }
+};
+
+// The following Float16_t conversions are based on the code from
+// Eigen library.
+
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+namespace detail
+{
+union float32_bits {
+  unsigned int u;
+  float f;
+};
+}; // namespace detail
+
+template <class Derived>
+inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept
+{
+  detail::float32_bits f{};
+  f.f = v;
+
+  constexpr detail::float32_bits f32infty = {255 << 23};
+  constexpr detail::float32_bits f16max = {(127 + 16) << 23};
+  constexpr detail::float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
+  constexpr unsigned int sign_mask = 0x80000000u;
+  uint16_t val = static_cast<uint16_t>(0x0u);
+
+  unsigned int sign = f.u & sign_mask;
+  f.u ^= sign;
+
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+
+  if (f.u >= f16max.u) {                        // result is Inf or NaN (all exponent bits set)
+    val = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+  } else {                                      // (De)normalized number or zero
+    if (f.u < (113 << 23)) {                    // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f.f += denorm_magic.f;
+
+      // and one integer subtract of the bias later, we have our final float!
+      val = static_cast<uint16_t>(f.u - denorm_magic.u);
+    } else {
+      unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
+
+      // update exponent, rounding bias part 1
+      // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
+      // without arithmetic overflow.
+      f.u += 0xc8000fffU;
+      // rounding bias part 2
+      f.u += mant_odd;
+      // take the bits!
+      val = static_cast<uint16_t>(f.u >> 13);
+    }
+  }
+
+  val |= static_cast<uint16_t>(sign >> 16);
+  return val;
+}
+
+template <class Derived>
+inline float Float16Impl<Derived>::ToFloatImpl() const noexcept
+{
+  constexpr detail::float32_bits magic = {113 << 23};
+  constexpr unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
+  detail::float32_bits o{};
+
+  o.u = (val & 0x7fff) << 13;           // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.u; // just the exponent
+  o.u += (127 - 15) << 23;              // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp) {  // Inf/NaN?
+    o.u += (128 - 16) << 23; // extra exp adjust
+  } else if (exp == 0) {     // Zero/Denormal?
+    o.u += 1 << 23;          // extra exp adjust
+    o.f -= magic.f;          // re-normalize
+  }
+
+  // Attempt to workaround the Internal Compiler Error on ARM64
+  // for bitwise | operator, including std::bitset
+#if (defined _MSC_VER) && (defined _M_ARM || defined _M_ARM64 || defined _M_ARM64EC)
+  if (IsNegative()) {
+    return -o.f;
+  }
+#else
+  // original code:
+  o.u |= (val & 0x8000U) << 16U; // sign bit
+#endif
+  return o.f;
+}
+
+/// Shared implementation between public and internal classes. CRTP pattern.
+template <class Derived>
+struct BFloat16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  static uint16_t ToUint16Impl(float v) noexcept;
+
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  float ToFloatImpl() const noexcept;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept
+  {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept
+  {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7F80U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7F80U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFF80U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7FC1U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFFC1U;
+  static constexpr uint16_t kSignaling_NaNBits = 0x7F80U;
+  static constexpr uint16_t kEpsilonBits = 0x0080U;
+  static constexpr uint16_t kMinValueBits = 0xFF7FU;
+  static constexpr uint16_t kMaxValueBits = 0x7F7FU;
+  static constexpr uint16_t kRoundToNearest = 0x7FFFU;
+  static constexpr uint16_t kOneBits = 0x3F80U;
+  static constexpr uint16_t kMinusOneBits = 0xBF80U;
+
+  uint16_t val{0};
+
+  BFloat16Impl() = default;
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept
+  {
+    return static_cast<int16_t>(val) < 0;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept
+  {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept
+  {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept
+  {
+    return val == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept
+  {
+    return val == kNegativeInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept
+  {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent)
+  }
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept
+  {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)          // is finite
+           && (abs != 0)                          // is not zero
+           && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent)
+  }
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept
+  {
+    // IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+    // for two values by or'ing the private bits together and stripping the sign. They are both zero,
+    // and therefore equivalent, if the resulting value is still zero.
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+};
+
+template <class Derived>
+inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
+{
+  uint16_t result;
+  if (std::isnan(v)) {
+    result = kPositiveQNaNBits;
+  } else {
+    auto get_msb_half = [](float fl) {
+      uint16_t result;
+#ifdef __cpp_if_constexpr
+      if constexpr (detail::endian::native == detail::endian::little)
+#else
+      if (detail::endian::native == detail::endian::little)
+#endif
+      {
+        std::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
+      } else {
+        std::memcpy(&result, &fl, sizeof(uint16_t));
+      }
+      return result;
+    };
+
+    uint16_t upper_bits = get_msb_half(v);
+    union {
+      uint32_t U32;
+      float F32;
+    };
+    F32 = v;
+    U32 += (upper_bits & 1) + kRoundToNearest;
+    result = get_msb_half(F32);
+  }
+  return result;
+}
+
+template <class Derived>
+inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
+{
+  if (IsNaN()) {
+    return std::numeric_limits<float>::quiet_NaN();
+  }
+  float result;
+  char* const first = reinterpret_cast<char*>(&result);
+  char* const second = first + sizeof(uint16_t);
+#ifdef __cpp_if_constexpr
+  if constexpr (detail::endian::native == detail::endian::little)
+#else
+  if (detail::endian::native == detail::endian::little)
+#endif
+  {
+    std::memset(first, 0, sizeof(uint16_t));
+    std::memcpy(second, &val, sizeof(uint16_t));
+  } else {
+    std::memcpy(first, &val, sizeof(uint16_t));
+    std::memset(second, 0, sizeof(uint16_t));
+  }
+  return result;
+}
+
+/** \brief IEEE 754 half-precision floating point data type
+ *
+ * \details This struct is used for converting float to float16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::Float16_t> fp16_values;
+ * fp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(fp16_values),
+ *     [](float value) { return Ort::Float16_t(value); });
+ *
+ * \endcode
+ */
+struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a 16-bit representation of a float16 value
+  /// No conversion is done here.
+  /// </summary>
+  /// <param name="v">16-bit representation</param>
+  constexpr explicit Float16_t(uint16_t v) noexcept { val = v; }
+
+ public:
+  using Base = OrtDataType::Float16Impl<Float16_t>;
+
+  /// <summary>
+  /// Default constructor
+  /// </summary>
+  Float16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of float16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of float16</param>
+  /// <returns>new instance of Float16_t</returns>
+  constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into float16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts Float16_t to float.
+  /// </summary>
+  explicit operator float() const noexcept { return ToFloat(); }
+
+  using Base::operator==;
+  using Base::operator!=;
+  using Base::operator<;
+};
+
+static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
+
+/** \brief bfloat16 (Brain Floating Point) data type
+ *
+ * \details This struct is used for converting float to bfloat16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::BFloat16_t> bfp16_values;
+ * bfp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(bfp16_values),
+ *     [](float value) { return Ort::BFloat16_t(value); });
+ *
+ * \endcode
+ */
+struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a uint16_t representation of bfloat16
+  /// used in FromBits() to escape overload resolution issue with
+  /// constructor from float.
+  /// No conversion is done.
+  /// </summary>
+  /// <param name="v">16-bit bfloat16 value</param>
+  constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; }
+
+ public:
+  using Base = OrtDataType::BFloat16Impl<BFloat16_t>;
+
+  BFloat16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of bfloat16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of bfloat16</param>
+  /// <returns>new instance of BFloat16_t</returns>
+  static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into bfloat16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts BFloat16_t to float.
+  /// </summary>
+  explicit operator float() const noexcept { return ToFloat(); }
+
+  // We do not have an inherited impl for the below operators
+  // as the internal class implements them a little differently
+  bool operator==(const BFloat16_t& rhs) const noexcept;
+  bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); }
+  bool operator<(const BFloat16_t& rhs) const noexcept;
+};
+
+static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
+
+} // namespace OrtDataType
+
+} // namespace o2
\ No newline at end of file
diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h
new file mode 100644
index 0000000000000..a365860db3279
--- /dev/null
+++ b/Common/ML/include/ML/ort_interface.h
@@ -0,0 +1,94 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file     ort_interface.h
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+/// \brief    A header library for loading ONNX models and inferencing them on CPU and GPU
+
+#ifndef O2_ML_ONNX_INTERFACE_H
+#define O2_ML_ONNX_INTERFACE_H
+
+// C++ and system includes
+#include <vector>
+#include <string>
+#include <memory>
+#include <map>
+#include <thread>
+
+// O2 includes
+#include "Framework/Logger.h"
+
+namespace o2
+{
+
+namespace ml
+{
+
+class OrtModel
+{
+
+  public:
+    // Constructor
+    OrtModel() = default;
+    OrtModel(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
+    void init(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
+    void reset(std::unordered_map<std::string, std::string>);
+
+    virtual ~OrtModel() = default;
+
+    // Conversion
+    template<class I, class O>
+    std::vector<O> v2v(std::vector<I>&, bool = true);
+
+    // Inferencing
+    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
+    std::vector<O> inference(std::vector<I>&);
+
+    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+    std::vector<O> inference(std::vector<std::vector<I>>&);
+
+    // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
+    // std::vector<O> inference(std::vector<I>&);
+
+    // Reset session
+    void resetSession();
+
+    std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+    std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+    std::vector<std::string> getInputNames() const { return mInputNames; }
+    std::vector<std::string> getOutputNames() const { return mOutputNames; }
+
+    void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+
+  private:
+
+    // ORT variables -> need to be hidden as Pimpl
+    struct OrtVariables;
+    OrtVariables* pImplOrt;
+
+    // Input & Output specifications of the loaded network
+    std::vector<const char*> inputNamesChar, outputNamesChar;
+    std::vector<std::string> mInputNames, mOutputNames;
+    std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
+
+    // Environment settings
+    std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
+    int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
+
+    std::string printShape(const std::vector<int64_t>&);
+
+};
+
+} // namespace ml
+
+} // namespace ml
+
+#endif // O2_ML_ORT_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
new file mode 100644
index 0000000000000..84a06ce1da068
--- /dev/null
+++ b/Common/ML/src/ort_interface.cxx
@@ -0,0 +1,262 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file     ort_interface.cxx
+/// \author   Christian Sonnabend <christian.sonnabend@cern.ch>
+/// \brief    A header library for loading ONNX models and inferencing them on CPU and GPU
+
+#include "ML/ort_interface.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
+
+// ONNX includes
+#include <onnxruntime_cxx_api.h>
+
+namespace o2
+{
+
+namespace ml
+{
+
+struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .cxx file
+  // ORT runtime objects
+  Ort::RunOptions runOptions;
+  std::shared_ptr<Ort::Env> env = nullptr;
+  std::shared_ptr<Ort::Session> session = nullptr; ///< ONNX session
+  Ort::SessionOptions sessionOptions;
+  Ort::AllocatorWithDefaultOptions allocator;
+  Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
+};
+
+void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
+
+  pImplOrt = new OrtVariables();
+
+  // Load from options map
+  if(!optionsMap.contains("model-path")){
+    LOG(fatal) << "(ORT) Model path cannot be empty!";
+  }
+  modelPath = optionsMap["model-path"];
+  device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
+  dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
+  deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
+  allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
+  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ?  std::stoi(optionsMap["intra-op-num-threads"]) : 0);
+  loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
+  enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
+  enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
+
+  std::string dev_mem_str = "Hip";
+#ifdef ORT_ROCM_BUILD
+  if(device == "ROCM") {
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
+    LOG(info) << "(ORT) ROCM execution provider set";
+  }
+#endif
+#ifdef ORT_MIGRAPHX_BUILD
+  if(device == "MIGRAPHX") {
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
+    LOG(info) << "(ORT) MIGraphX execution provider set";
+  }
+#endif
+#ifdef ORT_CUDA_BUILD
+  if(device == "CUDA") {
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
+    LOG(info) << "(ORT) CUDA execution provider set";
+    dev_mem_str = "Cuda";
+  }
+#endif
+
+  if(allocateDeviceMemory){
+    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
+    LOG(info) << "(ORT) Memory info set to on-device memory";
+  }
+
+  if(device == "CPU") {
+    (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
+    if(intraOpNumThreads > 1){
+      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+    } else if(intraOpNumThreads == 1){
+      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+    }
+    LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
+  }
+
+  (pImplOrt->sessionOptions).DisableMemPattern();
+  (pImplOrt->sessionOptions).DisableCpuMemArena();
+
+  if(enableProfiling){
+    if(optionsMap.contains("profiling-output-path")){
+      (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
+    } else {
+      LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
+      (pImplOrt->sessionOptions).DisableProfiling();
+    }
+  } else {
+    (pImplOrt->sessionOptions).DisableProfiling();
+  }
+  (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
+  (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
+
+  pImplOrt->env = std::make_shared<Ort::Env>(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()));
+  (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+      mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+      mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+      mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+      mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
+
+  inputNamesChar.resize(mInputNames.size(), nullptr);
+  std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
+      [&](const std::string& str) { return str.c_str(); });
+  outputNamesChar.resize(mOutputNames.size(), nullptr);
+  std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
+      [&](const std::string& str) { return str.c_str(); });
+
+  // Print names
+  if(loggingLevel > 1) {
+    LOG(info) << "Input Nodes:";
+    for (size_t i = 0; i < mInputNames.size(); i++) {
+      LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
+    }
+
+    LOG(info) << "Output Nodes:";
+    for (size_t i = 0; i < mOutputNames.size(); i++) {
+      LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
+    }
+  }
+}
+
+void OrtModel::resetSession() { 
+  (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+}
+
+template<class I, class O>
+std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput) {
+  if constexpr (std::is_same_v<I,O>){
+    return input;
+  } else {
+    std::vector<O> output(input.size());
+    std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); });
+    if(clearInput) input.clear();
+    return output;
+  }
+}
+
+template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<I>& input){
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(input)).data(), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
+  std::vector<O> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input){
+  std::vector<Ort::Value> inputTensor;
+  for(auto i : input){
+    std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+    inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(i)).data(), i.size(), inputShape.data(), inputShape.size()));
+  }
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
+  std::vector<O> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+std::string OrtModel::printShape(const std::vector<int64_t>& v)
+{
+  std::stringstream ss("");
+  for (size_t i = 0; i < v.size() - 1; i++)
+    ss << v[i] << "x";
+  ss << v[v.size() - 1];
+  return ss.str();
+}
+
+template <> std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  float* outputValues = outputTensors[0].template GetTensorMutableData<float>();
+  std::vector<float> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  float* outputValues = outputTensors[0].template GetTensorMutableData<float>();
+  std::vector<float> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
+  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input) {
+  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
+  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input) {
+  std::vector<Ort::Value> inputTensor;
+  for(auto i : input){
+    std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+    inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
+  }
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
+  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
+  outputTensors.clear();
+  return outputValuesVec;
+}
+
+} // namespace ml
+
+} // namespace o2
\ No newline at end of file

From 9232328476bbafb06cc660c2f122d81b67da9d73 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Mon, 18 Nov 2024 18:48:18 +0000
Subject: [PATCH 17/77] Please consider the following formatting changes

---
 Common/ML/include/ML/ort_interface.h | 76 ++++++++++++------------
 Common/ML/src/ort_interface.cxx      | 88 ++++++++++++++++------------
 2 files changed, 89 insertions(+), 75 deletions(-)

diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h
index a365860db3279..2fe9a44a0623c 100644
--- a/Common/ML/include/ML/ort_interface.h
+++ b/Common/ML/include/ML/ort_interface.h
@@ -35,60 +35,58 @@ namespace ml
 class OrtModel
 {
 
-  public:
-    // Constructor
-    OrtModel() = default;
-    OrtModel(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
-    void init(std::unordered_map<std::string, std::string> optionsMap){ reset(optionsMap); }
-    void reset(std::unordered_map<std::string, std::string>);
+ public:
+  // Constructor
+  OrtModel() = default;
+  OrtModel(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
+  void init(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
+  void reset(std::unordered_map<std::string, std::string>);
 
-    virtual ~OrtModel() = default;
+  virtual ~OrtModel() = default;
 
-    // Conversion
-    template<class I, class O>
-    std::vector<O> v2v(std::vector<I>&, bool = true);
+  // Conversion
+  template <class I, class O>
+  std::vector<O> v2v(std::vector<I>&, bool = true);
 
-    // Inferencing
-    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
-    std::vector<O> inference(std::vector<I>&);
+  // Inferencing
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
+  std::vector<O> inference(std::vector<I>&);
 
-    template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-    std::vector<O> inference(std::vector<std::vector<I>>&);
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+  std::vector<O> inference(std::vector<std::vector<I>>&);
 
-    // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
-    // std::vector<O> inference(std::vector<I>&);
+  // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
+  // std::vector<O> inference(std::vector<I>&);
 
-    // Reset session
-    void resetSession();
+  // Reset session
+  void resetSession();
 
-    std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
-    std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
-    std::vector<std::string> getInputNames() const { return mInputNames; }
-    std::vector<std::string> getOutputNames() const { return mOutputNames; }
+  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+  std::vector<std::string> getInputNames() const { return mInputNames; }
+  std::vector<std::string> getOutputNames() const { return mOutputNames; }
 
-    void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+  void setActiveThreads(int threads) { intraOpNumThreads = threads; }
 
-  private:
+ private:
+  // ORT variables -> need to be hidden as Pimpl
+  struct OrtVariables;
+  OrtVariables* pImplOrt;
 
-    // ORT variables -> need to be hidden as Pimpl
-    struct OrtVariables;
-    OrtVariables* pImplOrt;
+  // Input & Output specifications of the loaded network
+  std::vector<const char*> inputNamesChar, outputNamesChar;
+  std::vector<std::string> mInputNames, mOutputNames;
+  std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
 
-    // Input & Output specifications of the loaded network
-    std::vector<const char*> inputNamesChar, outputNamesChar;
-    std::vector<std::string> mInputNames, mOutputNames;
-    std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
-
-    // Environment settings
-    std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
-    int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
-
-    std::string printShape(const std::vector<int64_t>&);
+  // Environment settings
+  std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
+  int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
+  std::string printShape(const std::vector<int64_t>&);
 };
 
 } // namespace ml
 
-} // namespace ml
+} // namespace o2
 
 #endif // O2_ML_ORT_INTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
index 84a06ce1da068..8ebe0588b4a2b 100644
--- a/Common/ML/src/ort_interface.cxx
+++ b/Common/ML/src/ort_interface.cxx
@@ -25,7 +25,7 @@ namespace o2
 namespace ml
 {
 
-struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .cxx file
+struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file
   // ORT runtime objects
   Ort::RunOptions runOptions;
   std::shared_ptr<Ort::Env> env = nullptr;
@@ -35,12 +35,13 @@ struct OrtModel::OrtVariables {  // The actual implementation is hidden in the .
   Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
 };
 
-void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
+void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
+{
 
   pImplOrt = new OrtVariables();
 
   // Load from options map
-  if(!optionsMap.contains("model-path")){
+  if (!optionsMap.contains("model-path")) {
     LOG(fatal) << "(ORT) Model path cannot be empty!";
   }
   modelPath = optionsMap["model-path"];
@@ -48,42 +49,42 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
   deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
   allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
-  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ?  std::stoi(optionsMap["intra-op-num-threads"]) : 0);
+  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
   loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
   enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
   enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
 
   std::string dev_mem_str = "Hip";
 #ifdef ORT_ROCM_BUILD
-  if(device == "ROCM") {
+  if (device == "ROCM") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) ROCM execution provider set";
   }
 #endif
 #ifdef ORT_MIGRAPHX_BUILD
-  if(device == "MIGRAPHX") {
+  if (device == "MIGRAPHX") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) MIGraphX execution provider set";
   }
 #endif
 #ifdef ORT_CUDA_BUILD
-  if(device == "CUDA") {
+  if (device == "CUDA") {
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
     LOG(info) << "(ORT) CUDA execution provider set";
     dev_mem_str = "Cuda";
   }
 #endif
 
-  if(allocateDeviceMemory){
+  if (allocateDeviceMemory) {
     pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
     LOG(info) << "(ORT) Memory info set to on-device memory";
   }
 
-  if(device == "CPU") {
+  if (device == "CPU") {
     (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
-    if(intraOpNumThreads > 1){
+    if (intraOpNumThreads > 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-    } else if(intraOpNumThreads == 1){
+    } else if (intraOpNumThreads == 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
     }
     LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
@@ -92,8 +93,8 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   (pImplOrt->sessionOptions).DisableMemPattern();
   (pImplOrt->sessionOptions).DisableCpuMemArena();
 
-  if(enableProfiling){
-    if(optionsMap.contains("profiling-output-path")){
+  if (enableProfiling) {
+    if (optionsMap.contains("profiling-output-path")) {
       (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
     } else {
       LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
@@ -109,27 +110,27 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
 
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-      mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
+    mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-      mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-      mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
+    mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
   }
   for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-      mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
   }
 
   inputNamesChar.resize(mInputNames.size(), nullptr);
   std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
-      [&](const std::string& str) { return str.c_str(); });
+                 [&](const std::string& str) { return str.c_str(); });
   outputNamesChar.resize(mOutputNames.size(), nullptr);
   std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
-      [&](const std::string& str) { return str.c_str(); });
+                 [&](const std::string& str) { return str.c_str(); });
 
   // Print names
-  if(loggingLevel > 1) {
+  if (loggingLevel > 1) {
     LOG(info) << "Input Nodes:";
     for (size_t i = 0; i < mInputNames.size(); i++) {
       LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
@@ -142,24 +143,28 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap){
   }
 }
 
-void OrtModel::resetSession() { 
+void OrtModel::resetSession()
+{
   (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
 }
 
-template<class I, class O>
-std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput) {
-  if constexpr (std::is_same_v<I,O>){
+template <class I, class O>
+std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
+{
+  if constexpr (std::is_same_v<I, O>) {
     return input;
   } else {
     std::vector<O> output(input.size());
     std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); });
-    if(clearInput) input.clear();
+    if (clearInput)
+      input.clear();
     return output;
   }
 }
 
-template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<I>& input){
+template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<I>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(input)).data(), input.size(), inputShape.data(), inputShape.size()));
@@ -171,10 +176,11 @@ std::vector<O> OrtModel::inference(std::vector<I>& input){
   return outputValuesVec;
 }
 
-template<class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input){
+template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
+{
   std::vector<Ort::Value> inputTensor;
-  for(auto i : input){
+  for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
     inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(i)).data(), i.size(), inputShape.data(), inputShape.size()));
   }
@@ -195,7 +201,9 @@ std::string OrtModel::printShape(const std::vector<int64_t>& v)
   return ss.str();
 }
 
-template <> std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input) {
+template <>
+std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
@@ -207,7 +215,9 @@ template <> std::vector<float> OrtModel::inference<float, float>(std::vector<flo
   return outputValuesVec;
 }
 
-template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input) {
+template <>
+std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -219,7 +229,9 @@ template <> std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -231,7 +243,9 @@ template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType:
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -243,9 +257,11 @@ template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDa
   return outputValuesVec;
 }
 
-template <> std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input) {
+template <>
+std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input)
+{
   std::vector<Ort::Value> inputTensor;
-  for(auto i : input){
+  for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
     inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
   }

From 7251c5cfb30266479d3f8d7df38c733ba65add77 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 19 Nov 2024 09:23:26 +0100
Subject: [PATCH 18/77] Fixing macOS build issues with calling O*.data()

---
 Common/ML/src/ort_interface.cxx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
index 8ebe0588b4a2b..222dab55e6e6b 100644
--- a/Common/ML/src/ort_interface.cxx
+++ b/Common/ML/src/ort_interface.cxx
@@ -167,7 +167,7 @@ std::vector<O> OrtModel::inference(std::vector<I>& input)
 {
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(input)).data(), input.size(), inputShape.data(), inputShape.size()));
+  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
   // input.clear();
   auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
   O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
@@ -182,7 +182,7 @@ std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
   std::vector<Ort::Value> inputTensor;
   for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-    inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, (reinterpret_cast<O*>(i)).data(), i.size(), inputShape.data(), inputShape.size()));
+    inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
   }
   // input.clear();
   auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());

From d0f4dd8271a880c3152cc4e7ae511bb8439aa466 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 19 Nov 2024 20:40:17 +0100
Subject: [PATCH 19/77] Fixing compiler issues and char -> uint8_t

---
 Common/ML/src/ort_interface.cxx | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
index 222dab55e6e6b..cf60a3369613a 100644
--- a/Common/ML/src/ort_interface.cxx
+++ b/Common/ML/src/ort_interface.cxx
@@ -107,7 +107,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
 
   pImplOrt->env = std::make_shared<Ort::Env>(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()));
-  (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+  pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
 
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
     mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
@@ -145,7 +145,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 
 void OrtModel::resetSession()
 {
-  (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+  (pImplOrt->session).reset(std::make_shared<Ort::Session>{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
 }
 
 template <class I, class O>
@@ -156,8 +156,9 @@ std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
   } else {
     std::vector<O> output(input.size());
     std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); });
-    if (clearInput)
+    if (clearInput) {
       input.clear();
+    }
     return output;
   }
 }
@@ -195,8 +196,9 @@ std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
 std::string OrtModel::printShape(const std::vector<int64_t>& v)
 {
   std::stringstream ss("");
-  for (size_t i = 0; i < v.size() - 1; i++)
+  for (size_t i = 0; i < v.size() - 1; i++) {
     ss << v[i] << "x";
+  }
   ss << v[v.size() - 1];
   return ss.str();
 }

From 7859ab25223ec10c475bbbfa4c6b2da09dfcc609 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 19 Nov 2024 21:09:14 +0100
Subject: [PATCH 20/77] Fixing curly braces

---
 Common/ML/src/ort_interface.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
index cf60a3369613a..feeebe99fa6fa 100644
--- a/Common/ML/src/ort_interface.cxx
+++ b/Common/ML/src/ort_interface.cxx
@@ -145,7 +145,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 
 void OrtModel::resetSession()
 {
-  (pImplOrt->session).reset(std::make_shared<Ort::Session>{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions});
+  (pImplOrt->session).reset(std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions));
 }
 
 template <class I, class O>

From c6cb3e6f2992f9328185c360c1590a412f401575 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 19 Nov 2024 22:29:48 +0100
Subject: [PATCH 21/77] Fixing std::make_shared

---
 Common/ML/src/ort_interface.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx
index feeebe99fa6fa..160fdbadf84e4 100644
--- a/Common/ML/src/ort_interface.cxx
+++ b/Common/ML/src/ort_interface.cxx
@@ -145,7 +145,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 
 void OrtModel::resetSession()
 {
-  (pImplOrt->session).reset(std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions));
+  pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
 }
 
 template <class I, class O>

From 40bc4371920d9f7b51469d58135d7ee742ea5606 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 20 Nov 2024 10:38:11 +0100
Subject: [PATCH 22/77] Changing order for <CommonUtils/StringUtils.h>

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index bccff6328cb1d..c528f65c3924f 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -12,6 +12,8 @@
 /// \file GPUChainTrackingClusterizer.cxx
 /// \author David Rohr
 
+#include <CommonUtils/StringUtils.h>
+
 #include "GPUChainTracking.h"
 #include "GPUChainTrackingDefs.h"
 #include "GPULogging.h"
@@ -37,7 +39,6 @@
 #endif
 
 #include "utils/strtag.h"
-#include <CommonUtils/StringUtils.h>
 
 #ifndef GPUCA_NO_VC
 #include <Vc/Vc>

From 52b033f0c9594fc5238c986037c3dc9645a04841 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 17 Dec 2024 22:46:16 +0100
Subject: [PATCH 23/77] Bug-fixing file name

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index fd420357073e9..af5315ddae4ac 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -19,7 +19,7 @@
 #include "GPUProcessor.h"
 #include "GPUDataTypes.h"
 #include "CfFragment.h"
-#include "ML/ort_interface.h"
+#include "ML/OrtInterface.h"
 #include "ML/3rdparty/GPUORTFloat16.h"
 
 using namespace o2::ml;

From 684eb56a73022743705019b8c7685f89be1d1021 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 6 Feb 2025 18:38:08 +0100
Subject: [PATCH 24/77] Making NN clusterizer more efficient

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  1 +
 .../Global/GPUChainTrackingClusterizer.cxx    |  6 +++-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 29 ++++++++++---------
 3 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index eea8b3f0bbfe7..0bd18efb943f4 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -318,6 +318,7 @@ AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN
 AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
 AddOption(nnClusterizerUseCFregression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN")
 AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
+AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
 AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
 AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
 AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index ad5b095d38016..c8bfbfa7eecbe 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -901,7 +901,11 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
           clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
           clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
-          clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
+          if (GetProcessingSettings().nnClusterizerVerbosity < 0){
+            clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
+          } else {
+            clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnClusterizerVerbosity;
+          }
 
           // Settings for the NN evaluation
           clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index ba8fac2a397e9..76b0dbee20464 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -111,12 +111,13 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
     return;
   }
 
-  std::vector<float> central_charges(clusterer.nnClusterizerBatchedMode, -1.f);
-  std::vector<T> input_data(clusterer.nnClusterizerElementSize * clusterer.nnClusterizerBatchedMode, (T)-1.f);
-  std::vector<ChargePos> peak_positions(clusterer.nnClusterizerBatchedMode);
+  uint numElements = CAMath::Min(glo_idx + clusterer.nnClusterizerBatchedMode, clusternum - glo_idx);
+  std::vector<float> central_charges(numElements, -1.f);
+  std::vector<T> input_data(clusterer.nnClusterizerElementSize * numElements, (T)-1.f);
+  std::vector<ChargePos> peak_positions(numElements);
   unsigned int write_idx = 0;
 
-  for (int batch_counter = 0; batch_counter < clusterer.nnClusterizerBatchedMode; batch_counter++) {
+  for (int batch_counter = 0; batch_counter < numElements; batch_counter++) {
 
     uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1);
 
@@ -166,8 +167,8 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
   int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1];
 
   if (num_output_classes > 1) {
-    std::vector<float> tmp_out_class(clusterer.nnClusterizerBatchedMode);
-    for (int cls_idx = 0; cls_idx < clusterer.nnClusterizerBatchedMode; cls_idx++) {
+    std::vector<float> tmp_out_class(numElements);
+    for (int cls_idx = 0; cls_idx < numElements; cls_idx++) {
       auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx * num_output_classes);
       tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2
       if (tmp_out_class[cls_idx] > 1) {
@@ -195,7 +196,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
 
     input_data.clear();
 
-    if ((clusterer.nnClusterizerVerbosity >= 4) && glo_idx == 0) {
+    if ((clusterer.nnClusterizerVerbosity < 1) && glo_idx == 0) {
       LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")";
       LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
     }
@@ -205,7 +206,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
       num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1];
     }
 
-    for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
+    for (int element = 0; element < numElements; element++) {
 
       if (glo_idx + element >= clusternum) {
         return;
@@ -244,7 +245,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
           tpc::ClusterNative myCluster;
           bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
           if (rejectCluster) {
-            if (clusterer.nnClusterizerVerbosity > 3) {
+            if (clusterer.nnClusterizerVerbosity < 1) {
               LOG(warning) << "[CF] Cluster rejected!";
             }
             if (clusterPosInRow) {
@@ -290,7 +291,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
           tpc::ClusterNative myCluster;
           bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
           if (rejectCluster) {
-            if (clusterer.nnClusterizerVerbosity > 3) {
+            if (clusterer.nnClusterizerVerbosity < 1) {
               LOG(warning) << "[CF] Cluster rejected!";
             }
             if (clusterPosInRow) {
@@ -322,7 +323,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
           // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
           rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
           if (rejectCluster) {
-            if (clusterer.nnClusterizerVerbosity > 3) {
+            if (clusterer.nnClusterizerVerbosity < 1) {
               LOG(warning) << "[CF] Cluster rejected!";
             }
             if (clusterPosInRow) {
@@ -354,7 +355,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
   } else {
 
     input_data.clear();
-    for (int element = 0; element < clusterer.nnClusterizerBatchedMode; element++) {
+    for (int element = 0; element < numElements; element++) {
       if (glo_idx + element >= clusternum) {
         return;
       }
@@ -386,7 +387,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
         bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
 
         if (rejectCluster) {
-          if (clusterer.nnClusterizerVerbosity > 3) {
+          if (clusterer.nnClusterizerVerbosity < 1) {
             LOG(warning) << "[CF] Cluster rejected!";
           }
           if (clusterPosInRow) {
@@ -416,7 +417,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
     }
   }
 
-  if (clusterer.nnClusterizerVerbosity > 4) {
+  if (clusterer.nnClusterizerVerbosity < 2) {
     LOG(info) << "[CF] Clusterization done!";
   }
 }

From 639b895e909643ad3a2e73662c5e064157c95588 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 7 Feb 2025 10:10:51 +0100
Subject: [PATCH 25/77] Changing constexpr

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 98d979d28cf15..3402f9dab9f0b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -51,7 +51,7 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   }
 #endif
 
-  GPUhdi() CONSTEXPR static GPUDataTypes::RecoStep GetRecoStep()
+  GPUhdi() constexpr static GPUDataTypes::RecoStep GetRecoStep()
   {
     return GPUDataTypes::RecoStep::TPCClusterFinding;
   }

From 3c4c5874def33f5fc7b91119343849c7000ac75d Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 7 Feb 2025 11:25:15 +0100
Subject: [PATCH 26/77] Fixing build issues

---
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 64 +++++++++----------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    | 29 ++++-----
 2 files changed, 43 insertions(+), 50 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 76b0dbee20464..60c3c6236c2fd 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -22,11 +22,11 @@
 #include "MCLabelAccumulator.h"
 #endif
 
-using namespace GPUCA_NAMESPACE::gpu;
-using namespace GPUCA_NAMESPACE::gpu::tpccf;
+using namespace o2::gpu;
+using namespace o2::gpu::tpccf;
 
 template <>
-GPUdii() void GPUTPCNNClusterizer::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char onlyMC)
+GPUdii() void GPUTPCNNClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t onlyMC)
 {
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(
@@ -91,7 +91,7 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
 }
 
 template <class T>
-GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int iBlock, int iThread,
+GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
                                                 processorType& clusterer,
                                                 const CfFragment& fragment,
                                                 GPUSharedMemory& smem,
@@ -99,11 +99,11 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
                                                 const ChargePos* filteredPeakPositions,
                                                 const GPUSettingsRec& calib,
                                                 MCLabelAccumulator* labelAcc,
-                                                uint clusternum,
-                                                uint maxClusterPerRow,
-                                                uint* clusterInRow,
+                                                uint32_t clusternum,
+                                                uint32_t maxClusterPerRow,
+                                                uint32_t* clusterInRow,
                                                 tpc::ClusterNative* clusterByRow,
-                                                uint* clusterPosInRow)
+                                                uint32_t* clusterPosInRow)
 {
 
   uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode;
@@ -422,7 +422,7 @@ GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int nBlocks, int nThreads, int i
   }
 }
 
-GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads, int iBlock, int iThread,
+GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
                                                        processorType& clusterer,
                                                        const CfFragment& fragment,
                                                        GPUSharedMemory& smem,
@@ -430,13 +430,13 @@ GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads
                                                        const ChargePos* filteredPeakPositions,
                                                        const GPUSettingsRec& calib,
                                                        MCLabelAccumulator* labelAcc,
-                                                       uint clusternum,
-                                                       uint maxClusterPerRow,
-                                                       uint* clusterInRow,
+                                                       uint32_t clusternum,
+                                                       uint32_t maxClusterPerRow,
+                                                       uint32_t* clusterInRow,
                                                        tpc::ClusterNative* clusterByRow,
-                                                       uint* clusterPosInRow)
+                                                       uint32_t* clusterPosInRow)
 {
-  uint idx = get_global_id(0);
+  uint32_t idx = get_global_id(0);
 
   // For certain configurations dummy work items are added, so the total
   // number of work items is dividable by 64.
@@ -478,7 +478,7 @@ GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads
     return;
   }
 
-  uint rowIndex = 0;
+  uint32_t rowIndex = 0;
   if (clusterByRow != nullptr) {
     rowIndex = sortIntoBuckets(
       clusterer,
@@ -499,8 +499,8 @@ GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int nBlocks, int nThreads
 
 GPUdii() void GPUTPCNNClusterizer::updateClusterInner(
   const GPUSettingsRec& calib,
-  ushort lid,
-  ushort N,
+  uint16_t lid,
+  uint16_t N,
   const PackedCharge* buf,
   const ChargePos& pos,
   ClusterAccumulator* cluster,
@@ -510,15 +510,14 @@ GPUdii() void GPUTPCNNClusterizer::updateClusterInner(
   uint8_t aboveThreshold = 0;
 
   GPUCA_UNROLL(U(), U())
-  for (ushort i = 0; i < N; i++) {
+  for (uint16_t i = 0; i < N; i++) {
     Delta2 d = cfconsts::InnerNeighbors[i];
 
     PackedCharge p = buf[N * lid + i];
 
     Charge q = cluster->updateInner(p, d);
 
-    CPU_ONLY(
-      labelAcc->collect(pos.delta(d), q));
+    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
 
     aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
   }
@@ -529,17 +528,17 @@ GPUdii() void GPUTPCNNClusterizer::updateClusterInner(
 }
 
 GPUdii() void GPUTPCNNClusterizer::updateClusterOuter(
-  ushort lid,
-  ushort N,
-  ushort M,
-  ushort offset,
+  uint16_t lid,
+  uint16_t N,
+  uint16_t M,
+  uint16_t offset,
   const PackedCharge* buf,
   const ChargePos& pos,
   ClusterAccumulator* cluster,
   MCLabelAccumulator* labelAcc)
 {
   GPUCA_UNROLL(U(), U())
-  for (ushort i = offset; i < M + offset; i++) {
+  for (uint16_t i = offset; i < M + offset; i++) {
     PackedCharge p = buf[N * lid + i];
 
     Delta2 d = cfconsts::OuterNeighbors[i];
@@ -547,8 +546,7 @@ GPUdii() void GPUTPCNNClusterizer::updateClusterOuter(
     Charge q = cluster->updateOuter(p, d);
     static_cast<void>(q); // Avoid unused varible warning on GPU.
 
-    CPU_ONLY(
-      labelAcc->collect(pos.delta(d), q));
+    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
   }
 }
 
@@ -562,7 +560,7 @@ GPUdii() void GPUTPCNNClusterizer::buildCluster(
   ClusterAccumulator* myCluster,
   MCLabelAccumulator* labelAcc)
 {
-  ushort ll = get_local_id(0);
+  uint16_t ll = get_local_id(0);
 
   posBcast[ll] = pos;
   GPUbarrier();
@@ -587,11 +585,11 @@ GPUdii() void GPUTPCNNClusterizer::buildCluster(
     labelAcc,
     innerAboveThreshold);
 
-  ushort wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
+  uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
 
   bool inGroup1 = ll < wgSizeHalf;
 
-  ushort llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
+  uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
 
   CfUtils::condBlockLoad(
     chargeMap,
@@ -643,9 +641,9 @@ GPUdii() void GPUTPCNNClusterizer::buildCluster(
 #endif
 }
 
-GPUd() uint GPUTPCNNClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint row, uint maxElemsPerBucket, uint* elemsInBucket, tpc::ClusterNative* buckets)
+GPUd() uint32_t GPUTPCNNClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t* elemsInBucket, tpc::ClusterNative* buckets)
 {
-  uint index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
+  uint32_t index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
   if (index < maxElemsPerBucket) {
     buckets[maxElemsPerBucket * row + index] = cluster;
   } else {
@@ -653,4 +651,4 @@ GPUd() uint GPUTPCNNClusterizer::sortIntoBuckets(processorType& clusterer, const
     CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
   }
   return index;
-}
\ No newline at end of file
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 3402f9dab9f0b..0bb830352becc 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -27,7 +27,7 @@ namespace o2::tpc
 struct ClusterNative;
 } // namespace o2::tpc
 
-namespace GPUCA_NAMESPACE::gpu
+namespace o2::gpu
 {
 
 class ClusterAccumulator;
@@ -43,31 +43,28 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
     uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
   };
 
-#ifdef GPUCA_HAVE_O2HEADERS
   typedef GPUTPCClusterFinder processorType;
   GPUhdi() static processorType* Processor(GPUConstantMem& processors)
   {
     return processors.tpcClusterer;
   }
-#endif
 
   GPUhdi() constexpr static GPUDataTypes::RecoStep GetRecoStep()
   {
     return GPUDataTypes::RecoStep::TPCClusterFinding;
   }
 
-  template <int iKernel = defaultKernel>
-  GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer, char);
+  template <int32_t iKernel = defaultKernel>
+  GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t);
 
-  static GPUd() void computeClustersImpl(int, int, int, int, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint, uint, uint*, tpc::ClusterNative*, uint*);
+  static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*);
 
-  static GPUd() void exec(int, int, int, int, GPUSharedMemory&, processorType&, char);
   static int padOffset(int, int, const GPUTPCGeometry&);
   static int rowOffset(int, int);
   static bool isBoundary(int, int, int, const GPUTPCGeometry&);
 
   template <class T>
-  static GPUd() void nn_clusterizer(int, int, int, int,
+  static GPUd() void nn_clusterizer(int32_t, int32_t, int32_t, int32_t,
                                     processorType&,
                                     const CfFragment&,
                                     GPUSharedMemory&,
@@ -75,22 +72,20 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
                                     const ChargePos*,
                                     const GPUSettingsRec&,
                                     MCLabelAccumulator*,
-                                    uint,
-                                    uint,
-                                    uint*,
+                                    uint32_t,
+                                    uint32_t,
+                                    uint32_t*,
                                     tpc::ClusterNative*,
-                                    uint*);
+                                    uint32_t*);
 
  private:
-  // ---------------------------------
+  static GPUd() void updateClusterInner(const GPUSettingsRec&, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);
 
-  static GPUd() void updateClusterInner(const GPUSettingsRec&, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);
-
-  static GPUd() void updateClusterOuter(ushort, ushort, ushort, ushort, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
+  static GPUd() void updateClusterOuter(uint16_t, uint16_t, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
 
   static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
 
-  static GPUd() uint sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint, uint, uint*, tpc::ClusterNative*);
+  static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
 };
 
 } // namespace GPUCA_NAMESPACE::gpu

From 95bb2ff078eb991be8acb9432b2646c9c6ebcb76 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 17 Feb 2025 22:18:45 +0100
Subject: [PATCH 27/77] Major changes to make clusterizer parallelizable.
 Problem remains: different sizes of nnClusterizerBatchedMode lead to
 different number of clusters if nnClusterizerBatchedMode <
 clusterer.mPmemory->counters.nClusters

---
 Common/ML/include/ML/OrtInterface.h           |   4 +-
 Common/ML/src/OrtInterface.cxx                | 168 ++--
 GPU/GPUTracking/CMakeLists.txt                |  62 +-
 .../Global/GPUChainTrackingClusterizer.cxx    |  76 +-
 .../TPCClusterFinder/GPUTPCCFClusterizer.h    |   8 +-
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |   9 +
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 790 ++++++------------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  38 +-
 GPU/GPUTracking/kernels.cmake                 |   2 +-
 9 files changed, 445 insertions(+), 712 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 89631d59a3846..9c1ca3250187f 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -41,6 +41,7 @@ class OrtModel
   OrtModel(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
   void init(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
   void reset(std::unordered_map<std::string, std::string>);
+  bool isInitialized() { return mInitialized; }
 
   virtual ~OrtModel() = default;
 
@@ -79,6 +80,7 @@ class OrtModel
   std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
 
   // Environment settings
+  bool mInitialized = false;
   std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
   int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
@@ -89,4 +91,4 @@ class OrtModel
 
 } // namespace o2
 
-#endif // O2_ML_ORTINTERFACE_H
+#endif // O2_ML_ORTINTERFACE_H
\ No newline at end of file
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index eb124ff6f12c9..51792ac725ed6 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -44,17 +44,19 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   if (!optionsMap.contains("model-path")) {
     LOG(fatal) << "(ORT) Model path cannot be empty!";
   }
-  modelPath = optionsMap["model-path"];
-  device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
-  dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
-  deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
-  allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
-  intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
-  loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 2);
-  enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
-  enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
-
-  std::string dev_mem_str = "Hip";
+  
+  if (!optionsMap["model-path"].empty()) {
+    modelPath = optionsMap["model-path"];
+    device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
+    dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
+    deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
+    allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
+    intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
+    loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
+    enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
+    enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
+
+    std::string dev_mem_str = "Hip";
 #if defined(ORT_ROCM_BUILD)
 #if ORT_ROCM_BUILD == 1
   if (device == "ROCM") {
@@ -81,89 +83,85 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 #endif
 #endif
 
-  if (allocateDeviceMemory) {
-    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
-    LOG(info) << "(ORT) Memory info set to on-device memory";
-  }
+    if (allocateDeviceMemory) {
+      pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
+      LOG(info) << "(ORT) Memory info set to on-device memory";
+    }
 
-  if (device == "CPU") {
-    (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
-    if (intraOpNumThreads > 1) {
-      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-    } else if (intraOpNumThreads == 1) {
-      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+    if (device == "CPU") {
+      (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
+      if (intraOpNumThreads > 1) {
+        (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+      } else if (intraOpNumThreads == 1) {
+        (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+      }
+      if (loggingLevel < 2) {
+        LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
+      }
     }
-    LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
-  }
 
-  (pImplOrt->sessionOptions).DisableMemPattern();
-  (pImplOrt->sessionOptions).DisableCpuMemArena();
+    (pImplOrt->sessionOptions).DisableMemPattern();
+    (pImplOrt->sessionOptions).DisableCpuMemArena();
 
-  if (enableProfiling) {
-    if (optionsMap.contains("profiling-output-path")) {
-      (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
+    if (enableProfiling) {
+      if (optionsMap.contains("profiling-output-path")) {
+        (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
+      } else {
+        LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
+        (pImplOrt->sessionOptions).DisableProfiling();
+      }
     } else {
-      LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
       (pImplOrt->sessionOptions).DisableProfiling();
     }
-  } else {
-    (pImplOrt->sessionOptions).DisableProfiling();
-  }
-  (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
-  (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
-
-  pImplOrt->env = std::make_shared<Ort::Env>(
-    OrtLoggingLevel(loggingLevel),
-    (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()),
-    // Integrate ORT logging into Fairlogger
-    [](void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, const char* message) {
-      if (severity == ORT_LOGGING_LEVEL_VERBOSE) {
-        LOG(debug) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
-      } else if (severity == ORT_LOGGING_LEVEL_INFO) {
-        LOG(info) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
-      } else if (severity == ORT_LOGGING_LEVEL_WARNING) {
-        LOG(warning) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
-      } else if (severity == ORT_LOGGING_LEVEL_ERROR) {
-        LOG(error) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
-      } else if (severity == ORT_LOGGING_LEVEL_FATAL) {
-        LOG(fatal) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
-      } else {
-        LOG(info) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
-      }
-    },
-    (void*)3);
-  (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
-  pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
 
-  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-    mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
-  }
-  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-    mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-  }
-  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-    mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
-  }
-  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-    mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-  }
+    mInitialized = true;
 
-  inputNamesChar.resize(mInputNames.size(), nullptr);
-  std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
-                 [&](const std::string& str) { return str.c_str(); });
-  outputNamesChar.resize(mOutputNames.size(), nullptr);
-  std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
-                 [&](const std::string& str) { return str.c_str(); });
-
-  // Print names
-  LOG(info) << "\tInput Nodes:";
-  for (size_t i = 0; i < mInputNames.size(); i++) {
-    LOG(info) << "\t\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]);
-  }
+    (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
+    (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
+
+    pImplOrt->env = std::make_shared<Ort::Env>(
+      OrtLoggingLevel(loggingLevel),
+      (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()),
+      // Integrate ORT logging into Fairlogger
+      [](void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, const char* message) {
+        if (severity == ORT_LOGGING_LEVEL_VERBOSE) {
+          LOG(debug) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
+        } else if (severity == ORT_LOGGING_LEVEL_INFO) {
+          LOG(info) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
+        } else if (severity == ORT_LOGGING_LEVEL_WARNING) {
+          LOG(warning) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
+        } else if (severity == ORT_LOGGING_LEVEL_ERROR) {
+          LOG(error) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
+        } else if (severity == ORT_LOGGING_LEVEL_FATAL) {
+          LOG(fatal) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
+        } else {
+          LOG(info) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
+        }
+      },
+      (void*)3);
+    (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
+    pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
+
+    for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+      mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
+    }
+    for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+      mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    }
+    for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+      mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
+    }
+    for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+      mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+    }
+
+    inputNamesChar.resize(mInputNames.size(), nullptr);
+    std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
+                  [&](const std::string& str) { return str.c_str(); });
+    outputNamesChar.resize(mOutputNames.size(), nullptr);
+    std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
+                  [&](const std::string& str) { return str.c_str(); });
 
-  LOG(info) << "\tOutput Nodes:";
-  for (size_t i = 0; i < mOutputNames.size(); i++) {
-    LOG(info) << "\t\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]);
   }
 }
 
@@ -301,4 +299,4 @@ std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t,
 
 } // namespace ml
 
-} // namespace o2
+} // namespace o2
\ No newline at end of file
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 23ce7919731b7..6ae409b3549bc 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -159,37 +159,37 @@ set(HDRS_INSTALL
 
 set(SRCS_NO_CINT ${SRCS_NO_CINT} display/GPUDisplayInterface.cxx)
 set(SRCS_NO_CINT
-    ${SRCS_NO_CINT}
-    Global/GPUChainITS.cxx
-    ITS/GPUITSFitter.cxx
-    ITS/GPUITSFitterKernels.cxx
-    dEdx/GPUdEdx.cxx
-    TPCConvert/GPUTPCConvert.cxx
-    TPCConvert/GPUTPCConvertKernel.cxx
-    DataCompression/GPUTPCCompression.cxx
-    DataCompression/GPUTPCCompressionTrackModel.cxx
-    DataCompression/GPUTPCCompressionKernels.cxx
-    DataCompression/GPUTPCDecompression.cxx
-    DataCompression/GPUTPCDecompressionKernels.cxx
-    DataCompression/TPCClusterDecompressor.cxx
-    DataCompression/GPUTPCClusterStatistics.cxx
-    TPCClusterFinder/GPUTPCClusterFinder.cxx
-    TPCClusterFinder/ClusterAccumulator.cxx
-    TPCClusterFinder/MCLabelAccumulator.cxx
-    TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
-    TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
-    TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
-    TPCClusterFinder/GPUTPCCFPeakFinder.cxx
-    TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
-    TPCClusterFinder/GPUTPCCFClusterizer.cxx
-    TPCClusterFinder/GPUTPCNNClusterizer.cxx
-    TPCClusterFinder/GPUTPCCFDeconvolution.cxx
-    TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
-    TPCClusterFinder/GPUTPCCFDecodeZS.cxx
-    TPCClusterFinder/GPUTPCCFGather.cxx
-    Refit/GPUTrackingRefit.cxx
-    Refit/GPUTrackingRefitKernel.cxx
-    Merger/GPUTPCGMO2Output.cxx)
+  ${SRCS_NO_CINT}
+  Global/GPUChainITS.cxx
+  ITS/GPUITSFitter.cxx
+  ITS/GPUITSFitterKernels.cxx
+  dEdx/GPUdEdx.cxx
+  TPCConvert/GPUTPCConvert.cxx
+  TPCConvert/GPUTPCConvertKernel.cxx
+  DataCompression/GPUTPCCompression.cxx
+  DataCompression/GPUTPCCompressionTrackModel.cxx
+  DataCompression/GPUTPCCompressionKernels.cxx
+  DataCompression/GPUTPCDecompression.cxx
+  DataCompression/GPUTPCDecompressionKernels.cxx
+  DataCompression/TPCClusterDecompressor.cxx
+  DataCompression/GPUTPCClusterStatistics.cxx
+  TPCClusterFinder/GPUTPCClusterFinder.cxx
+  TPCClusterFinder/ClusterAccumulator.cxx
+  TPCClusterFinder/MCLabelAccumulator.cxx
+  TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
+  TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
+  TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
+  TPCClusterFinder/GPUTPCCFPeakFinder.cxx
+  TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
+  TPCClusterFinder/GPUTPCCFClusterizer.cxx
+  TPCClusterFinder/GPUTPCNNClusterizer.cxx
+  TPCClusterFinder/GPUTPCCFDeconvolution.cxx
+  TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
+  TPCClusterFinder/GPUTPCCFDecodeZS.cxx
+  TPCClusterFinder/GPUTPCCFGather.cxx
+  Refit/GPUTrackingRefit.cxx
+  Refit/GPUTrackingRefitKernel.cxx
+  Merger/GPUTPCGMO2Output.cxx)
 
 set(SRCS_DATATYPES
     ${SRCS_DATATYPES}
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index b1a49e9de7c61..49742ece6711d 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -919,21 +919,17 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             {"profiling-output-path", GetProcessingSettings().nnInferenceOrtProfilingPath},
             {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}};
           clusterer.model_class.init(clusterer.OrtOptions);
+          std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':');
+
           if (!clusterer.nnClusterizerUseCFregression) {
-            std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':');
-            if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) {
+            if (clusterer.model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
               clusterer.OrtOptions["model-path"] = reg_model_paths[0];
               clusterer.model_reg_1.init(clusterer.OrtOptions);
             } else {
-              if (reg_model_paths.size() == 1) {
-                clusterer.OrtOptions["model-path"] = reg_model_paths[0];
-                clusterer.model_reg_1.init(clusterer.OrtOptions);
-              } else {
-                clusterer.OrtOptions["model-path"] = reg_model_paths[0];
-                clusterer.model_reg_1.init(clusterer.OrtOptions);
-                clusterer.OrtOptions["model-path"] = reg_model_paths[1];
-                clusterer.model_reg_2.init(clusterer.OrtOptions);
-              }
+              clusterer.OrtOptions["model-path"] = reg_model_paths[0];
+              clusterer.model_reg_1.init(clusterer.OrtOptions);
+              clusterer.OrtOptions["model-path"] = reg_model_paths[1];
+              clusterer.model_reg_2.init(clusterer.OrtOptions);
             }
           } else {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
@@ -944,7 +940,57 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             // Inverse sigmoid transformation
             clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold));
           }
-          runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
+          
+          float time_clusterizer = 0, time_fill = 0;
+          int evalDtype = clusterer.OrtOptions["dtype"].find("32") != std::string::npos;
+          clusterer.outputDataClass.resize(clusterer.mPmemory->counters.nClusters);
+
+          for(int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clusterer.nnClusterizerBatchedMode); batch++) {
+            uint batchStart = batch * clusterer.nnClusterizerBatchedMode;
+            uint iSize = CAMath::Min((uint)clusterer.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
+
+            clusterer.peakPositions.resize(iSize);
+            clusterer.centralCharges.resize(iSize);
+
+            if (evalDtype == 1) {
+              clusterer.inputData32.resize(iSize * clusterer.nnClusterizerElementSize);
+            } else {
+              clusterer.inputData16.resize(iSize * clusterer.nnClusterizerElementSize);
+            }
+
+            auto start0 = std::chrono::high_resolution_clock::now();
+            runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, 0, batchStart); // Filling the data
+            auto stop0 = std::chrono::high_resolution_clock::now();
+
+            auto start1 = std::chrono::high_resolution_clock::now();
+            GPUTPCNNClusterizer::applyNetworkClass(clusterer, evalDtype);
+            runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 1, 0, batchStart); // Assigning class labels
+
+            if (!clusterer.nnClusterizerUseCFregression) {
+              GPUTPCNNClusterizer::applyNetworkReg1(clusterer, evalDtype);
+              runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 2, 0, batchStart); // Running the NN for regression class 1
+              if (clusterer.model_class.getNumOutputNodes()[0][1] > 1 && reg_model_paths.size() > 1) {
+                GPUTPCNNClusterizer::applyNetworkReg2(clusterer, evalDtype);
+                runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 4, 0, batchStart); // Running the NN for regression class 2
+              }
+            }
+            auto stop1 = std::chrono::high_resolution_clock::now();
+
+            time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+            time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
+
+          }
+
+          auto start1 = std::chrono::high_resolution_clock::now();
+          if(clusterer.nnClusterizerUseCFregression) {
+            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, -1, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+          }
+          auto stop1 = std::chrono::high_resolution_clock::now();
+          time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+
+          if (clusterer.nnClusterizerVerbosity < 3) {
+            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSlice << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s";
+          }
         } else {
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         }
@@ -954,11 +1000,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           if (doGPU) {
             SynchronizeStream(lane);
           }
-          if (!GetProcessingSettings().applyNNclusterizer) {
-            runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
-          } else {
-            runKernel<GPUTPCNNClusterizer>({GetGrid(std::ceil(clusterer.mPmemory->counters.nClusters / (float)clusterer.nnClusterizerBatchedMode), lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
-          }
+          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); // Computes MC labels
         }
 
         if (GetProcessingSettings().debugLevel >= 3) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
index 411c38c39459e..79f3325ed9ad2 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
@@ -59,14 +59,14 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate
 
   static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*);
 
+  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
+
+  static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
+
  private:
   static GPUd() void updateClusterInner(const GPUSettingsRec&, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);
 
   static GPUd() void updateClusterOuter(uint16_t, uint16_t, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
-
-  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
-
-  static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
 };
 
 } // namespace o2::gpu
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 313e3e07a9097..bf2663691d19b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -156,6 +156,15 @@ class GPUTPCClusterFinder : public GPUProcessor
   int nnClusterizerBatchedMode = 1;
   int nnClusterizerVerbosity = 0;
 
+  // Memory allocation for neural network
+  uint class2_elements = 0;
+  std::vector<float> inputData32;
+  std::vector<OrtDataType::Float16_t> inputData16;
+  std::vector<float> outputDataClass, modelProbabilities, outputDataReg1, outputDataReg2;
+
+  std::vector<ChargePos> peakPositions;
+  std::vector<float> centralCharges;
+
   std::unordered_map<std::string, std::string> OrtOptions;
   OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 60c3c6236c2fd..4aa947c5aa8c9 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -13,6 +13,7 @@
 /// \author Christian Sonnabend
 
 #include "GPUTPCNNClusterizer.h"
+#include "GPUTPCCFClusterizer.h"
 
 #include "CfConsts.h"
 #include "CfUtils.h"
@@ -26,24 +27,67 @@ using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
 
 template <>
-GPUdii() void GPUTPCNNClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t onlyMC)
+GPUdii() void GPUTPCNNClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t mode, int8_t onlyMC, uint batchStart)
 {
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  CPU_ONLY(
-    MCLabelAccumulator labelAcc(clusterer));
+  uint glo_idx = get_global_id(0);
+  if (mode == -1) {
+    Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+    CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
+    tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+    o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
+    GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+  } else if (mode == 0){
+    GPUTPCNNClusterizer::fillInputData(nBlocks, nThreads, iBlock, iThread, clusterer, dtype, batchStart);
+  } else if (mode == 1) { // Class labels
+    if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) {
+      clusterer.outputDataClass[glo_idx + batchStart] = (int)(clusterer.modelProbabilities[glo_idx] > clusterer.nnClassThreshold);
+    } else {
+      auto elem_iterator = clusterer.modelProbabilities.begin() + (unsigned int)(glo_idx * clusterer.model_class.getNumOutputNodes()[0][1]);
+      uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1]));
+      clusterer.outputDataClass[glo_idx + batchStart] = class_label;
+    }
+  } else if (mode == 2) { // Publishing for class 1 regression
+    if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
+      return;
+    } else {
+      GPUTPCNNClusterizer::publishClustersReg1(glo_idx, smem, clusterer, dtype, mode, onlyMC, batchStart);
+    }
+  } else if (mode == 3) { // Refilling for class 2 regression -> Deprecated because it needs sequential accumulation
+    return;
+  } else if (mode == 4) { // Publishing for class 2 regression
+    if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
+      return;
+    } else {
+      GPUTPCNNClusterizer::publishClustersReg2(glo_idx, smem, clusterer, dtype, mode, onlyMC, batchStart);
+    }
+  }  else {
+    return;
+  }
+}
 
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
-  if (clusterer.OrtOptions["dtype"].find("32") != std::string::npos) {
-    GPUTPCNNClusterizer::nn_clusterizer<float>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
-  } else if (clusterer.OrtOptions["dtype"].find("16") != std::string::npos) {
-    GPUTPCNNClusterizer::nn_clusterizer<OrtDataType::Float16_t>(nBlocks, nThreads, iBlock, iThread, clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+void GPUTPCNNClusterizer::applyNetworkClass(processorType& clusterer, int8_t dtype, uint batch_idx) {
+  if(dtype == 0){
+    clusterer.modelProbabilities = clusterer.model_class.inference<OrtDataType::Float16_t, float>(clusterer.inputData16);
+  } else {
+    clusterer.modelProbabilities = clusterer.model_class.inference<float, float>(clusterer.inputData32);
+  }
+}
+
+void GPUTPCNNClusterizer::applyNetworkReg1(processorType& clusterer, int8_t dtype) {
+  if(dtype == 0){
+    clusterer.outputDataReg1 = clusterer.model_reg_1.inference<OrtDataType::Float16_t, float>(clusterer.inputData16);
   } else {
-    LOG(fatal) << "Unsupported data type for neural network clusterizer!";
+    clusterer.outputDataReg1 = clusterer.model_reg_1.inference<float, float>(clusterer.inputData32);
+  }
+}
+
+void GPUTPCNNClusterizer::applyNetworkReg2(processorType& clusterer, int8_t dtype) {
+  if(dtype == 0){
+    clusterer.outputDataReg2 = clusterer.model_reg_2.inference<OrtDataType::Float16_t, float>(clusterer.inputData16);
+  } else {
+    clusterer.outputDataReg2 = clusterer.model_reg_2.inference<float, float>(clusterer.inputData32);
   }
-  // tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
-  //
-  // GPUTPCNNClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
 int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
@@ -62,11 +106,6 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
   if (pad < 0 || row < 0) { // Faster short-circuit
     return true;
   } else if (row <= 62) {
-    // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)) / 2) {
-    //   return true;
-    // } else {
-    //   return false;
-    // }
     if (pad < 0 || pad > geo.NPads(row)) {
       return true;
     } else {
@@ -75,11 +114,6 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
   } else if (row <= 62 + global_shift) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
     return true;
   } else if (row <= o2::tpc::constants::MAXGLOBALPADROW - 1 + global_shift) {
-    // if (pad < (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] - geo.NPads(row)- global_shift]) / 2 || pad > (geo.NPads(o2):tpc::constants::MAXGLOBALPADROW-1] + geo.NPads(row)- global_shift]) / 2) {
-    //   return true;
-    // } else {
-    //   return false;
-    // }
     if (pad < 0 || pad > geo.NPads(row)) {
       return true;
     } else {
@@ -90,565 +124,223 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
   }
 }
 
-template <class T>
-GPUd() void GPUTPCNNClusterizer::nn_clusterizer(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
-                                                processorType& clusterer,
-                                                const CfFragment& fragment,
-                                                GPUSharedMemory& smem,
-                                                const Array2D<PackedCharge>& chargeMap,
-                                                const ChargePos* filteredPeakPositions,
-                                                const GPUSettingsRec& calib,
-                                                MCLabelAccumulator* labelAcc,
-                                                uint32_t clusternum,
-                                                uint32_t maxClusterPerRow,
-                                                uint32_t* clusterInRow,
-                                                tpc::ClusterNative* clusterByRow,
-                                                uint32_t* clusterPosInRow)
+// ---------------------------------
+GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& clusterer, int8_t dtype, uint batchStart)
 {
 
-  uint glo_idx = get_global_id(0) * clusterer.nnClusterizerBatchedMode;
-  if (glo_idx >= clusternum) {
-    return;
-  }
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
 
-  uint numElements = CAMath::Min(glo_idx + clusterer.nnClusterizerBatchedMode, clusternum - glo_idx);
-  std::vector<float> central_charges(numElements, -1.f);
-  std::vector<T> input_data(clusterer.nnClusterizerElementSize * numElements, (T)-1.f);
-  std::vector<ChargePos> peak_positions(numElements);
-  unsigned int write_idx = 0;
-
-  for (int batch_counter = 0; batch_counter < numElements; batch_counter++) {
-
-    uint cls = CAMath::Min(glo_idx + batch_counter, clusternum - 1);
-
-    ChargePos peak = clusterer.mPfilteredPeakPositions[cls];
-    int row = peak.row(), pad = peak.pad(), time = peak.time();
-    float central_charge = chargeMap[peak].unpack();
-
-    peak_positions[batch_counter] = peak;
-    central_charges[batch_counter] = central_charge;
-
-    // unsigned int batch_offset = batch_counter * clusterer.nnClusterizerElementSize;
-    for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {
-      bool push_mc_label = (r == 0);
-      int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
-      int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
-      for (int p = -clusterer.nnClusterizerSizeInputPad; p <= clusterer.nnClusterizerSizeInputPad; p++) {
-        push_mc_label &= (std::abs(p) < 2); // Use inner 5x5 window
-        bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p + pad_offset, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
-        for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
-          push_mc_label &= (std::abs(t) < 2); // Use inner 5x5 window
-          if (!is_boundary) {
-            ChargePos tmp_pos(row + r, pad + p + pad_offset, time + t);
-            input_data[write_idx] = (T)(chargeMap[tmp_pos].unpack() / central_charge);
-            if (push_mc_label) {
-              ChargePos tmp_pos_mc(row, pad + p, time + t);
-              CPU_ONLY(labelAcc->collect(tmp_pos, chargeMap[tmp_pos_mc].unpack()));
-            }
+  uint glo_idx = get_global_id(0);
+  // SHouldn't be needed
+  // if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters)
+  // {
+  //   return;
+  // }
+
+  uint write_idx = glo_idx * clusterer.nnClusterizerElementSize; // For optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
+
+  ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
+  int row = peak.row(), pad = peak.pad(), time = peak.time();
+  float central_charge = chargeMap[peak].unpack();
+
+  clusterer.peakPositions[glo_idx] = peak;
+  clusterer.centralCharges[glo_idx] = central_charge;
+
+  int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
+  for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {
+    int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
+    for (int p = -clusterer.nnClusterizerSizeInputPad + pad_offset; p <= clusterer.nnClusterizerSizeInputPad + pad_offset; p++) {
+      bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
+      for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
+        if (!is_boundary) {
+          ChargePos tmp_pos(row + r, pad + p, time + t);
+          if(dtype == 0){
+            clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)((float)chargeMap[tmp_pos].unpack() / central_charge);
+          } else {
+            clusterer.inputData32[write_idx] = (float)chargeMap[tmp_pos].unpack() / central_charge;
           }
-          write_idx++;
         }
+        write_idx++;
       }
     }
-    if (clusterer.nnClusterizerAddIndexData) {
-      input_data[write_idx] = (T)(clusterer.mISlice / 36.f);
-      input_data[write_idx + 1] = (T)(row / 152.f);
-      input_data[write_idx + 2] = (T)((float)pad / clusterer.Param().tpcGeometry.NPads(row));
-      write_idx += 3;
-      // if(idx == 100){
-      //   LOG(info) << "[" << input_data[input_data.size()-3] << ", " << input_data[input_data.size()-2] << ", " << input_data[input_data.size()-1] << "]";
-      // }
-    }
   }
-
-  std::vector<int> index_class_2;
-  std::vector<float> out_class = clusterer.model_class.inference<T, float>(input_data);
-  // LOG(info) << "input_data.size(): " << input_data.size() << "; write_idx: " << write_idx << "; out_class.size(): " << out_class.size();
-  int num_output_classes = clusterer.model_class.getNumOutputNodes()[0][1];
-
-  if (num_output_classes > 1) {
-    std::vector<float> tmp_out_class(numElements);
-    for (int cls_idx = 0; cls_idx < numElements; cls_idx++) {
-      auto elem_iterator = out_class.begin() + (unsigned int)(cls_idx * num_output_classes);
-      tmp_out_class[cls_idx] = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + num_output_classes)) - 1; // -1 since 2-class classifier will have 3 outputs: classes 0, 1, 2
-      if (tmp_out_class[cls_idx] > 1) {
-        index_class_2.push_back(cls_idx);
-      }
+  if (clusterer.nnClusterizerAddIndexData) {
+    if(dtype == 0){
+      clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISlice / 36.f);
+      clusterer.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
+      clusterer.inputData16[write_idx + 2] = (OrtDataType::Float16_t)((float)pad / clusterer.Param().tpcGeometry.NPads(row));
+    } else {
+      clusterer.inputData32[write_idx] = clusterer.mISlice / 36.f;
+      clusterer.inputData32[write_idx + 1] = row / 152.f;
+      clusterer.inputData32[write_idx + 2] = (float)pad / clusterer.Param().tpcGeometry.NPads(row);
     }
-    out_class = tmp_out_class;
   }
+}
 
-  if (!clusterer.nnClusterizerUseCFregression) {
+// ---------------------------------
+GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t mode, int8_t onlyMC, uint batchStart)
+{
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
+  CPU_ONLY(MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem));
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  uint full_glo_idx = glo_idx + batchStart;
+  int model_output_index = glo_idx * clusterer.model_reg_1.getNumOutputNodes()[0][1];
+
+  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clusterer.outputDataReg1.size() << " / " << clusterer.model_reg_1.getNumOutputNodes()[0][1] << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
+
+  if (clusterer.outputDataClass[full_glo_idx] == 1) {
+
+    ClusterAccumulator pc;
+
+    if (onlyMC) {
+      ClusterAccumulator dummy_pc;
+      CPU_ONLY(labelAcc->collect(clusterer.peakPositions[glo_idx], chargeMap[clusterer.peakPositions[glo_idx]].unpack()));
+      GPUTPCCFClusterizer::buildCluster(
+        clusterer.Param().rec,
+        chargeMap,
+        clusterer.peakPositions[glo_idx],
+        smem.posBcast,
+        smem.buf,
+        smem.innerAboveThreshold,
+        &dummy_pc,
+        labelAcc);
+    }
 
-    std::vector<float> out_reg = clusterer.model_reg_1.inference<T, float>(input_data), tmp_out_reg_2;
-    if (index_class_2.size() > 0) {
-      std::vector<T> tmp_in_reg_2(index_class_2.size() * clusterer.nnClusterizerElementSize);
-      int fill_counter = 0;
-      for (int cls_idx : index_class_2) {
-        int from_idx = cls_idx * clusterer.nnClusterizerElementSize, to_idx = fill_counter * clusterer.nnClusterizerElementSize;
-        for (int reg_idx = 0; reg_idx < clusterer.nnClusterizerElementSize; reg_idx++) {
-          tmp_in_reg_2[to_idx + reg_idx] = input_data[from_idx + reg_idx];
-        }
-        fill_counter++;
+    if ((clusterer.mPmemory->fragment).isOverlap(clusterer.peakPositions[glo_idx].time())) {
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
       }
-      tmp_out_reg_2 = clusterer.model_reg_2.inference<T, float>(input_data);
+      return;
     }
 
-    input_data.clear();
+    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg1[model_output_index + 4], clusterer.peakPositions[glo_idx].pad() + clusterer.outputDataReg1[model_output_index], clusterer.outputDataReg1[model_output_index + 2], (clusterer.mPmemory->fragment).start + clusterer.peakPositions[glo_idx].time() + clusterer.outputDataReg1[model_output_index + 1], clusterer.outputDataReg1[model_output_index + 3], 0, 0);
 
-    if ((clusterer.nnClusterizerVerbosity < 1) && glo_idx == 0) {
-      LOG(info) << "[CF] Classification model: " << out_class[0] << " (>? " << clusterer.nnClassThreshold << ")";
-      LOG(info) << "[CF] Regression model: " << out_reg[0] << "; " << out_reg[1] << "; " << out_reg[2] << "; " << out_reg[3] << "; " << out_reg[4];
-    }
-
-    int num_outputs_1 = clusterer.model_reg_1.getNumOutputNodes()[0][1], num_outputs_2 = 0, counter_class_2_idcs = 0;
-    if (num_output_classes > 1) {
-      num_outputs_2 = clusterer.model_reg_2.getNumOutputNodes()[0][1];
+    tpc::ClusterNative myCluster;
+    bool rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
+    if (rejectCluster) {
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+      }
+      return;
     }
 
-    for (int element = 0; element < numElements; element++) {
-
-      if (glo_idx + element >= clusternum) {
-        return;
+    uint rowIndex = 0;
+    if (clusterer.mPclusterByRow != nullptr) {
+      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
+        clusterer,
+        myCluster,
+        clusterer.peakPositions[glo_idx].row(),
+        clusterer.mNMaxClusterPerRow,
+        clusterer.mPclusterInRow,
+        clusterOut);
+      if (clusterer.mPclusterPosInRow != nullptr) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
       }
+    } else if (clusterer.mPclusterPosInRow) {
+      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
+    }
+    CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+  }
+}
 
-      int model_output_index = element * num_outputs_1;
-      if (out_class[element] > clusterer.nnClassThreshold) {
-        if ((num_output_classes == 1) || ((num_output_classes > 1) && (out_class[element] < 2))) {
-          // CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-          ClusterAccumulator pc;
-
-          ClusterAccumulator dummy_pc;
-          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-
-          // Dummy build to push MC labels
-          buildCluster(
-            calib,
-            chargeMap,
-            peak_positions[element],
-            smem.posBcast,
-            smem.buf,
-            smem.innerAboveThreshold,
-            &dummy_pc,
-            labelAcc);
-
-          if (fragment.isOverlap(peak_positions[element].time())) {
-            if (clusterPosInRow) {
-              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-            }
-            continue;
-          }
-
-          pc.setFull(central_charges[element] * out_reg[model_output_index + 4], peak_positions[element].pad() + out_reg[model_output_index + 0], out_reg[model_output_index + 2], fragment.start + peak_positions[element].time() + out_reg[model_output_index + 1], out_reg[model_output_index + 3], 0, 0);
-          // LOG(info) << "Example: " << num_outputs_1 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
-
-          tpc::ClusterNative myCluster;
-          bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
-          if (rejectCluster) {
-            if (clusterer.nnClusterizerVerbosity < 1) {
-              LOG(warning) << "[CF] Cluster rejected!";
-            }
-            if (clusterPosInRow) {
-              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-            }
-            continue;
-          }
-
-          uint rowIndex = 0;
-          if (clusterByRow != nullptr) {
-            rowIndex = sortIntoBuckets(
-              clusterer,
-              myCluster,
-              peak_positions[element].row(),
-              maxClusterPerRow,
-              clusterInRow,
-              clusterByRow);
-            if (clusterPosInRow != nullptr) {
-              clusterPosInRow[glo_idx + element] = rowIndex;
-            }
-          } else if (clusterPosInRow) {
-            rowIndex = clusterPosInRow[glo_idx + element];
-          }
-          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
-        } else {
-          model_output_index = index_class_2[counter_class_2_idcs] * num_outputs_2;
-          counter_class_2_idcs++;
-
-          // Cluster 1
-          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-          ClusterAccumulator pc;
-
-          if (fragment.isOverlap(peak_positions[element].time())) {
-            if (clusterPosInRow) {
-              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-            }
-            continue;
-          }
-
-          pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 8], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 4], tmp_out_reg_2[model_output_index + 2], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 2], tmp_out_reg_2[model_output_index + 6], 0, 0);
-          // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
-
-          tpc::ClusterNative myCluster;
-          bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
-          if (rejectCluster) {
-            if (clusterer.nnClusterizerVerbosity < 1) {
-              LOG(warning) << "[CF] Cluster rejected!";
-            }
-            if (clusterPosInRow) {
-              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-            }
-            continue;
-          }
-
-          uint rowIndex = 0;
-          if (clusterByRow != nullptr) {
-            rowIndex = sortIntoBuckets(
-              clusterer,
-              myCluster,
-              peak_positions[element].row(),
-              maxClusterPerRow,
-              clusterInRow,
-              clusterByRow);
-            if (clusterPosInRow != nullptr) {
-              clusterPosInRow[glo_idx + element] = rowIndex;
-            }
-          } else if (clusterPosInRow) {
-            rowIndex = clusterPosInRow[glo_idx + element];
-          }
-          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
-
-          // Cluster 2
-          CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-          pc.setFull(central_charges[element] * tmp_out_reg_2[model_output_index + 9], peak_positions[element].pad() + tmp_out_reg_2[model_output_index + 1], tmp_out_reg_2[model_output_index + 5], fragment.start + peak_positions[element].time() + tmp_out_reg_2[model_output_index + 3], tmp_out_reg_2[model_output_index + 7], 0, 0);
-          // LOG(info) << "Example: " << num_outputs_2 << " " << out_reg.size() << ";; " << out_reg[model_output_index + 4] << "; " << out_reg[model_output_index + 0] << "; " << out_reg[model_output_index + 2] << "; " << out_reg[model_output_index + 1] << "; " << out_reg[model_output_index + 3];
-          rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
-          if (rejectCluster) {
-            if (clusterer.nnClusterizerVerbosity < 1) {
-              LOG(warning) << "[CF] Cluster rejected!";
-            }
-            if (clusterPosInRow) {
-              clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-            }
-            continue;
-          }
+// ---------------------------------
+GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t mode, int8_t onlyMC, uint batchStart)
+{
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
+  CPU_ONLY(MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem));
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  uint full_glo_idx = glo_idx + batchStart;
+  int model_output_index = glo_idx * clusterer.model_reg_2.getNumOutputNodes()[0][1];
+
+  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clusterer.outputDataReg1.size() << " / " << clusterer.model_reg_1.getNumOutputNodes()[0][1] << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
+
+  if (clusterer.outputDataClass[full_glo_idx] > 0) {
+
+    ClusterAccumulator pc;
+
+    if (onlyMC) {
+      ClusterAccumulator dummy_pc;
+      CPU_ONLY(labelAcc->collect(clusterer.peakPositions[glo_idx], chargeMap[clusterer.peakPositions[glo_idx]].unpack()));
+      GPUTPCCFClusterizer::buildCluster(
+        clusterer.Param().rec,
+        chargeMap,
+        clusterer.peakPositions[glo_idx],
+        smem.posBcast,
+        smem.buf,
+        smem.innerAboveThreshold,
+        &dummy_pc,
+        labelAcc);
+    }
 
-          rowIndex = 0;
-          if (clusterByRow != nullptr) {
-            rowIndex = sortIntoBuckets(
-              clusterer,
-              myCluster,
-              peak_positions[element].row(),
-              maxClusterPerRow,
-              clusterInRow,
-              clusterByRow);
-            if (clusterPosInRow != nullptr) {
-              clusterPosInRow[glo_idx + element] = rowIndex;
-            }
-          } else if (clusterPosInRow) {
-            rowIndex = clusterPosInRow[glo_idx + element];
-          }
-          CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
-        }
+    if ((clusterer.mPmemory->fragment).isOverlap(clusterer.peakPositions[glo_idx].time())) {
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
       }
+      return;
     }
 
-  } else {
+    // Cluster 1
+    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg2[model_output_index + 8], clusterer.peakPositions[glo_idx].pad() + clusterer.outputDataReg2[model_output_index], clusterer.outputDataReg2[model_output_index + 4], (clusterer.mPmemory->fragment).start + clusterer.peakPositions[glo_idx].time() + clusterer.outputDataReg2[model_output_index + 2], clusterer.outputDataReg2[model_output_index + 6], 0, 0);
 
-    input_data.clear();
-    for (int element = 0; element < numElements; element++) {
-      if (glo_idx + element >= clusternum) {
-        return;
+    tpc::ClusterNative myCluster;
+    bool rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
+    if (rejectCluster) {
+      if (clusterer.nnClusterizerVerbosity < 2) {
+        LOG(warning) << "[NN, CF] Cluster rejected!";
       }
-
-      if (out_class[element] > clusterer.nnClassThreshold) {
-
-        ClusterAccumulator pc;
-        CPU_ONLY(labelAcc->collect(peak_positions[element], central_charges[element]));
-
-        buildCluster(
-          calib,
-          chargeMap,
-          peak_positions[element],
-          smem.posBcast,
-          smem.buf,
-          smem.innerAboveThreshold,
-          &pc,
-          labelAcc);
-
-        if (fragment.isOverlap(peak_positions[element].time())) {
-          if (clusterPosInRow) {
-            clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-          }
-          continue;
-        }
-        pc.finalize(peak_positions[element], central_charges[element], fragment.start, clusterer.Param().tpcGeometry);
-
-        tpc::ClusterNative myCluster;
-        bool rejectCluster = !pc.toNative(peak_positions[element], central_charges[element], myCluster, clusterer.Param());
-
-        if (rejectCluster) {
-          if (clusterer.nnClusterizerVerbosity < 1) {
-            LOG(warning) << "[CF] Cluster rejected!";
-          }
-          if (clusterPosInRow) {
-            clusterPosInRow[glo_idx + element] = maxClusterPerRow;
-          }
-          continue;
-        }
-
-        uint rowIndex = 0;
-        if (clusterByRow != nullptr) {
-          rowIndex = sortIntoBuckets(
-            clusterer,
-            myCluster,
-            peak_positions[element].row(),
-            maxClusterPerRow,
-            clusterInRow,
-            clusterByRow);
-          if (clusterPosInRow != nullptr) {
-            clusterPosInRow[glo_idx + element] = rowIndex;
-          }
-        } else if (clusterPosInRow) {
-          rowIndex = clusterPosInRow[glo_idx + element];
-        }
-
-        CPU_ONLY(labelAcc->commit(peak_positions[element].row(), rowIndex, maxClusterPerRow));
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
       }
+      return;
     }
-  }
 
-  if (clusterer.nnClusterizerVerbosity < 2) {
-    LOG(info) << "[CF] Clusterization done!";
-  }
-}
-
-GPUdii() void GPUTPCNNClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
-                                                       processorType& clusterer,
-                                                       const CfFragment& fragment,
-                                                       GPUSharedMemory& smem,
-                                                       const Array2D<PackedCharge>& chargeMap,
-                                                       const ChargePos* filteredPeakPositions,
-                                                       const GPUSettingsRec& calib,
-                                                       MCLabelAccumulator* labelAcc,
-                                                       uint32_t clusternum,
-                                                       uint32_t maxClusterPerRow,
-                                                       uint32_t* clusterInRow,
-                                                       tpc::ClusterNative* clusterByRow,
-                                                       uint32_t* clusterPosInRow)
-{
-  uint32_t idx = get_global_id(0);
-
-  // For certain configurations dummy work items are added, so the total
-  // number of work items is dividable by 64.
-  // These dummy items also compute the last cluster but discard the result.
-  ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
-  Charge charge = chargeMap[pos].unpack();
-
-  ClusterAccumulator pc;
-  CPU_ONLY(labelAcc->collect(pos, charge));
-
-  buildCluster(
-    calib,
-    chargeMap,
-    pos,
-    smem.posBcast,
-    smem.buf,
-    smem.innerAboveThreshold,
-    &pc,
-    labelAcc);
-
-  if (idx >= clusternum) {
-    return;
-  }
-  if (fragment.isOverlap(pos.time())) {
-    if (clusterPosInRow) {
-      clusterPosInRow[idx] = maxClusterPerRow;
+    uint rowIndex = 0;
+    if (clusterer.mPclusterByRow != nullptr) {
+      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
+        clusterer,
+        myCluster,
+        clusterer.peakPositions[glo_idx].row(),
+        clusterer.mNMaxClusterPerRow,
+        clusterer.mPclusterInRow,
+        clusterOut);
+      if (clusterer.mPclusterPosInRow != nullptr) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
+      }
+    } else if (clusterer.mPclusterPosInRow) {
+      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-    return;
-  }
-  pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry);
+    CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
 
-  tpc::ClusterNative myCluster;
-  bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param());
+    // Cluster 2
+    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg2[model_output_index + 9], clusterer.peakPositions[glo_idx].pad() + clusterer.outputDataReg2[model_output_index + 1], clusterer.outputDataReg2[model_output_index + 5], (clusterer.mPmemory->fragment).start + clusterer.peakPositions[glo_idx].time() + clusterer.outputDataReg2[model_output_index + 3], clusterer.outputDataReg2[model_output_index + 7], 0, 0);
 
-  if (rejectCluster) {
-    if (clusterPosInRow) {
-      clusterPosInRow[idx] = maxClusterPerRow;
+    rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
+    if (rejectCluster) {
+      if (clusterer.nnClusterizerVerbosity < 2) {
+        LOG(warning) << "[NN, CF] Cluster rejected!";
+      }
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+      }
+      return;
     }
-    return;
-  }
 
-  uint32_t rowIndex = 0;
-  if (clusterByRow != nullptr) {
-    rowIndex = sortIntoBuckets(
-      clusterer,
-      myCluster,
-      pos.row(),
-      maxClusterPerRow,
-      clusterInRow,
-      clusterByRow);
-    if (clusterPosInRow != nullptr) {
-      clusterPosInRow[idx] = rowIndex;
+    if (clusterer.mPclusterByRow != nullptr) {
+      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
+        clusterer,
+        myCluster,
+        clusterer.peakPositions[glo_idx].row(),
+        clusterer.mNMaxClusterPerRow,
+        clusterer.mPclusterInRow,
+        clusterOut);
+      if (clusterer.mPclusterPosInRow != nullptr) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
+      }
+    } else if (clusterer.mPclusterPosInRow) {
+      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-  } else if (clusterPosInRow) {
-    rowIndex = clusterPosInRow[idx];
+    // CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
   }
-
-  CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
-}
-
-GPUdii() void GPUTPCNNClusterizer::updateClusterInner(
-  const GPUSettingsRec& calib,
-  uint16_t lid,
-  uint16_t N,
-  const PackedCharge* buf,
-  const ChargePos& pos,
-  ClusterAccumulator* cluster,
-  MCLabelAccumulator* labelAcc,
-  uint8_t* innerAboveThreshold)
-{
-  uint8_t aboveThreshold = 0;
-
-  GPUCA_UNROLL(U(), U())
-  for (uint16_t i = 0; i < N; i++) {
-    Delta2 d = cfconsts::InnerNeighbors[i];
-
-    PackedCharge p = buf[N * lid + i];
-
-    Charge q = cluster->updateInner(p, d);
-
-    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
-
-    aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
-  }
-
-  innerAboveThreshold[lid] = aboveThreshold;
-
-  GPUbarrier();
-}
-
-GPUdii() void GPUTPCNNClusterizer::updateClusterOuter(
-  uint16_t lid,
-  uint16_t N,
-  uint16_t M,
-  uint16_t offset,
-  const PackedCharge* buf,
-  const ChargePos& pos,
-  ClusterAccumulator* cluster,
-  MCLabelAccumulator* labelAcc)
-{
-  GPUCA_UNROLL(U(), U())
-  for (uint16_t i = offset; i < M + offset; i++) {
-    PackedCharge p = buf[N * lid + i];
-
-    Delta2 d = cfconsts::OuterNeighbors[i];
-
-    Charge q = cluster->updateOuter(p, d);
-    static_cast<void>(q); // Avoid unused varible warning on GPU.
-
-    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
-  }
-}
-
-GPUdii() void GPUTPCNNClusterizer::buildCluster(
-  const GPUSettingsRec& calib,
-  const Array2D<PackedCharge>& chargeMap,
-  ChargePos pos,
-  ChargePos* posBcast,
-  PackedCharge* buf,
-  uint8_t* innerAboveThreshold,
-  ClusterAccumulator* myCluster,
-  MCLabelAccumulator* labelAcc)
-{
-  uint16_t ll = get_local_id(0);
-
-  posBcast[ll] = pos;
-  GPUbarrier();
-
-  CfUtils::blockLoad<PackedCharge>(
-    chargeMap,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    8,
-    cfconsts::InnerNeighbors,
-    posBcast,
-    buf);
-  updateClusterInner(
-    calib,
-    ll,
-    8,
-    buf,
-    pos,
-    myCluster,
-    labelAcc,
-    innerAboveThreshold);
-
-  uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
-
-  bool inGroup1 = ll < wgSizeHalf;
-
-  uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
-
-  CfUtils::condBlockLoad(
-    chargeMap,
-    wgSizeHalf,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    16,
-    cfconsts::OuterNeighbors,
-    posBcast,
-    innerAboveThreshold,
-    buf);
-
-  if (inGroup1) {
-    updateClusterOuter(
-      llhalf,
-      16,
-      16,
-      0,
-      buf,
-      pos,
-      myCluster,
-      labelAcc);
-  }
-
-#if defined(GPUCA_GPUCODE)
-  CfUtils::condBlockLoad(
-    chargeMap,
-    wgSizeHalf,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    16,
-    cfconsts::OuterNeighbors,
-    posBcast + wgSizeHalf,
-    innerAboveThreshold + wgSizeHalf,
-    buf);
-  if (!inGroup1) {
-    updateClusterOuter(
-      llhalf,
-      16,
-      16,
-      0,
-      buf,
-      pos,
-      myCluster,
-      labelAcc);
-  }
-#endif
-}
-
-GPUd() uint32_t GPUTPCNNClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t* elemsInBucket, tpc::ClusterNative* buckets)
-{
-  uint32_t index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
-  if (index < maxElemsPerBucket) {
-    buckets[maxElemsPerBucket * row + index] = cluster;
-  } else {
-    clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISlice * 1000 + row, index, maxElemsPerBucket);
-    CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
-  }
-  return index;
-}
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 0bb830352becc..3a54a93964040 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -38,6 +38,7 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
  public:
   static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizer);
   struct GPUSharedMemory {
+    // Regular cluster finder
     ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
     PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
     uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
@@ -54,38 +55,27 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
     return GPUDataTypes::RecoStep::TPCClusterFinding;
   }
 
+  // Float16 inmplementation
   template <int32_t iKernel = defaultKernel>
-  GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t);
+  GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, int8_t = 0, int8_t = 0, int8_t = 0, uint = 0);
 
-  static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*);
+  static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, int8_t, uint);
 
-  static int padOffset(int, int, const GPUTPCGeometry&);
-  static int rowOffset(int, int);
-  static bool isBoundary(int, int, int, const GPUTPCGeometry&);
+  static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, int8_t, uint);
+  static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, int8_t, uint);
 
-  template <class T>
-  static GPUd() void nn_clusterizer(int32_t, int32_t, int32_t, int32_t,
-                                    processorType&,
-                                    const CfFragment&,
-                                    GPUSharedMemory&,
-                                    const Array2D<PackedCharge>&,
-                                    const ChargePos*,
-                                    const GPUSettingsRec&,
-                                    MCLabelAccumulator*,
-                                    uint32_t,
-                                    uint32_t,
-                                    uint32_t*,
-                                    tpc::ClusterNative*,
-                                    uint32_t*);
+  static void applyNetworkClass(processorType&, int8_t = 0, uint = 0);
 
- private:
-  static GPUd() void updateClusterInner(const GPUSettingsRec&, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);
+  static void applyNetworkReg1(processorType&, int8_t = 0);
 
-  static GPUd() void updateClusterOuter(uint16_t, uint16_t, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
+  static void applyNetworkReg2(processorType&, int8_t = 0);
 
-  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
+  
+  private:
 
-  static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
+    static int padOffset(int, int, const GPUTPCGeometry&);
+    static int rowOffset(int, int);
+    static bool isBoundary(int, int, int, const GPUTPCGeometry&);
 };
 
 } // namespace GPUCA_NAMESPACE::gpu
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index 162bf0bf774c3..b3124897e398f 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -110,7 +110,7 @@ o2_gpu_add_kernel("GPUTPCCFPeakFinder"                                "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"        "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB      single)
-o2_gpu_add_kernel("GPUTPCNNClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
+o2_gpu_add_kernel("GPUTPCNNClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t mode int8_t onlyMC uint btachStart)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"           "= TPCCLUSTERFINDER"                                    NO      single)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                 "= TPCCLUSTERFINDER"                                    NO      single GPUTPCLinearLabels* out)

From 857f27de7b63afcee4cdfb2907ed0e9df364a1f3 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 19 Feb 2025 22:22:54 +0100
Subject: [PATCH 28/77] Adjusting for default CF regression

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h            | 1 +
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx   | 8 ++++----
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 3 +++
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index be4cbe0238c66..1ae0335a3808d 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -315,6 +315,7 @@ AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the N
 AddOption(nnClusterizerUseCFregression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN")
 AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
 AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
+AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
 AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
 AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
 AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 49742ece6711d..6c18ac8ec6dbc 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -943,7 +943,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           
           float time_clusterizer = 0, time_fill = 0;
           int evalDtype = clusterer.OrtOptions["dtype"].find("32") != std::string::npos;
-          clusterer.outputDataClass.resize(clusterer.mPmemory->counters.nClusters);
+          clusterer.outputDataClass.resize(clusterer.mPmemory->counters.nClusters, -1);
 
           for(int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clusterer.nnClusterizerBatchedMode); batch++) {
             uint batchStart = batch * clusterer.nnClusterizerBatchedMode;
@@ -953,9 +953,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             clusterer.centralCharges.resize(iSize);
 
             if (evalDtype == 1) {
-              clusterer.inputData32.resize(iSize * clusterer.nnClusterizerElementSize);
+              clusterer.inputData32.resize(iSize * clusterer.nnClusterizerElementSize, GetProcessingSettings().nnClusterizerBoundaryFillValue);
             } else {
-              clusterer.inputData16.resize(iSize * clusterer.nnClusterizerElementSize);
+              clusterer.inputData16.resize(iSize * clusterer.nnClusterizerElementSize, GetProcessingSettings().nnClusterizerBoundaryFillValue);
             }
 
             auto start0 = std::chrono::high_resolution_clock::now();
@@ -989,7 +989,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
 
           if (clusterer.nnClusterizerVerbosity < 3) {
-            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSlice << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s";
+            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSlice << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
         } else {
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 4aa947c5aa8c9..41938afb4a833 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -31,6 +31,9 @@ GPUdii() void GPUTPCNNClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
 {
   uint glo_idx = get_global_id(0);
   if (mode == -1) {
+    if (clusterer.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
+      return;
+    }
     Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
     CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
     tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;

From 89c0105a3afd658b5b1f40cc787b9003ffb31e98 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 20 Feb 2025 15:24:45 +0100
Subject: [PATCH 29/77] Bug-fix for application of CF regression and logging
 message

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 6c18ac8ec6dbc..b6dd3ddd67f18 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -953,9 +953,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             clusterer.centralCharges.resize(iSize);
 
             if (evalDtype == 1) {
-              clusterer.inputData32.resize(iSize * clusterer.nnClusterizerElementSize, GetProcessingSettings().nnClusterizerBoundaryFillValue);
+              clusterer.inputData32.resize(iSize * clusterer.nnClusterizerElementSize, (float)(GetProcessingSettings().nnClusterizerBoundaryFillValue));
             } else {
-              clusterer.inputData16.resize(iSize * clusterer.nnClusterizerElementSize, GetProcessingSettings().nnClusterizerBoundaryFillValue);
+              clusterer.inputData16.resize(iSize * clusterer.nnClusterizerElementSize, (OrtDataType::Float16_t)((float)GetProcessingSettings().nnClusterizerBoundaryFillValue));
             }
 
             auto start0 = std::chrono::high_resolution_clock::now();
@@ -989,7 +989,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
 
           if (clusterer.nnClusterizerVerbosity < 3) {
-            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSlice << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSlice << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
         } else {
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);

From 45d807167d5dcbf4b96e1b478307fd3494d43eda Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 22 Feb 2025 19:29:46 +0100
Subject: [PATCH 30/77] Adding is_boundary check earlier to avoid out-of-bounds
 access

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 5 +++--
 GPU/GPUTracking/kernels.cmake                            | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 41938afb4a833..76002e44f0b16 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -151,9 +151,10 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
 
   int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
   for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {
-    int pad_offset = GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
+    bool is_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
+    int pad_offset = is_boundary ? 0 : GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
     for (int p = -clusterer.nnClusterizerSizeInputPad + pad_offset; p <= clusterer.nnClusterizerSizeInputPad + pad_offset; p++) {
-      bool is_boundary = GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
+      is_boundary = is_boundary || GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
       for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
         if (!is_boundary) {
           ChargePos tmp_pos(row + r, pad + p, time + t);
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index b3124897e398f..562715eff2fc6 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -110,7 +110,7 @@ o2_gpu_add_kernel("GPUTPCCFPeakFinder"                                "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"        "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB      single)
-o2_gpu_add_kernel("GPUTPCNNClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t mode int8_t onlyMC uint btachStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t mode int8_t onlyMC uint batchStart)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"           "= TPCCLUSTERFINDER"                                    NO      single)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                 "= TPCCLUSTERFINDER"                                    NO      single GPUTPCLinearLabels* out)

From 984857e30594a11551d10f18470c317ac2fa259c Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 24 Feb 2025 19:04:34 +0100
Subject: [PATCH 31/77] Bug-fixes for boundary reading

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h    |  3 ++-
 .../Global/GPUChainTrackingClusterizer.cxx       |  6 +++++-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx     | 16 +++++++++++-----
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 1ae0335a3808d..81c46e034cbf8 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -306,7 +306,7 @@ AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify t
 AddOption(nnInferenceThreadsPerNN, int, 0, "", 0, "Number of threads used to evaluate one neural network")
 AddOption(nnInferenceEnableOrtOptimization, unsigned int, 1, "", 0, "Enables graph optimizations in ONNX Runtime. Can be greater than 1!")
 AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
-AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If mmInferenceOrtProfiling is set, the path to store the profiling data")
+AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data")
 AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
 AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
 AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
@@ -316,6 +316,7 @@ AddOption(nnClusterizerUseCFregression, int, 0, "", 0, "(bool, default = false)
 AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
 AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
 AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
+AddOption(nnClusterizerApplyCfDeconvolution, int, 0, "", 0, "Applies the CFDeconvolution kernel before the digits to the network are filled")
 AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
 AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
 AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index b6dd3ddd67f18..fd2dde28bf962 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -931,7 +931,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
               clusterer.OrtOptions["model-path"] = reg_model_paths[1];
               clusterer.model_reg_2.init(clusterer.OrtOptions);
             }
-          } else {
+          }
+          
+          if (clusterer.nnClusterizerUseCFregression || (int)(GetProcessingSettings().nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
@@ -992,6 +994,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSlice << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
         } else {
+          runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
+          DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         }
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 76002e44f0b16..22fe48300e402 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -108,16 +108,16 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
 {
   if (pad < 0 || row < 0) { // Faster short-circuit
     return true;
-  } else if (row <= 62) {
-    if (pad < 0 || pad > geo.NPads(row)) {
+  } else if (row < 63) {
+    if (pad >= geo.NPads(row)) {
       return true;
     } else {
       return false;
     }
-  } else if (row <= 62 + global_shift) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
+  } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
     return true;
   } else if (row <= o2::tpc::constants::MAXGLOBALPADROW - 1 + global_shift) {
-    if (pad < 0 || pad > geo.NPads(row)) {
+    if (pad >= geo.NPads(row - global_shift)) {
       return true;
     } else {
       return false;
@@ -156,7 +156,13 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
     for (int p = -clusterer.nnClusterizerSizeInputPad + pad_offset; p <= clusterer.nnClusterizerSizeInputPad + pad_offset; p++) {
       is_boundary = is_boundary || GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
       for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
-        if (!is_boundary) {
+        if (is_boundary) {
+          if(dtype == 0){
+            clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)((float)clusterer.nnClusterizerBoundaryFillValue);
+          } else {
+            clusterer.inputData32[write_idx] = (float)clusterer.nnClusterizerBoundaryFillValue;
+          }
+        } else {
           ChargePos tmp_pos(row + r, pad + p, time + t);
           if(dtype == 0){
             clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)((float)chargeMap[tmp_pos].unpack() / central_charge);

From 57862a6bc547cd097328edad0b2249a510e445a9 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 25 Feb 2025 10:59:03 +0100
Subject: [PATCH 32/77] Updating to use explicit calls to kernels instead of
 if-statements

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  1 +
 .../Global/GPUChainTrackingClusterizer.cxx    | 19 ++--
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |  1 +
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 86 +++++++++++--------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    | 18 ++--
 GPU/GPUTracking/kernels.cmake                 |  7 +-
 6 files changed, 83 insertions(+), 49 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 81c46e034cbf8..99fcdc3b3f4f4 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -316,6 +316,7 @@ AddOption(nnClusterizerUseCFregression, int, 0, "", 0, "(bool, default = false)
 AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
 AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
 AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
+AddOption(nnClusterizerApplyNoiseSupression, int, 1, "", 0, "Applies the NoiseSupression kernel before the digits to the network are filled")
 AddOption(nnClusterizerApplyCfDeconvolution, int, 0, "", 0, "Applies the CFDeconvolution kernel before the digits to the network are filled")
 AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
 AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index fd2dde28bf962..6b711a91752a3 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -896,6 +896,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
           clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
           clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
+          clusterer.nnClusterizerBoundaryFillValue = GetProcessingSettings().nnClusterizerBoundaryFillValue;
           if (GetProcessingSettings().nnClusterizerVerbosity < 0){
             clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
           } else {
@@ -955,25 +956,29 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             clusterer.centralCharges.resize(iSize);
 
             if (evalDtype == 1) {
-              clusterer.inputData32.resize(iSize * clusterer.nnClusterizerElementSize, (float)(GetProcessingSettings().nnClusterizerBoundaryFillValue));
+              clusterer.inputData32.resize(iSize * clusterer.nnClusterizerElementSize, (float)(clusterer.nnClusterizerBoundaryFillValue));
             } else {
-              clusterer.inputData16.resize(iSize * clusterer.nnClusterizerElementSize, (OrtDataType::Float16_t)((float)GetProcessingSettings().nnClusterizerBoundaryFillValue));
+              clusterer.inputData16.resize(iSize * clusterer.nnClusterizerElementSize, (OrtDataType::Float16_t)((float)clusterer.nnClusterizerBoundaryFillValue));
             }
 
             auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, 0, batchStart); // Filling the data
+            runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Filling the data
             auto stop0 = std::chrono::high_resolution_clock::now();
 
             auto start1 = std::chrono::high_resolution_clock::now();
             GPUTPCNNClusterizer::applyNetworkClass(clusterer, evalDtype);
-            runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 1, 0, batchStart); // Assigning class labels
+            if (clusterer.model_class.getNumOutputNodes()[0][1] > 1){
+              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
+            } else {
+              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
+            }
 
             if (!clusterer.nnClusterizerUseCFregression) {
               GPUTPCNNClusterizer::applyNetworkReg1(clusterer, evalDtype);
-              runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 2, 0, batchStart); // Running the NN for regression class 1
+              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Running the NN for regression class 1
               if (clusterer.model_class.getNumOutputNodes()[0][1] > 1 && reg_model_paths.size() > 1) {
                 GPUTPCNNClusterizer::applyNetworkReg2(clusterer, evalDtype);
-                runKernel<GPUTPCNNClusterizer>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 4, 0, batchStart); // Running the NN for regression class 2
+                runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Running the NN for regression class 2
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
@@ -985,7 +990,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
           auto start1 = std::chrono::high_resolution_clock::now();
           if(clusterer.nnClusterizerUseCFregression) {
-            runKernel<GPUTPCNNClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, -1, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index bf2663691d19b..f7d7cfe9cf234 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -155,6 +155,7 @@ class GPUTPCClusterFinder : public GPUProcessor
   int nnClusterizerUseCFregression = 0;
   int nnClusterizerBatchedMode = 1;
   int nnClusterizerVerbosity = 0;
+  int nnClusterizerBoundaryFillValue = -1;
 
   // Memory allocation for neural network
   uint class2_elements = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 22fe48300e402..f163fde2151dc 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -27,45 +27,59 @@ using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
 
 template <>
-GPUdii() void GPUTPCNNClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t mode, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  if (mode == -1) {
-    if (clusterer.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
-      return;
-    }
-    Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-    CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
-    tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
-    o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
-    GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
-  } else if (mode == 0){
-    GPUTPCNNClusterizer::fillInputData(nBlocks, nThreads, iBlock, iThread, clusterer, dtype, batchStart);
-  } else if (mode == 1) { // Class labels
-    if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) {
-      clusterer.outputDataClass[glo_idx + batchStart] = (int)(clusterer.modelProbabilities[glo_idx] > clusterer.nnClassThreshold);
-    } else {
-      auto elem_iterator = clusterer.modelProbabilities.begin() + (unsigned int)(glo_idx * clusterer.model_class.getNumOutputNodes()[0][1]);
-      uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1]));
-      clusterer.outputDataClass[glo_idx + batchStart] = class_label;
-    }
-  } else if (mode == 2) { // Publishing for class 1 regression
-    if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
-      return;
-    } else {
-      GPUTPCNNClusterizer::publishClustersReg1(glo_idx, smem, clusterer, dtype, mode, onlyMC, batchStart);
-    }
-  } else if (mode == 3) { // Refilling for class 2 regression -> Deprecated because it needs sequential accumulation
+  if (clusterer.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
     return;
-  } else if (mode == 4) { // Publishing for class 2 regression
-    if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
-      return;
-    } else {
-      GPUTPCNNClusterizer::publishClustersReg2(glo_idx, smem, clusterer, dtype, mode, onlyMC, batchStart);
-    }
-  }  else {
+  }
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
+  GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  GPUTPCNNClusterizer::fillInputData(nBlocks, nThreads, iBlock, iThread, clusterer, dtype, batchStart);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  clusterer.outputDataClass[glo_idx + batchStart] = (int)(clusterer.modelProbabilities[glo_idx] > clusterer.nnClassThreshold);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  auto elem_iterator = clusterer.modelProbabilities.begin() + (unsigned int)(glo_idx * clusterer.model_class.getNumOutputNodes()[0][1]);
+  uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1]));
+  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
+    return;
+  }
+  GPUTPCNNClusterizer::publishClustersReg1(glo_idx, smem, clusterer, dtype, onlyMC, batchStart);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
     return;
   }
+  GPUTPCNNClusterizer::publishClustersReg2(glo_idx, smem, clusterer, dtype, onlyMC, batchStart);
 }
 
 
@@ -188,7 +202,7 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
 }
 
 // ---------------------------------
-GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t mode, int8_t onlyMC, uint batchStart)
+GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
@@ -255,7 +269,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
 }
 
 // ---------------------------------
-GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t mode, int8_t onlyMC, uint batchStart)
+GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 3a54a93964040..cc897e1fc7ed6 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -55,14 +55,22 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
     return GPUDataTypes::RecoStep::TPCClusterFinding;
   }
 
-  // Float16 inmplementation
-  template <int32_t iKernel = defaultKernel>
-  GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, int8_t = 0, int8_t = 0, int8_t = 0, uint = 0);
+  enum K : int32_t {
+    runCfClusterizer = 0,
+    fillInputNN = 1,
+    determineClass1Labels = 2,
+    determineClass2Labels = 3,
+    publishClass1Regression = 4,
+    publishClass2Regression = 5,
+  };
+
+  template <int32_t iKernel = defaultKernel, typename... Args>
+  GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, int8_t = 0, int8_t = 0, uint = 0, Args...);
 
   static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, int8_t, uint);
 
-  static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, int8_t, uint);
-  static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, int8_t, uint);
+  static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
+  static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
 
   static void applyNetworkClass(processorType&, int8_t = 0, uint = 0);
 
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index 562715eff2fc6..b584602c6f0de 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -110,7 +110,12 @@ o2_gpu_add_kernel("GPUTPCCFPeakFinder"                                "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"        "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB      single)
-o2_gpu_add_kernel("GPUTPCNNClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t mode int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, runCfClusterizer"             "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, fillInputNN"                  "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, determineClass1Labels"        "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, determineClass2Labels"        "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, publishClass1Regression"      "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, publishClass2Regression"      "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"           "= TPCCLUSTERFINDER"                                    NO      single)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                 "= TPCCLUSTERFINDER"                                    NO      single GPUTPCLinearLabels* out)

From c55cfc2bdd27e50fbf8ca40bd8b3b11a1fba781c Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 26 Feb 2025 10:00:21 +0100
Subject: [PATCH 33/77] Bug-fix for class label application

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 6b711a91752a3..0db6c9a05b9a6 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -967,7 +967,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto start1 = std::chrono::high_resolution_clock::now();
             GPUTPCNNClusterizer::applyNetworkClass(clusterer, evalDtype);
-            if (clusterer.model_class.getNumOutputNodes()[0][1] > 1){
+            if (clusterer.model_class.getNumOutputNodes()[0][1] == 1){
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
             } else {
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels

From 0125c2a9214e74fe434296d77a2c34388caca8a2 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 26 Feb 2025 23:28:17 +0100
Subject: [PATCH 34/77] Explicit casting solves regression issues. To be done:
 Correct publishing for class2 regression

---
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 45 +++++++------------
 1 file changed, 16 insertions(+), 29 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index f163fde2151dc..9c9492c7d359a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -123,19 +123,11 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
   if (pad < 0 || row < 0) { // Faster short-circuit
     return true;
   } else if (row < 63) {
-    if (pad >= geo.NPads(row)) {
-      return true;
-    } else {
-      return false;
-    }
+    return (pad >= static_cast<int>(geo.NPads(row)))
   } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
     return true;
   } else if (row <= o2::tpc::constants::MAXGLOBALPADROW - 1 + global_shift) {
-    if (pad >= geo.NPads(row - global_shift)) {
-      return true;
-    } else {
-      return false;
-    }
+    return (pad >= static_cast<int>(geo.NPads(row - global_shift)));
   } else {
     return true;
   }
@@ -148,40 +140,35 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
 
   uint glo_idx = get_global_id(0);
-  // SHouldn't be needed
-  // if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters)
-  // {
-  //   return;
-  // }
 
   uint write_idx = glo_idx * clusterer.nnClusterizerElementSize; // For optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
 
   ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
-  int row = peak.row(), pad = peak.pad(), time = peak.time();
-  float central_charge = chargeMap[peak].unpack();
+  int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time());
+  float central_charge = static_cast<float>(chargeMap[peak].unpack());
 
   clusterer.peakPositions[glo_idx] = peak;
   clusterer.centralCharges[glo_idx] = central_charge;
 
   int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
   for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {
-    bool is_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
-    int pad_offset = is_boundary ? 0 : GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
+    bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
+    int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
     for (int p = -clusterer.nnClusterizerSizeInputPad + pad_offset; p <= clusterer.nnClusterizerSizeInputPad + pad_offset; p++) {
-      is_boundary = is_boundary || GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
+      bool is_boundary = is_row_boundary || GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
       for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
-        if (is_boundary) {
+        if (!is_boundary) {
+          ChargePos tmp_pos(row + r, pad + p, time + t);
           if(dtype == 0){
-            clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)((float)clusterer.nnClusterizerBoundaryFillValue);
+            clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
           } else {
-            clusterer.inputData32[write_idx] = (float)clusterer.nnClusterizerBoundaryFillValue;
+            clusterer.inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
           }
         } else {
-          ChargePos tmp_pos(row + r, pad + p, time + t);
           if(dtype == 0){
-            clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)((float)chargeMap[tmp_pos].unpack() / central_charge);
+            clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clusterer.nnClusterizerBoundaryFillValue));
           } else {
-            clusterer.inputData32[write_idx] = (float)chargeMap[tmp_pos].unpack() / central_charge;
+            clusterer.inputData32[write_idx] = static_cast<float>(clusterer.nnClusterizerBoundaryFillValue);
           }
         }
         write_idx++;
@@ -192,11 +179,11 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
     if(dtype == 0){
       clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISlice / 36.f);
       clusterer.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
-      clusterer.inputData16[write_idx + 2] = (OrtDataType::Float16_t)((float)pad / clusterer.Param().tpcGeometry.NPads(row));
+      clusterer.inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row));
     } else {
       clusterer.inputData32[write_idx] = clusterer.mISlice / 36.f;
       clusterer.inputData32[write_idx + 1] = row / 152.f;
-      clusterer.inputData32[write_idx + 2] = (float)pad / clusterer.Param().tpcGeometry.NPads(row);
+      clusterer.inputData32[write_idx + 2] = static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row);
     }
   }
 }
@@ -238,7 +225,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
       return;
     }
 
-    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg1[model_output_index + 4], clusterer.peakPositions[glo_idx].pad() + clusterer.outputDataReg1[model_output_index], clusterer.outputDataReg1[model_output_index + 2], (clusterer.mPmemory->fragment).start + clusterer.peakPositions[glo_idx].time() + clusterer.outputDataReg1[model_output_index + 1], clusterer.outputDataReg1[model_output_index + 3], 0, 0);
+    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg1[model_output_index + 4], static_cast<float>clusterer.peakPositions[glo_idx].pad() + clusterer.outputDataReg1[model_output_index], clusterer.outputDataReg1[model_output_index + 2], static_cast<float>(clusterer.mPmemory->fragment).start + static_cast<float>clusterer.peakPositions[glo_idx].time() + clusterer.outputDataReg1[model_output_index + 1], clusterer.outputDataReg1[model_output_index + 3], 0, 0);
 
     tpc::ClusterNative myCluster;
     bool rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());

From 408787d45f06b1106e646141c2ce37d94f5cf9f3 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 27 Feb 2025 00:35:51 +0100
Subject: [PATCH 35/77] Bug-fixes

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 9c9492c7d359a..e2d7be3b169e2 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -123,7 +123,7 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
   if (pad < 0 || row < 0) { // Faster short-circuit
     return true;
   } else if (row < 63) {
-    return (pad >= static_cast<int>(geo.NPads(row)))
+    return (pad >= static_cast<int>(geo.NPads(row)));
   } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
     return true;
   } else if (row <= o2::tpc::constants::MAXGLOBALPADROW - 1 + global_shift) {
@@ -225,7 +225,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
       return;
     }
 
-    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg1[model_output_index + 4], static_cast<float>clusterer.peakPositions[glo_idx].pad() + clusterer.outputDataReg1[model_output_index], clusterer.outputDataReg1[model_output_index + 2], static_cast<float>(clusterer.mPmemory->fragment).start + static_cast<float>clusterer.peakPositions[glo_idx].time() + clusterer.outputDataReg1[model_output_index + 1], clusterer.outputDataReg1[model_output_index + 3], 0, 0);
+    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg1[model_output_index + 4], static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg1[model_output_index], clusterer.outputDataReg1[model_output_index + 2], static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg1[model_output_index + 1], clusterer.outputDataReg1[model_output_index + 3], 0, 0);
 
     tpc::ClusterNative myCluster;
     bool rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());

From e830697fc61b99d9b3d8a5b0d8395a6b93d000e2 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 5 Mar 2025 14:37:46 +0100
Subject: [PATCH 36/77] Adding some documentation

---
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 26 ++++++++++++++-----
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  3 ---
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index e2d7be3b169e2..13825b17848f7 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -26,6 +26,7 @@
 using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
 
+// Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
 template <>
 GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
@@ -58,7 +59,7 @@ GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass2La
 {
   uint glo_idx = get_global_id(0);
   auto elem_iterator = clusterer.modelProbabilities.begin() + (unsigned int)(glo_idx * clusterer.model_class.getNumOutputNodes()[0][1]);
-  uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1]));
+  uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1])); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
   clusterer.outputDataClass[glo_idx + batchStart] = class_label;
 }
 
@@ -107,6 +108,7 @@ void GPUTPCNNClusterizer::applyNetworkReg2(processorType& clusterer, int8_t dtyp
   }
 }
 
+// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
 {
   return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);
@@ -117,7 +119,6 @@ int GPUTPCNNClusterizer::rowOffset(int row, int global_shift)
   return (row > 62 ? global_shift : 0);
 }
 
-// ---------------------------------
 bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
 {
   if (pad < 0 || row < 0) { // Faster short-circuit
@@ -133,7 +134,7 @@ bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const G
   }
 }
 
-// ---------------------------------
+// Filling the input data for the neural network where there is no boundary
 GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& clusterer, int8_t dtype, uint batchStart)
 {
 
@@ -141,16 +142,17 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
 
   uint glo_idx = get_global_id(0);
 
-  uint write_idx = glo_idx * clusterer.nnClusterizerElementSize; // For optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
+  uint write_idx = glo_idx * clusterer.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
 
   ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
-  int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time());
+  int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
 
   clusterer.peakPositions[glo_idx] = peak;
   clusterer.centralCharges[glo_idx] = central_charge;
 
   int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
+  GPUCA_UNROLL(U(), U());
   for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {
     bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
     int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
@@ -165,6 +167,7 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
             clusterer.inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
           }
         } else {
+          // Filling boundary just to make sure that no values are left unintentionally
           if(dtype == 0){
             clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clusterer.nnClusterizerBoundaryFillValue));
           } else {
@@ -188,7 +191,6 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
   }
 }
 
-// ---------------------------------
 GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
@@ -204,6 +206,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
 
     ClusterAccumulator pc;
 
+    // Publishing logic is taken from default clusterizer
     if (onlyMC) {
       ClusterAccumulator dummy_pc;
       CPU_ONLY(labelAcc->collect(clusterer.peakPositions[glo_idx], chargeMap[clusterer.peakPositions[glo_idx]].unpack()));
@@ -252,10 +255,14 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
     CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+  } else {
+    if (clusterer.mPclusterPosInRow) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+    }
+    return;
   }
 }
 
-// ---------------------------------
 GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
@@ -353,5 +360,10 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
     // CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
+  } else {
+    if (clusterer.mPclusterPosInRow) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+    }
+    return;
   }
 }
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index cc897e1fc7ed6..14fe29398843a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -68,14 +68,11 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, int8_t = 0, int8_t = 0, uint = 0, Args...);
 
   static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, int8_t, uint);
-
   static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
   static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
 
   static void applyNetworkClass(processorType&, int8_t = 0, uint = 0);
-
   static void applyNetworkReg1(processorType&, int8_t = 0);
-
   static void applyNetworkReg2(processorType&, int8_t = 0);
 
   

From 1ca9fa0155a1cf36c2c25a44bb27598d9f064bf2 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Wed, 5 Mar 2025 13:38:21 +0000
Subject: [PATCH 37/77] Please consider the following formatting changes

---
 Common/ML/src/OrtInterface.cxx                | 139 +++++++++---------
 .../Global/GPUChainTrackingClusterizer.cxx    |  13 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |  22 +--
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  12 +-
 4 files changed, 92 insertions(+), 94 deletions(-)

diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 51792ac725ed6..e5c784a31f6de 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -44,7 +44,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   if (!optionsMap.contains("model-path")) {
     LOG(fatal) << "(ORT) Model path cannot be empty!";
   }
-  
+
   if (!optionsMap["model-path"].empty()) {
     modelPath = optionsMap["model-path"];
     device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
@@ -83,85 +83,84 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 #endif
 #endif
 
-    if (allocateDeviceMemory) {
-      pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
-      LOG(info) << "(ORT) Memory info set to on-device memory";
-    }
+  if (allocateDeviceMemory) {
+    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
+    LOG(info) << "(ORT) Memory info set to on-device memory";
+  }
 
-    if (device == "CPU") {
-      (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
-      if (intraOpNumThreads > 1) {
-        (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-      } else if (intraOpNumThreads == 1) {
-        (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
-      }
-      if (loggingLevel < 2) {
-        LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
-      }
+  if (device == "CPU") {
+    (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
+    if (intraOpNumThreads > 1) {
+      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+    } else if (intraOpNumThreads == 1) {
+      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+    }
+    if (loggingLevel < 2) {
+      LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
     }
+  }
 
-    (pImplOrt->sessionOptions).DisableMemPattern();
-    (pImplOrt->sessionOptions).DisableCpuMemArena();
+  (pImplOrt->sessionOptions).DisableMemPattern();
+  (pImplOrt->sessionOptions).DisableCpuMemArena();
 
-    if (enableProfiling) {
-      if (optionsMap.contains("profiling-output-path")) {
-        (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
-      } else {
-        LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
-        (pImplOrt->sessionOptions).DisableProfiling();
-      }
+  if (enableProfiling) {
+    if (optionsMap.contains("profiling-output-path")) {
+      (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
     } else {
+      LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
       (pImplOrt->sessionOptions).DisableProfiling();
     }
+  } else {
+    (pImplOrt->sessionOptions).DisableProfiling();
+  }
 
-    mInitialized = true;
-
-    (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
-    (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
-
-    pImplOrt->env = std::make_shared<Ort::Env>(
-      OrtLoggingLevel(loggingLevel),
-      (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()),
-      // Integrate ORT logging into Fairlogger
-      [](void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, const char* message) {
-        if (severity == ORT_LOGGING_LEVEL_VERBOSE) {
-          LOG(debug) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
-        } else if (severity == ORT_LOGGING_LEVEL_INFO) {
-          LOG(info) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
-        } else if (severity == ORT_LOGGING_LEVEL_WARNING) {
-          LOG(warning) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
-        } else if (severity == ORT_LOGGING_LEVEL_ERROR) {
-          LOG(error) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
-        } else if (severity == ORT_LOGGING_LEVEL_FATAL) {
-          LOG(fatal) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
-        } else {
-          LOG(info) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
-        }
-      },
-      (void*)3);
-    (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
-    pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
-
-    for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-      mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
-    }
-    for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-      mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-    }
-    for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-      mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
-    }
-    for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-      mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-    }
+  mInitialized = true;
+
+  (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
+  (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
+
+  pImplOrt->env = std::make_shared<Ort::Env>(
+    OrtLoggingLevel(loggingLevel),
+    (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()),
+    // Integrate ORT logging into Fairlogger
+    [](void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, const char* message) {
+      if (severity == ORT_LOGGING_LEVEL_VERBOSE) {
+        LOG(debug) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
+      } else if (severity == ORT_LOGGING_LEVEL_INFO) {
+        LOG(info) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
+      } else if (severity == ORT_LOGGING_LEVEL_WARNING) {
+        LOG(warning) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
+      } else if (severity == ORT_LOGGING_LEVEL_ERROR) {
+        LOG(error) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
+      } else if (severity == ORT_LOGGING_LEVEL_FATAL) {
+        LOG(fatal) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
+      } else {
+        LOG(info) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;
+      }
+    },
+    (void*)3);
+  (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
+  pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
 
-    inputNamesChar.resize(mInputNames.size(), nullptr);
-    std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
-                  [&](const std::string& str) { return str.c_str(); });
-    outputNamesChar.resize(mOutputNames.size(), nullptr);
-    std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
-                  [&](const std::string& str) { return str.c_str(); });
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+    mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+    mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+    mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+    mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
 
+  inputNamesChar.resize(mInputNames.size(), nullptr);
+  std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
+                 [&](const std::string& str) { return str.c_str(); });
+  outputNamesChar.resize(mOutputNames.size(), nullptr);
+  std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
+                 [&](const std::string& str) { return str.c_str(); });
   }
 }
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 0db6c9a05b9a6..6c81dc8997a12 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -897,7 +897,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
           clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
           clusterer.nnClusterizerBoundaryFillValue = GetProcessingSettings().nnClusterizerBoundaryFillValue;
-          if (GetProcessingSettings().nnClusterizerVerbosity < 0){
+          if (GetProcessingSettings().nnClusterizerVerbosity < 0) {
             clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
           } else {
             clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnClusterizerVerbosity;
@@ -933,7 +933,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
               clusterer.model_reg_2.init(clusterer.OrtOptions);
             }
           }
-          
+
           if (clusterer.nnClusterizerUseCFregression || (int)(GetProcessingSettings().nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
@@ -943,12 +943,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             // Inverse sigmoid transformation
             clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold));
           }
-          
+
           float time_clusterizer = 0, time_fill = 0;
           int evalDtype = clusterer.OrtOptions["dtype"].find("32") != std::string::npos;
           clusterer.outputDataClass.resize(clusterer.mPmemory->counters.nClusters, -1);
 
-          for(int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clusterer.nnClusterizerBatchedMode); batch++) {
+          for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clusterer.nnClusterizerBatchedMode); batch++) {
             uint batchStart = batch * clusterer.nnClusterizerBatchedMode;
             uint iSize = CAMath::Min((uint)clusterer.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
@@ -967,7 +967,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto start1 = std::chrono::high_resolution_clock::now();
             GPUTPCNNClusterizer::applyNetworkClass(clusterer, evalDtype);
-            if (clusterer.model_class.getNumOutputNodes()[0][1] == 1){
+            if (clusterer.model_class.getNumOutputNodes()[0][1] == 1) {
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
             } else {
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
@@ -985,11 +985,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
             time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
-
           }
 
           auto start1 = std::chrono::high_resolution_clock::now();
-          if(clusterer.nnClusterizerUseCFregression) {
+          if (clusterer.nnClusterizerUseCFregression) {
             runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 13825b17848f7..dd4eb97421f06 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -83,25 +83,27 @@ GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::publishClass2Regr
   GPUTPCNNClusterizer::publishClustersReg2(glo_idx, smem, clusterer, dtype, onlyMC, batchStart);
 }
 
-
-void GPUTPCNNClusterizer::applyNetworkClass(processorType& clusterer, int8_t dtype, uint batch_idx) {
-  if(dtype == 0){
+void GPUTPCNNClusterizer::applyNetworkClass(processorType& clusterer, int8_t dtype, uint batch_idx)
+{
+  if (dtype == 0) {
     clusterer.modelProbabilities = clusterer.model_class.inference<OrtDataType::Float16_t, float>(clusterer.inputData16);
   } else {
     clusterer.modelProbabilities = clusterer.model_class.inference<float, float>(clusterer.inputData32);
   }
 }
 
-void GPUTPCNNClusterizer::applyNetworkReg1(processorType& clusterer, int8_t dtype) {
-  if(dtype == 0){
+void GPUTPCNNClusterizer::applyNetworkReg1(processorType& clusterer, int8_t dtype)
+{
+  if (dtype == 0) {
     clusterer.outputDataReg1 = clusterer.model_reg_1.inference<OrtDataType::Float16_t, float>(clusterer.inputData16);
   } else {
     clusterer.outputDataReg1 = clusterer.model_reg_1.inference<float, float>(clusterer.inputData32);
   }
 }
 
-void GPUTPCNNClusterizer::applyNetworkReg2(processorType& clusterer, int8_t dtype) {
-  if(dtype == 0){
+void GPUTPCNNClusterizer::applyNetworkReg2(processorType& clusterer, int8_t dtype)
+{
+  if (dtype == 0) {
     clusterer.outputDataReg2 = clusterer.model_reg_2.inference<OrtDataType::Float16_t, float>(clusterer.inputData16);
   } else {
     clusterer.outputDataReg2 = clusterer.model_reg_2.inference<float, float>(clusterer.inputData32);
@@ -161,14 +163,14 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
       for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
         if (!is_boundary) {
           ChargePos tmp_pos(row + r, pad + p, time + t);
-          if(dtype == 0){
+          if (dtype == 0) {
             clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
           } else {
             clusterer.inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
           }
         } else {
           // Filling boundary just to make sure that no values are left unintentionally
-          if(dtype == 0){
+          if (dtype == 0) {
             clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clusterer.nnClusterizerBoundaryFillValue));
           } else {
             clusterer.inputData32[write_idx] = static_cast<float>(clusterer.nnClusterizerBoundaryFillValue);
@@ -179,7 +181,7 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
     }
   }
   if (clusterer.nnClusterizerAddIndexData) {
-    if(dtype == 0){
+    if (dtype == 0) {
       clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISlice / 36.f);
       clusterer.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
       clusterer.inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row));
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 14fe29398843a..e1485c80429aa 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -75,14 +75,12 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   static void applyNetworkReg1(processorType&, int8_t = 0);
   static void applyNetworkReg2(processorType&, int8_t = 0);
 
-  
-  private:
-
-    static int padOffset(int, int, const GPUTPCGeometry&);
-    static int rowOffset(int, int);
-    static bool isBoundary(int, int, int, const GPUTPCGeometry&);
+ private:
+  static int padOffset(int, int, const GPUTPCGeometry&);
+  static int rowOffset(int, int);
+  static bool isBoundary(int, int, int, const GPUTPCGeometry&);
 };
 
-} // namespace GPUCA_NAMESPACE::gpu
+} // namespace o2::gpu
 
 #endif
\ No newline at end of file

From 815cc3024431a254fe15fea0e735a05057ea6349 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 5 Mar 2025 19:29:49 +0100
Subject: [PATCH 38/77] Modifying for Davids comments

---
 GPU/GPUTracking/CMakeLists.txt                |  3 +-
 GPU/GPUTracking/Definitions/GPUSettingsList.h | 53 ++++++++++--------
 .../Global/GPUChainTrackingClusterizer.cxx    | 55 +++++++++----------
 .../TPCClusterFinder/GPUTPCClusterFinder.h    | 29 ++--------
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 25 ---------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  5 +-
 6 files changed, 62 insertions(+), 108 deletions(-)

diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 6ae409b3549bc..8654d007f6b2e 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -158,8 +158,7 @@ set(HDRS_INSTALL
 )
 
 set(SRCS_NO_CINT ${SRCS_NO_CINT} display/GPUDisplayInterface.cxx)
-set(SRCS_NO_CINT
-  ${SRCS_NO_CINT}
+set(SRCS_NO_CINT ${SRCS_NO_CINT}
   Global/GPUChainITS.cxx
   ITS/GPUITSFitter.cxx
   ITS/GPUITSFitterKernels.cxx
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 99fcdc3b3f4f4..bf69393a0632b 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -221,6 +221,34 @@ AddOption(tpcTriggerHandling, bool, true, "", 0, "Enable TPC trigger handling")
 AddHelp("help", 'h')
 EndConfig()
 
+BeginSubConfig(GPUSettingsProcessingNNclusterizer, nn, configStandalone.proc, "NN", 0, "Processing settings for neural network clusterizer", proc_nn)
+AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural network clusterizer should be used.")
+AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
+AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
+AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
+AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceThreadsPerNN, int, 0, "", 0, "Number of threads used to evaluate one neural network")
+AddOption(nnInferenceEnableOrtOptimization, unsigned int, 1, "", 0, "Enables graph optimizations in ONNX Runtime. Can be greater than 1!")
+AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
+AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data")
+AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
+AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
+AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
+AddOption(nnClusterizerUseCfRegression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN")
+AddOption(nnClusterizerApplyCfDeconvolution, int, 0, "", 0, "Applies the CFDeconvolution kernel before the digits to the network are filled")
+AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
+AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
+AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
+AddOption(nnClusterizerApplyNoiseSupression, int, 1, "", 0, "Applies the NoiseSupression kernel before the digits to the network are filled")
+AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
+AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
+AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
+AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
+AddHelp("help", 'h')
+EndConfig()
+
 BeginSubConfig(GPUSettingsProcessing, proc, configStandalone, "PROC", 0, "Processing settings", proc)
 AddOption(platformNum, int32_t, -1, "", 0, "Platform to use, in case the backend provides multiple platforms (OpenCL only, -1 = auto-select)")
 AddOption(deviceNum, int32_t, -1, "gpuDevice", 0, "Set GPU device to use (-1: automatic, -2: for round-robin usage in timeslice-pipeline)")
@@ -298,30 +326,7 @@ AddOption(printSettings, bool, false, "", 0, "Print all settings when initializi
 AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr)
 AddSubConfig(GPUSettingsProcessingRTC, rtc)
 AddSubConfig(GPUSettingsProcessingParam, param)
-AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural network clusterizer should be used.")
-AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
-AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
-AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
-AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
-AddOption(nnInferenceThreadsPerNN, int, 0, "", 0, "Number of threads used to evaluate one neural network")
-AddOption(nnInferenceEnableOrtOptimization, unsigned int, 1, "", 0, "Enables graph optimizations in ONNX Runtime. Can be greater than 1!")
-AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
-AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data")
-AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
-AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input")
-AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
-AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
-AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2")
-AddOption(nnClusterizerUseCFregression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN")
-AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
-AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
-AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
-AddOption(nnClusterizerApplyNoiseSupression, int, 1, "", 0, "Applies the NoiseSupression kernel before the digits to the network are filled")
-AddOption(nnClusterizerApplyCfDeconvolution, int, 0, "", 0, "Applies the CFDeconvolution kernel before the digits to the network are filled")
-AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
-AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
-AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
-AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
+AddSubConfig(GPUSettingsProcessingNNclusterizer, nn)
 AddHelp("help", 'h')
 EndConfig()
 #endif // __OPENCL__
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 0db6c9a05b9a6..44fe837988530 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -889,23 +889,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
         if (GetProcessingSettings().applyNNclusterizer) {
           // Settings for the clusterizer
-          clusterer.nnClusterizerUseCFregression = GetProcessingSettings().nnClusterizerUseCFregression;
-          clusterer.nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow;
-          clusterer.nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad;
-          clusterer.nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime;
-          clusterer.nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
-          clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
-          clusterer.nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
-          clusterer.nnClusterizerBoundaryFillValue = GetProcessingSettings().nnClusterizerBoundaryFillValue;
+          (clusterer.nnInternals)->nnClusterizerUseCfRegression = GetProcessingSettings().nnClusterizerUseCfRegression;
+          (clusterer.nnInternals)->nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow;
+          (clusterer.nnInternals)->nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad;
+          (clusterer.nnInternals)->nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime;
+          (clusterer.nnInternals)->nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
+          (clusterer.nnInternals)->nnClusterizerElementSize = ((2 * (clusterer.nnInternals)->nnClusterizerSizeInputRow + 1) * (2 * (clusterer.nnInternals)->nnClusterizerSizeInputPad + 1) * (2 * (clusterer.nnInternals)->nnClusterizerSizeInputTime + 1)) + ((clusterer.nnInternals)->nnClusterizerAddIndexData ? 3 : 0);
+          (clusterer.nnInternals)->nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
+          (clusterer.nnInternals)->nnClusterizerBoundaryFillValue = GetProcessingSettings().nnClusterizerBoundaryFillValue;
           if (GetProcessingSettings().nnClusterizerVerbosity < 0){
-            clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
+            (clusterer.nnInternals)->nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
           } else {
-            clusterer.nnClusterizerVerbosity = GetProcessingSettings().nnClusterizerVerbosity;
+            (clusterer.nnInternals)->nnClusterizerVerbosity = GetProcessingSettings().nnClusterizerVerbosity;
           }
 
           // Settings for the NN evaluation
-          clusterer.nnClassThreshold = GetProcessingSettings().nnClassThreshold;
-          clusterer.nnSigmoidTrafoClassThreshold = GetProcessingSettings().nnSigmoidTrafoClassThreshold;
+          (clusterer.nnInternals)->nnClassThreshold = GetProcessingSettings().nnClassThreshold;
+          (clusterer.nnInternals)->nnSigmoidTrafoClassThreshold = GetProcessingSettings().nnSigmoidTrafoClassThreshold;
 
           // Settings for the neural network evaluation
           clusterer.OrtOptions = {
@@ -922,7 +922,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           clusterer.model_class.init(clusterer.OrtOptions);
           std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':');
 
-          if (!clusterer.nnClusterizerUseCFregression) {
+          if (!(clusterer.nnInternals)->nnClusterizerUseCfRegression) {
             if (clusterer.model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
               clusterer.OrtOptions["model-path"] = reg_model_paths[0];
               clusterer.model_reg_1.init(clusterer.OrtOptions);
@@ -934,31 +934,31 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             }
           }
           
-          if (clusterer.nnClusterizerUseCFregression || (int)(GetProcessingSettings().nnClusterizerApplyCfDeconvolution)) {
+          if ((clusterer.nnInternals)->nnClusterizerUseCfRegression || (int)(GetProcessingSettings().nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
 
-          if (clusterer.nnSigmoidTrafoClassThreshold) {
+          if ((clusterer.nnInternals)->nnSigmoidTrafoClassThreshold) {
             // Inverse sigmoid transformation
-            clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold));
+            (clusterer.nnInternals)->nnClassThreshold = (float)std::log((clusterer.nnInternals)->nnClassThreshold / (1.f - (clusterer.nnInternals)->nnClassThreshold));
           }
           
           float time_clusterizer = 0, time_fill = 0;
           int evalDtype = clusterer.OrtOptions["dtype"].find("32") != std::string::npos;
           clusterer.outputDataClass.resize(clusterer.mPmemory->counters.nClusters, -1);
 
-          for(int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clusterer.nnClusterizerBatchedMode); batch++) {
-            uint batchStart = batch * clusterer.nnClusterizerBatchedMode;
-            uint iSize = CAMath::Min((uint)clusterer.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
+          for(int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / (clusterer.nnInternals)->nnClusterizerBatchedMode); batch++) {
+            uint batchStart = batch * (clusterer.nnInternals)->nnClusterizerBatchedMode;
+            uint iSize = CAMath::Min((uint)(clusterer.nnInternals)->nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
             clusterer.peakPositions.resize(iSize);
             clusterer.centralCharges.resize(iSize);
 
             if (evalDtype == 1) {
-              clusterer.inputData32.resize(iSize * clusterer.nnClusterizerElementSize, (float)(clusterer.nnClusterizerBoundaryFillValue));
+              clusterer.inputData32.resize(iSize * (clusterer.nnInternals)->nnClusterizerElementSize, (float)((clusterer.nnInternals)->nnClusterizerBoundaryFillValue));
             } else {
-              clusterer.inputData16.resize(iSize * clusterer.nnClusterizerElementSize, (OrtDataType::Float16_t)((float)clusterer.nnClusterizerBoundaryFillValue));
+              clusterer.inputData16.resize(iSize * (clusterer.nnInternals)->nnClusterizerElementSize, (OrtDataType::Float16_t)((float)(clusterer.nnInternals)->nnClusterizerBoundaryFillValue));
             }
 
             auto start0 = std::chrono::high_resolution_clock::now();
@@ -966,18 +966,18 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             auto stop0 = std::chrono::high_resolution_clock::now();
 
             auto start1 = std::chrono::high_resolution_clock::now();
-            GPUTPCNNClusterizer::applyNetworkClass(clusterer, evalDtype);
+            GPUTTPCNNClusterizerInference::inferenceNetworkClass(clusterer, evalDtype);
             if (clusterer.model_class.getNumOutputNodes()[0][1] == 1){
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
             } else {
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
             }
 
-            if (!clusterer.nnClusterizerUseCFregression) {
-              GPUTPCNNClusterizer::applyNetworkReg1(clusterer, evalDtype);
+            if (!(clusterer.nnInternals)->nnClusterizerUseCfRegression) {
+              GPUTTPCNNClusterizerInference::inferenceNetworkReg1(clusterer, evalDtype);
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Running the NN for regression class 1
               if (clusterer.model_class.getNumOutputNodes()[0][1] > 1 && reg_model_paths.size() > 1) {
-                GPUTPCNNClusterizer::applyNetworkReg2(clusterer, evalDtype);
+                GPUTTPCNNClusterizerInference::inferenceNetworkReg2(clusterer, evalDtype);
                 runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Running the NN for regression class 2
               }
             }
@@ -989,17 +989,16 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
 
           auto start1 = std::chrono::high_resolution_clock::now();
-          if(clusterer.nnClusterizerUseCFregression) {
+          if((clusterer.nnInternals)->nnClusterizerUseCfRegression) {
             runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
 
-          if (clusterer.nnClusterizerVerbosity < 3) {
+          if ((clusterer.nnInternals)->nnClusterizerVerbosity < 3) {
             LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSlice << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
         } else {
-          runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
           DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
         }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index f7d7cfe9cf234..32ca5cf992f2f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -19,8 +19,7 @@
 #include "GPUProcessor.h"
 #include "GPUDataTypes.h"
 #include "CfFragment.h"
-#include "ML/OrtInterface.h"
-#include "ML/3rdparty/GPUORTFloat16.h"
+#include "GPUTPCNNClusterizerInternals.h"
 
 using namespace o2::ml;
 
@@ -54,6 +53,8 @@ struct ChargePos;
 
 class GPUTPCGeometry;
 
+class GPUTPCNNClusterizerInternals;
+
 class GPUTPCClusterFinder : public GPUProcessor
 {
  public:
@@ -145,29 +146,7 @@ class GPUTPCClusterFinder : public GPUProcessor
   int16_t mZSOffsetId = -1;
   int16_t mOutputId = -1;
 
-  int nnClusterizerSizeInputRow = 3;
-  int nnClusterizerSizeInputPad = 3;
-  int nnClusterizerSizeInputTime = 3;
-  int nnClusterizerElementSize = -1;
-  bool nnClusterizerAddIndexData = true;
-  float nnClassThreshold = 0.16;
-  bool nnSigmoidTrafoClassThreshold = 1;
-  int nnClusterizerUseCFregression = 0;
-  int nnClusterizerBatchedMode = 1;
-  int nnClusterizerVerbosity = 0;
-  int nnClusterizerBoundaryFillValue = -1;
-
-  // Memory allocation for neural network
-  uint class2_elements = 0;
-  std::vector<float> inputData32;
-  std::vector<OrtDataType::Float16_t> inputData16;
-  std::vector<float> outputDataClass, modelProbabilities, outputDataReg1, outputDataReg2;
-
-  std::vector<ChargePos> peakPositions;
-  std::vector<float> centralCharges;
-
-  std::unordered_map<std::string, std::string> OrtOptions;
-  OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
+  GPUTPCNNClusterizerInternals* nnInternals;
 
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 13825b17848f7..87c8257f15b04 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -83,31 +83,6 @@ GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::publishClass2Regr
   GPUTPCNNClusterizer::publishClustersReg2(glo_idx, smem, clusterer, dtype, onlyMC, batchStart);
 }
 
-
-void GPUTPCNNClusterizer::applyNetworkClass(processorType& clusterer, int8_t dtype, uint batch_idx) {
-  if(dtype == 0){
-    clusterer.modelProbabilities = clusterer.model_class.inference<OrtDataType::Float16_t, float>(clusterer.inputData16);
-  } else {
-    clusterer.modelProbabilities = clusterer.model_class.inference<float, float>(clusterer.inputData32);
-  }
-}
-
-void GPUTPCNNClusterizer::applyNetworkReg1(processorType& clusterer, int8_t dtype) {
-  if(dtype == 0){
-    clusterer.outputDataReg1 = clusterer.model_reg_1.inference<OrtDataType::Float16_t, float>(clusterer.inputData16);
-  } else {
-    clusterer.outputDataReg1 = clusterer.model_reg_1.inference<float, float>(clusterer.inputData32);
-  }
-}
-
-void GPUTPCNNClusterizer::applyNetworkReg2(processorType& clusterer, int8_t dtype) {
-  if(dtype == 0){
-    clusterer.outputDataReg2 = clusterer.model_reg_2.inference<OrtDataType::Float16_t, float>(clusterer.inputData16);
-  } else {
-    clusterer.outputDataReg2 = clusterer.model_reg_2.inference<float, float>(clusterer.inputData32);
-  }
-}
-
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
 {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 14fe29398843a..b2b265706a4d5 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -21,6 +21,7 @@
 #include "GPUTPCClusterFinder.h"
 #include "Array2D.h"
 #include "PackedCharge.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
 
 namespace o2::tpc
 {
@@ -71,10 +72,6 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
   static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
 
-  static void applyNetworkClass(processorType&, int8_t = 0, uint = 0);
-  static void applyNetworkReg1(processorType&, int8_t = 0);
-  static void applyNetworkReg2(processorType&, int8_t = 0);
-
   
   private:
 

From a478634f563e00067d4742f548d55ffd8db4f831 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 7 Mar 2025 10:31:07 +0100
Subject: [PATCH 39/77] Modifications from comments on PR

---
 GPU/GPUTracking/CMakeLists.txt                | 141 ++++++++++++------
 .../GPUTrackingLinkDef_O2_DataTypes.h         |   1 +
 .../Global/GPUChainTrackingClusterizer.cxx    | 111 +++++++-------
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |   7 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 139 ++++++++++-------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |   4 +-
 .../GPUTPCNNClusterizerInternals.h            |  57 +++++++
 7 files changed, 310 insertions(+), 150 deletions(-)
 create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h

diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 8654d007f6b2e..06a0b4157b732 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -158,37 +158,71 @@ set(HDRS_INSTALL
 )
 
 set(SRCS_NO_CINT ${SRCS_NO_CINT} display/GPUDisplayInterface.cxx)
-set(SRCS_NO_CINT ${SRCS_NO_CINT}
-  Global/GPUChainITS.cxx
-  ITS/GPUITSFitter.cxx
-  ITS/GPUITSFitterKernels.cxx
-  dEdx/GPUdEdx.cxx
-  TPCConvert/GPUTPCConvert.cxx
-  TPCConvert/GPUTPCConvertKernel.cxx
-  DataCompression/GPUTPCCompression.cxx
-  DataCompression/GPUTPCCompressionTrackModel.cxx
-  DataCompression/GPUTPCCompressionKernels.cxx
-  DataCompression/GPUTPCDecompression.cxx
-  DataCompression/GPUTPCDecompressionKernels.cxx
-  DataCompression/TPCClusterDecompressor.cxx
-  DataCompression/GPUTPCClusterStatistics.cxx
-  TPCClusterFinder/GPUTPCClusterFinder.cxx
-  TPCClusterFinder/ClusterAccumulator.cxx
-  TPCClusterFinder/MCLabelAccumulator.cxx
-  TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
-  TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
-  TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
-  TPCClusterFinder/GPUTPCCFPeakFinder.cxx
-  TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
-  TPCClusterFinder/GPUTPCCFClusterizer.cxx
-  TPCClusterFinder/GPUTPCNNClusterizer.cxx
-  TPCClusterFinder/GPUTPCCFDeconvolution.cxx
-  TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
-  TPCClusterFinder/GPUTPCCFDecodeZS.cxx
-  TPCClusterFinder/GPUTPCCFGather.cxx
-  Refit/GPUTrackingRefit.cxx
-  Refit/GPUTrackingRefitKernel.cxx
-  Merger/GPUTPCGMO2Output.cxx)
+
+if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
+  set(SRCS_NO_CINT ${SRCS_NO_CINT}
+    Global/GPUChainITS.cxx
+    ITS/GPUITSFitter.cxx
+    ITS/GPUITSFitterKernels.cxx
+    dEdx/GPUdEdx.cxx
+    TPCConvert/GPUTPCConvert.cxx
+    TPCConvert/GPUTPCConvertKernel.cxx
+    DataCompression/GPUTPCCompression.cxx
+    DataCompression/GPUTPCCompressionTrackModel.cxx
+    DataCompression/GPUTPCCompressionKernels.cxx
+    DataCompression/GPUTPCDecompression.cxx
+    DataCompression/GPUTPCDecompressionKernels.cxx
+    DataCompression/TPCClusterDecompressor.cxx
+    DataCompression/GPUTPCClusterStatistics.cxx
+    TPCClusterFinder/GPUTPCClusterFinder.cxx
+    TPCClusterFinder/ClusterAccumulator.cxx
+    TPCClusterFinder/MCLabelAccumulator.cxx
+    TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
+    TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
+    TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
+    TPCClusterFinder/GPUTPCCFPeakFinder.cxx
+    TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
+    TPCClusterFinder/GPUTPCCFClusterizer.cxx
+    TPCClusterFinder/GPUTPCNNClusterizer.cxx
+    TPCClusterFinder/GPUTPCCFDeconvolution.cxx
+    TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
+    TPCClusterFinder/GPUTPCCFDecodeZS.cxx
+    TPCClusterFinder/GPUTPCCFGather.cxx
+    Refit/GPUTrackingRefit.cxx
+    Refit/GPUTrackingRefitKernel.cxx
+    Merger/GPUTPCGMO2Output.cxx)
+else()
+  set(SRCS_NO_CINT ${SRCS_NO_CINT}
+    Global/GPUChainITS.cxx
+    ITS/GPUITSFitter.cxx
+    ITS/GPUITSFitterKernels.cxx
+    dEdx/GPUdEdx.cxx
+    TPCConvert/GPUTPCConvert.cxx
+    TPCConvert/GPUTPCConvertKernel.cxx
+    DataCompression/GPUTPCCompression.cxx
+    DataCompression/GPUTPCCompressionTrackModel.cxx
+    DataCompression/GPUTPCCompressionKernels.cxx
+    DataCompression/GPUTPCDecompression.cxx
+    DataCompression/GPUTPCDecompressionKernels.cxx
+    DataCompression/TPCClusterDecompressor.cxx
+    DataCompression/GPUTPCClusterStatistics.cxx
+    TPCClusterFinder/GPUTPCClusterFinder.cxx
+    TPCClusterFinder/ClusterAccumulator.cxx
+    TPCClusterFinder/MCLabelAccumulator.cxx
+    TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
+    TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
+    TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
+    TPCClusterFinder/GPUTPCCFPeakFinder.cxx
+    TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
+    TPCClusterFinder/GPUTPCCFClusterizer.cxx
+    TPCClusterFinder/GPUTPCCFDeconvolution.cxx
+    TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
+    TPCClusterFinder/GPUTPCCFDecodeZS.cxx
+    TPCClusterFinder/GPUTPCCFGather.cxx
+    Refit/GPUTrackingRefit.cxx
+    Refit/GPUTrackingRefitKernel.cxx
+    Merger/GPUTPCGMO2Output.cxx)
+endif()
 
 set(SRCS_DATATYPES
     ${SRCS_DATATYPES}
@@ -260,19 +294,37 @@ unset(HDRS_TMP)
 
 # Main CMake part for O2
 if(ALIGPU_BUILD_TYPE STREQUAL "O2")
-  o2_add_library(GPUDataTypes
-                 TARGETVARNAME targetName
-                 PUBLIC_INCLUDE_DIRECTORIES .
-                                            Definitions
-                                            DataTypes
-                 PUBLIC_LINK_LIBRARIES O2::GPUUtils
-                                       O2::GPUCommon
-                                       O2::ReconstructionDataFormats
-                                       O2::TPCFastTransformation
-                                       O2::ML
-                 PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
-                 SOURCES ${SRCS_DATATYPES})
-  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2)
+
+  if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
+    o2_add_library(GPUDataTypes
+                  TARGETVARNAME targetName
+                  PUBLIC_INCLUDE_DIRECTORIES .
+                                              Definitions
+                                              DataTypes
+                  PUBLIC_LINK_LIBRARIES O2::GPUUtils
+                                        O2::GPUCommon
+                                        O2::ReconstructionDataFormats
+                                        O2::TPCFastTransformation
+                                        O2::ML
+                  PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
+                  SOURCES ${SRCS_DATATYPES})
+    add_compile_definitions(GPUCA_HAS_ONNX=1)
+    target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX)
+  else()
+    o2_add_library(GPUDataTypes
+                  TARGETVARNAME targetName
+                  PUBLIC_INCLUDE_DIRECTORIES .
+                                              Definitions
+                                              DataTypes
+                  PUBLIC_LINK_LIBRARIES O2::GPUUtils
+                                        O2::GPUCommon
+                                        O2::ReconstructionDataFormats
+                                        O2::TPCFastTransformation
+                  PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
+                  SOURCES ${SRCS_DATATYPES})
+    target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2)
+  endif()
+
   o2_target_root_dictionary(GPUDataTypes
                             HEADERS ${HDRS_CINT_DATATYPES} ${HDRS_CINT_O2_ADDITIONAL}
                             LINKDEF GPUTrackingLinkDef_O2_DataTypes.h)
@@ -337,6 +389,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                          LABELS its COMPILE_ONLY)
 
   add_subdirectory(Interface)
+
 endif()
 
 # Main CMake part for Standalone
diff --git a/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h b/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h
index ab60827655a43..35ebbabe41672 100644
--- a/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h
+++ b/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h
@@ -30,6 +30,7 @@
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessing + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingParam + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingRTC + ;
+#pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingNNclusterizer + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsDisplay + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsDisplayLight + ;
 #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsDisplayHeavy + ;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 44fe837988530..ee0cc84c70a59 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -846,14 +846,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (clusterer.mPmemory->counters.nPeaks == 0) {
           continue;
         }
-        if (!GetProcessingSettings().applyNNclusterizer) {
-          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
-          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
-        } else {
-          // FIXME: This potentially needs to be removed when I actually apply the NN. For now its only to make the code work
-          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
-          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
-        }
+        runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+        runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
         if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile)) {
           clusterer.DumpPeakMap(*mDebugFile, "Suppressed Peaks");
         }
@@ -887,54 +881,56 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
 
-        if (GetProcessingSettings().applyNNclusterizer) {
+#ifdef GPUCA_HAS_ONNX
+        GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn;
+        if (nn_settings.applyNNclusterizer) {
           // Settings for the clusterizer
-          (clusterer.nnInternals)->nnClusterizerUseCfRegression = GetProcessingSettings().nnClusterizerUseCfRegression;
-          (clusterer.nnInternals)->nnClusterizerSizeInputRow = GetProcessingSettings().nnClusterizerSizeInputRow;
-          (clusterer.nnInternals)->nnClusterizerSizeInputPad = GetProcessingSettings().nnClusterizerSizeInputPad;
-          (clusterer.nnInternals)->nnClusterizerSizeInputTime = GetProcessingSettings().nnClusterizerSizeInputTime;
-          (clusterer.nnInternals)->nnClusterizerAddIndexData = GetProcessingSettings().nnClusterizerAddIndexData;
+          (clusterer.nnInternals)->nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
+          (clusterer.nnInternals)->nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
+          (clusterer.nnInternals)->nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
+          (clusterer.nnInternals)->nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
+          (clusterer.nnInternals)->nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
           (clusterer.nnInternals)->nnClusterizerElementSize = ((2 * (clusterer.nnInternals)->nnClusterizerSizeInputRow + 1) * (2 * (clusterer.nnInternals)->nnClusterizerSizeInputPad + 1) * (2 * (clusterer.nnInternals)->nnClusterizerSizeInputTime + 1)) + ((clusterer.nnInternals)->nnClusterizerAddIndexData ? 3 : 0);
-          (clusterer.nnInternals)->nnClusterizerBatchedMode = GetProcessingSettings().nnClusterizerBatchedMode;
-          (clusterer.nnInternals)->nnClusterizerBoundaryFillValue = GetProcessingSettings().nnClusterizerBoundaryFillValue;
-          if (GetProcessingSettings().nnClusterizerVerbosity < 0){
-            (clusterer.nnInternals)->nnClusterizerVerbosity = GetProcessingSettings().nnInferenceVerbosity;
+          (clusterer.nnInternals)->nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
+          (clusterer.nnInternals)->nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
+          if (nn_settings.nnClusterizerVerbosity < 0){
+            (clusterer.nnInternals)->nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
           } else {
-            (clusterer.nnInternals)->nnClusterizerVerbosity = GetProcessingSettings().nnClusterizerVerbosity;
+            (clusterer.nnInternals)->nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
           }
 
           // Settings for the NN evaluation
-          (clusterer.nnInternals)->nnClassThreshold = GetProcessingSettings().nnClassThreshold;
-          (clusterer.nnInternals)->nnSigmoidTrafoClassThreshold = GetProcessingSettings().nnSigmoidTrafoClassThreshold;
+          (clusterer.nnInternals)->nnClassThreshold = nn_settings.nnClassThreshold;
+          (clusterer.nnInternals)->nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
 
           // Settings for the neural network evaluation
-          clusterer.OrtOptions = {
-            {"model-path", GetProcessingSettings().nnClassificationPath},
-            {"device", GetProcessingSettings().nnInferenceDevice},
-            {"device-id", std::to_string(GetProcessingSettings().nnInferenceDeviceId)},
-            {"allocate-device-memory", std::to_string(GetProcessingSettings().nnInferenceAllocateDevMem)},
-            {"dtype", GetProcessingSettings().nnInferenceDtype},
-            {"intra-op-num-threads", std::to_string(GetProcessingSettings().nnInferenceThreadsPerNN)},
-            {"enable-optimizations", std::to_string(GetProcessingSettings().nnInferenceEnableOrtOptimization)},
-            {"enable-profiling", std::to_string(GetProcessingSettings().nnInferenceOrtProfiling)},
-            {"profiling-output-path", GetProcessingSettings().nnInferenceOrtProfilingPath},
-            {"logging-level", std::to_string(GetProcessingSettings().nnInferenceVerbosity)}};
-          clusterer.model_class.init(clusterer.OrtOptions);
-          std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(GetProcessingSettings().nnRegressionPath, ':');
+          (clusterer.nnInternals)->OrtOptions = {
+            {"model-path", nn_settings.nnClassificationPath},
+            {"device", nn_settings.nnInferenceDevice},
+            {"device-id", std::to_string(nn_settings.nnInferenceDeviceId)},
+            {"allocate-device-memory", std::to_string(nn_settings.nnInferenceAllocateDevMem)},
+            {"dtype", nn_settings.nnInferenceDtype},
+            {"intra-op-num-threads", std::to_string(nn_settings.nnInferenceThreadsPerNN)},
+            {"enable-optimizations", std::to_string(nn_settings.nnInferenceEnableOrtOptimization)},
+            {"enable-profiling", std::to_string(nn_settings.nnInferenceOrtProfiling)},
+            {"profiling-output-path", nn_settings.nnInferenceOrtProfilingPath},
+            {"logging-level", std::to_string(nn_settings.nnInferenceVerbosity)}};
+          (clusterer.nnInternals)->model_class.init((clusterer.nnInternals)->OrtOptions);
+          std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(nn_settings.nnRegressionPath, ':');
 
           if (!(clusterer.nnInternals)->nnClusterizerUseCfRegression) {
-            if (clusterer.model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
-              clusterer.OrtOptions["model-path"] = reg_model_paths[0];
-              clusterer.model_reg_1.init(clusterer.OrtOptions);
+            if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
+              (clusterer.nnInternals)->OrtOptions["model-path"] = reg_model_paths[0];
+              (clusterer.nnInternals)->model_reg_1.init((clusterer.nnInternals)->OrtOptions);
             } else {
-              clusterer.OrtOptions["model-path"] = reg_model_paths[0];
-              clusterer.model_reg_1.init(clusterer.OrtOptions);
-              clusterer.OrtOptions["model-path"] = reg_model_paths[1];
-              clusterer.model_reg_2.init(clusterer.OrtOptions);
+              (clusterer.nnInternals)->OrtOptions["model-path"] = reg_model_paths[0];
+              (clusterer.nnInternals)->model_reg_1.init((clusterer.nnInternals)->OrtOptions);
+              (clusterer.nnInternals)->OrtOptions["model-path"] = reg_model_paths[1];
+              (clusterer.nnInternals)->model_reg_2.init((clusterer.nnInternals)->OrtOptions);
             }
           }
           
-          if ((clusterer.nnInternals)->nnClusterizerUseCfRegression || (int)(GetProcessingSettings().nnClusterizerApplyCfDeconvolution)) {
+          if ((clusterer.nnInternals)->nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
@@ -945,20 +941,25 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
           
           float time_clusterizer = 0, time_fill = 0;
-          int evalDtype = clusterer.OrtOptions["dtype"].find("32") != std::string::npos;
-          clusterer.outputDataClass.resize(clusterer.mPmemory->counters.nClusters, -1);
+          int evalDtype = (clusterer.nnInternals)->OrtOptions["dtype"].find("32") != std::string::npos;
+          (clusterer.nnInternals)->outputDataClass.resize(clusterer.mPmemory->counters.nClusters, -1);
 
           for(int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / (clusterer.nnInternals)->nnClusterizerBatchedMode); batch++) {
             uint batchStart = batch * (clusterer.nnInternals)->nnClusterizerBatchedMode;
             uint iSize = CAMath::Min((uint)(clusterer.nnInternals)->nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
-            clusterer.peakPositions.resize(iSize);
-            clusterer.centralCharges.resize(iSize);
+            (clusterer.nnInternals)->clusterFlags.clear();
+            (clusterer.nnInternals)->peakPositions.clear();
+            (clusterer.nnInternals)->centralCharges.clear();
+
+            (clusterer.nnInternals)->clusterFlags.resize(iSize, {0,0});
+            (clusterer.nnInternals)->peakPositions.resize(iSize);
+            (clusterer.nnInternals)->centralCharges.resize(iSize);
 
             if (evalDtype == 1) {
-              clusterer.inputData32.resize(iSize * (clusterer.nnInternals)->nnClusterizerElementSize, (float)((clusterer.nnInternals)->nnClusterizerBoundaryFillValue));
+              (clusterer.nnInternals)->inputData32.resize(iSize * (clusterer.nnInternals)->nnClusterizerElementSize, (float)((clusterer.nnInternals)->nnClusterizerBoundaryFillValue));
             } else {
-              clusterer.inputData16.resize(iSize * (clusterer.nnInternals)->nnClusterizerElementSize, (OrtDataType::Float16_t)((float)(clusterer.nnInternals)->nnClusterizerBoundaryFillValue));
+              (clusterer.nnInternals)->inputData16.resize(iSize * (clusterer.nnInternals)->nnClusterizerElementSize, (OrtDataType::Float16_t)((float)(clusterer.nnInternals)->nnClusterizerBoundaryFillValue));
             }
 
             auto start0 = std::chrono::high_resolution_clock::now();
@@ -966,18 +967,18 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             auto stop0 = std::chrono::high_resolution_clock::now();
 
             auto start1 = std::chrono::high_resolution_clock::now();
-            GPUTTPCNNClusterizerInference::inferenceNetworkClass(clusterer, evalDtype);
-            if (clusterer.model_class.getNumOutputNodes()[0][1] == 1){
+            GPUTPCNNClusterizer::inferenceNetworkClass(clusterer, evalDtype);
+            if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] == 1){
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
             } else {
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
             }
 
             if (!(clusterer.nnInternals)->nnClusterizerUseCfRegression) {
-              GPUTTPCNNClusterizerInference::inferenceNetworkReg1(clusterer, evalDtype);
+              GPUTPCNNClusterizer::inferenceNetworkReg1(clusterer, evalDtype);
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Running the NN for regression class 1
-              if (clusterer.model_class.getNumOutputNodes()[0][1] > 1 && reg_model_paths.size() > 1) {
-                GPUTTPCNNClusterizerInference::inferenceNetworkReg2(clusterer, evalDtype);
+              if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] > 1 && reg_model_paths.size() > 1) {
+                GPUTPCNNClusterizer::inferenceNetworkReg2(clusterer, evalDtype);
                 runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Running the NN for regression class 2
               }
             }
@@ -999,9 +1000,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSlice << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
         } else {
+#endif
           DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
+
+#ifdef GPUCA_HAS_ONNX
         }
+#endif
 
         if (doGPU && propagateMCLabels) {
           TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 32ca5cf992f2f..f7a5bf8990b8a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -19,9 +19,10 @@
 #include "GPUProcessor.h"
 #include "GPUDataTypes.h"
 #include "CfFragment.h"
-#include "GPUTPCNNClusterizerInternals.h"
 
-using namespace o2::ml;
+#ifdef GPUCA_HAS_ONNX
+#include "GPUTPCNNClusterizerInternals.h"
+#endif
 
 namespace o2
 {
@@ -146,7 +147,9 @@ class GPUTPCClusterFinder : public GPUProcessor
   int16_t mZSOffsetId = -1;
   int16_t mOutputId = -1;
 
+#ifdef GPUCA_HAS_ONNX
   GPUTPCNNClusterizerInternals* nnInternals;
+#endif
 
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 87c8257f15b04..d4e2f3125592d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -31,7 +31,7 @@ template <>
 GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  if (clusterer.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
+  if ((clusterer.nnInternals)->outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
     return;
   }
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
@@ -51,16 +51,25 @@ template <>
 GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  clusterer.outputDataClass[glo_idx + batchStart] = (int)(clusterer.modelProbabilities[glo_idx] > clusterer.nnClassThreshold);
+  (clusterer.nnInternals)->outputDataClass[glo_idx + batchStart] = (int)((clusterer.nnInternals)->modelProbabilities[glo_idx] > (clusterer.nnInternals)->nnClassThreshold);
 }
 
 template <>
 GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  auto elem_iterator = clusterer.modelProbabilities.begin() + (unsigned int)(glo_idx * clusterer.model_class.getNumOutputNodes()[0][1]);
-  uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.model_class.getNumOutputNodes()[0][1])); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
-  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
+  uint elem_iterator = glo_idx * (clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1];
+  float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
+  uint class_label = 0;
+  for(float pIdx = elem_iterator; pIdx < elem_iterator + (clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1]; pIdx++) {
+    if(pIdx == elem_iterator) {
+      current_max_prob = (clusterer.nnInternals)->modelProbabilities[pIdx];
+    } else {
+      class_label = ((clusterer.nnInternals)->modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
+    }
+  }
+  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + (clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1])); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
+  (clusterer.nnInternals)->outputDataClass[glo_idx + batchStart] = class_label;
 }
 
 template <>
@@ -83,6 +92,31 @@ GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::publishClass2Regr
   GPUTPCNNClusterizer::publishClustersReg2(glo_idx, smem, clusterer, dtype, onlyMC, batchStart);
 }
 
+// Apply the neural network to the input data. Note: These are not GPU kernels. We let ONNX take care of that
+void GPUTPCNNClusterizer::inferenceNetworkClass(processorType& clusterer, int8_t dtype, uint batch_idx) {
+  if(dtype == 0){
+    (clusterer.nnInternals)->modelProbabilities = (clusterer.nnInternals)->model_class.inference<OrtDataType::Float16_t, float>((clusterer.nnInternals)->inputData16);
+  } else {
+    (clusterer.nnInternals)->modelProbabilities = (clusterer.nnInternals)->model_class.inference<float, float>((clusterer.nnInternals)->inputData32);
+  }
+}
+
+void GPUTPCNNClusterizer::inferenceNetworkReg1(processorType& clusterer, int8_t dtype) {
+  if(dtype == 0){
+    (clusterer.nnInternals)->outputDataReg1 = (clusterer.nnInternals)->model_reg_1.inference<OrtDataType::Float16_t, float>((clusterer.nnInternals)->inputData16);
+  } else {
+    (clusterer.nnInternals)->outputDataReg1 = (clusterer.nnInternals)->model_reg_1.inference<float, float>((clusterer.nnInternals)->inputData32);
+  }
+}
+
+void GPUTPCNNClusterizer::inferenceNetworkReg2(processorType& clusterer, int8_t dtype) {
+  if(dtype == 0){
+    (clusterer.nnInternals)->outputDataReg2 = (clusterer.nnInternals)->model_reg_2.inference<OrtDataType::Float16_t, float>((clusterer.nnInternals)->inputData16);
+  } else {
+    (clusterer.nnInternals)->outputDataReg2 = (clusterer.nnInternals)->model_reg_2.inference<float, float>((clusterer.nnInternals)->inputData32);
+  }
+}
+
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
 {
@@ -114,54 +148,59 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
 {
 
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
 
   uint glo_idx = get_global_id(0);
 
-  uint write_idx = glo_idx * clusterer.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
+  uint write_idx = glo_idx * (clusterer.nnInternals)->nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
 
   ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
   int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
 
-  clusterer.peakPositions[glo_idx] = peak;
-  clusterer.centralCharges[glo_idx] = central_charge;
+  (clusterer.nnInternals)->peakPositions[glo_idx] = peak;
+  (clusterer.nnInternals)->centralCharges[glo_idx] = central_charge;
 
-  int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
+  int row_offset = GPUTPCNNClusterizer::rowOffset(row, (clusterer.nnInternals)->nnClusterizerSizeInputRow);
   GPUCA_UNROLL(U(), U());
-  for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {
+  for (int r = -(clusterer.nnInternals)->nnClusterizerSizeInputRow; r <= (clusterer.nnInternals)->nnClusterizerSizeInputRow; r++) {
     bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
     int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
-    for (int p = -clusterer.nnClusterizerSizeInputPad + pad_offset; p <= clusterer.nnClusterizerSizeInputPad + pad_offset; p++) {
-      bool is_boundary = is_row_boundary || GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
-      for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
+    for (int p = -(clusterer.nnInternals)->nnClusterizerSizeInputPad + pad_offset; p <= (clusterer.nnInternals)->nnClusterizerSizeInputPad + pad_offset; p++) {
+      bool is_boundary = is_row_boundary || GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p, (clusterer.nnInternals)->nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
+      for (int t = -(clusterer.nnInternals)->nnClusterizerSizeInputTime; t <= (clusterer.nnInternals)->nnClusterizerSizeInputTime; t++) {
         if (!is_boundary) {
           ChargePos tmp_pos(row + r, pad + p, time + t);
+          if (r == 0 && !(clusterer.nnInternals)->clusterFlags[glo_idx][0] && std::abs(p) < 3 && std::abs(t) < 3 && p!=0 && t!=0) { // ordering is done for short circuit optimization
+            (clusterer.nnInternals)->clusterFlags[glo_idx][0] = CfUtils::isPeak(isPeakMap[tmp_pos]);
+            (clusterer.nnInternals)->clusterFlags[glo_idx][1] = (clusterer.nnInternals)->clusterFlags[glo_idx][0];
+          }
           if(dtype == 0){
-            clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+            (clusterer.nnInternals)->inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
           } else {
-            clusterer.inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+            (clusterer.nnInternals)->inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
           }
         } else {
           // Filling boundary just to make sure that no values are left unintentionally
           if(dtype == 0){
-            clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clusterer.nnClusterizerBoundaryFillValue));
+            (clusterer.nnInternals)->inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>((clusterer.nnInternals)->nnClusterizerBoundaryFillValue));
           } else {
-            clusterer.inputData32[write_idx] = static_cast<float>(clusterer.nnClusterizerBoundaryFillValue);
+            (clusterer.nnInternals)->inputData32[write_idx] = static_cast<float>((clusterer.nnInternals)->nnClusterizerBoundaryFillValue);
           }
         }
         write_idx++;
       }
     }
   }
-  if (clusterer.nnClusterizerAddIndexData) {
+  if ((clusterer.nnInternals)->nnClusterizerAddIndexData) {
     if(dtype == 0){
-      clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISlice / 36.f);
-      clusterer.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
-      clusterer.inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row));
+      (clusterer.nnInternals)->inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISlice / 36.f);
+      (clusterer.nnInternals)->inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
+      (clusterer.nnInternals)->inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row));
     } else {
-      clusterer.inputData32[write_idx] = clusterer.mISlice / 36.f;
-      clusterer.inputData32[write_idx + 1] = row / 152.f;
-      clusterer.inputData32[write_idx + 2] = static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row);
+      (clusterer.nnInternals)->inputData32[write_idx] = clusterer.mISlice / 36.f;
+      (clusterer.nnInternals)->inputData32[write_idx + 1] = row / 152.f;
+      (clusterer.nnInternals)->inputData32[write_idx + 2] = static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row);
     }
   }
 }
@@ -173,22 +212,22 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
   CPU_ONLY(MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem));
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
-  int model_output_index = glo_idx * clusterer.model_reg_1.getNumOutputNodes()[0][1];
+  int model_output_index = glo_idx * (clusterer.nnInternals)->model_reg_1.getNumOutputNodes()[0][1];
 
-  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clusterer.outputDataReg1.size() << " / " << clusterer.model_reg_1.getNumOutputNodes()[0][1] << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
+  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << (clusterer.nnInternals)->outputDataReg1.size() << " / " << (clusterer.nnInternals)->model_reg_1.getNumOutputNodes()[0][1] << " -- " << (clusterer.nnInternals)->peakPositions.size() << " -- " << (clusterer.nnInternals)->centralCharges.size();
 
-  if (clusterer.outputDataClass[full_glo_idx] == 1) {
+  if ((clusterer.nnInternals)->outputDataClass[full_glo_idx] == 1) {
 
     ClusterAccumulator pc;
 
     // Publishing logic is taken from default clusterizer
     if (onlyMC) {
       ClusterAccumulator dummy_pc;
-      CPU_ONLY(labelAcc->collect(clusterer.peakPositions[glo_idx], chargeMap[clusterer.peakPositions[glo_idx]].unpack()));
+      CPU_ONLY(labelAcc->collect((clusterer.nnInternals)->peakPositions[glo_idx], chargeMap[(clusterer.nnInternals)->peakPositions[glo_idx]].unpack()));
       GPUTPCCFClusterizer::buildCluster(
         clusterer.Param().rec,
         chargeMap,
-        clusterer.peakPositions[glo_idx],
+        (clusterer.nnInternals)->peakPositions[glo_idx],
         smem.posBcast,
         smem.buf,
         smem.innerAboveThreshold,
@@ -196,17 +235,17 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
         labelAcc);
     }
 
-    if ((clusterer.mPmemory->fragment).isOverlap(clusterer.peakPositions[glo_idx].time())) {
+    if ((clusterer.mPmemory->fragment).isOverlap((clusterer.nnInternals)->peakPositions[glo_idx].time())) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
       }
       return;
     }
 
-    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg1[model_output_index + 4], static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg1[model_output_index], clusterer.outputDataReg1[model_output_index + 2], static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg1[model_output_index + 1], clusterer.outputDataReg1[model_output_index + 3], 0, 0);
+    pc.setFull((clusterer.nnInternals)->centralCharges[glo_idx] * (clusterer.nnInternals)->outputDataReg1[model_output_index + 4], static_cast<float>((clusterer.nnInternals)->peakPositions[glo_idx].pad()) + (clusterer.nnInternals)->outputDataReg1[model_output_index], (clusterer.nnInternals)->outputDataReg1[model_output_index + 2], static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>((clusterer.nnInternals)->peakPositions[glo_idx].time()) + (clusterer.nnInternals)->outputDataReg1[model_output_index + 1], (clusterer.nnInternals)->outputDataReg1[model_output_index + 3], 0, 0);
 
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
+    bool rejectCluster = !pc.toNative((clusterer.nnInternals)->peakPositions[glo_idx], (clusterer.nnInternals)->centralCharges[glo_idx], myCluster, clusterer.Param());
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -219,7 +258,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
         myCluster,
-        clusterer.peakPositions[glo_idx].row(),
+        (clusterer.nnInternals)->peakPositions[glo_idx].row(),
         clusterer.mNMaxClusterPerRow,
         clusterer.mPclusterInRow,
         clusterOut);
@@ -229,7 +268,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
     } else if (clusterer.mPclusterPosInRow) {
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-    CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+    CPU_ONLY(labelAcc->commit((clusterer.nnInternals)->peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
   } else {
     if (clusterer.mPclusterPosInRow) {
       clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -245,21 +284,21 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
   CPU_ONLY(MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem));
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
-  int model_output_index = glo_idx * clusterer.model_reg_2.getNumOutputNodes()[0][1];
+  int model_output_index = glo_idx * (clusterer.nnInternals)->model_reg_2.getNumOutputNodes()[0][1];
 
-  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clusterer.outputDataReg1.size() << " / " << clusterer.model_reg_1.getNumOutputNodes()[0][1] << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
+  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << (clusterer.nnInternals)->outputDataReg1.size() << " / " << (clusterer.nnInternals)->model_reg_1.getNumOutputNodes()[0][1] << " -- " << (clusterer.nnInternals)->peakPositions.size() << " -- " << (clusterer.nnInternals)->centralCharges.size();
 
-  if (clusterer.outputDataClass[full_glo_idx] > 0) {
+  if ((clusterer.nnInternals)->outputDataClass[full_glo_idx] > 0) {
 
     ClusterAccumulator pc;
 
     if (onlyMC) {
       ClusterAccumulator dummy_pc;
-      CPU_ONLY(labelAcc->collect(clusterer.peakPositions[glo_idx], chargeMap[clusterer.peakPositions[glo_idx]].unpack()));
+      CPU_ONLY(labelAcc->collect((clusterer.nnInternals)->peakPositions[glo_idx], chargeMap[(clusterer.nnInternals)->peakPositions[glo_idx]].unpack()));
       GPUTPCCFClusterizer::buildCluster(
         clusterer.Param().rec,
         chargeMap,
-        clusterer.peakPositions[glo_idx],
+        (clusterer.nnInternals)->peakPositions[glo_idx],
         smem.posBcast,
         smem.buf,
         smem.innerAboveThreshold,
@@ -267,7 +306,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
         labelAcc);
     }
 
-    if ((clusterer.mPmemory->fragment).isOverlap(clusterer.peakPositions[glo_idx].time())) {
+    if ((clusterer.mPmemory->fragment).isOverlap((clusterer.nnInternals)->peakPositions[glo_idx].time())) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
       }
@@ -275,12 +314,12 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
     }
 
     // Cluster 1
-    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg2[model_output_index + 8], clusterer.peakPositions[glo_idx].pad() + clusterer.outputDataReg2[model_output_index], clusterer.outputDataReg2[model_output_index + 4], (clusterer.mPmemory->fragment).start + clusterer.peakPositions[glo_idx].time() + clusterer.outputDataReg2[model_output_index + 2], clusterer.outputDataReg2[model_output_index + 6], 0, 0);
+    pc.setFull((clusterer.nnInternals)->centralCharges[glo_idx] * (clusterer.nnInternals)->outputDataReg2[model_output_index + 8], (clusterer.nnInternals)->peakPositions[glo_idx].pad() + (clusterer.nnInternals)->outputDataReg2[model_output_index], (clusterer.nnInternals)->outputDataReg2[model_output_index + 4], (clusterer.mPmemory->fragment).start + (clusterer.nnInternals)->peakPositions[glo_idx].time() + (clusterer.nnInternals)->outputDataReg2[model_output_index + 2], (clusterer.nnInternals)->outputDataReg2[model_output_index + 6], 0, 0);
 
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
+    bool rejectCluster = !pc.toNative((clusterer.nnInternals)->peakPositions[glo_idx], (clusterer.nnInternals)->centralCharges[glo_idx], myCluster, clusterer.Param());
     if (rejectCluster) {
-      if (clusterer.nnClusterizerVerbosity < 2) {
+      if ((clusterer.nnInternals)->nnClusterizerVerbosity < 2) {
         LOG(warning) << "[NN, CF] Cluster rejected!";
       }
       if (clusterer.mPclusterPosInRow) {
@@ -294,7 +333,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
         myCluster,
-        clusterer.peakPositions[glo_idx].row(),
+        (clusterer.nnInternals)->peakPositions[glo_idx].row(),
         clusterer.mNMaxClusterPerRow,
         clusterer.mPclusterInRow,
         clusterOut);
@@ -304,14 +343,14 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
     } else if (clusterer.mPclusterPosInRow) {
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-    CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+    CPU_ONLY(labelAcc->commit((clusterer.nnInternals)->peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
 
     // Cluster 2
-    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg2[model_output_index + 9], clusterer.peakPositions[glo_idx].pad() + clusterer.outputDataReg2[model_output_index + 1], clusterer.outputDataReg2[model_output_index + 5], (clusterer.mPmemory->fragment).start + clusterer.peakPositions[glo_idx].time() + clusterer.outputDataReg2[model_output_index + 3], clusterer.outputDataReg2[model_output_index + 7], 0, 0);
+    pc.setFull((clusterer.nnInternals)->centralCharges[glo_idx] * (clusterer.nnInternals)->outputDataReg2[model_output_index + 9], (clusterer.nnInternals)->peakPositions[glo_idx].pad() + (clusterer.nnInternals)->outputDataReg2[model_output_index + 1], (clusterer.nnInternals)->outputDataReg2[model_output_index + 5], (clusterer.mPmemory->fragment).start + (clusterer.nnInternals)->peakPositions[glo_idx].time() + (clusterer.nnInternals)->outputDataReg2[model_output_index + 3], (clusterer.nnInternals)->outputDataReg2[model_output_index + 7], 0, 0);
 
-    rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
+    rejectCluster = !pc.toNative((clusterer.nnInternals)->peakPositions[glo_idx], (clusterer.nnInternals)->centralCharges[glo_idx], myCluster, clusterer.Param());
     if (rejectCluster) {
-      if (clusterer.nnClusterizerVerbosity < 2) {
+      if ((clusterer.nnInternals)->nnClusterizerVerbosity < 2) {
         LOG(warning) << "[NN, CF] Cluster rejected!";
       }
       if (clusterer.mPclusterPosInRow) {
@@ -324,7 +363,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
         myCluster,
-        clusterer.peakPositions[glo_idx].row(),
+        (clusterer.nnInternals)->peakPositions[glo_idx].row(),
         clusterer.mNMaxClusterPerRow,
         clusterer.mPclusterInRow,
         clusterOut);
@@ -334,7 +373,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
     } else if (clusterer.mPclusterPosInRow) {
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-    // CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
+    // CPU_ONLY(labelAcc->commit((clusterer.nnInternals)->peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
   } else {
     if (clusterer.mPclusterPosInRow) {
       clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index b2b265706a4d5..fdebdda7a5f86 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -21,7 +21,6 @@
 #include "GPUTPCClusterFinder.h"
 #include "Array2D.h"
 #include "PackedCharge.h"
-#include "ML/3rdparty/GPUORTFloat16.h"
 
 namespace o2::tpc
 {
@@ -72,6 +71,9 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
   static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
 
+  static void inferenceNetworkClass(processorType&, int8_t = 0, uint = 0);
+  static void inferenceNetworkReg1(processorType&, int8_t = 0);
+  static void inferenceNetworkReg2(processorType&, int8_t = 0);
   
   private:
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
new file mode 100644
index 0000000000000..288a1fe7223ab
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
@@ -0,0 +1,57 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizerInternals.h
+/// \author Christian Sonnabend
+
+#include "ML/3rdparty/GPUORTFloat16.h"
+#include "ML/OrtInterface.h"
+#include "ChargePos.h"
+
+#ifndef O2_GPU_NN_CLUSTERIZER_INTERNALS_H
+#define O2_GPU_NN_CLUSTERIZER_INTERNALS_H
+
+namespace o2::gpu
+{
+
+class GPUTPCNNClusterizerInternals {
+    public:
+        int nnClusterizerSizeInputRow = 3;
+        int nnClusterizerSizeInputPad = 3;
+        int nnClusterizerSizeInputTime = 3;
+        int nnClusterizerElementSize = -1;
+        bool nnClusterizerAddIndexData = true;
+        float nnClassThreshold = 0.16;
+        bool nnSigmoidTrafoClassThreshold = 1;
+        int nnClusterizerUseCfRegression = 0;
+        int nnClusterizerBatchedMode = 1;
+        int nnClusterizerVerbosity = 0;
+        int nnClusterizerBoundaryFillValue = -1;
+        int nnClusterizerDumpDigits = 0;
+        int nnClusterizerApplyCfDeconvolution = 0;
+    
+        // Memory allocation for neural network
+        uint class2_elements = 0;
+        std::vector<float> inputData32;
+        std::vector<OrtDataType::Float16_t> inputData16;
+        std::vector<float> outputDataClass, modelProbabilities, outputDataReg1, outputDataReg2;
+    
+        std::vector<ChargePos> peakPositions;
+        std::vector<std::vector<bool>> clusterFlags; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cxx
+        std::vector<float> centralCharges;
+    
+        std::unordered_map<std::string, std::string> OrtOptions;
+        o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
+}; // class GPUTPCNNClusterizerInternals
+
+} // namespace o2::gpu
+
+#endif
\ No newline at end of file

From db0c83650d218a1cbb504bd2e1c4ace71cc6c203 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Fri, 7 Mar 2025 09:37:48 +0000
Subject: [PATCH 40/77] Please consider the following formatting changes

---
 .../Global/GPUChainTrackingClusterizer.cxx    | 12 ++--
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 27 +++++----
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  9 ++-
 .../GPUTPCNNClusterizerInternals.h            | 57 ++++++++++---------
 4 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index a7a4cd55f8b18..08ea0d5e2dd0f 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -893,7 +893,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           (clusterer.nnInternals)->nnClusterizerElementSize = ((2 * (clusterer.nnInternals)->nnClusterizerSizeInputRow + 1) * (2 * (clusterer.nnInternals)->nnClusterizerSizeInputPad + 1) * (2 * (clusterer.nnInternals)->nnClusterizerSizeInputTime + 1)) + ((clusterer.nnInternals)->nnClusterizerAddIndexData ? 3 : 0);
           (clusterer.nnInternals)->nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
           (clusterer.nnInternals)->nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
-          if (nn_settings.nnClusterizerVerbosity < 0){
+          if (nn_settings.nnClusterizerVerbosity < 0) {
             (clusterer.nnInternals)->nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
           } else {
             (clusterer.nnInternals)->nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
@@ -929,7 +929,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
               (clusterer.nnInternals)->model_reg_2.init((clusterer.nnInternals)->OrtOptions);
             }
           }
-          
+
           if ((clusterer.nnInternals)->nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
@@ -944,7 +944,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           int evalDtype = (clusterer.nnInternals)->OrtOptions["dtype"].find("32") != std::string::npos;
           (clusterer.nnInternals)->outputDataClass.resize(clusterer.mPmemory->counters.nClusters, -1);
 
-          for(int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / (clusterer.nnInternals)->nnClusterizerBatchedMode); batch++) {
+          for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / (clusterer.nnInternals)->nnClusterizerBatchedMode); batch++) {
             uint batchStart = batch * (clusterer.nnInternals)->nnClusterizerBatchedMode;
             uint iSize = CAMath::Min((uint)(clusterer.nnInternals)->nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
@@ -952,7 +952,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             (clusterer.nnInternals)->peakPositions.clear();
             (clusterer.nnInternals)->centralCharges.clear();
 
-            (clusterer.nnInternals)->clusterFlags.resize(iSize, {0,0});
+            (clusterer.nnInternals)->clusterFlags.resize(iSize, {0, 0});
             (clusterer.nnInternals)->peakPositions.resize(iSize);
             (clusterer.nnInternals)->centralCharges.resize(iSize);
 
@@ -968,7 +968,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto start1 = std::chrono::high_resolution_clock::now();
             GPUTPCNNClusterizer::inferenceNetworkClass(clusterer, evalDtype);
-            if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] == 1){
+            if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] == 1) {
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
             } else {
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
@@ -989,7 +989,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
 
           auto start1 = std::chrono::high_resolution_clock::now();
-          if((clusterer.nnInternals)->nnClusterizerUseCfRegression) {
+          if ((clusterer.nnInternals)->nnClusterizerUseCfRegression) {
             runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index d4e2f3125592d..1f139b32c10f5 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -61,8 +61,8 @@ GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass2La
   uint elem_iterator = glo_idx * (clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1];
   float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
   uint class_label = 0;
-  for(float pIdx = elem_iterator; pIdx < elem_iterator + (clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1]; pIdx++) {
-    if(pIdx == elem_iterator) {
+  for (float pIdx = elem_iterator; pIdx < elem_iterator + (clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1]; pIdx++) {
+    if (pIdx == elem_iterator) {
       current_max_prob = (clusterer.nnInternals)->modelProbabilities[pIdx];
     } else {
       class_label = ((clusterer.nnInternals)->modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
@@ -93,24 +93,27 @@ GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::publishClass2Regr
 }
 
 // Apply the neural network to the input data. Note: These are not GPU kernels. We let ONNX take care of that
-void GPUTPCNNClusterizer::inferenceNetworkClass(processorType& clusterer, int8_t dtype, uint batch_idx) {
-  if(dtype == 0){
+void GPUTPCNNClusterizer::inferenceNetworkClass(processorType& clusterer, int8_t dtype, uint batch_idx)
+{
+  if (dtype == 0) {
     (clusterer.nnInternals)->modelProbabilities = (clusterer.nnInternals)->model_class.inference<OrtDataType::Float16_t, float>((clusterer.nnInternals)->inputData16);
   } else {
     (clusterer.nnInternals)->modelProbabilities = (clusterer.nnInternals)->model_class.inference<float, float>((clusterer.nnInternals)->inputData32);
   }
 }
 
-void GPUTPCNNClusterizer::inferenceNetworkReg1(processorType& clusterer, int8_t dtype) {
-  if(dtype == 0){
+void GPUTPCNNClusterizer::inferenceNetworkReg1(processorType& clusterer, int8_t dtype)
+{
+  if (dtype == 0) {
     (clusterer.nnInternals)->outputDataReg1 = (clusterer.nnInternals)->model_reg_1.inference<OrtDataType::Float16_t, float>((clusterer.nnInternals)->inputData16);
   } else {
     (clusterer.nnInternals)->outputDataReg1 = (clusterer.nnInternals)->model_reg_1.inference<float, float>((clusterer.nnInternals)->inputData32);
   }
 }
 
-void GPUTPCNNClusterizer::inferenceNetworkReg2(processorType& clusterer, int8_t dtype) {
-  if(dtype == 0){
+void GPUTPCNNClusterizer::inferenceNetworkReg2(processorType& clusterer, int8_t dtype)
+{
+  if (dtype == 0) {
     (clusterer.nnInternals)->outputDataReg2 = (clusterer.nnInternals)->model_reg_2.inference<OrtDataType::Float16_t, float>((clusterer.nnInternals)->inputData16);
   } else {
     (clusterer.nnInternals)->outputDataReg2 = (clusterer.nnInternals)->model_reg_2.inference<float, float>((clusterer.nnInternals)->inputData32);
@@ -171,18 +174,18 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
       for (int t = -(clusterer.nnInternals)->nnClusterizerSizeInputTime; t <= (clusterer.nnInternals)->nnClusterizerSizeInputTime; t++) {
         if (!is_boundary) {
           ChargePos tmp_pos(row + r, pad + p, time + t);
-          if (r == 0 && !(clusterer.nnInternals)->clusterFlags[glo_idx][0] && std::abs(p) < 3 && std::abs(t) < 3 && p!=0 && t!=0) { // ordering is done for short circuit optimization
+          if (r == 0 && !(clusterer.nnInternals)->clusterFlags[glo_idx][0] && std::abs(p) < 3 && std::abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
             (clusterer.nnInternals)->clusterFlags[glo_idx][0] = CfUtils::isPeak(isPeakMap[tmp_pos]);
             (clusterer.nnInternals)->clusterFlags[glo_idx][1] = (clusterer.nnInternals)->clusterFlags[glo_idx][0];
           }
-          if(dtype == 0){
+          if (dtype == 0) {
             (clusterer.nnInternals)->inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
           } else {
             (clusterer.nnInternals)->inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
           }
         } else {
           // Filling boundary just to make sure that no values are left unintentionally
-          if(dtype == 0){
+          if (dtype == 0) {
             (clusterer.nnInternals)->inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>((clusterer.nnInternals)->nnClusterizerBoundaryFillValue));
           } else {
             (clusterer.nnInternals)->inputData32[write_idx] = static_cast<float>((clusterer.nnInternals)->nnClusterizerBoundaryFillValue);
@@ -193,7 +196,7 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
     }
   }
   if ((clusterer.nnInternals)->nnClusterizerAddIndexData) {
-    if(dtype == 0){
+    if (dtype == 0) {
       (clusterer.nnInternals)->inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISlice / 36.f);
       (clusterer.nnInternals)->inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
       (clusterer.nnInternals)->inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row));
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index d534cab8aba85..1f16428ab39fa 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -74,12 +74,11 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   static void inferenceNetworkClass(processorType&, int8_t = 0, uint = 0);
   static void inferenceNetworkReg1(processorType&, int8_t = 0);
   static void inferenceNetworkReg2(processorType&, int8_t = 0);
-  
-  private:
 
-    static int padOffset(int, int, const GPUTPCGeometry&);
-    static int rowOffset(int, int);
-    static bool isBoundary(int, int, int, const GPUTPCGeometry&);
+ private:
+  static int padOffset(int, int, const GPUTPCGeometry&);
+  static int rowOffset(int, int);
+  static bool isBoundary(int, int, int, const GPUTPCGeometry&);
 };
 
 } // namespace o2::gpu
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
index 288a1fe7223ab..03916d055703b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
@@ -22,34 +22,35 @@
 namespace o2::gpu
 {
 
-class GPUTPCNNClusterizerInternals {
-    public:
-        int nnClusterizerSizeInputRow = 3;
-        int nnClusterizerSizeInputPad = 3;
-        int nnClusterizerSizeInputTime = 3;
-        int nnClusterizerElementSize = -1;
-        bool nnClusterizerAddIndexData = true;
-        float nnClassThreshold = 0.16;
-        bool nnSigmoidTrafoClassThreshold = 1;
-        int nnClusterizerUseCfRegression = 0;
-        int nnClusterizerBatchedMode = 1;
-        int nnClusterizerVerbosity = 0;
-        int nnClusterizerBoundaryFillValue = -1;
-        int nnClusterizerDumpDigits = 0;
-        int nnClusterizerApplyCfDeconvolution = 0;
-    
-        // Memory allocation for neural network
-        uint class2_elements = 0;
-        std::vector<float> inputData32;
-        std::vector<OrtDataType::Float16_t> inputData16;
-        std::vector<float> outputDataClass, modelProbabilities, outputDataReg1, outputDataReg2;
-    
-        std::vector<ChargePos> peakPositions;
-        std::vector<std::vector<bool>> clusterFlags; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cxx
-        std::vector<float> centralCharges;
-    
-        std::unordered_map<std::string, std::string> OrtOptions;
-        o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
+class GPUTPCNNClusterizerInternals
+{
+ public:
+  int nnClusterizerSizeInputRow = 3;
+  int nnClusterizerSizeInputPad = 3;
+  int nnClusterizerSizeInputTime = 3;
+  int nnClusterizerElementSize = -1;
+  bool nnClusterizerAddIndexData = true;
+  float nnClassThreshold = 0.16;
+  bool nnSigmoidTrafoClassThreshold = 1;
+  int nnClusterizerUseCfRegression = 0;
+  int nnClusterizerBatchedMode = 1;
+  int nnClusterizerVerbosity = 0;
+  int nnClusterizerBoundaryFillValue = -1;
+  int nnClusterizerDumpDigits = 0;
+  int nnClusterizerApplyCfDeconvolution = 0;
+
+  // Memory allocation for neural network
+  uint class2_elements = 0;
+  std::vector<float> inputData32;
+  std::vector<OrtDataType::Float16_t> inputData16;
+  std::vector<float> outputDataClass, modelProbabilities, outputDataReg1, outputDataReg2;
+
+  std::vector<ChargePos> peakPositions;
+  std::vector<std::vector<bool>> clusterFlags; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cxx
+  std::vector<float> centralCharges;
+
+  std::unordered_map<std::string, std::string> OrtOptions;
+  o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
 }; // class GPUTPCNNClusterizerInternals
 
 } // namespace o2::gpu

From 6c6cb9589888e288ff43021c3e1d66f565816621 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 7 Mar 2025 11:20:58 +0100
Subject: [PATCH 41/77] iSlice -> iSector

---
 .../Global/GPUChainTrackingClusterizer.cxx    | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 88e8f398d68a9..c8aacfdf179ad 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -926,7 +926,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
 
           if ((clusterer.nnInternals)->nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
-            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
+            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
 
@@ -958,23 +958,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             }
 
             auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Filling the data
+            runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Filling the data
             auto stop0 = std::chrono::high_resolution_clock::now();
 
             auto start1 = std::chrono::high_resolution_clock::now();
             GPUTPCNNClusterizer::inferenceNetworkClass(clusterer, evalDtype);
             if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Assigning class labels
             } else {
-              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Assigning class labels
             }
 
             if (!(clusterer.nnInternals)->nnClusterizerUseCfRegression) {
               GPUTPCNNClusterizer::inferenceNetworkReg1(clusterer, evalDtype);
-              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Running the NN for regression class 1
+              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Running the NN for regression class 1
               if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] > 1 && reg_model_paths.size() > 1) {
                 GPUTPCNNClusterizer::inferenceNetworkReg2(clusterer, evalDtype);
-                runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, batchStart); // Running the NN for regression class 2
+                runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Running the NN for regression class 2
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
@@ -985,18 +985,17 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
           auto start1 = std::chrono::high_resolution_clock::now();
           if ((clusterer.nnInternals)->nnClusterizerUseCfRegression) {
-            runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
 
           if ((clusterer.nnInternals)->nnClusterizerVerbosity < 3) {
-            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSlice << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
         } else {
 #endif
-          DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
-          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 0);
+          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 0);
 
 #ifdef GPUCA_HAS_ONNX
         }
@@ -1007,7 +1006,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           if (doGPU) {
             SynchronizeStream(lane);
           }
-          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); // Computes MC labels
+          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1); // Computes MC labels
         }
 
         if (GetProcessingSettings().debugLevel >= 3) {

From 490170e2c6caaece754dc0102c8dd1c2c544c8df Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 7 Mar 2025 11:38:09 +0100
Subject: [PATCH 42/77] mISlice -> mISector

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 1f139b32c10f5..131673e59564b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -197,11 +197,11 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
   }
   if ((clusterer.nnInternals)->nnClusterizerAddIndexData) {
     if (dtype == 0) {
-      (clusterer.nnInternals)->inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISlice / 36.f);
+      (clusterer.nnInternals)->inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f);
       (clusterer.nnInternals)->inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
       (clusterer.nnInternals)->inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row));
     } else {
-      (clusterer.nnInternals)->inputData32[write_idx] = clusterer.mISlice / 36.f;
+      (clusterer.nnInternals)->inputData32[write_idx] = clusterer.mISector / 36.f;
       (clusterer.nnInternals)->inputData32[write_idx + 1] = row / 152.f;
       (clusterer.nnInternals)->inputData32[write_idx + 2] = static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row);
     }

From bca1014e457c2ef818f399cb85ba42a94e928ba9 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 7 Mar 2025 14:54:55 +0100
Subject: [PATCH 43/77] Minor bug-fixes

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index c8aacfdf179ad..9f811a8c8e604 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -873,9 +873,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           return;
         }
 
-        runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
-        DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
-
 #ifdef GPUCA_HAS_ONNX
         GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn;
         if (nn_settings.applyNNclusterizer) {
@@ -960,7 +957,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             auto start0 = std::chrono::high_resolution_clock::now();
             runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Filling the data
             auto stop0 = std::chrono::high_resolution_clock::now();
-
             auto start1 = std::chrono::high_resolution_clock::now();
             GPUTPCNNClusterizer::inferenceNetworkClass(clusterer, evalDtype);
             if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] == 1) {
@@ -995,6 +991,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
         } else {
 #endif
+          runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
+          DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 0);
 
 #ifdef GPUCA_HAS_ONNX

From b68796749dd9490f63f70fbc4eb90446ed505eb2 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 8 Mar 2025 11:50:38 +0100
Subject: [PATCH 44/77] Adjusting for comments

---
 GPU/GPUTracking/CMakeLists.txt                | 112 ++++++------------
 .../Global/GPUChainTrackingClusterizer.cxx    |  18 ++-
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |   8 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |   1 +
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |   1 +
 .../GPUTPCNNClusterizerInternals.cxx          |  15 +++
 .../GPUTPCNNClusterizerInternals.h            |   6 +-
 7 files changed, 70 insertions(+), 91 deletions(-)
 create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx

diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index c5714eedf72a8..5aaa8c44bc760 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -161,69 +161,40 @@ set(HDRS_INSTALL
 
 set(SRCS_NO_CINT ${SRCS_NO_CINT} display/GPUDisplayInterface.cxx)
 
+set(SRCS_NO_CINT ${SRCS_NO_CINT}
+  Global/GPUChainITS.cxx
+  ITS/GPUITSFitter.cxx
+  ITS/GPUITSFitterKernels.cxx
+  dEdx/GPUdEdx.cxx
+  TPCConvert/GPUTPCConvert.cxx
+  TPCConvert/GPUTPCConvertKernel.cxx
+  DataCompression/GPUTPCCompression.cxx
+  DataCompression/GPUTPCCompressionTrackModel.cxx
+  DataCompression/GPUTPCCompressionKernels.cxx
+  DataCompression/GPUTPCDecompression.cxx
+  DataCompression/GPUTPCDecompressionKernels.cxx
+  DataCompression/TPCClusterDecompressor.cxx
+  DataCompression/GPUTPCClusterStatistics.cxx
+  TPCClusterFinder/GPUTPCClusterFinder.cxx
+  TPCClusterFinder/ClusterAccumulator.cxx
+  TPCClusterFinder/MCLabelAccumulator.cxx
+  TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
+  TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
+  TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
+  TPCClusterFinder/GPUTPCCFPeakFinder.cxx
+  TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
+  TPCClusterFinder/GPUTPCCFClusterizer.cxx
+  TPCClusterFinder/GPUTPCCFDeconvolution.cxx
+  TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
+  TPCClusterFinder/GPUTPCCFDecodeZS.cxx
+  TPCClusterFinder/GPUTPCCFGather.cxx
+  Refit/GPUTrackingRefit.cxx
+  Refit/GPUTrackingRefitKernel.cxx
+  Merger/GPUTPCGMO2Output.cxx)
+
 if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
-  set(SRCS_NO_CINT ${SRCS_NO_CINT}
-    Global/GPUChainITS.cxx
-    ITS/GPUITSFitter.cxx
-    ITS/GPUITSFitterKernels.cxx
-    dEdx/GPUdEdx.cxx
-    TPCConvert/GPUTPCConvert.cxx
-    TPCConvert/GPUTPCConvertKernel.cxx
-    DataCompression/GPUTPCCompression.cxx
-    DataCompression/GPUTPCCompressionTrackModel.cxx
-    DataCompression/GPUTPCCompressionKernels.cxx
-    DataCompression/GPUTPCDecompression.cxx
-    DataCompression/GPUTPCDecompressionKernels.cxx
-    DataCompression/TPCClusterDecompressor.cxx
-    DataCompression/GPUTPCClusterStatistics.cxx
-    TPCClusterFinder/GPUTPCClusterFinder.cxx
-    TPCClusterFinder/ClusterAccumulator.cxx
-    TPCClusterFinder/MCLabelAccumulator.cxx
-    TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
-    TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
-    TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
-    TPCClusterFinder/GPUTPCCFPeakFinder.cxx
-    TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
-    TPCClusterFinder/GPUTPCCFClusterizer.cxx
-    TPCClusterFinder/GPUTPCNNClusterizer.cxx
-    TPCClusterFinder/GPUTPCCFDeconvolution.cxx
-    TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
-    TPCClusterFinder/GPUTPCCFDecodeZS.cxx
-    TPCClusterFinder/GPUTPCCFGather.cxx
-    Refit/GPUTrackingRefit.cxx
-    Refit/GPUTrackingRefitKernel.cxx
-    Merger/GPUTPCGMO2Output.cxx)
-else()
-  set(SRCS_NO_CINT ${SRCS_NO_CINT}
-    Global/GPUChainITS.cxx
-    ITS/GPUITSFitter.cxx
-    ITS/GPUITSFitterKernels.cxx
-    dEdx/GPUdEdx.cxx
-    TPCConvert/GPUTPCConvert.cxx
-    TPCConvert/GPUTPCConvertKernel.cxx
-    DataCompression/GPUTPCCompression.cxx
-    DataCompression/GPUTPCCompressionTrackModel.cxx
-    DataCompression/GPUTPCCompressionKernels.cxx
-    DataCompression/GPUTPCDecompression.cxx
-    DataCompression/GPUTPCDecompressionKernels.cxx
-    DataCompression/TPCClusterDecompressor.cxx
-    DataCompression/GPUTPCClusterStatistics.cxx
-    TPCClusterFinder/GPUTPCClusterFinder.cxx
-    TPCClusterFinder/ClusterAccumulator.cxx
-    TPCClusterFinder/MCLabelAccumulator.cxx
-    TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
-    TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
-    TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
-    TPCClusterFinder/GPUTPCCFPeakFinder.cxx
-    TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
-    TPCClusterFinder/GPUTPCCFClusterizer.cxx
-    TPCClusterFinder/GPUTPCCFDeconvolution.cxx
-    TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
-    TPCClusterFinder/GPUTPCCFDecodeZS.cxx
-    TPCClusterFinder/GPUTPCCFGather.cxx
-    Refit/GPUTrackingRefit.cxx
-    Refit/GPUTrackingRefitKernel.cxx
-    Merger/GPUTPCGMO2Output.cxx)
+  list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizer.cxx)
+  list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx)
 endif()
 
 set(SRCS_DATATYPES
@@ -297,8 +268,7 @@ unset(HDRS_TMP)
 # Main CMake part for O2
 if(ALIGPU_BUILD_TYPE STREQUAL "O2")
 
-  if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
-    o2_add_library(GPUDataTypes
+  o2_add_library(GPUDataTypes
                   TARGETVARNAME targetName
                   PUBLIC_INCLUDE_DIRECTORIES .
                                               Definitions
@@ -307,23 +277,13 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                                         O2::GPUCommon
                                         O2::ReconstructionDataFormats
                                         O2::TPCFastTransformation
-                                        O2::ML
                   PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
                   SOURCES ${SRCS_DATATYPES})
+  if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
     add_compile_definitions(GPUCA_HAS_ONNX=1)
     target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX)
+    target_link_libraries(${targetName} PUBLIC O2::ML)
   else()
-    o2_add_library(GPUDataTypes
-                  TARGETVARNAME targetName
-                  PUBLIC_INCLUDE_DIRECTORIES .
-                                              Definitions
-                                              DataTypes
-                  PUBLIC_LINK_LIBRARIES O2::GPUUtils
-                                        O2::GPUCommon
-                                        O2::ReconstructionDataFormats
-                                        O2::TPCFastTransformation
-                  PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
-                  SOURCES ${SRCS_DATATYPES})
     target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2)
   endif()
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 9f811a8c8e604..806eb3d36ee51 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -41,6 +41,11 @@
 #include <Vc/Vc>
 #endif
 
+#ifdef GPUCA_HAS_ONNX
+#include "GPUTPCNNClusterizer.h"
+#include "GPUTPCNNClusterizerInternals.h"
+#endif
+
 using namespace o2::gpu;
 using namespace o2::tpc;
 using namespace o2::tpc::constants;
@@ -873,10 +878,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           return;
         }
 
+        if (GetProcessingSettings().nn.applyNNclusterizer) {
 #ifdef GPUCA_HAS_ONNX
-        GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn;
-        if (nn_settings.applyNNclusterizer) {
           // Settings for the clusterizer
+          GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn;
+          GPUTPCNNClusterizerInternals nnSettingsInternal;
+          clusterer.nnInternals = &nnSettingsInternal;
           (clusterer.nnInternals)->nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
           (clusterer.nnInternals)->nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
           (clusterer.nnInternals)->nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
@@ -989,15 +996,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           if ((clusterer.nnInternals)->nnClusterizerVerbosity < 3) {
             LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
-        } else {
+#else
+          GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif
+        } else {
           runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
           DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 0);
-
-#ifdef GPUCA_HAS_ONNX
         }
-#endif
 
         if (doGPU && propagateMCLabels) {
           TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 329929aa23885..353b4abb51597 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -20,10 +20,6 @@
 #include "GPUDataTypes.h"
 #include "CfFragment.h"
 
-#ifdef GPUCA_HAS_ONNX
-#include "GPUTPCNNClusterizerInternals.h"
-#endif
-
 namespace o2
 {
 
@@ -147,9 +143,7 @@ class GPUTPCClusterFinder : public GPUProcessor
   int16_t mZSOffsetId = -1;
   int16_t mOutputId = -1;
 
-#ifdef GPUCA_HAS_ONNX
-  GPUTPCNNClusterizerInternals* nnInternals;
-#endif
+  GPUTPCNNClusterizerInternals* nnInternals = nullptr;
 
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 131673e59564b..bfe0cbdb5b8b4 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -13,6 +13,7 @@
 /// \author Christian Sonnabend
 
 #include "GPUTPCNNClusterizer.h"
+#include "GPUTPCNNClusterizerInternals.h"
 #include "GPUTPCCFClusterizer.h"
 
 #include "CfConsts.h"
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 1f16428ab39fa..6a5b9b749213b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -32,6 +32,7 @@ namespace o2::gpu
 
 class ClusterAccumulator;
 class MCLabelAccumulator;
+class GPUTPCNNClusterizerInternals;
 
 class GPUTPCNNClusterizer : public GPUKernelTemplate
 {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx
new file mode 100644
index 0000000000000..886f300c13544
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx
@@ -0,0 +1,15 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizerInternals.cxx
+/// \author Christian Sonnabend
+
+#include "GPUTPCNNClusterizerInternals.h"
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
index 03916d055703b..aebbb38ed3820 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
@@ -12,12 +12,14 @@
 /// \file GPUTPCNNClusterizerInternals.h
 /// \author Christian Sonnabend
 
+#ifndef O2_GPUTPCNNCLUSTERIZERINTERNALS_H
+#define O2_GPUTPCNNCLUSTERIZERINTERNALS_H
+
 #include "ML/3rdparty/GPUORTFloat16.h"
 #include "ML/OrtInterface.h"
 #include "ChargePos.h"
 
-#ifndef O2_GPU_NN_CLUSTERIZER_INTERNALS_H
-#define O2_GPU_NN_CLUSTERIZER_INTERNALS_H
+using namespace o2::ml;
 
 namespace o2::gpu
 {

From 70adf1ef9849d26da2c61092a0bdd808cd467b68 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 8 Mar 2025 17:42:21 +0100
Subject: [PATCH 45/77] Bug-fix for fullCI build

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 4 ++--
 GPU/GPUTracking/kernels.cmake                            | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index bfe0cbdb5b8b4..9a461c5cd5627 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -213,7 +213,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
 {
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
-  CPU_ONLY(MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem));
+  MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
   int model_output_index = glo_idx * (clusterer.nnInternals)->model_reg_1.getNumOutputNodes()[0][1];
@@ -285,7 +285,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
 {
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
-  CPU_ONLY(MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem));
+  MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
   int model_output_index = glo_idx * (clusterer.nnInternals)->model_reg_2.getNumOutputNodes()[0][1];
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index e71fa504792ab..35a9a1fefd748 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -23,7 +23,7 @@ o2_gpu_kernel_file_list(TPCMERGER ERRORS GPUTPCGMMerger.cxx GPUTPCGMSectorTrack.
 o2_gpu_kernel_file_list(O2PROPAGATOR TrackParametrization.cxx TrackParametrizationWithError.cxx Propagator.cxx TrackLTIntegral.cxx)
 o2_gpu_kernel_file_list(TPCCOMPRESSION GPUTPCCompressionTrackModel.cxx)
 o2_gpu_kernel_file_list(TPCDECOMPRESSION GPUTPCCompressionTrackModel.cxx ERRORS)
-o2_gpu_kernel_file_list(TPCCLUSTERFINDER ERRORS ClusterAccumulator.cxx)
+o2_gpu_kernel_file_list(TPCCLUSTERFINDER ERRORS ClusterAccumulator.cxx GPUTPCNNClusterizer.cxx)
 o2_gpu_kernel_file_list(TRDTRACKER GPUTRDTrack.cxx GPUTRDTracker.cxx GPUTRDTrackletWord.cxx GeometryBase.cxx)
 o2_gpu_kernel_file_list(GLOBALREFIT TPCMERGER O2PROPAGATOR MATLUT GPUTrackingRefit.cxx)
 

From 06e26a87087e3660b1247134f7ba1bba323ebb54 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 8 Mar 2025 18:12:10 +0100
Subject: [PATCH 46/77] Adding GPUd() for on-device functions

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 6 +++---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 9a461c5cd5627..df07827abdb90 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -122,17 +122,17 @@ void GPUTPCNNClusterizer::inferenceNetworkReg2(processorType& clusterer, int8_t
 }
 
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
-int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
+GPUd() int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
 {
   return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);
 }
 
-int GPUTPCNNClusterizer::rowOffset(int row, int global_shift)
+GPUd() int GPUTPCNNClusterizer::rowOffset(int row, int global_shift)
 {
   return (row > 62 ? global_shift : 0);
 }
 
-bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
+GPUd() bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
 {
   if (pad < 0 || row < 0) { // Faster short-circuit
     return true;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 6a5b9b749213b..01f4ddb8bf346 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -77,9 +77,9 @@ class GPUTPCNNClusterizer : public GPUKernelTemplate
   static void inferenceNetworkReg2(processorType&, int8_t = 0);
 
  private:
-  static int padOffset(int, int, const GPUTPCGeometry&);
-  static int rowOffset(int, int);
-  static bool isBoundary(int, int, int, const GPUTPCGeometry&);
+  static GPUd() int padOffset(int, int, const GPUTPCGeometry&);
+  static GPUd() int rowOffset(int, int);
+  static GPUd() bool isBoundary(int, int, int, const GPUTPCGeometry&);
 };
 
 } // namespace o2::gpu

From bedb592e6a8a59917b76ac87c431cdd316688dda Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 10 Mar 2025 09:27:38 +0100
Subject: [PATCH 47/77] Fixing compile issues, only thing mssing: conversion of
 float to float16

---
 .../Global/GPUChainTrackingClusterizer.cxx    |  3 +++
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 20 +++++++------------
 .../GPUTPCNNClusterizerInternals.h            |  3 +++
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 806eb3d36ee51..da345802a6e8d 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -921,11 +921,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
               (clusterer.nnInternals)->OrtOptions["model-path"] = reg_model_paths[0];
               (clusterer.nnInternals)->model_reg_1.init((clusterer.nnInternals)->OrtOptions);
+              (clusterer.nnInternals)->nnClusterizerModelClassNumOutputNodes = (clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1];
             } else {
               (clusterer.nnInternals)->OrtOptions["model-path"] = reg_model_paths[0];
               (clusterer.nnInternals)->model_reg_1.init((clusterer.nnInternals)->OrtOptions);
+              (clusterer.nnInternals)->nnClusterizerModelReg1NumOutputNodes = (clusterer.nnInternals)->model_reg_1.getNumOutputNodes()[0][1];
               (clusterer.nnInternals)->OrtOptions["model-path"] = reg_model_paths[1];
               (clusterer.nnInternals)->model_reg_2.init((clusterer.nnInternals)->OrtOptions);
+              (clusterer.nnInternals)->nnClusterizerModelReg2NumOutputNodes = (clusterer.nnInternals)->model_reg_2.getNumOutputNodes()[0][1];
             }
           }
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index df07827abdb90..607e8fb2c1a90 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -59,17 +59,17 @@ template <>
 GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  uint elem_iterator = glo_idx * (clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1];
+  uint elem_iterator = glo_idx * (clusterer.nnInternals)->nnClusterizerModelClassNumOutputNodes;
   float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
   uint class_label = 0;
-  for (float pIdx = elem_iterator; pIdx < elem_iterator + (clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1]; pIdx++) {
+  for (float pIdx = elem_iterator; pIdx < elem_iterator + (clusterer.nnInternals)->nnClusterizerModelClassNumOutputNodes; pIdx++) {
     if (pIdx == elem_iterator) {
       current_max_prob = (clusterer.nnInternals)->modelProbabilities[pIdx];
     } else {
       class_label = ((clusterer.nnInternals)->modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
     }
   }
-  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + (clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1])); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
+  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + (clusterer.nnInternals)->nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
   (clusterer.nnInternals)->outputDataClass[glo_idx + batchStart] = class_label;
 }
 
@@ -216,9 +216,9 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
-  int model_output_index = glo_idx * (clusterer.nnInternals)->model_reg_1.getNumOutputNodes()[0][1];
+  int model_output_index = glo_idx * (clusterer.nnInternals)->nnClusterizerModelReg1NumOutputNodes;
 
-  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << (clusterer.nnInternals)->outputDataReg1.size() << " / " << (clusterer.nnInternals)->model_reg_1.getNumOutputNodes()[0][1] << " -- " << (clusterer.nnInternals)->peakPositions.size() << " -- " << (clusterer.nnInternals)->centralCharges.size();
+  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << (clusterer.nnInternals)->outputDataReg1.size() << " / " << (clusterer.nnInternals)->nnClusterizerModelReg1NumOutputNodes << " -- " << (clusterer.nnInternals)->peakPositions.size() << " -- " << (clusterer.nnInternals)->centralCharges.size();
 
   if ((clusterer.nnInternals)->outputDataClass[full_glo_idx] == 1) {
 
@@ -288,9 +288,9 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
-  int model_output_index = glo_idx * (clusterer.nnInternals)->model_reg_2.getNumOutputNodes()[0][1];
+  int model_output_index = glo_idx * (clusterer.nnInternals)->nnClusterizerModelReg2NumOutputNodes;
 
-  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << (clusterer.nnInternals)->outputDataReg1.size() << " / " << (clusterer.nnInternals)->model_reg_1.getNumOutputNodes()[0][1] << " -- " << (clusterer.nnInternals)->peakPositions.size() << " -- " << (clusterer.nnInternals)->centralCharges.size();
+  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << (clusterer.nnInternals)->outputDataReg1.size() << " / " << (clusterer.nnInternals)->nnClusterizerModelReg2NumOutputNodes << " -- " << (clusterer.nnInternals)->peakPositions.size() << " -- " << (clusterer.nnInternals)->centralCharges.size();
 
   if ((clusterer.nnInternals)->outputDataClass[full_glo_idx] > 0) {
 
@@ -323,9 +323,6 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
     tpc::ClusterNative myCluster;
     bool rejectCluster = !pc.toNative((clusterer.nnInternals)->peakPositions[glo_idx], (clusterer.nnInternals)->centralCharges[glo_idx], myCluster, clusterer.Param());
     if (rejectCluster) {
-      if ((clusterer.nnInternals)->nnClusterizerVerbosity < 2) {
-        LOG(warning) << "[NN, CF] Cluster rejected!";
-      }
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
       }
@@ -354,9 +351,6 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
 
     rejectCluster = !pc.toNative((clusterer.nnInternals)->peakPositions[glo_idx], (clusterer.nnInternals)->centralCharges[glo_idx], myCluster, clusterer.Param());
     if (rejectCluster) {
-      if ((clusterer.nnInternals)->nnClusterizerVerbosity < 2) {
-        LOG(warning) << "[NN, CF] Cluster rejected!";
-      }
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
       }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
index aebbb38ed3820..0e8d337a781d4 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
@@ -40,6 +40,9 @@ class GPUTPCNNClusterizerInternals
   int nnClusterizerBoundaryFillValue = -1;
   int nnClusterizerDumpDigits = 0;
   int nnClusterizerApplyCfDeconvolution = 0;
+  int nnClusterizerModelClassNumOutputNodes = -1;
+  int nnClusterizerModelReg1NumOutputNodes = -1;
+  int nnClusterizerModelReg2NumOutputNodes = -1;
 
   // Memory allocation for neural network
   uint class2_elements = 0;

From e88829890e13cef14a16fd279c1cca11f988c617 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 10 Mar 2025 10:45:50 +0100
Subject: [PATCH 48/77] Let's see if this does the trick

---
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h | 15 ++++++++-------
 GPU/GPUTracking/CMakeLists.txt                |  3 +--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
index db65328409d3c..cf604fef44404 100644
--- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -9,6 +9,7 @@
 #include <cmath>
 #include <cstring>
 #include <limits>
+#include "GPUCommonDef.h"
 
 namespace o2
 {
@@ -43,7 +44,7 @@ static_assert(
 /// Shared implementation between public and internal classes. CRTP pattern.
 /// </summary>
 template <class Derived>
-struct Float16Impl {
+GPUd() struct Float16Impl {
  protected:
   /// <summary>
   /// Converts from float to uint16_t float16 representation
@@ -267,7 +268,7 @@ union float32_bits {
 }; // namespace detail
 
 template <class Derived>
-inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept
+GPUd() inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept
 {
   detail::float32_bits f{};
   f.f = v;
@@ -316,7 +317,7 @@ inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept
 }
 
 template <class Derived>
-inline float Float16Impl<Derived>::ToFloatImpl() const noexcept
+GPUd() inline float Float16Impl<Derived>::ToFloatImpl() const noexcept
 {
   constexpr detail::float32_bits magic = {113 << 23};
   constexpr unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
@@ -349,7 +350,7 @@ inline float Float16Impl<Derived>::ToFloatImpl() const noexcept
 
 /// Shared implementation between public and internal classes. CRTP pattern.
 template <class Derived>
-struct BFloat16Impl {
+GPUd() struct BFloat16Impl {
  protected:
   /// <summary>
   /// Converts from float to uint16_t float16 representation
@@ -520,7 +521,7 @@ struct BFloat16Impl {
 };
 
 template <class Derived>
-inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
+GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
 {
   uint16_t result;
   if (std::isnan(v)) {
@@ -595,7 +596,7 @@ inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
  *
  * \endcode
  */
-struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
+GPUd() struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
  private:
   /// <summary>
   /// Constructor from a 16-bit representation of a float16 value
@@ -737,7 +738,7 @@ static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
  *
  * \endcode
  */
-struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
+GPUd() struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
  private:
   /// <summary>
   /// Constructor from a uint16_t representation of bfloat16
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 5aaa8c44bc760..7d65978ab2e60 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -193,8 +193,7 @@ set(SRCS_NO_CINT ${SRCS_NO_CINT}
   Merger/GPUTPCGMO2Output.cxx)
 
 if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
-  list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizer.cxx)
-  list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx)
+  list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizer.cxx TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx)
 endif()
 
 set(SRCS_DATATYPES

From 21f56947f76bd10b7525390ac16b4ebf03b660d0 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 10 Mar 2025 11:10:47 +0100
Subject: [PATCH 49/77] Making functions (constructors) GPUd() (GPUdDefault())

---
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h | 100 +++++++++---------
 1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
index cf604fef44404..1ada9c7ffc426 100644
--- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -44,26 +44,26 @@ static_assert(
 /// Shared implementation between public and internal classes. CRTP pattern.
 /// </summary>
 template <class Derived>
-GPUd() struct Float16Impl {
+struct Float16Impl {
  protected:
   /// <summary>
   /// Converts from float to uint16_t float16 representation
   /// </summary>
   /// <param name="v"></param>
   /// <returns></returns>
-  constexpr static uint16_t ToUint16Impl(float v) noexcept;
+  GPUd() constexpr static uint16_t ToUint16Impl(float v) noexcept;
 
   /// <summary>
   /// Converts float16 to float
   /// </summary>
   /// <returns>float representation of float16 value</returns>
-  float ToFloatImpl() const noexcept;
+  GPUd() float ToFloatImpl() const noexcept;
 
   /// <summary>
   /// Creates an instance that represents absolute value.
   /// </summary>
   /// <returns>Absolute value</returns>
-  uint16_t AbsImpl() const noexcept
+  GPUd() uint16_t AbsImpl() const noexcept
   {
     return static_cast<uint16_t>(val & ~kSignMask);
   }
@@ -72,7 +72,7 @@ GPUd() struct Float16Impl {
   /// Creates a new instance with the sign flipped.
   /// </summary>
   /// <returns>Flipped sign instance</returns>
-  uint16_t NegateImpl() const noexcept
+  GPUd() uint16_t NegateImpl() const noexcept
   {
     return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
   }
@@ -93,13 +93,13 @@ GPUd() struct Float16Impl {
 
   uint16_t val{0};
 
-  Float16Impl() = default;
+  GPUdDefault() Float16Impl() = default;
 
   /// <summary>
   /// Checks if the value is negative
   /// </summary>
   /// <returns>true if negative</returns>
-  bool IsNegative() const noexcept
+  GPUd() bool IsNegative() const noexcept
   {
     return static_cast<int16_t>(val) < 0;
   }
@@ -108,7 +108,7 @@ GPUd() struct Float16Impl {
   /// Tests if the value is NaN
   /// </summary>
   /// <returns>true if NaN</returns>
-  bool IsNaN() const noexcept
+  GPUd() bool IsNaN() const noexcept
   {
     return AbsImpl() > kPositiveInfinityBits;
   }
@@ -117,7 +117,7 @@ GPUd() struct Float16Impl {
   /// Tests if the value is finite
   /// </summary>
   /// <returns>true if finite</returns>
-  bool IsFinite() const noexcept
+  GPUd() bool IsFinite() const noexcept
   {
     return AbsImpl() < kPositiveInfinityBits;
   }
@@ -126,7 +126,7 @@ GPUd() struct Float16Impl {
   /// Tests if the value represents positive infinity.
   /// </summary>
   /// <returns>true if positive infinity</returns>
-  bool IsPositiveInfinity() const noexcept
+  GPUd() bool IsPositiveInfinity() const noexcept
   {
     return val == kPositiveInfinityBits;
   }
@@ -135,7 +135,7 @@ GPUd() struct Float16Impl {
   /// Tests if the value represents negative infinity
   /// </summary>
   /// <returns>true if negative infinity</returns>
-  bool IsNegativeInfinity() const noexcept
+  GPUd() bool IsNegativeInfinity() const noexcept
   {
     return val == kNegativeInfinityBits;
   }
@@ -144,7 +144,7 @@ GPUd() struct Float16Impl {
   /// Tests if the value is either positive or negative infinity.
   /// </summary>
   /// <returns>True if absolute value is infinity</returns>
-  bool IsInfinity() const noexcept
+  GPUd() bool IsInfinity() const noexcept
   {
     return AbsImpl() == kPositiveInfinityBits;
   }
@@ -153,7 +153,7 @@ GPUd() struct Float16Impl {
   /// Tests if the value is NaN or zero. Useful for comparisons.
   /// </summary>
   /// <returns>True if NaN or zero.</returns>
-  bool IsNaNOrZero() const noexcept
+  GPUd() bool IsNaNOrZero() const noexcept
   {
     auto abs = AbsImpl();
     return (abs == 0 || abs > kPositiveInfinityBits);
@@ -163,7 +163,7 @@ GPUd() struct Float16Impl {
   /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
   /// </summary>
   /// <returns>True if so</returns>
-  bool IsNormal() const noexcept
+  GPUd() bool IsNormal() const noexcept
   {
     auto abs = AbsImpl();
     return (abs < kPositiveInfinityBits)          // is finite
@@ -175,7 +175,7 @@ GPUd() struct Float16Impl {
   /// Tests if the value is subnormal (denormal).
   /// </summary>
   /// <returns>True if so</returns>
-  bool IsSubnormal() const noexcept
+  GPUd() bool IsSubnormal() const noexcept
   {
     auto abs = AbsImpl();
     return (abs < kPositiveInfinityBits)          // is finite
@@ -187,13 +187,13 @@ GPUd() struct Float16Impl {
   /// Creates an instance that represents absolute value.
   /// </summary>
   /// <returns>Absolute value</returns>
-  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+  GPUd() Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
 
   /// <summary>
   /// Creates a new instance with the sign flipped.
   /// </summary>
   /// <returns>Flipped sign instance</returns>
-  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+  GPUd() Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
 
   /// <summary>
   /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
@@ -203,12 +203,12 @@ GPUd() struct Float16Impl {
   /// <param name="lhs">first value</param>
   /// <param name="rhs">second value</param>
   /// <returns>True if both arguments represent zero</returns>
-  static bool AreZero(const Float16Impl& lhs, const Float16Impl& rhs) noexcept
+  GPUd() static bool AreZero(const Float16Impl& lhs, const Float16Impl& rhs) noexcept
   {
     return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
   }
 
-  bool operator==(const Float16Impl& rhs) const noexcept
+  GPUd() bool operator==(const Float16Impl& rhs) const noexcept
   {
     if (IsNaN() || rhs.IsNaN()) {
       // IEEE defines that NaN is not equal to anything, including itself.
@@ -217,9 +217,9 @@ GPUd() struct Float16Impl {
     return val == rhs.val;
   }
 
-  bool operator!=(const Float16Impl& rhs) const noexcept { return !(*this == rhs); }
+  GPUd() bool operator!=(const Float16Impl& rhs) const noexcept { return !(*this == rhs); }
 
-  bool operator<(const Float16Impl& rhs) const noexcept
+  GPUd() bool operator<(const Float16Impl& rhs) const noexcept
   {
     if (IsNaN() || rhs.IsNaN()) {
       // IEEE defines that NaN is unordered with respect to everything, including itself.
@@ -350,26 +350,26 @@ GPUd() inline float Float16Impl<Derived>::ToFloatImpl() const noexcept
 
 /// Shared implementation between public and internal classes. CRTP pattern.
 template <class Derived>
-GPUd() struct BFloat16Impl {
+struct BFloat16Impl {
  protected:
   /// <summary>
   /// Converts from float to uint16_t float16 representation
   /// </summary>
   /// <param name="v"></param>
   /// <returns></returns>
-  static uint16_t ToUint16Impl(float v) noexcept;
+  GPUd() static uint16_t ToUint16Impl(float v) noexcept;
 
   /// <summary>
   /// Converts bfloat16 to float
   /// </summary>
   /// <returns>float representation of bfloat16 value</returns>
-  float ToFloatImpl() const noexcept;
+  GPUd() float ToFloatImpl() const noexcept;
 
   /// <summary>
   /// Creates an instance that represents absolute value.
   /// </summary>
   /// <returns>Absolute value</returns>
-  uint16_t AbsImpl() const noexcept
+  GPUd() uint16_t AbsImpl() const noexcept
   {
     return static_cast<uint16_t>(val & ~kSignMask);
   }
@@ -378,7 +378,7 @@ GPUd() struct BFloat16Impl {
   /// Creates a new instance with the sign flipped.
   /// </summary>
   /// <returns>Flipped sign instance</returns>
-  uint16_t NegateImpl() const noexcept
+  GPUd() uint16_t NegateImpl() const noexcept
   {
     return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
   }
@@ -401,13 +401,13 @@ GPUd() struct BFloat16Impl {
 
   uint16_t val{0};
 
-  BFloat16Impl() = default;
+  GPUdDefault() BFloat16Impl() = default;
 
   /// <summary>
   /// Checks if the value is negative
   /// </summary>
   /// <returns>true if negative</returns>
-  bool IsNegative() const noexcept
+  GPUd() bool IsNegative() const noexcept
   {
     return static_cast<int16_t>(val) < 0;
   }
@@ -416,7 +416,7 @@ GPUd() struct BFloat16Impl {
   /// Tests if the value is NaN
   /// </summary>
   /// <returns>true if NaN</returns>
-  bool IsNaN() const noexcept
+  GPUd() bool IsNaN() const noexcept
   {
     return AbsImpl() > kPositiveInfinityBits;
   }
@@ -425,7 +425,7 @@ GPUd() struct BFloat16Impl {
   /// Tests if the value is finite
   /// </summary>
   /// <returns>true if finite</returns>
-  bool IsFinite() const noexcept
+  GPUd() bool IsFinite() const noexcept
   {
     return AbsImpl() < kPositiveInfinityBits;
   }
@@ -434,7 +434,7 @@ GPUd() struct BFloat16Impl {
   /// Tests if the value represents positive infinity.
   /// </summary>
   /// <returns>true if positive infinity</returns>
-  bool IsPositiveInfinity() const noexcept
+  GPUd() bool IsPositiveInfinity() const noexcept
   {
     return val == kPositiveInfinityBits;
   }
@@ -443,7 +443,7 @@ GPUd() struct BFloat16Impl {
   /// Tests if the value represents negative infinity
   /// </summary>
   /// <returns>true if negative infinity</returns>
-  bool IsNegativeInfinity() const noexcept
+  GPUd() bool IsNegativeInfinity() const noexcept
   {
     return val == kNegativeInfinityBits;
   }
@@ -452,7 +452,7 @@ GPUd() struct BFloat16Impl {
   /// Tests if the value is either positive or negative infinity.
   /// </summary>
   /// <returns>True if absolute value is infinity</returns>
-  bool IsInfinity() const noexcept
+  GPUd() bool IsInfinity() const noexcept
   {
     return AbsImpl() == kPositiveInfinityBits;
   }
@@ -461,7 +461,7 @@ GPUd() struct BFloat16Impl {
   /// Tests if the value is NaN or zero. Useful for comparisons.
   /// </summary>
   /// <returns>True if NaN or zero.</returns>
-  bool IsNaNOrZero() const noexcept
+  GPUd() bool IsNaNOrZero() const noexcept
   {
     auto abs = AbsImpl();
     return (abs == 0 || abs > kPositiveInfinityBits);
@@ -471,7 +471,7 @@ GPUd() struct BFloat16Impl {
   /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
   /// </summary>
   /// <returns>True if so</returns>
-  bool IsNormal() const noexcept
+  GPUd() bool IsNormal() const noexcept
   {
     auto abs = AbsImpl();
     return (abs < kPositiveInfinityBits)          // is finite
@@ -483,7 +483,7 @@ GPUd() struct BFloat16Impl {
   /// Tests if the value is subnormal (denormal).
   /// </summary>
   /// <returns>True if so</returns>
-  bool IsSubnormal() const noexcept
+  GPUd() bool IsSubnormal() const noexcept
   {
     auto abs = AbsImpl();
     return (abs < kPositiveInfinityBits)          // is finite
@@ -495,13 +495,13 @@ GPUd() struct BFloat16Impl {
   /// Creates an instance that represents absolute value.
   /// </summary>
   /// <returns>Absolute value</returns>
-  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+  GPUd() Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
 
   /// <summary>
   /// Creates a new instance with the sign flipped.
   /// </summary>
   /// <returns>Flipped sign instance</returns>
-  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+  GPUd() Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
 
   /// <summary>
   /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
@@ -511,7 +511,7 @@ GPUd() struct BFloat16Impl {
   /// <param name="lhs">first value</param>
   /// <param name="rhs">second value</param>
   /// <returns>True if both arguments represent zero</returns>
-  static bool AreZero(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept
+  GPUd() static bool AreZero(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept
   {
     // IEEE defines that positive and negative zero are equal, this gives us a quick equality check
     // for two values by or'ing the private bits together and stripping the sign. They are both zero,
@@ -555,7 +555,7 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
 }
 
 template <class Derived>
-inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
+GPUd() inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
 {
   if (IsNaN()) {
     return std::numeric_limits<float>::quiet_NaN();
@@ -596,7 +596,7 @@ inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
  *
  * \endcode
  */
-GPUd() struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
+struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
  private:
   /// <summary>
   /// Constructor from a 16-bit representation of a float16 value
@@ -611,26 +611,26 @@ GPUd() struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
   /// <summary>
   /// Default constructor
   /// </summary>
-  Float16_t() = default;
+  GPUdDefault() Float16_t() = default;
 
   /// <summary>
   /// Explicit conversion to uint16_t representation of float16.
   /// </summary>
   /// <param name="v">uint16_t bit representation of float16</param>
   /// <returns>new instance of Float16_t</returns>
-  constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); }
+  GPUd() constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); }
 
   /// <summary>
   /// __ctor from float. Float is converted into float16 16-bit representation.
   /// </summary>
   /// <param name="v">float value</param>
-  explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+  GPUd() explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
 
   /// <summary>
   /// Converts float16 to float
   /// </summary>
   /// <returns>float representation of float16 value</returns>
-  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+  GPUd() float ToFloat() const noexcept { return Base::ToFloatImpl(); }
 
   /// <summary>
   /// Checks if the value is negative
@@ -738,7 +738,7 @@ static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
  *
  * \endcode
  */
-GPUd() struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
+struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
  private:
   /// <summary>
   /// Constructor from a uint16_t representation of bfloat16
@@ -752,26 +752,26 @@ GPUd() struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
  public:
   using Base = OrtDataType::BFloat16Impl<BFloat16_t>;
 
-  BFloat16_t() = default;
+  GPUdDefault() BFloat16_t() = default;
 
   /// <summary>
   /// Explicit conversion to uint16_t representation of bfloat16.
   /// </summary>
   /// <param name="v">uint16_t bit representation of bfloat16</param>
   /// <returns>new instance of BFloat16_t</returns>
-  static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); }
+  GPUd() static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); }
 
   /// <summary>
   /// __ctor from float. Float is converted into bfloat16 16-bit representation.
   /// </summary>
   /// <param name="v">float value</param>
-  explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+  GPUd() explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
 
   /// <summary>
   /// Converts bfloat16 to float
   /// </summary>
   /// <returns>float representation of bfloat16 value</returns>
-  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+  GPUd() float ToFloat() const noexcept { return Base::ToFloatImpl(); }
 
   /// <summary>
   /// Checks if the value is negative

From 66da84e9f3b25aaa94321d551c369dfe54ac6baf Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 10 Mar 2025 12:58:55 +0100
Subject: [PATCH 50/77] GPU kernels should now be findable

---
 GPU/GPUTracking/Definitions/GPUDefGPUParameters.h | 6 ++++++
 GPU/GPUTracking/Definitions/GPUSettingsList.h     | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
index 4d7f00ee1e21d..19ad0ace94a66 100644
--- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
+++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
@@ -513,6 +513,12 @@
 
 #define GPUCA_LB_GPUTPCCFNoiseSuppression_noiseSuppression GPUCA_LB_GPUTPCCFNoiseSuppression
 #define GPUCA_LB_GPUTPCCFNoiseSuppression_updatePeaks GPUCA_LB_GPUTPCCFNoiseSuppression
+#define GPUCA_LB_GPUTPCNNClusterizer_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizer
+#define GPUCA_LB_GPUTPCNNClusterizer_fillInputNN GPUCA_LB_GPUTPCNNClusterizer
+#define GPUCA_LB_GPUTPCNNClusterizer_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizer
+#define GPUCA_LB_GPUTPCNNClusterizer_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizer
+#define GPUCA_LB_GPUTPCNNClusterizer_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizer
+#define GPUCA_LB_GPUTPCNNClusterizer_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizer
 #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_THREAD_COUNT_SCAN
 #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_THREAD_COUNT_SCAN
 #define GPUCA_LB_GPUTPCCFStreamCompaction_scanTop GPUCA_THREAD_COUNT_SCAN
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 14280d3f879e3..50b3accab5337 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -242,7 +242,7 @@ AddOption(nnClusterizerApplyCfDeconvolution, int, 0, "", 0, "Applies the CFDecon
 AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable")
 AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
 AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
-AddOption(nnClusterizerApplyNoiseSupression, int, 1, "", 0, "Applies the NoiseSupression kernel before the digits to the network are filled")
+AddOption(nnClusterizerApplyNoiseSuppression, int, 1, "", 0, "Applies the NoiseSuppression kernel before the digits to the network are filled")
 AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
 AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
 AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")

From e8af1c29eef6da654931671f1699b659cef42fa6 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 10 Mar 2025 14:07:25 +0100
Subject: [PATCH 51/77] Adding ifdefs for standalone build and header
 exclusions in GPUORTFloat16

---
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h   |  3 +++
 .../Definitions/GPUDefGPUParameters.h           |  4 ++++
 GPU/GPUTracking/kernels.cmake                   | 17 +++++++++++------
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
index 1ada9c7ffc426..b343416f5380c 100644
--- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -5,10 +5,13 @@
 //    - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_float16.h
 //    - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_api.h
 
+#ifndef GPUCA_GPUCODE_DEVICE
 #include <stdint.h>
 #include <cmath>
 #include <cstring>
 #include <limits>
+#endif
+
 #include "GPUCommonDef.h"
 
 namespace o2
diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
index 19ad0ace94a66..d7d5ea4e02de7 100644
--- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
+++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
@@ -513,12 +513,16 @@
 
 #define GPUCA_LB_GPUTPCCFNoiseSuppression_noiseSuppression GPUCA_LB_GPUTPCCFNoiseSuppression
 #define GPUCA_LB_GPUTPCCFNoiseSuppression_updatePeaks GPUCA_LB_GPUTPCCFNoiseSuppression
+
+#ifdef GPUCA_HAS_ONNX
 #define GPUCA_LB_GPUTPCNNClusterizer_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizer
 #define GPUCA_LB_GPUTPCNNClusterizer_fillInputNN GPUCA_LB_GPUTPCNNClusterizer
 #define GPUCA_LB_GPUTPCNNClusterizer_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizer
 #define GPUCA_LB_GPUTPCNNClusterizer_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizer
 #define GPUCA_LB_GPUTPCNNClusterizer_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizer
 #define GPUCA_LB_GPUTPCNNClusterizer_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizer
+#endif
+
 #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_THREAD_COUNT_SCAN
 #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_THREAD_COUNT_SCAN
 #define GPUCA_LB_GPUTPCCFStreamCompaction_scanTop GPUCA_THREAD_COUNT_SCAN
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index 35a9a1fefd748..29d90908afa2f 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -24,6 +24,9 @@ o2_gpu_kernel_file_list(O2PROPAGATOR TrackParametrization.cxx TrackParametrizati
 o2_gpu_kernel_file_list(TPCCOMPRESSION GPUTPCCompressionTrackModel.cxx)
 o2_gpu_kernel_file_list(TPCDECOMPRESSION GPUTPCCompressionTrackModel.cxx ERRORS)
 o2_gpu_kernel_file_list(TPCCLUSTERFINDER ERRORS ClusterAccumulator.cxx GPUTPCNNClusterizer.cxx)
+if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
+o2_gpu_kernel_file_list(TPCNNCLUSTERFINDER ERRORS ClusterAccumulator.cxx GPUTPCNNClusterizer.cxx)
+endif()
 o2_gpu_kernel_file_list(TRDTRACKER GPUTRDTrack.cxx GPUTRDTracker.cxx GPUTRDTrackletWord.cxx GeometryBase.cxx)
 o2_gpu_kernel_file_list(GLOBALREFIT TPCMERGER O2PROPAGATOR MATLUT GPUTrackingRefit.cxx)
 
@@ -110,12 +113,14 @@ o2_gpu_add_kernel("GPUTPCCFPeakFinder"                                "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"        "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB      single)
-o2_gpu_add_kernel("GPUTPCNNClusterizer, runCfClusterizer"             "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizer, fillInputNN"                  "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizer, determineClass1Labels"        "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizer, determineClass2Labels"        "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizer, publishClass1Regression"      "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizer, publishClass2Regression"      "= TPCCLUSTERFINDER"                                    LB      single int8_t dtype int8_t onlyMC uint batchStart)
+if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
+o2_gpu_add_kernel("GPUTPCNNClusterizer, runCfClusterizer"             "= TPCNNCLUSTERFINDER"                                  LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, fillInputNN"                  "= TPCNNCLUSTERFINDER"                                  LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, determineClass1Labels"        "= TPCNNCLUSTERFINDER"                                  LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, determineClass2Labels"        "= TPCNNCLUSTERFINDER"                                  LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, publishClass1Regression"      "= TPCNNCLUSTERFINDER"                                  LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizer, publishClass2Regression"      "= TPCNNCLUSTERFINDER"                                  LB      single int8_t dtype int8_t onlyMC uint batchStart)
+endif()
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"           "= TPCCLUSTERFINDER"                                    NO      single)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                 "= TPCCLUSTERFINDER"                                    NO      single GPUTPCLinearLabels* out)

From 08753ddc3373ec618450075e830b23537d9120fb Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 11 Mar 2025 00:51:40 +0100
Subject: [PATCH 52/77] Modifying the approach to not use std:: types. Still
 needs to be tested and need to do proper memory allocation

---
 Common/ML/include/ML/OrtInterface.h           |   6 +
 Common/ML/src/OrtInterface.cxx                |  77 +++++++++
 GPU/GPUTracking/Base/GPUMemoryResource.h      |   1 +
 .../Global/GPUChainTrackingClusterizer.cxx    | 105 ++++--------
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |  35 +++-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 157 ++++++++----------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |   1 -
 .../GPUTPCNNClusterizerInternals.cxx          |  88 ++++++++++
 .../GPUTPCNNClusterizerInternals.h            |  61 +++----
 9 files changed, 341 insertions(+), 190 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 9c1ca3250187f..9384689103f0f 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -53,6 +53,12 @@ class OrtModel
   template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
   std::vector<O> inference(std::vector<I>&);
 
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
+  O* inference(I*, size_t);
+
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
+  void inference(I*, size_t, O*);
+
   template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
   std::vector<O> inference(std::vector<std::vector<I>>&);
 
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index e5c784a31f6de..1aec841c16656 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -198,6 +198,29 @@ std::vector<O> OrtModel::inference(std::vector<I>& input)
   return outputValuesVec;
 }
 
+template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+O* OrtModel::inference(I* input, size_t input_size)
+{
+  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(input), input_size, inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
+  return outputValues;
+}
+
+template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+void OrtModel::inference(I* input, size_t input_size, O* output)
+{
+  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(input), input_size, inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  output = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
+}
+
 template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
 std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
 {
@@ -280,6 +303,60 @@ std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Floa
   return outputValuesVec;
 }
 
+template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+float* OrtModel::inference(float* input, size_t input_size)
+{
+  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, reinterpret_cast<float*>(input), input_size, inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  float* outputValues = reinterpret_cast<float*>(outputTensors[0].template GetTensorMutableData<float>());
+  return outputValues;
+}
+
+template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+float* OrtModel::inference(OrtDataType::Float16_t* input, size_t input_size)
+{
+  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<Ort::Value> inputTensor;
+  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size()));
+  // input.clear();
+  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
+  float* outputValues = reinterpret_cast<float*>(outputTensors[0].template GetTensorMutableData<float>());
+  return outputValues;
+}
+
+template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+void OrtModel::inference(float* input, size_t input_size, float* output)
+{
+  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  Ort::Value inputTensor = Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size());
+  
+  std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
+  size_t outputSize = (int64_t)((input_size / mInputShapes[0][1]) * outputShape[1]);
+  Ort::Value outputTensor = Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
+  
+  (pImplOrt->session)->Run(pImplOrt->runOptions, 
+                           inputNamesChar.data(), &inputTensor, 1,
+                           outputNamesChar.data(), &outputTensor, 1);
+}
+
+template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+void OrtModel::inference(OrtDataType::Float16_t* input, size_t input_size, float* output)
+{
+  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  Ort::Value inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size());
+  
+  std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
+  size_t outputSize = (int64_t)((input_size / mInputShapes[0][1]) * outputShape[1]);
+  Ort::Value outputTensor = Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
+  
+  (pImplOrt->session)->Run(pImplOrt->runOptions, 
+                           inputNamesChar.data(), &inputTensor, 1,
+                           outputNamesChar.data(), &outputTensor, 1);
+}
+
 template <>
 std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input)
 {
diff --git a/GPU/GPUTracking/Base/GPUMemoryResource.h b/GPU/GPUTracking/Base/GPUMemoryResource.h
index 3bb2c363db2a9..06e350db0bfc7 100644
--- a/GPU/GPUTracking/Base/GPUMemoryResource.h
+++ b/GPU/GPUTracking/Base/GPUMemoryResource.h
@@ -28,6 +28,7 @@ struct GPUMemoryReuse {
   };
   enum Group : uint16_t {
     ClustererScratch,
+    NNClusterer,
     ClustererZS,
     TrackerScratch,
     TrackerDataLinks,
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index da345802a6e8d..b5656269f8129 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -12,8 +12,6 @@
 /// \file GPUChainTrackingClusterizer.cxx
 /// \author David Rohr
 
-#include <CommonUtils/StringUtils.h>
-
 #include "GPUChainTracking.h"
 #include "GPUChainTrackingDefs.h"
 #include "GPULogging.h"
@@ -882,104 +880,59 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 #ifdef GPUCA_HAS_ONNX
           // Settings for the clusterizer
           GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn;
-          GPUTPCNNClusterizerInternals nnSettingsInternal;
-          clusterer.nnInternals = &nnSettingsInternal;
-          (clusterer.nnInternals)->nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
-          (clusterer.nnInternals)->nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
-          (clusterer.nnInternals)->nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
-          (clusterer.nnInternals)->nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
-          (clusterer.nnInternals)->nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
-          (clusterer.nnInternals)->nnClusterizerElementSize = ((2 * (clusterer.nnInternals)->nnClusterizerSizeInputRow + 1) * (2 * (clusterer.nnInternals)->nnClusterizerSizeInputPad + 1) * (2 * (clusterer.nnInternals)->nnClusterizerSizeInputTime + 1)) + ((clusterer.nnInternals)->nnClusterizerAddIndexData ? 3 : 0);
-          (clusterer.nnInternals)->nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
-          (clusterer.nnInternals)->nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
+          clusterer.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
+          clusterer.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
+          clusterer.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
+          clusterer.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
+          clusterer.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
+          clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
+          clusterer.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
+          clusterer.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
           if (nn_settings.nnClusterizerVerbosity < 0) {
-            (clusterer.nnInternals)->nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
+            clusterer.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
           } else {
-            (clusterer.nnInternals)->nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
+            clusterer.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
           }
 
           // Settings for the NN evaluation
-          (clusterer.nnInternals)->nnClassThreshold = nn_settings.nnClassThreshold;
-          (clusterer.nnInternals)->nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
-
-          // Settings for the neural network evaluation
-          (clusterer.nnInternals)->OrtOptions = {
-            {"model-path", nn_settings.nnClassificationPath},
-            {"device", nn_settings.nnInferenceDevice},
-            {"device-id", std::to_string(nn_settings.nnInferenceDeviceId)},
-            {"allocate-device-memory", std::to_string(nn_settings.nnInferenceAllocateDevMem)},
-            {"dtype", nn_settings.nnInferenceDtype},
-            {"intra-op-num-threads", std::to_string(nn_settings.nnInferenceThreadsPerNN)},
-            {"enable-optimizations", std::to_string(nn_settings.nnInferenceEnableOrtOptimization)},
-            {"enable-profiling", std::to_string(nn_settings.nnInferenceOrtProfiling)},
-            {"profiling-output-path", nn_settings.nnInferenceOrtProfilingPath},
-            {"logging-level", std::to_string(nn_settings.nnInferenceVerbosity)}};
-          (clusterer.nnInternals)->model_class.init((clusterer.nnInternals)->OrtOptions);
-          std::vector<std::string> reg_model_paths = o2::utils::Str::tokenize(nn_settings.nnRegressionPath, ':');
-
-          if (!(clusterer.nnInternals)->nnClusterizerUseCfRegression) {
-            if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
-              (clusterer.nnInternals)->OrtOptions["model-path"] = reg_model_paths[0];
-              (clusterer.nnInternals)->model_reg_1.init((clusterer.nnInternals)->OrtOptions);
-              (clusterer.nnInternals)->nnClusterizerModelClassNumOutputNodes = (clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1];
-            } else {
-              (clusterer.nnInternals)->OrtOptions["model-path"] = reg_model_paths[0];
-              (clusterer.nnInternals)->model_reg_1.init((clusterer.nnInternals)->OrtOptions);
-              (clusterer.nnInternals)->nnClusterizerModelReg1NumOutputNodes = (clusterer.nnInternals)->model_reg_1.getNumOutputNodes()[0][1];
-              (clusterer.nnInternals)->OrtOptions["model-path"] = reg_model_paths[1];
-              (clusterer.nnInternals)->model_reg_2.init((clusterer.nnInternals)->OrtOptions);
-              (clusterer.nnInternals)->nnClusterizerModelReg2NumOutputNodes = (clusterer.nnInternals)->model_reg_2.getNumOutputNodes()[0][1];
-            }
-          }
+          clusterer.nnClassThreshold = nn_settings.nnClassThreshold;
+          clusterer.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
+
+          GPUTPCNNClusterizerInternals nnApplication(GetProcessingSettings(), clusterer);
 
-          if ((clusterer.nnInternals)->nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
+          if (clusterer.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
 
-          if ((clusterer.nnInternals)->nnSigmoidTrafoClassThreshold) {
+          if (clusterer.nnSigmoidTrafoClassThreshold) {
             // Inverse sigmoid transformation
-            (clusterer.nnInternals)->nnClassThreshold = (float)std::log((clusterer.nnInternals)->nnClassThreshold / (1.f - (clusterer.nnInternals)->nnClassThreshold));
+            clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold));
           }
 
           float time_clusterizer = 0, time_fill = 0;
-          int evalDtype = (clusterer.nnInternals)->OrtOptions["dtype"].find("32") != std::string::npos;
-          (clusterer.nnInternals)->outputDataClass.resize(clusterer.mPmemory->counters.nClusters, -1);
+          int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
 
-          for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / (clusterer.nnInternals)->nnClusterizerBatchedMode); batch++) {
-            uint batchStart = batch * (clusterer.nnInternals)->nnClusterizerBatchedMode;
-            uint iSize = CAMath::Min((uint)(clusterer.nnInternals)->nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
-
-            (clusterer.nnInternals)->clusterFlags.clear();
-            (clusterer.nnInternals)->peakPositions.clear();
-            (clusterer.nnInternals)->centralCharges.clear();
-
-            (clusterer.nnInternals)->clusterFlags.resize(iSize, {0, 0});
-            (clusterer.nnInternals)->peakPositions.resize(iSize);
-            (clusterer.nnInternals)->centralCharges.resize(iSize);
-
-            if (evalDtype == 1) {
-              (clusterer.nnInternals)->inputData32.resize(iSize * (clusterer.nnInternals)->nnClusterizerElementSize, (float)((clusterer.nnInternals)->nnClusterizerBoundaryFillValue));
-            } else {
-              (clusterer.nnInternals)->inputData16.resize(iSize * (clusterer.nnInternals)->nnClusterizerElementSize, (OrtDataType::Float16_t)((float)(clusterer.nnInternals)->nnClusterizerBoundaryFillValue));
-            }
+          for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clusterer.nnClusterizerBatchedMode); batch++) {
+            uint batchStart = batch * clusterer.nnClusterizerBatchedMode;
+            uint iSize = CAMath::Min((uint)clusterer.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
             auto start0 = std::chrono::high_resolution_clock::now();
             runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Filling the data
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
-            GPUTPCNNClusterizer::inferenceNetworkClass(clusterer, evalDtype);
-            if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] == 1) {
+            nnApplication.inferenceNetworkClass(clusterer, evalDtype, batchStart);
+            if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Assigning class labels
             } else {
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Assigning class labels
             }
 
-            if (!(clusterer.nnInternals)->nnClusterizerUseCfRegression) {
-              GPUTPCNNClusterizer::inferenceNetworkReg1(clusterer, evalDtype);
+            if (!clusterer.nnClusterizerUseCfRegression) {
+              nnApplication.inferenceNetworkReg1(clusterer, evalDtype, batchStart);
               runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Running the NN for regression class 1
-              if ((clusterer.nnInternals)->model_class.getNumOutputNodes()[0][1] > 1 && reg_model_paths.size() > 1) {
-                GPUTPCNNClusterizer::inferenceNetworkReg2(clusterer, evalDtype);
+              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
+                nnApplication.inferenceNetworkReg2(clusterer, evalDtype, batchStart);
                 runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Running the NN for regression class 2
               }
             }
@@ -990,13 +943,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
 
           auto start1 = std::chrono::high_resolution_clock::now();
-          if ((clusterer.nnInternals)->nnClusterizerUseCfRegression) {
+          if (clusterer.nnClusterizerUseCfRegression) {
             runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
 
-          if ((clusterer.nnInternals)->nnClusterizerVerbosity < 3) {
+          if (clusterer.nnClusterizerVerbosity < 3) {
             LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
 #else
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 353b4abb51597..615880494d4a7 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -19,6 +19,7 @@
 #include "GPUProcessor.h"
 #include "GPUDataTypes.h"
 #include "CfFragment.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
 
 namespace o2
 {
@@ -143,7 +144,39 @@ class GPUTPCClusterFinder : public GPUProcessor
   int16_t mZSOffsetId = -1;
   int16_t mOutputId = -1;
 
-  GPUTPCNNClusterizerInternals* nnInternals = nullptr;
+  // Neural network clusterization
+
+  int nnClusterizerSizeInputRow = 3;
+  int nnClusterizerSizeInputPad = 3;
+  int nnClusterizerSizeInputTime = 3;
+  int nnClusterizerElementSize = -1;
+  bool nnClusterizerAddIndexData = true;
+  float nnClassThreshold = 0.16;
+  bool nnSigmoidTrafoClassThreshold = 1;
+  int nnClusterizerUseCfRegression = 0;
+  int nnClusterizerBatchedMode = 1;
+  int nnClusterizerVerbosity = 0;
+  int nnClusterizerBoundaryFillValue = -1;
+  int nnClusterizerDumpDigits = 0;
+  int nnClusterizerApplyCfDeconvolution = 0;
+  int nnClusterizerModelClassNumOutputNodes = -1;
+  int nnClusterizerModelReg1NumOutputNodes = -1;
+  int nnClusterizerModelReg2NumOutputNodes = -1;
+  uint nnClusterizerCurrentSize = -1; // This variable determines the size of the memory pointers. It will be set at runtime.
+  int nnClusterizerDtype = 0; // 0: float16, 1: float32
+
+  // Memory allocation for neural network
+  uint class2_elements = 0;
+  float* inputData32=nullptr;
+  OrtDataType::Float16_t* inputData16=nullptr;
+  float* outputDataClass=nullptr;
+  float* modelProbabilities=nullptr;
+  float* outputDataReg1=nullptr;
+  float* outputDataReg2=nullptr;
+
+  ChargePos* peakPositions=nullptr;
+  bool* clusterFlags=nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx
+  float* centralCharges=nullptr;
 
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 607e8fb2c1a90..d4b7ba03bdd5a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -13,7 +13,6 @@
 /// \author Christian Sonnabend
 
 #include "GPUTPCNNClusterizer.h"
-#include "GPUTPCNNClusterizerInternals.h"
 #include "GPUTPCCFClusterizer.h"
 
 #include "CfConsts.h"
@@ -32,7 +31,7 @@ template <>
 GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  if ((clusterer.nnInternals)->outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
+  if (clusterer.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
     return;
   }
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
@@ -52,25 +51,25 @@ template <>
 GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  (clusterer.nnInternals)->outputDataClass[glo_idx + batchStart] = (int)((clusterer.nnInternals)->modelProbabilities[glo_idx] > (clusterer.nnInternals)->nnClassThreshold);
+  clusterer.outputDataClass[glo_idx + batchStart] = (int)(clusterer.modelProbabilities[glo_idx] > clusterer.nnClassThreshold);
 }
 
 template <>
 GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  uint elem_iterator = glo_idx * (clusterer.nnInternals)->nnClusterizerModelClassNumOutputNodes;
+  uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
   float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
   uint class_label = 0;
-  for (float pIdx = elem_iterator; pIdx < elem_iterator + (clusterer.nnInternals)->nnClusterizerModelClassNumOutputNodes; pIdx++) {
+  for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
     if (pIdx == elem_iterator) {
-      current_max_prob = (clusterer.nnInternals)->modelProbabilities[pIdx];
+      current_max_prob = clusterer.modelProbabilities[pIdx];
     } else {
-      class_label = ((clusterer.nnInternals)->modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
+      class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
     }
   }
-  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + (clusterer.nnInternals)->nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
-  (clusterer.nnInternals)->outputDataClass[glo_idx + batchStart] = class_label;
+  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
+  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
 }
 
 template <>
@@ -93,34 +92,6 @@ GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::publishClass2Regr
   GPUTPCNNClusterizer::publishClustersReg2(glo_idx, smem, clusterer, dtype, onlyMC, batchStart);
 }
 
-// Apply the neural network to the input data. Note: These are not GPU kernels. We let ONNX take care of that
-void GPUTPCNNClusterizer::inferenceNetworkClass(processorType& clusterer, int8_t dtype, uint batch_idx)
-{
-  if (dtype == 0) {
-    (clusterer.nnInternals)->modelProbabilities = (clusterer.nnInternals)->model_class.inference<OrtDataType::Float16_t, float>((clusterer.nnInternals)->inputData16);
-  } else {
-    (clusterer.nnInternals)->modelProbabilities = (clusterer.nnInternals)->model_class.inference<float, float>((clusterer.nnInternals)->inputData32);
-  }
-}
-
-void GPUTPCNNClusterizer::inferenceNetworkReg1(processorType& clusterer, int8_t dtype)
-{
-  if (dtype == 0) {
-    (clusterer.nnInternals)->outputDataReg1 = (clusterer.nnInternals)->model_reg_1.inference<OrtDataType::Float16_t, float>((clusterer.nnInternals)->inputData16);
-  } else {
-    (clusterer.nnInternals)->outputDataReg1 = (clusterer.nnInternals)->model_reg_1.inference<float, float>((clusterer.nnInternals)->inputData32);
-  }
-}
-
-void GPUTPCNNClusterizer::inferenceNetworkReg2(processorType& clusterer, int8_t dtype)
-{
-  if (dtype == 0) {
-    (clusterer.nnInternals)->outputDataReg2 = (clusterer.nnInternals)->model_reg_2.inference<OrtDataType::Float16_t, float>((clusterer.nnInternals)->inputData16);
-  } else {
-    (clusterer.nnInternals)->outputDataReg2 = (clusterer.nnInternals)->model_reg_2.inference<float, float>((clusterer.nnInternals)->inputData32);
-  }
-}
-
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 GPUd() int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
 {
@@ -156,55 +127,55 @@ GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads
 
   uint glo_idx = get_global_id(0);
 
-  uint write_idx = glo_idx * (clusterer.nnInternals)->nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
+  uint write_idx = glo_idx * clusterer.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
 
   ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
   int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
 
-  (clusterer.nnInternals)->peakPositions[glo_idx] = peak;
-  (clusterer.nnInternals)->centralCharges[glo_idx] = central_charge;
+  clusterer.peakPositions[glo_idx] = peak;
+  clusterer.centralCharges[glo_idx] = central_charge;
 
-  int row_offset = GPUTPCNNClusterizer::rowOffset(row, (clusterer.nnInternals)->nnClusterizerSizeInputRow);
+  int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
   GPUCA_UNROLL(U(), U());
-  for (int r = -(clusterer.nnInternals)->nnClusterizerSizeInputRow; r <= (clusterer.nnInternals)->nnClusterizerSizeInputRow; r++) {
+  for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {
     bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
     int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
-    for (int p = -(clusterer.nnInternals)->nnClusterizerSizeInputPad + pad_offset; p <= (clusterer.nnInternals)->nnClusterizerSizeInputPad + pad_offset; p++) {
-      bool is_boundary = is_row_boundary || GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p, (clusterer.nnInternals)->nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
-      for (int t = -(clusterer.nnInternals)->nnClusterizerSizeInputTime; t <= (clusterer.nnInternals)->nnClusterizerSizeInputTime; t++) {
+    for (int p = -clusterer.nnClusterizerSizeInputPad + pad_offset; p <= clusterer.nnClusterizerSizeInputPad + pad_offset; p++) {
+      bool is_boundary = is_row_boundary || GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
+      for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
         if (!is_boundary) {
           ChargePos tmp_pos(row + r, pad + p, time + t);
-          if (r == 0 && !(clusterer.nnInternals)->clusterFlags[glo_idx][0] && std::abs(p) < 3 && std::abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
-            (clusterer.nnInternals)->clusterFlags[glo_idx][0] = CfUtils::isPeak(isPeakMap[tmp_pos]);
-            (clusterer.nnInternals)->clusterFlags[glo_idx][1] = (clusterer.nnInternals)->clusterFlags[glo_idx][0];
+          if (r == 0 && !clusterer.clusterFlags[2*glo_idx] && std::abs(p) < 3 && std::abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
+            clusterer.clusterFlags[2*glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
+            clusterer.clusterFlags[2*glo_idx + 1] = clusterer.clusterFlags[2*glo_idx];
           }
           if (dtype == 0) {
-            (clusterer.nnInternals)->inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+            clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
           } else {
-            (clusterer.nnInternals)->inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+            clusterer.inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
           }
         } else {
           // Filling boundary just to make sure that no values are left unintentionally
           if (dtype == 0) {
-            (clusterer.nnInternals)->inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>((clusterer.nnInternals)->nnClusterizerBoundaryFillValue));
+            clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clusterer.nnClusterizerBoundaryFillValue));
           } else {
-            (clusterer.nnInternals)->inputData32[write_idx] = static_cast<float>((clusterer.nnInternals)->nnClusterizerBoundaryFillValue);
+            clusterer.inputData32[write_idx] = static_cast<float>(clusterer.nnClusterizerBoundaryFillValue);
           }
         }
         write_idx++;
       }
     }
   }
-  if ((clusterer.nnInternals)->nnClusterizerAddIndexData) {
+  if (clusterer.nnClusterizerAddIndexData) {
     if (dtype == 0) {
-      (clusterer.nnInternals)->inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f);
-      (clusterer.nnInternals)->inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
-      (clusterer.nnInternals)->inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row));
+      clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f);
+      clusterer.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
+      clusterer.inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row));
     } else {
-      (clusterer.nnInternals)->inputData32[write_idx] = clusterer.mISector / 36.f;
-      (clusterer.nnInternals)->inputData32[write_idx + 1] = row / 152.f;
-      (clusterer.nnInternals)->inputData32[write_idx + 2] = static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row);
+      clusterer.inputData32[write_idx] = clusterer.mISector / 36.f;
+      clusterer.inputData32[write_idx + 1] = row / 152.f;
+      clusterer.inputData32[write_idx + 2] = static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row);
     }
   }
 }
@@ -216,22 +187,22 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
-  int model_output_index = glo_idx * (clusterer.nnInternals)->nnClusterizerModelReg1NumOutputNodes;
+  int model_output_index = glo_idx * clusterer.nnClusterizerModelReg1NumOutputNodes;
 
-  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << (clusterer.nnInternals)->outputDataReg1.size() << " / " << (clusterer.nnInternals)->nnClusterizerModelReg1NumOutputNodes << " -- " << (clusterer.nnInternals)->peakPositions.size() << " -- " << (clusterer.nnInternals)->centralCharges.size();
+  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clusterer.outputDataReg1.size() << " / " << clusterer.nnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
 
-  if ((clusterer.nnInternals)->outputDataClass[full_glo_idx] == 1) {
+  if (clusterer.outputDataClass[full_glo_idx] == 1) {
 
     ClusterAccumulator pc;
 
     // Publishing logic is taken from default clusterizer
     if (onlyMC) {
       ClusterAccumulator dummy_pc;
-      CPU_ONLY(labelAcc->collect((clusterer.nnInternals)->peakPositions[glo_idx], chargeMap[(clusterer.nnInternals)->peakPositions[glo_idx]].unpack()));
+      CPU_ONLY(labelAcc->collect(clusterer.peakPositions[glo_idx], chargeMap[clusterer.peakPositions[glo_idx]].unpack()));
       GPUTPCCFClusterizer::buildCluster(
         clusterer.Param().rec,
         chargeMap,
-        (clusterer.nnInternals)->peakPositions[glo_idx],
+        clusterer.peakPositions[glo_idx],
         smem.posBcast,
         smem.buf,
         smem.innerAboveThreshold,
@@ -239,17 +210,23 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
         labelAcc);
     }
 
-    if ((clusterer.mPmemory->fragment).isOverlap((clusterer.nnInternals)->peakPositions[glo_idx].time())) {
+    if ((clusterer.mPmemory->fragment).isOverlap(clusterer.peakPositions[glo_idx].time())) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
       }
       return;
     }
 
-    pc.setFull((clusterer.nnInternals)->centralCharges[glo_idx] * (clusterer.nnInternals)->outputDataReg1[model_output_index + 4], static_cast<float>((clusterer.nnInternals)->peakPositions[glo_idx].pad()) + (clusterer.nnInternals)->outputDataReg1[model_output_index], (clusterer.nnInternals)->outputDataReg1[model_output_index + 2], static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>((clusterer.nnInternals)->peakPositions[glo_idx].time()) + (clusterer.nnInternals)->outputDataReg1[model_output_index + 1], (clusterer.nnInternals)->outputDataReg1[model_output_index + 3], 0, 0);
+    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg1[model_output_index + 4],
+      static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg1[model_output_index],
+      clusterer.outputDataReg1[model_output_index + 2],
+      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg1[model_output_index + 1],
+      clusterer.outputDataReg1[model_output_index + 3],
+      clusterer.clusterFlags[2*glo_idx],
+      clusterer.clusterFlags[2*glo_idx + 1]);
 
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative((clusterer.nnInternals)->peakPositions[glo_idx], (clusterer.nnInternals)->centralCharges[glo_idx], myCluster, clusterer.Param());
+    bool rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -262,7 +239,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
         myCluster,
-        (clusterer.nnInternals)->peakPositions[glo_idx].row(),
+        clusterer.peakPositions[glo_idx].row(),
         clusterer.mNMaxClusterPerRow,
         clusterer.mPclusterInRow,
         clusterOut);
@@ -272,7 +249,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemo
     } else if (clusterer.mPclusterPosInRow) {
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-    CPU_ONLY(labelAcc->commit((clusterer.nnInternals)->peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+    CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
   } else {
     if (clusterer.mPclusterPosInRow) {
       clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -288,21 +265,21 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
-  int model_output_index = glo_idx * (clusterer.nnInternals)->nnClusterizerModelReg2NumOutputNodes;
+  int model_output_index = glo_idx * clusterer.nnClusterizerModelReg2NumOutputNodes;
 
-  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << (clusterer.nnInternals)->outputDataReg1.size() << " / " << (clusterer.nnInternals)->nnClusterizerModelReg2NumOutputNodes << " -- " << (clusterer.nnInternals)->peakPositions.size() << " -- " << (clusterer.nnInternals)->centralCharges.size();
+  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clusterer.outputDataReg1.size() << " / " << clusterer.nnClusterizerModelReg2NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
 
-  if ((clusterer.nnInternals)->outputDataClass[full_glo_idx] > 0) {
+  if (clusterer.outputDataClass[full_glo_idx] > 0) {
 
     ClusterAccumulator pc;
 
     if (onlyMC) {
       ClusterAccumulator dummy_pc;
-      CPU_ONLY(labelAcc->collect((clusterer.nnInternals)->peakPositions[glo_idx], chargeMap[(clusterer.nnInternals)->peakPositions[glo_idx]].unpack()));
+      CPU_ONLY(labelAcc->collect(clusterer.peakPositions[glo_idx], chargeMap[clusterer.peakPositions[glo_idx]].unpack()));
       GPUTPCCFClusterizer::buildCluster(
         clusterer.Param().rec,
         chargeMap,
-        (clusterer.nnInternals)->peakPositions[glo_idx],
+        clusterer.peakPositions[glo_idx],
         smem.posBcast,
         smem.buf,
         smem.innerAboveThreshold,
@@ -310,7 +287,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
         labelAcc);
     }
 
-    if ((clusterer.mPmemory->fragment).isOverlap((clusterer.nnInternals)->peakPositions[glo_idx].time())) {
+    if ((clusterer.mPmemory->fragment).isOverlap(clusterer.peakPositions[glo_idx].time())) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
       }
@@ -318,10 +295,16 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
     }
 
     // Cluster 1
-    pc.setFull((clusterer.nnInternals)->centralCharges[glo_idx] * (clusterer.nnInternals)->outputDataReg2[model_output_index + 8], (clusterer.nnInternals)->peakPositions[glo_idx].pad() + (clusterer.nnInternals)->outputDataReg2[model_output_index], (clusterer.nnInternals)->outputDataReg2[model_output_index + 4], (clusterer.mPmemory->fragment).start + (clusterer.nnInternals)->peakPositions[glo_idx].time() + (clusterer.nnInternals)->outputDataReg2[model_output_index + 2], (clusterer.nnInternals)->outputDataReg2[model_output_index + 6], 0, 0);
+    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg2[model_output_index + 8],
+      static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg2[model_output_index],
+      clusterer.outputDataReg2[model_output_index + 4],
+      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg2[model_output_index + 2],
+      clusterer.outputDataReg2[model_output_index + 6],
+      clusterer.clusterFlags[2*glo_idx],
+      clusterer.clusterFlags[2*glo_idx + 1]);
 
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative((clusterer.nnInternals)->peakPositions[glo_idx], (clusterer.nnInternals)->centralCharges[glo_idx], myCluster, clusterer.Param());
+    bool rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -334,7 +317,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
         myCluster,
-        (clusterer.nnInternals)->peakPositions[glo_idx].row(),
+        clusterer.peakPositions[glo_idx].row(),
         clusterer.mNMaxClusterPerRow,
         clusterer.mPclusterInRow,
         clusterOut);
@@ -344,12 +327,18 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
     } else if (clusterer.mPclusterPosInRow) {
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-    CPU_ONLY(labelAcc->commit((clusterer.nnInternals)->peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+    CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
 
     // Cluster 2
-    pc.setFull((clusterer.nnInternals)->centralCharges[glo_idx] * (clusterer.nnInternals)->outputDataReg2[model_output_index + 9], (clusterer.nnInternals)->peakPositions[glo_idx].pad() + (clusterer.nnInternals)->outputDataReg2[model_output_index + 1], (clusterer.nnInternals)->outputDataReg2[model_output_index + 5], (clusterer.mPmemory->fragment).start + (clusterer.nnInternals)->peakPositions[glo_idx].time() + (clusterer.nnInternals)->outputDataReg2[model_output_index + 3], (clusterer.nnInternals)->outputDataReg2[model_output_index + 7], 0, 0);
-
-    rejectCluster = !pc.toNative((clusterer.nnInternals)->peakPositions[glo_idx], (clusterer.nnInternals)->centralCharges[glo_idx], myCluster, clusterer.Param());
+    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg2[model_output_index + 9],
+      static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg2[model_output_index + 1],
+      clusterer.outputDataReg2[model_output_index + 5],
+      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg2[model_output_index + 3],
+      clusterer.outputDataReg2[model_output_index + 7],
+      clusterer.clusterFlags[2*glo_idx],
+      clusterer.clusterFlags[2*glo_idx + 1]);
+
+    rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -361,7 +350,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
         myCluster,
-        (clusterer.nnInternals)->peakPositions[glo_idx].row(),
+        clusterer.peakPositions[glo_idx].row(),
         clusterer.mNMaxClusterPerRow,
         clusterer.mPclusterInRow,
         clusterOut);
@@ -371,7 +360,7 @@ GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemo
     } else if (clusterer.mPclusterPosInRow) {
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-    // CPU_ONLY(labelAcc->commit((clusterer.nnInternals)->peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
+    // CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
   } else {
     if (clusterer.mPclusterPosInRow) {
       clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 01f4ddb8bf346..534446af67828 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -32,7 +32,6 @@ namespace o2::gpu
 
 class ClusterAccumulator;
 class MCLabelAccumulator;
-class GPUTPCNNClusterizerInternals;
 
 class GPUTPCNNClusterizer : public GPUKernelTemplate
 {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx
index 886f300c13544..59c59a26a1d10 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx
@@ -13,3 +13,91 @@
 /// \author Christian Sonnabend
 
 #include "GPUTPCNNClusterizerInternals.h"
+
+using namespace o2::gpu;
+
+GPUTPCNNClusterizerInternals::GPUTPCNNClusterizerInternals(GPUSettingsProcessing settings, processorType& clusterer) {
+  clusterer_internal = &clusterer;
+  GPUSettingsProcessingNNclusterizer nn_settings = settings.nn;
+  OrtOptions = {{"model-path", nn_settings.nnClassificationPath},
+    {"device", nn_settings.nnInferenceDevice},
+    {"device-id", std::to_string(nn_settings.nnInferenceDeviceId)},
+    {"allocate-device-memory", std::to_string(nn_settings.nnInferenceAllocateDevMem)},
+    {"dtype", nn_settings.nnInferenceDtype},
+    {"intra-op-num-threads", std::to_string(nn_settings.nnInferenceThreadsPerNN)},
+    {"enable-optimizations", std::to_string(nn_settings.nnInferenceEnableOrtOptimization)},
+    {"enable-profiling", std::to_string(nn_settings.nnInferenceOrtProfiling)},
+    {"profiling-output-path", nn_settings.nnInferenceOrtProfilingPath},
+    {"logging-level", std::to_string(nn_settings.nnInferenceVerbosity)}};
+  sector = clusterer.mISector;
+
+
+  model_class.init(OrtOptions);
+  reg_model_paths = splitString(nn_settings.nnRegressionPath, ":");
+
+  if (!nn_settings.nnClusterizerUseCfRegression) {
+    if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
+      OrtOptions["model-path"] = reg_model_paths[0];
+      model_reg_1.init(OrtOptions);
+      clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
+    } else {
+      OrtOptions["model-path"] = reg_model_paths[0];
+      model_reg_1.init(OrtOptions);
+      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
+      OrtOptions["model-path"] = reg_model_paths[1];
+      model_reg_2.init(OrtOptions);
+      clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1];
+    }
+  }
+}
+
+void* GPUTPCNNClusterizerInternals::setIOPointers(void* mem) {
+  if (clusterer_internal->nnClusterizerDtype == 0){
+      computePointerWithAlignment(mem, clusterer_internal->inputData16, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerElementSize);
+  } else if (clusterer_internal->nnClusterizerDtype == 1){
+      computePointerWithAlignment(mem, clusterer_internal->inputData32, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerElementSize);
+  }
+  computePointerWithAlignment(mem, clusterer_internal->outputDataClass, clusterer_internal->nnClusterizerCurrentSize);
+  computePointerWithAlignment(mem, clusterer_internal->modelProbabilities, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerModelClassNumOutputNodes);
+  computePointerWithAlignment(mem, clusterer_internal->outputDataReg1, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerModelReg1NumOutputNodes);
+  computePointerWithAlignment(mem, clusterer_internal->outputDataReg2, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerModelReg2NumOutputNodes);
+  computePointerWithAlignment(mem, clusterer_internal->peakPositions, clusterer_internal->nnClusterizerCurrentSize);
+  computePointerWithAlignment(mem, clusterer_internal->clusterFlags, 2*clusterer_internal->nnClusterizerCurrentSize);
+  computePointerWithAlignment(mem, clusterer_internal->centralCharges, clusterer_internal->nnClusterizerCurrentSize);
+
+  return mem;
+}
+
+void GPUTPCNNClusterizerInternals::RegisterMemoryAllocation() {
+  AllocateAndInitializeLate();
+  int32_t memType = GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK;
+  mMemoryId = mRec->RegisterMemoryAllocation(this, &GPUTPCNNClusterizerInternals::setIOPointers, memType, "TPCNNClusterer", GPUMemoryReuse{GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::NNClusterer, (uint16_t)(sector % mRec->GetProcessingSettings().nTPCClustererLanes)});
+}
+
+// Apply the neural network to the input data. Note: These are not GPU kernels. We let ONNX take care of that
+void GPUTPCNNClusterizerInternals::inferenceNetworkClass(processorType& clusterer, int8_t dtype, uint batch_idx)
+{
+  if (dtype == 0) {
+    model_class.inference<OrtDataType::Float16_t, float>(clusterer.inputData16 + batch_idx, clusterer.nnClusterizerCurrentSize * clusterer.nnClusterizerElementSize, clusterer.modelProbabilities);
+  } else {
+    model_class.inference<float, float>(clusterer.inputData32 + batch_idx, clusterer.nnClusterizerCurrentSize * clusterer.nnClusterizerElementSize, clusterer.modelProbabilities);
+  }
+}
+
+void GPUTPCNNClusterizerInternals::inferenceNetworkReg1(processorType& clusterer, int8_t dtype, uint batch_idx)
+{
+  if (dtype == 0) {
+    model_reg_1.inference<OrtDataType::Float16_t, float>(clusterer.inputData16 + batch_idx, clusterer.nnClusterizerCurrentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg1);
+  } else {
+    model_reg_1.inference<float, float>(clusterer.inputData32 + batch_idx, clusterer.nnClusterizerCurrentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg1);
+  }
+}
+
+void GPUTPCNNClusterizerInternals::inferenceNetworkReg2(processorType& clusterer, int8_t dtype, uint batch_idx)
+{
+  if (dtype == 0) {
+    model_reg_2.inference<OrtDataType::Float16_t, float>(clusterer.inputData16 + batch_idx, clusterer.nnClusterizerCurrentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg2);
+  } else {
+    model_reg_2.inference<float, float>(clusterer.inputData32 + batch_idx, clusterer.nnClusterizerCurrentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg2);
+  }
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
index 0e8d337a781d4..ef027e536969d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
@@ -15,47 +15,52 @@
 #ifndef O2_GPUTPCNNCLUSTERIZERINTERNALS_H
 #define O2_GPUTPCNNCLUSTERIZERINTERNALS_H
 
-#include "ML/3rdparty/GPUORTFloat16.h"
 #include "ML/OrtInterface.h"
 #include "ChargePos.h"
+#include "GPUReconstruction.h"
+#include "GPUProcessor.h"
+#include "GPUTPCClusterFinder.h"
+#include "GPUHostDataTypes.h"
 
 using namespace o2::ml;
 
 namespace o2::gpu
 {
 
-class GPUTPCNNClusterizerInternals
+class GPUTPCNNClusterizerInternals : public GPUProcessor
 {
  public:
-  int nnClusterizerSizeInputRow = 3;
-  int nnClusterizerSizeInputPad = 3;
-  int nnClusterizerSizeInputTime = 3;
-  int nnClusterizerElementSize = -1;
-  bool nnClusterizerAddIndexData = true;
-  float nnClassThreshold = 0.16;
-  bool nnSigmoidTrafoClassThreshold = 1;
-  int nnClusterizerUseCfRegression = 0;
-  int nnClusterizerBatchedMode = 1;
-  int nnClusterizerVerbosity = 0;
-  int nnClusterizerBoundaryFillValue = -1;
-  int nnClusterizerDumpDigits = 0;
-  int nnClusterizerApplyCfDeconvolution = 0;
-  int nnClusterizerModelClassNumOutputNodes = -1;
-  int nnClusterizerModelReg1NumOutputNodes = -1;
-  int nnClusterizerModelReg2NumOutputNodes = -1;
-
-  // Memory allocation for neural network
-  uint class2_elements = 0;
-  std::vector<float> inputData32;
-  std::vector<OrtDataType::Float16_t> inputData16;
-  std::vector<float> outputDataClass, modelProbabilities, outputDataReg1, outputDataReg2;
-
-  std::vector<ChargePos> peakPositions;
-  std::vector<std::vector<bool>> clusterFlags; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cxx
-  std::vector<float> centralCharges;
+ typedef GPUTPCClusterFinder processorType;
+  GPUTPCNNClusterizerInternals() = default;
+  GPUTPCNNClusterizerInternals(GPUSettingsProcessing, processorType&);
+  void* setIOPointers(void*);
+  void RegisterMemoryAllocation();
+  void inferenceNetworkClass(processorType&, int8_t, uint);
+  void inferenceNetworkReg1(processorType&, int8_t, uint);
+  void inferenceNetworkReg2(processorType&, int8_t, uint);
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
+  std::vector<std::string> reg_model_paths;
+ private:
+ processorType* clusterer_internal;
+  int sector = -1;
+  int16_t mMemoryId = -1;
+
+  // Avoid including CommonUtils/StringUtils.h
+  std::vector<std::string> splitString(const std::string& input, const std::string& delimiter) {
+    std::vector<std::string> tokens;
+    std::size_t pos = 0;
+    std::size_t found;
+
+    while ((found = input.find(delimiter, pos)) != std::string::npos) {
+        tokens.push_back(input.substr(pos, found - pos));
+        pos = found + delimiter.length();
+    }
+    tokens.push_back(input.substr(pos));
+
+    return tokens;
+  }
 }; // class GPUTPCNNClusterizerInternals
 
 } // namespace o2::gpu

From 9155cca526e412e822bdaa30917390d9bf6817fc Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 12 Mar 2025 00:09:24 +0100
Subject: [PATCH 53/77] New version of clusterizer. Compiles locally, but
 segfaults in fillInput kernel. Testing with the CI now.

---
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h |   7 +-
 Common/ML/include/ML/OrtInterface.h           |   7 +-
 Common/ML/src/OrtInterface.cxx                | 182 ++-------
 GPU/GPUTracking/Base/GPUConstantMem.h         |   7 +
 GPU/GPUTracking/Base/GPUReconstruction.cxx    |   3 +
 GPU/GPUTracking/CMakeLists.txt                |   2 +-
 .../Definitions/GPUDefGPUParameters.h         |  22 +-
 GPU/GPUTracking/Global/GPUChainTracking.cxx   |   6 +
 .../Global/GPUChainTrackingClusterizer.cxx    |  78 ++--
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |  36 --
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 368 ++---------------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    | 108 +++--
 .../GPUTPCNNClusterizerHost.cxx               |  83 ++++
 ...rInternals.h => GPUTPCNNClusterizerHost.h} |  42 +-
 .../GPUTPCNNClusterizerInternals.cxx          | 103 -----
 .../GPUTPCNNClusterizerKernels.cxx            | 376 ++++++++++++++++++
 .../GPUTPCNNClusterizerKernels.h              |  77 ++++
 GPU/GPUTracking/kernels.cmake                 |  16 +-
 18 files changed, 758 insertions(+), 765 deletions(-)
 create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
 rename GPU/GPUTracking/TPCClusterFinder/{GPUTPCNNClusterizerInternals.h => GPUTPCNNClusterizerHost.h} (61%)
 delete mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx
 create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
 create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h

diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
index b343416f5380c..2fcc09375cef2 100644
--- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -5,6 +5,9 @@
 //    - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_float16.h
 //    - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_api.h
 
+#ifndef GPUORTFLOAT16_H
+#define GPUORTFLOAT16_H
+
 #ifndef GPUCA_GPUCODE_DEVICE
 #include <stdint.h>
 #include <cmath>
@@ -868,4 +871,6 @@ static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
 
 } // namespace OrtDataType
 
-} // namespace o2
\ No newline at end of file
+} // namespace o2
+
+#endif
\ No newline at end of file
diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 9384689103f0f..368754aee0f92 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -53,15 +53,12 @@ class OrtModel
   template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
   std::vector<O> inference(std::vector<I>&);
 
-  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
-  O* inference(I*, size_t);
+  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
+  std::vector<O> inference(std::vector<std::vector<I>>&);
 
   template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
   void inference(I*, size_t, O*);
 
-  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-  std::vector<O> inference(std::vector<std::vector<I>>&);
-
   // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
   // std::vector<O> inference(std::vector<I>&);
 
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 1aec841c16656..933cd861ff950 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -162,6 +162,9 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
                  [&](const std::string& str) { return str.c_str(); });
   }
+  if (loggingLevel < 2) {
+    LOG(info) << "(ORT) Model loaded successfully! (input: " <<  printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")";
+  }
 }
 
 void OrtModel::resetSession()
@@ -184,59 +187,6 @@ std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
   }
 }
 
-template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<I>& input)
-{
-  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
-  std::vector<O> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
-  outputTensors.clear();
-  return outputValuesVec;
-}
-
-template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-O* OrtModel::inference(I* input, size_t input_size)
-{
-  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(input), input_size, inputShape.data(), inputShape.size()));
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
-  return outputValues;
-}
-
-template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-void OrtModel::inference(I* input, size_t input_size, O* output)
-{
-  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(input), input_size, inputShape.data(), inputShape.size()));
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  output = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
-}
-
-template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
-{
-  std::vector<Ort::Value> inputTensor;
-  for (auto i : input) {
-    std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-    inputTensor.emplace_back(Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, reinterpret_cast<O*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
-  }
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
-  std::vector<O> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
-  outputTensors.clear();
-  return outputValuesVec;
-}
-
 std::string OrtModel::printShape(const std::vector<int64_t>& v)
 {
   std::stringstream ss("");
@@ -247,128 +197,72 @@ std::string OrtModel::printShape(const std::vector<int64_t>& v)
   return ss.str();
 }
 
-template <>
-std::vector<float> OrtModel::inference<float, float>(std::vector<float>& input)
-{
-  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  float* outputValues = outputTensors[0].template GetTensorMutableData<float>();
-  std::vector<float> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
-  outputTensors.clear();
-  return outputValuesVec;
-}
 
-template <>
-std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>& input)
-{
+template <class I, class O> 
+std::vector<O> OrtModel::inference(std::vector<I>& input) {
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
+    inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
+  } else {
+    inputTensor.emplace_back(Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size()));
+  }
   // input.clear();
   auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  float* outputValues = outputTensors[0].template GetTensorMutableData<float>();
-  std::vector<float> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
+  O* outputValues = outputTensors[0].template GetTensorMutableData<O>();
+  std::vector<O> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
   outputTensors.clear();
   return outputValuesVec;
 }
 
-template <>
-std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>& input)
-{
-  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
-  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
-  outputTensors.clear();
-  return outputValuesVec;
-}
+template std::vector<float> OrtModel::inference<float, float>(std::vector<float>&);
 
-template <>
-std::vector<OrtDataType::Float16_t> OrtModel::inference<float, OrtDataType::Float16_t>(std::vector<float>& input)
-{
-  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
-  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]};
-  outputTensors.clear();
-  return outputValuesVec;
-}
+template std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>&);
 
-template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-float* OrtModel::inference(float* input, size_t input_size)
-{
-  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, reinterpret_cast<float*>(input), input_size, inputShape.data(), inputShape.size()));
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  float* outputValues = reinterpret_cast<float*>(outputTensors[0].template GetTensorMutableData<float>());
-  return outputValues;
-}
+template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>&);
 
-template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-float* OrtModel::inference(OrtDataType::Float16_t* input, size_t input_size)
-{
-  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  std::vector<Ort::Value> inputTensor;
-  inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size()));
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  float* outputValues = reinterpret_cast<float*>(outputTensors[0].template GetTensorMutableData<float>());
-  return outputValues;
-}
 
-template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-void OrtModel::inference(float* input, size_t input_size, float* output)
+template <class I, class O>
+void OrtModel::inference(I* input, size_t input_size, O* output)
 {
   std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  Ort::Value inputTensor = Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size());
+  Ort::Value inputTensor = Ort::Value(nullptr);
+  if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
+    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size());
+  } else {
+    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size());
+  }
   
   std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
-  size_t outputSize = (int64_t)((input_size / mInputShapes[0][1]) * outputShape[1]);
-  Ort::Value outputTensor = Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
+  size_t outputSize = (int64_t)(inputShape[0] * mOutputShapes[0][1]);
+  Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
   
   (pImplOrt->session)->Run(pImplOrt->runOptions, 
                            inputNamesChar.data(), &inputTensor, 1,
                            outputNamesChar.data(), &outputTensor, 1);
 }
 
-template <>// class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-void OrtModel::inference(OrtDataType::Float16_t* input, size_t input_size, float* output)
-{
-  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-  Ort::Value inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size());
-  
-  std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
-  size_t outputSize = (int64_t)((input_size / mInputShapes[0][1]) * outputShape[1]);
-  Ort::Value outputTensor = Ort::Value::CreateTensor<float>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
-  
-  (pImplOrt->session)->Run(pImplOrt->runOptions, 
-                           inputNamesChar.data(), &inputTensor, 1,
-                           outputNamesChar.data(), &outputTensor, 1);
-}
+template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
+
+template void OrtModel::inference<float, float>(float*, size_t, float*);
 
-template <>
-std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>& input)
+
+template <class I, class O> 
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
 {
   std::vector<Ort::Value> inputTensor;
   for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
-    inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
+    if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
+      inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
+    } else {
+      inputTensor.emplace_back(Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, i.data(), i.size(), inputShape.data(), inputShape.size()));
+    }
   }
   // input.clear();
   auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  OrtDataType::Float16_t* outputValues = reinterpret_cast<OrtDataType::Float16_t*>(outputTensors[0].template GetTensorMutableData<Ort::Float16_t>());
-  std::vector<OrtDataType::Float16_t> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
+  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
+  std::vector<O> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
   outputTensors.clear();
   return outputValuesVec;
 }
diff --git a/GPU/GPUTracking/Base/GPUConstantMem.h b/GPU/GPUTracking/Base/GPUConstantMem.h
index 4f83fa48a64e0..65aa0dcddf8e4 100644
--- a/GPU/GPUTracking/Base/GPUConstantMem.h
+++ b/GPU/GPUTracking/Base/GPUConstantMem.h
@@ -34,6 +34,10 @@
 #include "GPUKernelDebugOutput.h"
 #endif
 
+#ifdef GPUCA_HAS_ONNX
+#include "GPUTPCNNClusterizer.h"
+#endif
+
 namespace o2::gpu
 {
 struct GPUConstantMem {
@@ -55,6 +59,9 @@ struct GPUConstantMem {
 #ifdef GPUCA_KERNEL_DEBUGGER_OUTPUT
   GPUKernelDebugOutput debugOutput;
 #endif
+#ifdef GPUCA_HAS_ONNX
+  GPUTPCNNClusterizer tpcNNClusterer[GPUCA_NSECTORS];
+#endif
 
   template <int32_t I>
   GPUd() auto& getTRDTracker();
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index a7e0c2cb827f1..32e3d4ba05acd 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -93,6 +93,9 @@ GPUReconstruction::GPUReconstruction(const GPUSettingsDeviceBackend& cfg) : mHos
   for (uint32_t i = 0; i < NSECTORS; i++) {
     processors()->tpcTrackers[i].SetSector(i); // TODO: Move to a better place
     processors()->tpcClusterer[i].mISector = i;
+#ifdef GPUCA_HAS_ONNX
+    processors()->tpcNNClusterer[i].mISector = i;
+#endif
   }
 #ifndef GPUCA_NO_ROOT
   mROOTDump = GPUROOTDumpCore::getAndCreate();
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 7d65978ab2e60..df0a621a49235 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -193,7 +193,7 @@ set(SRCS_NO_CINT ${SRCS_NO_CINT}
   Merger/GPUTPCGMO2Output.cxx)
 
 if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
-  list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizer.cxx TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx)
+  list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx TPCClusterFinder/GPUTPCNNClusterizer.cxx TPCClusterFinder/GPUTPCNNClusterizerHost.cxx)
 endif()
 
 set(SRCS_DATATYPES
diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
index d7d5ea4e02de7..e3a2528f375e0 100644
--- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
+++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
@@ -81,7 +81,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 448
-  #define GPUCA_LB_GPUTPCNNClusterizer 448
+  #define GPUCA_LB_GPUTPCNNClusterizerKernels 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -148,7 +148,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 512
-  #define GPUCA_LB_GPUTPCNNClusterizer 512
+  #define GPUCA_LB_GPUTPCNNClusterizerKernels 512
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -215,7 +215,7 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 448
   #define GPUCA_LB_GPUTPCCFDeconvolution 384
   #define GPUCA_LB_GPUTPCCFClusterizer 448
-  #define GPUCA_LB_GPUTPCNNClusterizer 448
+  #define GPUCA_LB_GPUTPCNNClusterizerKernels 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -492,8 +492,8 @@
   #ifndef GPUCA_LB_GPUTPCCFClusterizer
     #define GPUCA_LB_GPUTPCCFClusterizer 512
   #endif
-  #ifndef GPUCA_LB_GPUTPCNNClusterizer
-    #define GPUCA_LB_GPUTPCNNClusterizer 512
+  #ifndef GPUCA_LB_GPUTPCNNClusterizerKernels
+    #define GPUCA_LB_GPUTPCNNClusterizerKernels 512
   #endif
   #ifndef GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU
     #define GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU 256
@@ -515,12 +515,12 @@
 #define GPUCA_LB_GPUTPCCFNoiseSuppression_updatePeaks GPUCA_LB_GPUTPCCFNoiseSuppression
 
 #ifdef GPUCA_HAS_ONNX
-#define GPUCA_LB_GPUTPCNNClusterizer_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizer
-#define GPUCA_LB_GPUTPCNNClusterizer_fillInputNN GPUCA_LB_GPUTPCNNClusterizer
-#define GPUCA_LB_GPUTPCNNClusterizer_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizer
-#define GPUCA_LB_GPUTPCNNClusterizer_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizer
-#define GPUCA_LB_GPUTPCNNClusterizer_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizer
-#define GPUCA_LB_GPUTPCNNClusterizer_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizer
+#define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels
+#define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN GPUCA_LB_GPUTPCNNClusterizerKernels
+#define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels
+#define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels
+#define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels
+#define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizerKernels
 #endif
 
 #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_THREAD_COUNT_SCAN
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index a63886b93ccf9..ed6522f8dadbd 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -104,6 +104,9 @@ void GPUChainTracking::RegisterPermanentMemoryAndProcessors()
   if (GetRecoSteps() & RecoStep::TPCClusterFinding) {
     for (uint32_t i = 0; i < NSECTORS; i++) {
       mRec->RegisterGPUProcessor(&processors()->tpcClusterer[i], GetRecoStepsGPU() & RecoStep::TPCClusterFinding);
+#ifdef GPUCA_HAS_ONNX
+      mRec->RegisterGPUProcessor(&processors()->tpcNNClusterer[i], GetRecoStepsGPU() & RecoStep::TPCClusterFinding);
+#endif
     }
   }
   if (GetRecoSteps() & RecoStep::Refit) {
@@ -149,6 +152,9 @@ void GPUChainTracking::RegisterGPUProcessors()
   if (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) {
     for (uint32_t i = 0; i < NSECTORS; i++) {
       mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcClusterer[i], &processors()->tpcClusterer[i]);
+#ifdef GPUCA_HAS_ONNX
+      mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcNNClusterer[i], &processors()->tpcNNClusterer[i]);
+#endif
     }
   }
   if (GetRecoStepsGPU() & RecoStep::Refit) {
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index b5656269f8129..b0b4aae4970b3 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -40,8 +40,8 @@
 #endif
 
 #ifdef GPUCA_HAS_ONNX
-#include "GPUTPCNNClusterizer.h"
-#include "GPUTPCNNClusterizerInternals.h"
+#include "GPUTPCNNClusterizerKernels.h"
+#include "GPUTPCNNClusterizerHost.h"
 #endif
 
 using namespace o2::gpu;
@@ -858,7 +858,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
         uint32_t iSector = iSectorBase + lane;
         GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
+        GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
         GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
+
         if (doGPU) {
           SynchronizeStream(lane);
         }
@@ -878,62 +880,68 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
         if (GetProcessingSettings().nn.applyNNclusterizer) {
 #ifdef GPUCA_HAS_ONNX
-          // Settings for the clusterizer
-          GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn;
-          clusterer.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
-          clusterer.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
-          clusterer.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
-          clusterer.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
-          clusterer.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
-          clusterer.nnClusterizerElementSize = ((2 * clusterer.nnClusterizerSizeInputRow + 1) * (2 * clusterer.nnClusterizerSizeInputPad + 1) * (2 * clusterer.nnClusterizerSizeInputTime + 1)) + (clusterer.nnClusterizerAddIndexData ? 3 : 0);
-          clusterer.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
-          clusterer.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
+
+          // Setting some initial sizes, important for memory allocation
+          const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
+          clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
+          clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
+          clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
+          clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
+          clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
+          clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
+          clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
+          clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
           if (nn_settings.nnClusterizerVerbosity < 0) {
-            clusterer.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
+            clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
           } else {
-            clusterer.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
+            clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
           }
 
           // Settings for the NN evaluation
-          clusterer.nnClassThreshold = nn_settings.nnClassThreshold;
-          clusterer.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
+          clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
+          clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
 
-          GPUTPCNNClusterizerInternals nnApplication(GetProcessingSettings(), clusterer);
+          GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
 
-          if (clusterer.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
+          if(fragment.index == 0){
+            AllocateRegisteredMemory(clustererNN.mMemoryId);
+          }
+
+          if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
 
-          if (clusterer.nnSigmoidTrafoClassThreshold) {
+          if (clustererNN.nnSigmoidTrafoClassThreshold) {
             // Inverse sigmoid transformation
-            clusterer.nnClassThreshold = (float)std::log(clusterer.nnClassThreshold / (1.f - clusterer.nnClassThreshold));
+            clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
           }
 
           float time_clusterizer = 0, time_fill = 0;
           int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
 
-          for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clusterer.nnClusterizerBatchedMode); batch++) {
-            uint batchStart = batch * clusterer.nnClusterizerBatchedMode;
-            uint iSize = CAMath::Min((uint)clusterer.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
+          for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
+            uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
+            size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
             auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Filling the data
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Filling the data
+
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
-            nnApplication.inferenceNetworkClass(clusterer, evalDtype, batchStart);
+            nnApplication.inferenceNetworkClass(clustererNN, iSize, evalDtype, batchStart);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Assigning class labels
             } else {
-              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Assigning class labels
             }
 
-            if (!clusterer.nnClusterizerUseCfRegression) {
-              nnApplication.inferenceNetworkReg1(clusterer, evalDtype, batchStart);
-              runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Running the NN for regression class 1
+            if (!clustererNN.nnClusterizerUseCfRegression) {
+              nnApplication.inferenceNetworkReg1(clustererNN, iSize, evalDtype, batchStart);
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
-                nnApplication.inferenceNetworkReg2(clusterer, evalDtype, batchStart);
-                runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, batchStart); // Running the NN for regression class 2
+                nnApplication.inferenceNetworkReg2(clustererNN, iSize, evalDtype, batchStart);
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 2
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
@@ -943,13 +951,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
 
           auto start1 = std::chrono::high_resolution_clock::now();
-          if (clusterer.nnClusterizerUseCfRegression) {
-            runKernel<GPUTPCNNClusterizer, GPUTPCNNClusterizer::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+          if (clustererNN.nnClusterizerUseCfRegression) {
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
 
-          if (clusterer.nnClusterizerVerbosity < 3) {
+          if (clustererNN.nnClusterizerVerbosity < 3) {
             LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
 #else
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 615880494d4a7..994cd4a66e83f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -51,8 +51,6 @@ struct ChargePos;
 
 class GPUTPCGeometry;
 
-class GPUTPCNNClusterizerInternals;
-
 class GPUTPCClusterFinder : public GPUProcessor
 {
  public:
@@ -144,40 +142,6 @@ class GPUTPCClusterFinder : public GPUProcessor
   int16_t mZSOffsetId = -1;
   int16_t mOutputId = -1;
 
-  // Neural network clusterization
-
-  int nnClusterizerSizeInputRow = 3;
-  int nnClusterizerSizeInputPad = 3;
-  int nnClusterizerSizeInputTime = 3;
-  int nnClusterizerElementSize = -1;
-  bool nnClusterizerAddIndexData = true;
-  float nnClassThreshold = 0.16;
-  bool nnSigmoidTrafoClassThreshold = 1;
-  int nnClusterizerUseCfRegression = 0;
-  int nnClusterizerBatchedMode = 1;
-  int nnClusterizerVerbosity = 0;
-  int nnClusterizerBoundaryFillValue = -1;
-  int nnClusterizerDumpDigits = 0;
-  int nnClusterizerApplyCfDeconvolution = 0;
-  int nnClusterizerModelClassNumOutputNodes = -1;
-  int nnClusterizerModelReg1NumOutputNodes = -1;
-  int nnClusterizerModelReg2NumOutputNodes = -1;
-  uint nnClusterizerCurrentSize = -1; // This variable determines the size of the memory pointers. It will be set at runtime.
-  int nnClusterizerDtype = 0; // 0: float16, 1: float32
-
-  // Memory allocation for neural network
-  uint class2_elements = 0;
-  float* inputData32=nullptr;
-  OrtDataType::Float16_t* inputData16=nullptr;
-  float* outputDataClass=nullptr;
-  float* modelProbabilities=nullptr;
-  float* outputDataReg1=nullptr;
-  float* outputDataReg2=nullptr;
-
-  ChargePos* peakPositions=nullptr;
-  bool* clusterFlags=nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx
-  float* centralCharges=nullptr;
-
 #ifndef GPUCA_GPUCODE
   void DumpDigits(std::ostream& out);
   void DumpChargeMap(std::ostream& out, std::string_view);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index d4b7ba03bdd5a..102567bbe5439 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -12,359 +12,45 @@
 /// \file GPUTPCNNClusterizer.cxx
 /// \author Christian Sonnabend
 
+#include "GPUReconstruction.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
 #include "GPUTPCNNClusterizer.h"
-#include "GPUTPCCFClusterizer.h"
-
-#include "CfConsts.h"
-#include "CfUtils.h"
-#include "ClusterAccumulator.h"
-#if !defined(GPUCA_GPUCODE)
-#include "GPUHostDataTypes.h"
-#include "MCLabelAccumulator.h"
-#endif
 
 using namespace o2::gpu;
-using namespace o2::gpu::tpccf;
-
-// Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
-template <>
-GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  if (clusterer.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
-    return;
-  }
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
-  o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
-  GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
-}
 
-template <>
-GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  GPUTPCNNClusterizer::fillInputData(nBlocks, nThreads, iBlock, iThread, clusterer, dtype, batchStart);
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  clusterer.outputDataClass[glo_idx + batchStart] = (int)(clusterer.modelProbabilities[glo_idx] > clusterer.nnClassThreshold);
-}
+void GPUTPCNNClusterizer::InitializeProcessor(){}
 
-template <>
-GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
-  float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
-  uint class_label = 0;
-  for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
-    if (pIdx == elem_iterator) {
-      current_max_prob = clusterer.modelProbabilities[pIdx];
-    } else {
-      class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
-    }
-  }
-  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
-  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
-}
+void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io){}
 
-template <>
-GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
-    return;
+void* GPUTPCNNClusterizer::setIOPointers(void* mem) {
+  if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0){
+    computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
+  } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0){
+    computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
   }
-  GPUTPCNNClusterizer::publishClustersReg1(glo_idx, smem, clusterer, dtype, onlyMC, batchStart);
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizer::Thread<GPUTPCNNClusterizer::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  if (glo_idx >= clusterer.mPmemory->counters.nClusters) {
-    return;
+  computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
+  computePointerWithAlignment(mem, clusterFlags, 2*nnClusterizerBatchedMode);
+  computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
+  computePointerWithAlignment(mem, outputDataClass, nnClusterizerBatchedMode);
+  if(nnClusterizerModelClassNumOutputNodes > 0) {
+    computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
   }
-  GPUTPCNNClusterizer::publishClustersReg2(glo_idx, smem, clusterer, dtype, onlyMC, batchStart);
-}
-
-// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
-GPUd() int GPUTPCNNClusterizer::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
-{
-  return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);
-}
-
-GPUd() int GPUTPCNNClusterizer::rowOffset(int row, int global_shift)
-{
-  return (row > 62 ? global_shift : 0);
-}
-
-GPUd() bool GPUTPCNNClusterizer::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
-{
-  if (pad < 0 || row < 0) { // Faster short-circuit
-    return true;
-  } else if (row < 63) {
-    return (pad >= static_cast<int>(geo.NPads(row)));
-  } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
-    return true;
-  } else if (row <= o2::tpc::constants::MAXGLOBALPADROW - 1 + global_shift) {
-    return (pad >= static_cast<int>(geo.NPads(row - global_shift)));
-  } else {
-    return true;
-  }
-}
-
-// Filling the input data for the neural network where there is no boundary
-GPUd() void GPUTPCNNClusterizer::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& clusterer, int8_t dtype, uint batchStart)
-{
-
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
-
-  uint glo_idx = get_global_id(0);
-
-  uint write_idx = glo_idx * clusterer.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
-
-  ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
-  int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
-  float central_charge = static_cast<float>(chargeMap[peak].unpack());
-
-  clusterer.peakPositions[glo_idx] = peak;
-  clusterer.centralCharges[glo_idx] = central_charge;
-
-  int row_offset = GPUTPCNNClusterizer::rowOffset(row, clusterer.nnClusterizerSizeInputRow);
-  GPUCA_UNROLL(U(), U());
-  for (int r = -clusterer.nnClusterizerSizeInputRow; r <= clusterer.nnClusterizerSizeInputRow; r++) {
-    bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
-    int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizer::padOffset(row, row + r, clusterer.Param().tpcGeometry);
-    for (int p = -clusterer.nnClusterizerSizeInputPad + pad_offset; p <= clusterer.nnClusterizerSizeInputPad + pad_offset; p++) {
-      bool is_boundary = is_row_boundary || GPUTPCNNClusterizer::isBoundary(row + r + row_offset, pad + p, clusterer.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
-      for (int t = -clusterer.nnClusterizerSizeInputTime; t <= clusterer.nnClusterizerSizeInputTime; t++) {
-        if (!is_boundary) {
-          ChargePos tmp_pos(row + r, pad + p, time + t);
-          if (r == 0 && !clusterer.clusterFlags[2*glo_idx] && std::abs(p) < 3 && std::abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
-            clusterer.clusterFlags[2*glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
-            clusterer.clusterFlags[2*glo_idx + 1] = clusterer.clusterFlags[2*glo_idx];
-          }
-          if (dtype == 0) {
-            clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
-          } else {
-            clusterer.inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
-          }
-        } else {
-          // Filling boundary just to make sure that no values are left unintentionally
-          if (dtype == 0) {
-            clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clusterer.nnClusterizerBoundaryFillValue));
-          } else {
-            clusterer.inputData32[write_idx] = static_cast<float>(clusterer.nnClusterizerBoundaryFillValue);
-          }
-        }
-        write_idx++;
-      }
+  if (!nnClusterizerUseCfRegression) {
+    if(nnClusterizerModelReg1NumOutputNodes > 0) {
+      computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
     }
-  }
-  if (clusterer.nnClusterizerAddIndexData) {
-    if (dtype == 0) {
-      clusterer.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f);
-      clusterer.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
-      clusterer.inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row));
-    } else {
-      clusterer.inputData32[write_idx] = clusterer.mISector / 36.f;
-      clusterer.inputData32[write_idx + 1] = row / 152.f;
-      clusterer.inputData32[write_idx + 2] = static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row);
+    if(nnClusterizerModelReg2NumOutputNodes > 0) {
+      computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
     }
   }
-}
-
-GPUd() void GPUTPCNNClusterizer::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
-  MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
-  uint full_glo_idx = glo_idx + batchStart;
-  int model_output_index = glo_idx * clusterer.nnClusterizerModelReg1NumOutputNodes;
-
-  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clusterer.outputDataReg1.size() << " / " << clusterer.nnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
 
-  if (clusterer.outputDataClass[full_glo_idx] == 1) {
+  LOG(info) << "Alloc mem: " << nnClusterizerBatchedMode * nnClusterizerElementSize << " " << nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes << " " << mem;
 
-    ClusterAccumulator pc;
-
-    // Publishing logic is taken from default clusterizer
-    if (onlyMC) {
-      ClusterAccumulator dummy_pc;
-      CPU_ONLY(labelAcc->collect(clusterer.peakPositions[glo_idx], chargeMap[clusterer.peakPositions[glo_idx]].unpack()));
-      GPUTPCCFClusterizer::buildCluster(
-        clusterer.Param().rec,
-        chargeMap,
-        clusterer.peakPositions[glo_idx],
-        smem.posBcast,
-        smem.buf,
-        smem.innerAboveThreshold,
-        &dummy_pc,
-        labelAcc);
-    }
-
-    if ((clusterer.mPmemory->fragment).isOverlap(clusterer.peakPositions[glo_idx].time())) {
-      if (clusterer.mPclusterPosInRow) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
-      }
-      return;
-    }
-
-    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg1[model_output_index + 4],
-      static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg1[model_output_index],
-      clusterer.outputDataReg1[model_output_index + 2],
-      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg1[model_output_index + 1],
-      clusterer.outputDataReg1[model_output_index + 3],
-      clusterer.clusterFlags[2*glo_idx],
-      clusterer.clusterFlags[2*glo_idx + 1]);
-
-    tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
-    if (rejectCluster) {
-      if (clusterer.mPclusterPosInRow) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
-      }
-      return;
-    }
-
-    uint rowIndex = 0;
-    if (clusterer.mPclusterByRow != nullptr) {
-      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
-        clusterer,
-        myCluster,
-        clusterer.peakPositions[glo_idx].row(),
-        clusterer.mNMaxClusterPerRow,
-        clusterer.mPclusterInRow,
-        clusterOut);
-      if (clusterer.mPclusterPosInRow != nullptr) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
-      }
-    } else if (clusterer.mPclusterPosInRow) {
-      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
-    }
-    CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
-  } else {
-    if (clusterer.mPclusterPosInRow) {
-      clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
-    }
-    return;
-  }
+  return mem;
 }
 
-GPUd() void GPUTPCNNClusterizer::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& clusterer, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
-  MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
-  uint full_glo_idx = glo_idx + batchStart;
-  int model_output_index = glo_idx * clusterer.nnClusterizerModelReg2NumOutputNodes;
-
-  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clusterer.outputDataReg1.size() << " / " << clusterer.nnClusterizerModelReg2NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
-
-  if (clusterer.outputDataClass[full_glo_idx] > 0) {
-
-    ClusterAccumulator pc;
-
-    if (onlyMC) {
-      ClusterAccumulator dummy_pc;
-      CPU_ONLY(labelAcc->collect(clusterer.peakPositions[glo_idx], chargeMap[clusterer.peakPositions[glo_idx]].unpack()));
-      GPUTPCCFClusterizer::buildCluster(
-        clusterer.Param().rec,
-        chargeMap,
-        clusterer.peakPositions[glo_idx],
-        smem.posBcast,
-        smem.buf,
-        smem.innerAboveThreshold,
-        &dummy_pc,
-        labelAcc);
-    }
-
-    if ((clusterer.mPmemory->fragment).isOverlap(clusterer.peakPositions[glo_idx].time())) {
-      if (clusterer.mPclusterPosInRow) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
-      }
-      return;
-    }
-
-    // Cluster 1
-    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg2[model_output_index + 8],
-      static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg2[model_output_index],
-      clusterer.outputDataReg2[model_output_index + 4],
-      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg2[model_output_index + 2],
-      clusterer.outputDataReg2[model_output_index + 6],
-      clusterer.clusterFlags[2*glo_idx],
-      clusterer.clusterFlags[2*glo_idx + 1]);
-
-    tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
-    if (rejectCluster) {
-      if (clusterer.mPclusterPosInRow) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
-      }
-      return;
-    }
-
-    uint rowIndex = 0;
-    if (clusterer.mPclusterByRow != nullptr) {
-      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
-        clusterer,
-        myCluster,
-        clusterer.peakPositions[glo_idx].row(),
-        clusterer.mNMaxClusterPerRow,
-        clusterer.mPclusterInRow,
-        clusterOut);
-      if (clusterer.mPclusterPosInRow != nullptr) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
-      }
-    } else if (clusterer.mPclusterPosInRow) {
-      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
-    }
-    CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
-
-    // Cluster 2
-    pc.setFull(clusterer.centralCharges[glo_idx] * clusterer.outputDataReg2[model_output_index + 9],
-      static_cast<float>(clusterer.peakPositions[glo_idx].pad()) + clusterer.outputDataReg2[model_output_index + 1],
-      clusterer.outputDataReg2[model_output_index + 5],
-      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clusterer.peakPositions[glo_idx].time()) + clusterer.outputDataReg2[model_output_index + 3],
-      clusterer.outputDataReg2[model_output_index + 7],
-      clusterer.clusterFlags[2*glo_idx],
-      clusterer.clusterFlags[2*glo_idx + 1]);
-
-    rejectCluster = !pc.toNative(clusterer.peakPositions[glo_idx], clusterer.centralCharges[glo_idx], myCluster, clusterer.Param());
-    if (rejectCluster) {
-      if (clusterer.mPclusterPosInRow) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
-      }
-      return;
-    }
-
-    if (clusterer.mPclusterByRow != nullptr) {
-      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
-        clusterer,
-        myCluster,
-        clusterer.peakPositions[glo_idx].row(),
-        clusterer.mNMaxClusterPerRow,
-        clusterer.mPclusterInRow,
-        clusterOut);
-      if (clusterer.mPclusterPosInRow != nullptr) {
-        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
-      }
-    } else if (clusterer.mPclusterPosInRow) {
-      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
-    }
-    // CPU_ONLY(labelAcc->commit(clusterer.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
-  } else {
-    if (clusterer.mPclusterPosInRow) {
-      clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
-    }
-    return;
-  }
-}
\ No newline at end of file
+void GPUTPCNNClusterizer::RegisterMemoryAllocation() {
+  AllocateAndInitializeLate();
+  int32_t memType = GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK;
+  mMemoryId = mRec->RegisterMemoryAllocation(this, &GPUTPCNNClusterizer::setIOPointers, memType, "TPCNNClusterer", GPUMemoryReuse{GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::NNClusterer, (uint16_t)(mISector % mRec->GetProcessingSettings().nTPCClustererLanes)});
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 534446af67828..3d2dd31cc2beb 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -12,74 +12,64 @@
 /// \file GPUTPCNNClusterizer.h
 /// \author Christian Sonnabend
 
-#ifndef O2_GPU_NN_CLUSTERIZER_H
-#define O2_GPU_NN_CLUSTERIZER_H
+#ifndef O2_GPUTPCNNCLUSTERIZER_H
+#define O2_GPUTPCNNCLUSTERIZER_H
 
-#include "clusterFinderDefs.h"
-#include "GPUGeneralKernels.h"
-#include "GPUConstantMem.h"
-#include "GPUTPCClusterFinder.h"
-#include "Array2D.h"
-#include "PackedCharge.h"
+#include "ChargePos.h"
+#include "GPUProcessor.h"
 
-namespace o2::tpc
+namespace o2::OrtDataType
 {
-struct ClusterNative;
-} // namespace o2::tpc
+  struct Float16_t;
+}
 
 namespace o2::gpu
 {
 
-class ClusterAccumulator;
-class MCLabelAccumulator;
-
-class GPUTPCNNClusterizer : public GPUKernelTemplate
+class GPUTPCNNClusterizer : public GPUProcessor
 {
  public:
-  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizer);
-  struct GPUSharedMemory {
-    // Regular cluster finder
-    ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
-    PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
-    uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
-  };
-
-  typedef GPUTPCClusterFinder processorType;
-  GPUhdi() static processorType* Processor(GPUConstantMem& processors)
-  {
-    return processors.tpcClusterer;
-  }
-
-  GPUhdi() constexpr static GPUDataTypes::RecoStep GetRecoStep()
-  {
-    return GPUDataTypes::RecoStep::TPCClusterFinding;
-  }
-
-  enum K : int32_t {
-    runCfClusterizer = 0,
-    fillInputNN = 1,
-    determineClass1Labels = 2,
-    determineClass2Labels = 3,
-    publishClass1Regression = 4,
-    publishClass2Regression = 5,
-  };
-
-  template <int32_t iKernel = defaultKernel, typename... Args>
-  GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, int8_t = 0, int8_t = 0, uint = 0, Args...);
-
-  static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, int8_t, uint);
-  static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
-  static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, int8_t, int8_t, uint);
-
-  static void inferenceNetworkClass(processorType&, int8_t = 0, uint = 0);
-  static void inferenceNetworkReg1(processorType&, int8_t = 0);
-  static void inferenceNetworkReg2(processorType&, int8_t = 0);
-
- private:
-  static GPUd() int padOffset(int, int, const GPUTPCGeometry&);
-  static GPUd() int rowOffset(int, int);
-  static GPUd() bool isBoundary(int, int, int, const GPUTPCGeometry&);
-};
+  GPUTPCNNClusterizer() = default;
+  void* setIOPointers(void*);
+  void RegisterMemoryAllocation();
+  void InitializeProcessor();
+  void SetMaxData(const GPUTrackingInOutPointers&);
+
+  // Neural network clusterization
+
+  int nnClusterizerSizeInputRow = 3;
+  int nnClusterizerSizeInputPad = 3;
+  int nnClusterizerSizeInputTime = 3;
+  int nnClusterizerElementSize = -1;
+  bool nnClusterizerAddIndexData = true;
+  float nnClassThreshold = 0.16;
+  bool nnSigmoidTrafoClassThreshold = 1;
+  int nnClusterizerUseCfRegression = 0;
+  int nnClusterizerBatchedMode = 1;
+  int nnClusterizerVerbosity = 0;
+  int nnClusterizerBoundaryFillValue = -1;
+  int nnClusterizerDumpDigits = 0;
+  int nnClusterizerApplyCfDeconvolution = 0;
+  int nnClusterizerModelClassNumOutputNodes = -1;
+  int nnClusterizerModelReg1NumOutputNodes = -1;
+  int nnClusterizerModelReg2NumOutputNodes = -1;
+  int nnClusterizerDtype = 0; // 0: float16, 1: float32
+  int mISector = -1;
+
+  // Memory allocation for neural network
+  uint class2_elements = 0;
+  float* inputData32=nullptr;
+  OrtDataType::Float16_t* inputData16=nullptr;
+  float* outputDataClass=nullptr;
+  float* modelProbabilities=nullptr;
+  float* outputDataReg1=nullptr;
+  float* outputDataReg2=nullptr;
+
+  ChargePos* peakPositions=nullptr;
+  bool* clusterFlags=nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx
+  float* centralCharges=nullptr;
+  int16_t mMemoryId = -1;
+}; // class GPUTPCNNClusterizer
 
 } // namespace o2::gpu
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
new file mode 100644
index 0000000000000..fe53f42dbbe8d
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -0,0 +1,83 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizerHost.cxx
+/// \author Christian Sonnabend
+
+#include "GPUTPCNNClusterizerHost.h"
+#include "GPUTPCNNClusterizer.h"
+#include "GPUSettings.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
+
+using namespace o2::gpu;
+
+GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer) {
+  OrtOptions = {
+    {"model-path", settings.nnClassificationPath},
+    {"device", settings.nnInferenceDevice},
+    {"device-id", std::to_string(settings.nnInferenceDeviceId)},
+    {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
+    {"dtype", settings.nnInferenceDtype},
+    {"intra-op-num-threads", std::to_string(settings.nnInferenceThreadsPerNN)},
+    {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
+    {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)},
+    {"profiling-output-path", settings.nnInferenceOrtProfilingPath},
+    {"logging-level", std::to_string(settings.nnInferenceVerbosity)}
+  };
+
+  model_class.init(OrtOptions);
+  clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
+
+  reg_model_paths = splitString(settings.nnRegressionPath, ":");
+
+  if (!settings.nnClusterizerUseCfRegression) {
+    if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
+      OrtOptions["model-path"] = reg_model_paths[0];
+      model_reg_1.init(OrtOptions);
+      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
+    } else {
+      OrtOptions["model-path"] = reg_model_paths[0];
+      model_reg_1.init(OrtOptions);
+      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
+      OrtOptions["model-path"] = reg_model_paths[1];
+      model_reg_2.init(OrtOptions);
+      clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1];
+    }
+  }
+}
+
+// Apply the neural network to the input data. Note: These are not GPU kernels. We let ONNX take care of that
+void GPUTPCNNClusterizerHost::inferenceNetworkClass(GPUTPCNNClusterizer& clusterer, size_t currentSize, int8_t dtype, uint batch_idx)
+{
+  if (dtype == 0) {
+    model_class.inference<OrtDataType::Float16_t, float>(clusterer.inputData16, currentSize * clusterer.nnClusterizerElementSize, clusterer.modelProbabilities);
+  } else {
+    model_class.inference<float, float>(clusterer.inputData32, currentSize * clusterer.nnClusterizerElementSize, clusterer.modelProbabilities);
+  }
+}
+
+void GPUTPCNNClusterizerHost::inferenceNetworkReg1(GPUTPCNNClusterizer& clusterer, size_t currentSize, int8_t dtype, uint batch_idx)
+{
+  if (dtype == 0) {
+    model_reg_1.inference<OrtDataType::Float16_t, float>(clusterer.inputData16, currentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg1);
+  } else {
+    model_reg_1.inference<float, float>(clusterer.inputData32, currentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg1);
+  }
+}
+
+void GPUTPCNNClusterizerHost::inferenceNetworkReg2(GPUTPCNNClusterizer& clusterer, size_t currentSize, int8_t dtype, uint batch_idx)
+{
+  if (dtype == 0) {
+    model_reg_2.inference<OrtDataType::Float16_t, float>(clusterer.inputData16, currentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg2);
+  } else {
+    model_reg_2.inference<float, float>(clusterer.inputData32, currentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg2);
+  }
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
similarity index 61%
rename from GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
rename to GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index ef027e536969d..de0118c26c8db 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -9,43 +9,43 @@
 // granted to it by virtue of its status as an Intergovernmental Organization
 // or submit itself to any jurisdiction.
 
-/// \file GPUTPCNNClusterizerInternals.h
+/// \file GPUTPCNNClusterizerHost.h
 /// \author Christian Sonnabend
 
-#ifndef O2_GPUTPCNNCLUSTERIZERINTERNALS_H
-#define O2_GPUTPCNNCLUSTERIZERINTERNALS_H
+#ifndef O2_GPUTPCNNCLUSTERIZERHOST_H
+#define O2_GPUTPCNNCLUSTERIZERHOST_H
 
+#include <string>
+#include <unordered_map>
+#include <vector>
 #include "ML/OrtInterface.h"
-#include "ChargePos.h"
-#include "GPUReconstruction.h"
-#include "GPUProcessor.h"
-#include "GPUTPCClusterFinder.h"
-#include "GPUHostDataTypes.h"
 
 using namespace o2::ml;
 
+namespace o2::OrtDataType
+{
+  struct Float16_t;
+}
+
 namespace o2::gpu
 {
 
-class GPUTPCNNClusterizerInternals : public GPUProcessor
+class GPUTPCNNClusterizer;
+struct GPUSettingsProcessingNNclusterizer;
+
+class GPUTPCNNClusterizerHost
 {
  public:
- typedef GPUTPCClusterFinder processorType;
-  GPUTPCNNClusterizerInternals() = default;
-  GPUTPCNNClusterizerInternals(GPUSettingsProcessing, processorType&);
-  void* setIOPointers(void*);
-  void RegisterMemoryAllocation();
-  void inferenceNetworkClass(processorType&, int8_t, uint);
-  void inferenceNetworkReg1(processorType&, int8_t, uint);
-  void inferenceNetworkReg2(processorType&, int8_t, uint);
+  GPUTPCNNClusterizerHost() = default;
+  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+  void inferenceNetworkClass(GPUTPCNNClusterizer&, size_t, int8_t, uint);
+  void inferenceNetworkReg1(GPUTPCNNClusterizer&, size_t, int8_t, uint);
+  void inferenceNetworkReg2(GPUTPCNNClusterizer&, size_t, int8_t, uint);
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
   std::vector<std::string> reg_model_paths;
  private:
- processorType* clusterer_internal;
-  int sector = -1;
-  int16_t mMemoryId = -1;
 
   // Avoid including CommonUtils/StringUtils.h
   std::vector<std::string> splitString(const std::string& input, const std::string& delimiter) {
@@ -61,7 +61,7 @@ class GPUTPCNNClusterizerInternals : public GPUProcessor
 
     return tokens;
   }
-}; // class GPUTPCNNClusterizerInternals
+}; // class GPUTPCNNClusterizerHost
 
 } // namespace o2::gpu
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx
deleted file mode 100644
index 59c59a26a1d10..0000000000000
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerInternals.cxx
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-/// \file GPUTPCNNClusterizerInternals.cxx
-/// \author Christian Sonnabend
-
-#include "GPUTPCNNClusterizerInternals.h"
-
-using namespace o2::gpu;
-
-GPUTPCNNClusterizerInternals::GPUTPCNNClusterizerInternals(GPUSettingsProcessing settings, processorType& clusterer) {
-  clusterer_internal = &clusterer;
-  GPUSettingsProcessingNNclusterizer nn_settings = settings.nn;
-  OrtOptions = {{"model-path", nn_settings.nnClassificationPath},
-    {"device", nn_settings.nnInferenceDevice},
-    {"device-id", std::to_string(nn_settings.nnInferenceDeviceId)},
-    {"allocate-device-memory", std::to_string(nn_settings.nnInferenceAllocateDevMem)},
-    {"dtype", nn_settings.nnInferenceDtype},
-    {"intra-op-num-threads", std::to_string(nn_settings.nnInferenceThreadsPerNN)},
-    {"enable-optimizations", std::to_string(nn_settings.nnInferenceEnableOrtOptimization)},
-    {"enable-profiling", std::to_string(nn_settings.nnInferenceOrtProfiling)},
-    {"profiling-output-path", nn_settings.nnInferenceOrtProfilingPath},
-    {"logging-level", std::to_string(nn_settings.nnInferenceVerbosity)}};
-  sector = clusterer.mISector;
-
-
-  model_class.init(OrtOptions);
-  reg_model_paths = splitString(nn_settings.nnRegressionPath, ":");
-
-  if (!nn_settings.nnClusterizerUseCfRegression) {
-    if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
-      OrtOptions["model-path"] = reg_model_paths[0];
-      model_reg_1.init(OrtOptions);
-      clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
-    } else {
-      OrtOptions["model-path"] = reg_model_paths[0];
-      model_reg_1.init(OrtOptions);
-      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
-      OrtOptions["model-path"] = reg_model_paths[1];
-      model_reg_2.init(OrtOptions);
-      clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1];
-    }
-  }
-}
-
-void* GPUTPCNNClusterizerInternals::setIOPointers(void* mem) {
-  if (clusterer_internal->nnClusterizerDtype == 0){
-      computePointerWithAlignment(mem, clusterer_internal->inputData16, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerElementSize);
-  } else if (clusterer_internal->nnClusterizerDtype == 1){
-      computePointerWithAlignment(mem, clusterer_internal->inputData32, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerElementSize);
-  }
-  computePointerWithAlignment(mem, clusterer_internal->outputDataClass, clusterer_internal->nnClusterizerCurrentSize);
-  computePointerWithAlignment(mem, clusterer_internal->modelProbabilities, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerModelClassNumOutputNodes);
-  computePointerWithAlignment(mem, clusterer_internal->outputDataReg1, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerModelReg1NumOutputNodes);
-  computePointerWithAlignment(mem, clusterer_internal->outputDataReg2, clusterer_internal->nnClusterizerCurrentSize * clusterer_internal->nnClusterizerModelReg2NumOutputNodes);
-  computePointerWithAlignment(mem, clusterer_internal->peakPositions, clusterer_internal->nnClusterizerCurrentSize);
-  computePointerWithAlignment(mem, clusterer_internal->clusterFlags, 2*clusterer_internal->nnClusterizerCurrentSize);
-  computePointerWithAlignment(mem, clusterer_internal->centralCharges, clusterer_internal->nnClusterizerCurrentSize);
-
-  return mem;
-}
-
-void GPUTPCNNClusterizerInternals::RegisterMemoryAllocation() {
-  AllocateAndInitializeLate();
-  int32_t memType = GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK;
-  mMemoryId = mRec->RegisterMemoryAllocation(this, &GPUTPCNNClusterizerInternals::setIOPointers, memType, "TPCNNClusterer", GPUMemoryReuse{GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::NNClusterer, (uint16_t)(sector % mRec->GetProcessingSettings().nTPCClustererLanes)});
-}
-
-// Apply the neural network to the input data. Note: These are not GPU kernels. We let ONNX take care of that
-void GPUTPCNNClusterizerInternals::inferenceNetworkClass(processorType& clusterer, int8_t dtype, uint batch_idx)
-{
-  if (dtype == 0) {
-    model_class.inference<OrtDataType::Float16_t, float>(clusterer.inputData16 + batch_idx, clusterer.nnClusterizerCurrentSize * clusterer.nnClusterizerElementSize, clusterer.modelProbabilities);
-  } else {
-    model_class.inference<float, float>(clusterer.inputData32 + batch_idx, clusterer.nnClusterizerCurrentSize * clusterer.nnClusterizerElementSize, clusterer.modelProbabilities);
-  }
-}
-
-void GPUTPCNNClusterizerInternals::inferenceNetworkReg1(processorType& clusterer, int8_t dtype, uint batch_idx)
-{
-  if (dtype == 0) {
-    model_reg_1.inference<OrtDataType::Float16_t, float>(clusterer.inputData16 + batch_idx, clusterer.nnClusterizerCurrentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg1);
-  } else {
-    model_reg_1.inference<float, float>(clusterer.inputData32 + batch_idx, clusterer.nnClusterizerCurrentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg1);
-  }
-}
-
-void GPUTPCNNClusterizerInternals::inferenceNetworkReg2(processorType& clusterer, int8_t dtype, uint batch_idx)
-{
-  if (dtype == 0) {
-    model_reg_2.inference<OrtDataType::Float16_t, float>(clusterer.inputData16 + batch_idx, clusterer.nnClusterizerCurrentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg2);
-  } else {
-    model_reg_2.inference<float, float>(clusterer.inputData32 + batch_idx, clusterer.nnClusterizerCurrentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg2);
-  }
-}
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
new file mode 100644
index 0000000000000..0e6739168151d
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -0,0 +1,376 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizerKernels.cxx
+/// \author Christian Sonnabend
+
+#include "GPUTPCNNClusterizerKernels.h"
+#include "GPUTPCCFClusterizer.h"
+
+#include "CfConsts.h"
+#include "CfUtils.h"
+#include "ClusterAccumulator.h"
+#if !defined(GPUCA_GPUCODE)
+#include "GPUHostDataTypes.h"
+#include "MCLabelAccumulator.h"
+#endif
+
+using namespace o2::gpu;
+using namespace o2::gpu::tpccf;
+
+// Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  auto& clusterer = processors.tpcClusterer[sector];
+  if (processors.tpcNNClusterer[sector].outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
+    return;
+  }
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
+  GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  auto& clusterer = processors.tpcNNClusterer[sector];
+  uint glo_idx = get_global_id(0);
+  uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
+  float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
+  uint class_label = 0;
+  for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
+    if (pIdx == elem_iterator) {
+      current_max_prob = clusterer.modelProbabilities[pIdx];
+    } else {
+      class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
+    }
+  }
+  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
+  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
+    return;
+  }
+  GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
+    return;
+  }
+  GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
+}
+
+// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
+GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
+{
+  return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);
+}
+
+GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift)
+{
+  return (row > 62 ? global_shift : 0);
+}
+
+GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
+{
+  if (pad < 0 || row < 0) { // Faster short-circuit
+    return true;
+  } else if (row < 63) {
+    return (pad >= static_cast<int>(geo.NPads(row)));
+  } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
+    return true;
+  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) {
+    return (pad >= static_cast<int>(geo.NPads(row - global_shift)));
+  } else {
+    return true;
+  }
+}
+
+// Filling the input data for the neural network where there is no boundary
+GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
+
+  uint write_idx = glo_idx * clustererNN.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
+
+  ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
+  int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
+  float central_charge = static_cast<float>(chargeMap[peak].unpack());
+
+  clustererNN.peakPositions[glo_idx] = peak;
+  clustererNN.centralCharges[glo_idx] = central_charge;
+
+  int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
+  GPUCA_UNROLL(U(), U());
+  for (int r = -clustererNN.nnClusterizerSizeInputRow; r <= clustererNN.nnClusterizerSizeInputRow; r++) {
+    bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
+    int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r, clusterer.Param().tpcGeometry);
+    for (int p = -clustererNN.nnClusterizerSizeInputPad + pad_offset; p <= clustererNN.nnClusterizerSizeInputPad + pad_offset; p++) {
+      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry);
+      for (int t = -clustererNN.nnClusterizerSizeInputTime; t <= clustererNN.nnClusterizerSizeInputTime; t++) {
+        if (!is_boundary) {
+          ChargePos tmp_pos(row + r, pad + p, time + t);
+          if (r == 0 && !clustererNN.clusterFlags[2*glo_idx] && std::abs(p) < 3 && std::abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
+            clustererNN.clusterFlags[2*glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
+            clustererNN.clusterFlags[2*glo_idx + 1] = clustererNN.clusterFlags[2*glo_idx];
+          }
+          if (dtype == 0) {
+            clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+          } else {
+            clustererNN.inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+          }
+        } else {
+          // Filling boundary just to make sure that no values are left unintentionally
+          if (dtype == 0) {
+            clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
+          } else {
+            clustererNN.inputData32[write_idx] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
+          }
+        }
+        write_idx++;
+      }
+    }
+  }
+  if (clustererNN.nnClusterizerAddIndexData) {
+    if (dtype == 0) {
+      clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f);
+      clustererNN.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
+      clustererNN.inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row));
+    } else {
+      clustererNN.inputData32[write_idx] = clusterer.mISector / 36.f;
+      clustererNN.inputData32[write_idx + 1] = row / 152.f;
+      clustererNN.inputData32[write_idx + 2] = static_cast<float>(pad) / clusterer.Param().tpcGeometry.NPads(row);
+    }
+  }
+}
+
+GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
+  MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  uint full_glo_idx = glo_idx + batchStart;
+  int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg1NumOutputNodes;
+
+  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
+
+  if (clustererNN.outputDataClass[full_glo_idx] == 1) {
+
+    ClusterAccumulator pc;
+
+    // Publishing logic is taken from default clusterizer
+    if (onlyMC) {
+      ClusterAccumulator dummy_pc;
+      CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
+      GPUTPCCFClusterizer::buildCluster(
+        clusterer.Param().rec,
+        chargeMap,
+        clustererNN.peakPositions[glo_idx],
+        smem.posBcast,
+        smem.buf,
+        smem.innerAboveThreshold,
+        &dummy_pc,
+        labelAcc);
+    }
+
+    if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+      }
+      return;
+    }
+
+    pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg1[model_output_index + 4],
+      static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg1[model_output_index],
+      clustererNN.outputDataReg1[model_output_index + 2],
+      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1],
+      clustererNN.outputDataReg1[model_output_index + 3],
+      clustererNN.clusterFlags[2*glo_idx],
+      clustererNN.clusterFlags[2*glo_idx + 1]);
+
+    tpc::ClusterNative myCluster;
+    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param());
+    if (rejectCluster) {
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+      }
+      return;
+    }
+
+    uint rowIndex = 0;
+    if (clusterer.mPclusterByRow != nullptr) {
+      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
+        clusterer,
+        myCluster,
+        clustererNN.peakPositions[glo_idx].row(),
+        clusterer.mNMaxClusterPerRow,
+        clusterer.mPclusterInRow,
+        clusterOut);
+      if (clusterer.mPclusterPosInRow != nullptr) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
+      }
+    } else if (clusterer.mPclusterPosInRow) {
+      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
+    }
+    CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+  } else {
+    if (clusterer.mPclusterPosInRow) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+    }
+    return;
+  }
+}
+
+GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
+  MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
+  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  uint full_glo_idx = glo_idx + batchStart;
+  int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg2NumOutputNodes;
+
+  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg2NumOutputNodes << " -- " << clustererNN.peakPositions.size() << " -- " << clustererNN.centralCharges.size();
+
+  if (clustererNN.outputDataClass[full_glo_idx] > 0) {
+
+    ClusterAccumulator pc;
+
+    if (onlyMC) {
+      ClusterAccumulator dummy_pc;
+      CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
+      GPUTPCCFClusterizer::buildCluster(
+        clusterer.Param().rec,
+        chargeMap,
+        clustererNN.peakPositions[glo_idx],
+        smem.posBcast,
+        smem.buf,
+        smem.innerAboveThreshold,
+        &dummy_pc,
+        labelAcc);
+    }
+
+    if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+      }
+      return;
+    }
+
+    // Cluster 1
+    pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 8],
+      static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index],
+      clustererNN.outputDataReg2[model_output_index + 4],
+      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2],
+      clustererNN.outputDataReg2[model_output_index + 6],
+      clustererNN.clusterFlags[2*glo_idx],
+      clustererNN.clusterFlags[2*glo_idx + 1]);
+
+    tpc::ClusterNative myCluster;
+    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param());
+    if (rejectCluster) {
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+      }
+      return;
+    }
+
+    uint rowIndex = 0;
+    if (clusterer.mPclusterByRow != nullptr) {
+      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
+        clusterer,
+        myCluster,
+        clustererNN.peakPositions[glo_idx].row(),
+        clusterer.mNMaxClusterPerRow,
+        clusterer.mPclusterInRow,
+        clusterOut);
+      if (clusterer.mPclusterPosInRow != nullptr) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
+      }
+    } else if (clusterer.mPclusterPosInRow) {
+      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
+    }
+    CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+
+    // Cluster 2
+    pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 9],
+      static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index + 1],
+      clustererNN.outputDataReg2[model_output_index + 5],
+      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3],
+      clustererNN.outputDataReg2[model_output_index + 7],
+      clustererNN.clusterFlags[2*glo_idx],
+      clustererNN.clusterFlags[2*glo_idx + 1]);
+
+    rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param());
+    if (rejectCluster) {
+      if (clusterer.mPclusterPosInRow) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+      }
+      return;
+    }
+
+    if (clusterer.mPclusterByRow != nullptr) {
+      rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
+        clusterer,
+        myCluster,
+        clustererNN.peakPositions[glo_idx].row(),
+        clusterer.mNMaxClusterPerRow,
+        clusterer.mPclusterInRow,
+        clusterOut);
+      if (clusterer.mPclusterPosInRow != nullptr) {
+        clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex;
+      }
+    } else if (clusterer.mPclusterPosInRow) {
+      rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
+    }
+    // CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
+  } else {
+    if (clusterer.mPclusterPosInRow) {
+      clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
+    }
+    return;
+  }
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
new file mode 100644
index 0000000000000..7c669b3b25c10
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -0,0 +1,77 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCNNClusterizerKernels.h
+/// \author Christian Sonnabend
+
+#ifndef O2_GPU_NN_CLUSTERIZER_H
+#define O2_GPU_NN_CLUSTERIZER_H
+
+#include "clusterFinderDefs.h"
+#include "GPUGeneralKernels.h"
+#include "GPUConstantMem.h"
+#include "GPUTPCClusterFinder.h"
+#include "Array2D.h"
+#include "PackedCharge.h"
+#include "GPUTPCNNClusterizer.h"
+
+namespace o2::tpc
+{
+struct ClusterNative;
+} // namespace o2::tpc
+
+namespace o2::gpu
+{
+
+class ClusterAccumulator;
+class MCLabelAccumulator;
+
+class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
+{
+ public:
+  static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels);
+  struct GPUSharedMemory {
+    // Regular cluster finder
+    ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE];
+    PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N];
+    uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE];
+  };
+
+  GPUhdi() constexpr static GPUDataTypes::RecoStep GetRecoStep()
+  {
+    return GPUDataTypes::RecoStep::TPCClusterFinding;
+  }
+
+  enum K : int32_t {
+    runCfClusterizer = 0,
+    fillInputNN = 1,
+    determineClass1Labels = 2,
+    determineClass2Labels = 3,
+    publishClass1Regression = 4,
+    publishClass2Regression = 5,
+  };
+
+  template <int32_t iKernel = defaultKernel, typename... Args>
+  GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, uint8_t = 0, int8_t = 0, int8_t = 0, uint = 0, Args...);
+
+  static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, uint8_t, int8_t, uint);
+  static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
+  static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
+
+ private:
+  static GPUd() int padOffset(int, int, const GPUTPCGeometry&);
+  static GPUd() int rowOffset(int, int);
+  static GPUd() bool isBoundary(int, int, int, const GPUTPCGeometry&);
+};
+
+} // namespace o2::gpu
+
+#endif
\ No newline at end of file
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index 29d90908afa2f..e628586253e17 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -23,9 +23,9 @@ o2_gpu_kernel_file_list(TPCMERGER ERRORS GPUTPCGMMerger.cxx GPUTPCGMSectorTrack.
 o2_gpu_kernel_file_list(O2PROPAGATOR TrackParametrization.cxx TrackParametrizationWithError.cxx Propagator.cxx TrackLTIntegral.cxx)
 o2_gpu_kernel_file_list(TPCCOMPRESSION GPUTPCCompressionTrackModel.cxx)
 o2_gpu_kernel_file_list(TPCDECOMPRESSION GPUTPCCompressionTrackModel.cxx ERRORS)
-o2_gpu_kernel_file_list(TPCCLUSTERFINDER ERRORS ClusterAccumulator.cxx GPUTPCNNClusterizer.cxx)
+o2_gpu_kernel_file_list(TPCCLUSTERFINDER ERRORS ClusterAccumulator.cxx)
 if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
-o2_gpu_kernel_file_list(TPCNNCLUSTERFINDER ERRORS ClusterAccumulator.cxx GPUTPCNNClusterizer.cxx)
+o2_gpu_kernel_file_list(TPCNNCLUSTERFINDER ERRORS ClusterAccumulator.cxx GPUTPCNNClusterizerKernels.cxx)
 endif()
 o2_gpu_kernel_file_list(TRDTRACKER GPUTRDTrack.cxx GPUTRDTracker.cxx GPUTRDTrackletWord.cxx GeometryBase.cxx)
 o2_gpu_kernel_file_list(GLOBALREFIT TPCMERGER O2PROPAGATOR MATLUT GPUTrackingRefit.cxx)
@@ -114,12 +114,12 @@ o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"        "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB      single)
 if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
-o2_gpu_add_kernel("GPUTPCNNClusterizer, runCfClusterizer"             "= TPCNNCLUSTERFINDER"                                  LB      single int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizer, fillInputNN"                  "= TPCNNCLUSTERFINDER"                                  LB      single int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizer, determineClass1Labels"        "= TPCNNCLUSTERFINDER"                                  LB      single int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizer, determineClass2Labels"        "= TPCNNCLUSTERFINDER"                                  LB      single int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizer, publishClass1Regression"      "= TPCNNCLUSTERFINDER"                                  LB      single int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizer, publishClass2Regression"      "= TPCNNCLUSTERFINDER"                                  LB      single int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer"             "= TPCNNCLUSTERFINDER"                                  LB      single uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN"                  "= TPCNNCLUSTERFINDER"                                  LB      single uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels"        "= TPCNNCLUSTERFINDER"                                  LB      single uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels"        "= TPCNNCLUSTERFINDER"                                  LB      single uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression"      "= TPCNNCLUSTERFINDER"                                  LB      single uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression"      "= TPCNNCLUSTERFINDER"                                  LB      single uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 endif()
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"           "= TPCCLUSTERFINDER"                                    NO      single)

From 05bc4b8971a5c1ed1666058042e97cc583736b3f Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Tue, 11 Mar 2025 23:10:08 +0000
Subject: [PATCH 54/77] Please consider the following formatting changes

---
 Common/ML/src/OrtInterface.cxx                | 20 ++++-----
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 22 +++++-----
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    | 20 ++++-----
 .../GPUTPCNNClusterizerHost.cxx               |  6 +--
 .../GPUTPCNNClusterizerHost.h                 | 11 ++---
 .../GPUTPCNNClusterizerKernels.cxx            | 42 +++++++++----------
 6 files changed, 60 insertions(+), 61 deletions(-)

diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 933cd861ff950..f052e8fddd3e1 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -163,7 +163,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
                  [&](const std::string& str) { return str.c_str(); });
   }
   if (loggingLevel < 2) {
-    LOG(info) << "(ORT) Model loaded successfully! (input: " <<  printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")";
+    LOG(info) << "(ORT) Model loaded successfully! (input: " << printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")";
   }
 }
 
@@ -197,9 +197,9 @@ std::string OrtModel::printShape(const std::vector<int64_t>& v)
   return ss.str();
 }
 
-
-template <class I, class O> 
-std::vector<O> OrtModel::inference(std::vector<I>& input) {
+template <class I, class O>
+std::vector<O> OrtModel::inference(std::vector<I>& input)
+{
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
@@ -221,7 +221,6 @@ template std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(s
 
 template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>&);
 
-
 template <class I, class O>
 void OrtModel::inference(I* input, size_t input_size, O* output)
 {
@@ -232,22 +231,19 @@ void OrtModel::inference(I* input, size_t input_size, O* output)
   } else {
     inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size());
   }
-  
+
   std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
   size_t outputSize = (int64_t)(inputShape[0] * mOutputShapes[0][1]);
   Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
-  
-  (pImplOrt->session)->Run(pImplOrt->runOptions, 
-                           inputNamesChar.data(), &inputTensor, 1,
-                           outputNamesChar.data(), &outputTensor, 1);
+
+  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, 1);
 }
 
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
 
 template void OrtModel::inference<float, float>(float*, size_t, float*);
 
-
-template <class I, class O> 
+template <class I, class O>
 std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
 {
   std::vector<Ort::Value> inputTensor;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 102567bbe5439..16120bced1917 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -18,28 +18,29 @@
 
 using namespace o2::gpu;
 
-void GPUTPCNNClusterizer::InitializeProcessor(){}
+void GPUTPCNNClusterizer::InitializeProcessor() {}
 
-void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io){}
+void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}
 
-void* GPUTPCNNClusterizer::setIOPointers(void* mem) {
-  if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0){
+void* GPUTPCNNClusterizer::setIOPointers(void* mem)
+{
+  if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
     computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
-  } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0){
+  } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {
     computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
   }
   computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, clusterFlags, 2*nnClusterizerBatchedMode);
+  computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
   computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
   computePointerWithAlignment(mem, outputDataClass, nnClusterizerBatchedMode);
-  if(nnClusterizerModelClassNumOutputNodes > 0) {
+  if (nnClusterizerModelClassNumOutputNodes > 0) {
     computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
   }
   if (!nnClusterizerUseCfRegression) {
-    if(nnClusterizerModelReg1NumOutputNodes > 0) {
+    if (nnClusterizerModelReg1NumOutputNodes > 0) {
       computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
     }
-    if(nnClusterizerModelReg2NumOutputNodes > 0) {
+    if (nnClusterizerModelReg2NumOutputNodes > 0) {
       computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
     }
   }
@@ -49,7 +50,8 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) {
   return mem;
 }
 
-void GPUTPCNNClusterizer::RegisterMemoryAllocation() {
+void GPUTPCNNClusterizer::RegisterMemoryAllocation()
+{
   AllocateAndInitializeLate();
   int32_t memType = GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK;
   mMemoryId = mRec->RegisterMemoryAllocation(this, &GPUTPCNNClusterizer::setIOPointers, memType, "TPCNNClusterer", GPUMemoryReuse{GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::NNClusterer, (uint16_t)(mISector % mRec->GetProcessingSettings().nTPCClustererLanes)});
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 3d2dd31cc2beb..d0f3da460fee0 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -20,7 +20,7 @@
 
 namespace o2::OrtDataType
 {
-  struct Float16_t;
+struct Float16_t;
 }
 
 namespace o2::gpu
@@ -58,16 +58,16 @@ class GPUTPCNNClusterizer : public GPUProcessor
 
   // Memory allocation for neural network
   uint class2_elements = 0;
-  float* inputData32=nullptr;
-  OrtDataType::Float16_t* inputData16=nullptr;
-  float* outputDataClass=nullptr;
-  float* modelProbabilities=nullptr;
-  float* outputDataReg1=nullptr;
-  float* outputDataReg2=nullptr;
+  float* inputData32 = nullptr;
+  OrtDataType::Float16_t* inputData16 = nullptr;
+  float* outputDataClass = nullptr;
+  float* modelProbabilities = nullptr;
+  float* outputDataReg1 = nullptr;
+  float* outputDataReg2 = nullptr;
 
-  ChargePos* peakPositions=nullptr;
-  bool* clusterFlags=nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx
-  float* centralCharges=nullptr;
+  ChargePos* peakPositions = nullptr;
+  bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx
+  float* centralCharges = nullptr;
   int16_t mMemoryId = -1;
 }; // class GPUTPCNNClusterizer
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index fe53f42dbbe8d..131ce79cf0a45 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -19,7 +19,8 @@
 
 using namespace o2::gpu;
 
-GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer) {
+GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer)
+{
   OrtOptions = {
     {"model-path", settings.nnClassificationPath},
     {"device", settings.nnInferenceDevice},
@@ -30,8 +31,7 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
     {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
     {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)},
     {"profiling-output-path", settings.nnInferenceOrtProfilingPath},
-    {"logging-level", std::to_string(settings.nnInferenceVerbosity)}
-  };
+    {"logging-level", std::to_string(settings.nnInferenceVerbosity)}};
 
   model_class.init(OrtOptions);
   clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index de0118c26c8db..b71e3816ca892 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -24,7 +24,7 @@ using namespace o2::ml;
 
 namespace o2::OrtDataType
 {
-  struct Float16_t;
+struct Float16_t;
 }
 
 namespace o2::gpu
@@ -45,17 +45,18 @@ class GPUTPCNNClusterizerHost
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
   std::vector<std::string> reg_model_paths;
- private:
 
+ private:
   // Avoid including CommonUtils/StringUtils.h
-  std::vector<std::string> splitString(const std::string& input, const std::string& delimiter) {
+  std::vector<std::string> splitString(const std::string& input, const std::string& delimiter)
+  {
     std::vector<std::string> tokens;
     std::size_t pos = 0;
     std::size_t found;
 
     while ((found = input.find(delimiter, pos)) != std::string::npos) {
-        tokens.push_back(input.substr(pos, found - pos));
-        pos = found + delimiter.length();
+      tokens.push_back(input.substr(pos, found - pos));
+      pos = found + delimiter.length();
     }
     tokens.push_back(input.substr(pos));
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 0e6739168151d..60d6dde759518 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -148,9 +148,9 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n
       for (int t = -clustererNN.nnClusterizerSizeInputTime; t <= clustererNN.nnClusterizerSizeInputTime; t++) {
         if (!is_boundary) {
           ChargePos tmp_pos(row + r, pad + p, time + t);
-          if (r == 0 && !clustererNN.clusterFlags[2*glo_idx] && std::abs(p) < 3 && std::abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
-            clustererNN.clusterFlags[2*glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
-            clustererNN.clusterFlags[2*glo_idx + 1] = clustererNN.clusterFlags[2*glo_idx];
+          if (r == 0 && !clustererNN.clusterFlags[2 * glo_idx] && std::abs(p) < 3 && std::abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
+            clustererNN.clusterFlags[2 * glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
+            clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
           }
           if (dtype == 0) {
             clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
@@ -222,12 +222,12 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
     }
 
     pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg1[model_output_index + 4],
-      static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg1[model_output_index],
-      clustererNN.outputDataReg1[model_output_index + 2],
-      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1],
-      clustererNN.outputDataReg1[model_output_index + 3],
-      clustererNN.clusterFlags[2*glo_idx],
-      clustererNN.clusterFlags[2*glo_idx + 1]);
+               static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg1[model_output_index],
+               clustererNN.outputDataReg1[model_output_index + 2],
+               static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1],
+               clustererNN.outputDataReg1[model_output_index + 3],
+               clustererNN.clusterFlags[2 * glo_idx],
+               clustererNN.clusterFlags[2 * glo_idx + 1]);
 
     tpc::ClusterNative myCluster;
     bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param());
@@ -302,12 +302,12 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
 
     // Cluster 1
     pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 8],
-      static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index],
-      clustererNN.outputDataReg2[model_output_index + 4],
-      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2],
-      clustererNN.outputDataReg2[model_output_index + 6],
-      clustererNN.clusterFlags[2*glo_idx],
-      clustererNN.clusterFlags[2*glo_idx + 1]);
+               static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index],
+               clustererNN.outputDataReg2[model_output_index + 4],
+               static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2],
+               clustererNN.outputDataReg2[model_output_index + 6],
+               clustererNN.clusterFlags[2 * glo_idx],
+               clustererNN.clusterFlags[2 * glo_idx + 1]);
 
     tpc::ClusterNative myCluster;
     bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param());
@@ -337,12 +337,12 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
 
     // Cluster 2
     pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 9],
-      static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index + 1],
-      clustererNN.outputDataReg2[model_output_index + 5],
-      static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3],
-      clustererNN.outputDataReg2[model_output_index + 7],
-      clustererNN.clusterFlags[2*glo_idx],
-      clustererNN.clusterFlags[2*glo_idx + 1]);
+               static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index + 1],
+               clustererNN.outputDataReg2[model_output_index + 5],
+               static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3],
+               clustererNN.outputDataReg2[model_output_index + 7],
+               clustererNN.clusterFlags[2 * glo_idx],
+               clustererNN.clusterFlags[2 * glo_idx + 1]);
 
     rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param());
     if (rejectCluster) {

From ed323ec7d68f088c00cc5a2a5bdb15d3ad6f0996 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 12 Mar 2025 10:41:16 +0100
Subject: [PATCH 55/77] Adjust for comments

---
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h |  12 +-
 GPU/GPUTracking/CMakeLists.txt                |  13 +-
 .../Definitions/GPUDefGPUParameters.h         |   3 -
 .../Global/GPUChainTrackingClusterizer.cxx    |  10 +-
 .../TPCClusterFinder/GPUTPCCFClusterizer.cxx  | 235 +----------------
 .../TPCClusterFinder/GPUTPCCFClusterizer.inc  | 249 ++++++++++++++++++
 .../TPCClusterFinder/GPUTPCClusterFinder.h    |   1 -
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |   3 -
 .../GPUTPCNNClusterizerHost.cxx               |  25 +-
 .../GPUTPCNNClusterizerHost.h                 |   5 +-
 .../GPUTPCNNClusterizerKernels.cxx            |  16 +-
 .../GPUTPCNNClusterizerKernels.h              |   3 +-
 12 files changed, 288 insertions(+), 287 deletions(-)
 create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc

diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
index 2fcc09375cef2..819e6c8da7594 100644
--- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -16,6 +16,7 @@
 #endif
 
 #include "GPUCommonDef.h"
+#include "GPUCommonMath.h"
 
 namespace o2
 {
@@ -530,11 +531,14 @@ template <class Derived>
 GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
 {
   uint16_t result;
-  if (std::isnan(v)) {
+  if (o2::gpu::CAMath::IsNaN(v)) {
     result = kPositiveQNaNBits;
   } else {
     auto get_msb_half = [](float fl) {
       uint16_t result;
+#ifdef GPUCA_GPUCODE
+      result = 0;
+#else
 #ifdef __cpp_if_constexpr
       if constexpr (detail::endian::native == detail::endian::little)
 #else
@@ -557,6 +561,7 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
     U32 += (upper_bits & 1) + kRoundToNearest;
     result = get_msb_half(F32);
   }
+#endif
   return result;
 }
 
@@ -567,6 +572,9 @@ GPUd() inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
     return std::numeric_limits<float>::quiet_NaN();
   }
   float result;
+#ifdef GPUCA_GPUCODE
+  result = 0; // Fixme: implement memcpy
+#else
   char* const first = reinterpret_cast<char*>(&result);
   char* const second = first + sizeof(uint16_t);
 #ifdef __cpp_if_constexpr
@@ -581,6 +589,7 @@ GPUd() inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
     std::memcpy(first, &val, sizeof(uint16_t));
     std::memset(second, 0, sizeof(uint16_t));
   }
+#endif
   return result;
 }
 
@@ -872,5 +881,4 @@ static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
 } // namespace OrtDataType
 
 } // namespace o2
-
 #endif
\ No newline at end of file
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index df0a621a49235..17b66c14c838d 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -276,15 +276,11 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                                         O2::GPUCommon
                                         O2::ReconstructionDataFormats
                                         O2::TPCFastTransformation
+                                        O2::ML
                   PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
                   SOURCES ${SRCS_DATATYPES})
-  if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
-    add_compile_definitions(GPUCA_HAS_ONNX=1)
-    target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX)
-    target_link_libraries(${targetName} PUBLIC O2::ML)
-  else()
-    target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2)
-  endif()
+  add_compile_definitions(GPUCA_HAS_ONNX=1)
+  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX)
 
   o2_target_root_dictionary(GPUDataTypes
                             HEADERS ${HDRS_CINT_DATATYPES} ${HDRS_CINT_O2_ADDITIONAL}
@@ -350,7 +346,6 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                          LABELS its COMPILE_ONLY)
 
   add_subdirectory(Interface)
-
 endif()
 
 # Main CMake part for Standalone
@@ -422,4 +417,4 @@ endif()
 
 if(${GPUCA_NO_FAST_MATH})
   target_compile_definitions(${targetName} PUBLIC GPUCA_NO_FAST_MATH)
-endif()
+endif()
\ No newline at end of file
diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
index e3a2528f375e0..b01377145e2e6 100644
--- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
+++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h
@@ -81,7 +81,6 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 448
-  #define GPUCA_LB_GPUTPCNNClusterizerKernels 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -148,7 +147,6 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 512
   #define GPUCA_LB_GPUTPCCFDeconvolution 512
   #define GPUCA_LB_GPUTPCCFClusterizer 512
-  #define GPUCA_LB_GPUTPCNNClusterizerKernels 512
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 5
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
@@ -215,7 +213,6 @@
   #define GPUCA_LB_GPUTPCCFNoiseSuppression 448
   #define GPUCA_LB_GPUTPCCFDeconvolution 384
   #define GPUCA_LB_GPUTPCCFClusterizer 448
-  #define GPUCA_LB_GPUTPCNNClusterizerKernels 448
   #define GPUCA_LB_COMPRESSION_GATHER 1024
   #define GPUCA_NEIGHBOURS_FINDER_MAX_NNEIGHUP 4
   #define GPUCA_TRACKLET_SELECTOR_HITS_REG_SIZE 20
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index b0b4aae4970b3..757b9c7b2982e 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -858,7 +858,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
         uint32_t iSector = iSectorBase + lane;
         GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
+#ifdef GPUCA_HAS_ONNX
         GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
+#endif
         GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
 
         if (doGPU) {
@@ -929,7 +931,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
-            nnApplication.inferenceNetworkClass(clustererNN, iSize, evalDtype, batchStart);
+            nnApplication.inferenceNetwork(clustererNN.model_class, clustererNN, iSize, clusterer.modelProbabilities);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Assigning class labels
             } else {
@@ -937,10 +939,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             }
 
             if (!clustererNN.nnClusterizerUseCfRegression) {
-              nnApplication.inferenceNetworkReg1(clustererNN, iSize, evalDtype, batchStart);
+              nnApplication.inferenceNetwork(clustererNN.model_reg_1, clustererNN, iSize, clusterer.outputDataReg1);
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
-                nnApplication.inferenceNetworkReg2(clustererNN, iSize, evalDtype, batchStart);
+                nnApplication.inferenceNetwork(clustererNN.model_reg_2, clustererNN, iSize, clusterer.outputDataReg2);
                 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 2
               }
             }
@@ -1168,4 +1170,4 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
 #endif
   return 0;
-}
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
index 1aeae812f5193..7bf53b4878233 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
@@ -25,6 +25,8 @@
 using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
 
+#include "GPUTPCCFClusterizer.inc"
+
 template <>
 GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t onlyMC)
 {
@@ -34,235 +36,4 @@ GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
   GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
-}
-
-GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
-                                                       processorType& clusterer,
-                                                       const CfFragment& fragment,
-                                                       GPUSharedMemory& smem,
-                                                       const Array2D<PackedCharge>& chargeMap,
-                                                       const ChargePos* filteredPeakPositions,
-                                                       const GPUSettingsRec& calib,
-                                                       MCLabelAccumulator* labelAcc,
-                                                       uint32_t clusternum,
-                                                       uint32_t maxClusterPerRow,
-                                                       uint32_t* clusterInRow,
-                                                       tpc::ClusterNative* clusterByRow,
-                                                       uint32_t* clusterPosInRow)
-{
-  uint32_t idx = get_global_id(0);
-
-  // For certain configurations dummy work items are added, so the total
-  // number of work items is dividable by 64.
-  // These dummy items also compute the last cluster but discard the result.
-  ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
-  Charge charge = chargeMap[pos].unpack();
-
-  ClusterAccumulator pc;
-  CPU_ONLY(labelAcc->collect(pos, charge));
-
-  buildCluster(
-    calib,
-    chargeMap,
-    pos,
-    smem.posBcast,
-    smem.buf,
-    smem.innerAboveThreshold,
-    &pc,
-    labelAcc);
-
-  if (idx >= clusternum) {
-    return;
-  }
-  if (fragment.isOverlap(pos.time())) {
-    if (clusterPosInRow) {
-      clusterPosInRow[idx] = maxClusterPerRow;
-    }
-    return;
-  }
-  pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry);
-
-  tpc::ClusterNative myCluster;
-  bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param());
-
-  if (rejectCluster) {
-    if (clusterPosInRow) {
-      clusterPosInRow[idx] = maxClusterPerRow;
-    }
-    return;
-  }
-
-  uint32_t rowIndex = 0;
-  if (clusterByRow != nullptr) {
-    rowIndex = sortIntoBuckets(
-      clusterer,
-      myCluster,
-      pos.row(),
-      maxClusterPerRow,
-      clusterInRow,
-      clusterByRow);
-    if (clusterPosInRow != nullptr) {
-      clusterPosInRow[idx] = rowIndex;
-    }
-  } else if (clusterPosInRow) {
-    rowIndex = clusterPosInRow[idx];
-  }
-
-  CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
-}
-
-GPUdii() void GPUTPCCFClusterizer::updateClusterInner(
-  const GPUSettingsRec& calib,
-  uint16_t lid,
-  uint16_t N,
-  const PackedCharge* buf,
-  const ChargePos& pos,
-  ClusterAccumulator* cluster,
-  MCLabelAccumulator* labelAcc,
-  uint8_t* innerAboveThreshold)
-{
-  uint8_t aboveThreshold = 0;
-
-  GPUCA_UNROLL(U(), U())
-  for (uint16_t i = 0; i < N; i++) {
-    Delta2 d = cfconsts::InnerNeighbors[i];
-
-    PackedCharge p = buf[N * lid + i];
-
-    Charge q = cluster->updateInner(p, d);
-
-    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
-
-    aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
-  }
-
-  innerAboveThreshold[lid] = aboveThreshold;
-
-  GPUbarrier();
-}
-
-GPUdii() void GPUTPCCFClusterizer::updateClusterOuter(
-  uint16_t lid,
-  uint16_t N,
-  uint16_t M,
-  uint16_t offset,
-  const PackedCharge* buf,
-  const ChargePos& pos,
-  ClusterAccumulator* cluster,
-  MCLabelAccumulator* labelAcc)
-{
-  GPUCA_UNROLL(U(), U())
-  for (uint16_t i = offset; i < M + offset; i++) {
-    PackedCharge p = buf[N * lid + i];
-
-    Delta2 d = cfconsts::OuterNeighbors[i];
-
-    Charge q = cluster->updateOuter(p, d);
-    static_cast<void>(q); // Avoid unused varible warning on GPU.
-
-    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
-  }
-}
-
-GPUdii() void GPUTPCCFClusterizer::buildCluster(
-  const GPUSettingsRec& calib,
-  const Array2D<PackedCharge>& chargeMap,
-  ChargePos pos,
-  ChargePos* posBcast,
-  PackedCharge* buf,
-  uint8_t* innerAboveThreshold,
-  ClusterAccumulator* myCluster,
-  MCLabelAccumulator* labelAcc)
-{
-  uint16_t ll = get_local_id(0);
-
-  posBcast[ll] = pos;
-  GPUbarrier();
-
-  CfUtils::blockLoad<PackedCharge>(
-    chargeMap,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    8,
-    cfconsts::InnerNeighbors,
-    posBcast,
-    buf);
-  updateClusterInner(
-    calib,
-    ll,
-    8,
-    buf,
-    pos,
-    myCluster,
-    labelAcc,
-    innerAboveThreshold);
-
-  uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
-
-  bool inGroup1 = ll < wgSizeHalf;
-
-  uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
-
-  CfUtils::condBlockLoad(
-    chargeMap,
-    wgSizeHalf,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    16,
-    cfconsts::OuterNeighbors,
-    posBcast,
-    innerAboveThreshold,
-    buf);
-
-  if (inGroup1) {
-    updateClusterOuter(
-      llhalf,
-      16,
-      16,
-      0,
-      buf,
-      pos,
-      myCluster,
-      labelAcc);
-  }
-
-#if defined(GPUCA_GPUCODE)
-  CfUtils::condBlockLoad(
-    chargeMap,
-    wgSizeHalf,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    16,
-    cfconsts::OuterNeighbors,
-    posBcast + wgSizeHalf,
-    innerAboveThreshold + wgSizeHalf,
-    buf);
-  if (!inGroup1) {
-    updateClusterOuter(
-      llhalf,
-      16,
-      16,
-      0,
-      buf,
-      pos,
-      myCluster,
-      labelAcc);
-  }
-#endif
-}
-
-GPUd() uint32_t GPUTPCCFClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t* elemsInBucket, tpc::ClusterNative* buckets)
-{
-  uint32_t index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
-  if (index < maxElemsPerBucket) {
-    buckets[maxElemsPerBucket * row + index] = cluster;
-  } else {
-    clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISector * 1000 + row, index, maxElemsPerBucket);
-    CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
-  }
-  return index;
-}
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
new file mode 100644
index 0000000000000..443de3585a499
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
@@ -0,0 +1,249 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCCFClusterizer.cxx
+/// \author Felix Weiglhofer
+
+#ifndef O2_GPU_CLUSTERIZER_INC_H
+#define O2_GPU_CLUSTERIZER_INC_H
+
+GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
+                                                       processorType& clusterer,
+                                                       const CfFragment& fragment,
+                                                       GPUSharedMemory& smem,
+                                                       const Array2D<PackedCharge>& chargeMap,
+                                                       const ChargePos* filteredPeakPositions,
+                                                       const GPUSettingsRec& calib,
+                                                       MCLabelAccumulator* labelAcc,
+                                                       uint32_t clusternum,
+                                                       uint32_t maxClusterPerRow,
+                                                       uint32_t* clusterInRow,
+                                                       tpc::ClusterNative* clusterByRow,
+                                                       uint32_t* clusterPosInRow)
+{
+  uint32_t idx = get_global_id(0);
+
+  // For certain configurations dummy work items are added, so the total
+  // number of work items is dividable by 64.
+  // These dummy items also compute the last cluster but discard the result.
+  ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
+  Charge charge = chargeMap[pos].unpack();
+
+  ClusterAccumulator pc;
+  CPU_ONLY(labelAcc->collect(pos, charge));
+
+  buildCluster(
+    calib,
+    chargeMap,
+    pos,
+    smem.posBcast,
+    smem.buf,
+    smem.innerAboveThreshold,
+    &pc,
+    labelAcc);
+
+  if (idx >= clusternum) {
+    return;
+  }
+  if (fragment.isOverlap(pos.time())) {
+    if (clusterPosInRow) {
+      clusterPosInRow[idx] = maxClusterPerRow;
+    }
+    return;
+  }
+  pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry);
+
+  tpc::ClusterNative myCluster;
+  bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param());
+
+  if (rejectCluster) {
+    if (clusterPosInRow) {
+      clusterPosInRow[idx] = maxClusterPerRow;
+    }
+    return;
+  }
+
+  uint32_t rowIndex = 0;
+  if (clusterByRow != nullptr) {
+    rowIndex = sortIntoBuckets(
+      clusterer,
+      myCluster,
+      pos.row(),
+      maxClusterPerRow,
+      clusterInRow,
+      clusterByRow);
+    if (clusterPosInRow != nullptr) {
+      clusterPosInRow[idx] = rowIndex;
+    }
+  } else if (clusterPosInRow) {
+    rowIndex = clusterPosInRow[idx];
+  }
+
+  CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
+}
+
+GPUdii() void GPUTPCCFClusterizer::updateClusterInner(
+  const GPUSettingsRec& calib,
+  uint16_t lid,
+  uint16_t N,
+  const PackedCharge* buf,
+  const ChargePos& pos,
+  ClusterAccumulator* cluster,
+  MCLabelAccumulator* labelAcc,
+  uint8_t* innerAboveThreshold)
+{
+  uint8_t aboveThreshold = 0;
+
+  GPUCA_UNROLL(U(), U())
+  for (uint16_t i = 0; i < N; i++) {
+    Delta2 d = cfconsts::InnerNeighbors[i];
+
+    PackedCharge p = buf[N * lid + i];
+
+    Charge q = cluster->updateInner(p, d);
+
+    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
+
+    aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
+  }
+
+  innerAboveThreshold[lid] = aboveThreshold;
+
+  GPUbarrier();
+}
+
+GPUdii() void GPUTPCCFClusterizer::updateClusterOuter(
+  uint16_t lid,
+  uint16_t N,
+  uint16_t M,
+  uint16_t offset,
+  const PackedCharge* buf,
+  const ChargePos& pos,
+  ClusterAccumulator* cluster,
+  MCLabelAccumulator* labelAcc)
+{
+  GPUCA_UNROLL(U(), U())
+  for (uint16_t i = offset; i < M + offset; i++) {
+    PackedCharge p = buf[N * lid + i];
+
+    Delta2 d = cfconsts::OuterNeighbors[i];
+
+    Charge q = cluster->updateOuter(p, d);
+    static_cast<void>(q); // Avoid unused varible warning on GPU.
+
+    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
+  }
+}
+
+GPUdii() void GPUTPCCFClusterizer::buildCluster(
+  const GPUSettingsRec& calib,
+  const Array2D<PackedCharge>& chargeMap,
+  ChargePos pos,
+  ChargePos* posBcast,
+  PackedCharge* buf,
+  uint8_t* innerAboveThreshold,
+  ClusterAccumulator* myCluster,
+  MCLabelAccumulator* labelAcc)
+{
+  uint16_t ll = get_local_id(0);
+
+  posBcast[ll] = pos;
+  GPUbarrier();
+
+  CfUtils::blockLoad<PackedCharge>(
+    chargeMap,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    8,
+    cfconsts::InnerNeighbors,
+    posBcast,
+    buf);
+  updateClusterInner(
+    calib,
+    ll,
+    8,
+    buf,
+    pos,
+    myCluster,
+    labelAcc,
+    innerAboveThreshold);
+
+  uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
+
+  bool inGroup1 = ll < wgSizeHalf;
+
+  uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
+
+  CfUtils::condBlockLoad(
+    chargeMap,
+    wgSizeHalf,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    16,
+    cfconsts::OuterNeighbors,
+    posBcast,
+    innerAboveThreshold,
+    buf);
+
+  if (inGroup1) {
+    updateClusterOuter(
+      llhalf,
+      16,
+      16,
+      0,
+      buf,
+      pos,
+      myCluster,
+      labelAcc);
+  }
+
+#if defined(GPUCA_GPUCODE)
+  CfUtils::condBlockLoad(
+    chargeMap,
+    wgSizeHalf,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    16,
+    cfconsts::OuterNeighbors,
+    posBcast + wgSizeHalf,
+    innerAboveThreshold + wgSizeHalf,
+    buf);
+  if (!inGroup1) {
+    updateClusterOuter(
+      llhalf,
+      16,
+      16,
+      0,
+      buf,
+      pos,
+      myCluster,
+      labelAcc);
+  }
+#endif
+}
+
+GPUd() uint32_t GPUTPCCFClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t* elemsInBucket, tpc::ClusterNative* buckets)
+{
+  uint32_t index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
+  if (index < maxElemsPerBucket) {
+    buckets[maxElemsPerBucket * row + index] = cluster;
+  } else {
+    clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISector * 1000 + row, index, maxElemsPerBucket);
+    CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
+  }
+  return index;
+}
+
+#endif // O2_GPU_CLUSTERIZER_INC_H
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
index 994cd4a66e83f..f59102aa6b5c3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCClusterFinder.h
@@ -19,7 +19,6 @@
 #include "GPUProcessor.h"
 #include "GPUDataTypes.h"
 #include "CfFragment.h"
-#include "ML/3rdparty/GPUORTFloat16.h"
 
 namespace o2
 {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 16120bced1917..1265f6821e1ef 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -44,9 +44,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
       computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
     }
   }
-
-  LOG(info) << "Alloc mem: " << nnClusterizerBatchedMode * nnClusterizerElementSize << " " << nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes << " " << mem;
-
   return mem;
 }
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 131ce79cf0a45..ed4607d412746 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -54,30 +54,11 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
   }
 }
 
-// Apply the neural network to the input data. Note: These are not GPU kernels. We let ONNX take care of that
-void GPUTPCNNClusterizerHost::inferenceNetworkClass(GPUTPCNNClusterizer& clusterer, size_t currentSize, int8_t dtype, uint batch_idx)
+void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output)
 {
   if (dtype == 0) {
-    model_class.inference<OrtDataType::Float16_t, float>(clusterer.inputData16, currentSize * clusterer.nnClusterizerElementSize, clusterer.modelProbabilities);
+    model.inference<OrtDataType::Float16_t, float>(clusterer.inputData16, size * clusterer.nnClusterizerElementSize, output);
   } else {
-    model_class.inference<float, float>(clusterer.inputData32, currentSize * clusterer.nnClusterizerElementSize, clusterer.modelProbabilities);
-  }
-}
-
-void GPUTPCNNClusterizerHost::inferenceNetworkReg1(GPUTPCNNClusterizer& clusterer, size_t currentSize, int8_t dtype, uint batch_idx)
-{
-  if (dtype == 0) {
-    model_reg_1.inference<OrtDataType::Float16_t, float>(clusterer.inputData16, currentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg1);
-  } else {
-    model_reg_1.inference<float, float>(clusterer.inputData32, currentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg1);
-  }
-}
-
-void GPUTPCNNClusterizerHost::inferenceNetworkReg2(GPUTPCNNClusterizer& clusterer, size_t currentSize, int8_t dtype, uint batch_idx)
-{
-  if (dtype == 0) {
-    model_reg_2.inference<OrtDataType::Float16_t, float>(clusterer.inputData16, currentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg2);
-  } else {
-    model_reg_2.inference<float, float>(clusterer.inputData32, currentSize * clusterer.nnClusterizerElementSize, clusterer.outputDataReg2);
+    model.inference<float, float>(clusterer.inputData32, size * clusterer.nnClusterizerElementSize, output);
   }
 }
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index b71e3816ca892..14a256b5fa95b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -38,9 +38,8 @@ class GPUTPCNNClusterizerHost
  public:
   GPUTPCNNClusterizerHost() = default;
   GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
-  void inferenceNetworkClass(GPUTPCNNClusterizer&, size_t, int8_t, uint);
-  void inferenceNetworkReg1(GPUTPCNNClusterizer&, size_t, int8_t, uint);
-  void inferenceNetworkReg2(GPUTPCNNClusterizer&, size_t, int8_t, uint);
+
+  void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output);
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 60d6dde759518..adcea7d4a1076 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -15,17 +15,17 @@
 #include "GPUTPCNNClusterizerKernels.h"
 #include "GPUTPCCFClusterizer.h"
 
+using namespace o2::gpu;
+using namespace o2::gpu::tpccf;
+
 #include "CfConsts.h"
 #include "CfUtils.h"
 #include "ClusterAccumulator.h"
-#if !defined(GPUCA_GPUCODE)
-#include "GPUHostDataTypes.h"
-#include "MCLabelAccumulator.h"
+#include "ML/3rdparty/GPUORTFloat16.h"
+#ifdef GPUCA_GPUCODE
+#include "GPUTPCCFClusterizer.inc"
 #endif
 
-using namespace o2::gpu;
-using namespace o2::gpu::tpccf;
-
 // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
@@ -139,7 +139,9 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n
   clustererNN.centralCharges[glo_idx] = central_charge;
 
   int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
+#ifndef GPUCA_GPUCODE
   GPUCA_UNROLL(U(), U());
+#endif
   for (int r = -clustererNN.nnClusterizerSizeInputRow; r <= clustererNN.nnClusterizerSizeInputRow; r++) {
     bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
     int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r, clusterer.Param().tpcGeometry);
@@ -148,7 +150,7 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n
       for (int t = -clustererNN.nnClusterizerSizeInputTime; t <= clustererNN.nnClusterizerSizeInputTime; t++) {
         if (!is_boundary) {
           ChargePos tmp_pos(row + r, pad + p, time + t);
-          if (r == 0 && !clustererNN.clusterFlags[2 * glo_idx] && std::abs(p) < 3 && std::abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
+          if (r == 0 && !clustererNN.clusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
             clustererNN.clusterFlags[2 * glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
             clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
           }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index 7c669b3b25c10..c2eaaf6ee1f4e 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -62,11 +62,12 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
   template <int32_t iKernel = defaultKernel, typename... Args>
   GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, uint8_t = 0, int8_t = 0, int8_t = 0, uint = 0, Args...);
 
+ private:
+
   static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, uint8_t, int8_t, uint);
   static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
   static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
 
- private:
   static GPUd() int padOffset(int, int, const GPUTPCGeometry&);
   static GPUd() int rowOffset(int, int);
   static GPUd() bool isBoundary(int, int, int, const GPUTPCGeometry&);

From 248f9c90aa0cea1d90a392350bffc97f534c2590 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Wed, 12 Mar 2025 09:42:10 +0000
Subject: [PATCH 56/77] Please consider the following formatting changes

---
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h | 590 +++++++++---------
 .../GPUTPCNNClusterizerKernels.h              |   1 -
 2 files changed, 295 insertions(+), 296 deletions(-)

diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
index 819e6c8da7594..b66fde4628c2b 100644
--- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -562,18 +562,18 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
     result = get_msb_half(F32);
   }
 #endif
-  return result;
-}
+      return result;
+    }
 
-template <class Derived>
-GPUd() inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
-{
-  if (IsNaN()) {
-    return std::numeric_limits<float>::quiet_NaN();
-  }
-  float result;
+    template <class Derived>
+    GPUd() inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
+    {
+      if (IsNaN()) {
+        return std::numeric_limits<float>::quiet_NaN();
+      }
+      float result;
 #ifdef GPUCA_GPUCODE
-  result = 0; // Fixme: implement memcpy
+      result = 0; // Fixme: implement memcpy
 #else
   char* const first = reinterpret_cast<char*>(&result);
   char* const second = first + sizeof(uint16_t);
@@ -590,295 +590,295 @@ GPUd() inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
     std::memset(second, 0, sizeof(uint16_t));
   }
 #endif
-  return result;
-}
-
-/** \brief IEEE 754 half-precision floating point data type
- *
- * \details This struct is used for converting float to float16 and back
- * so the user could feed inputs and fetch outputs using these type.
- *
- * The size of the structure should align with uint16_t and one can freely cast
- * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data.
- *
- * \code{.unparsed}
- * // This example demonstrates converion from float to float16
- * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
- * std::vector<Ort::Float16_t> fp16_values;
- * fp16_values.reserve(std::size(values));
- * std::transform(std::begin(values), std::end(values), std::back_inserter(fp16_values),
- *     [](float value) { return Ort::Float16_t(value); });
- *
- * \endcode
- */
-struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
- private:
-  /// <summary>
-  /// Constructor from a 16-bit representation of a float16 value
-  /// No conversion is done here.
-  /// </summary>
-  /// <param name="v">16-bit representation</param>
-  constexpr explicit Float16_t(uint16_t v) noexcept { val = v; }
-
- public:
-  using Base = OrtDataType::Float16Impl<Float16_t>;
-
-  /// <summary>
-  /// Default constructor
-  /// </summary>
-  GPUdDefault() Float16_t() = default;
-
-  /// <summary>
-  /// Explicit conversion to uint16_t representation of float16.
-  /// </summary>
-  /// <param name="v">uint16_t bit representation of float16</param>
-  /// <returns>new instance of Float16_t</returns>
-  GPUd() constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); }
-
-  /// <summary>
-  /// __ctor from float. Float is converted into float16 16-bit representation.
-  /// </summary>
-  /// <param name="v">float value</param>
-  GPUd() explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
-
-  /// <summary>
-  /// Converts float16 to float
-  /// </summary>
-  /// <returns>float representation of float16 value</returns>
-  GPUd() float ToFloat() const noexcept { return Base::ToFloatImpl(); }
-
-  /// <summary>
-  /// Checks if the value is negative
-  /// </summary>
-  /// <returns>true if negative</returns>
-  using Base::IsNegative;
-
-  /// <summary>
-  /// Tests if the value is NaN
-  /// </summary>
-  /// <returns>true if NaN</returns>
-  using Base::IsNaN;
-
-  /// <summary>
-  /// Tests if the value is finite
-  /// </summary>
-  /// <returns>true if finite</returns>
-  using Base::IsFinite;
-
-  /// <summary>
-  /// Tests if the value represents positive infinity.
-  /// </summary>
-  /// <returns>true if positive infinity</returns>
-  using Base::IsPositiveInfinity;
-
-  /// <summary>
-  /// Tests if the value represents negative infinity
-  /// </summary>
-  /// <returns>true if negative infinity</returns>
-  using Base::IsNegativeInfinity;
-
-  /// <summary>
-  /// Tests if the value is either positive or negative infinity.
-  /// </summary>
-  /// <returns>True if absolute value is infinity</returns>
-  using Base::IsInfinity;
-
-  /// <summary>
-  /// Tests if the value is NaN or zero. Useful for comparisons.
-  /// </summary>
-  /// <returns>True if NaN or zero.</returns>
-  using Base::IsNaNOrZero;
-
-  /// <summary>
-  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
-  /// </summary>
-  /// <returns>True if so</returns>
-  using Base::IsNormal;
-
-  /// <summary>
-  /// Tests if the value is subnormal (denormal).
-  /// </summary>
-  /// <returns>True if so</returns>
-  using Base::IsSubnormal;
-
-  /// <summary>
-  /// Creates an instance that represents absolute value.
-  /// </summary>
-  /// <returns>Absolute value</returns>
-  using Base::Abs;
-
-  /// <summary>
-  /// Creates a new instance with the sign flipped.
-  /// </summary>
-  /// <returns>Flipped sign instance</returns>
-  using Base::Negate;
-
-  /// <summary>
-  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
-  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
-  /// and therefore equivalent, if the resulting value is still zero.
-  /// </summary>
-  /// <param name="lhs">first value</param>
-  /// <param name="rhs">second value</param>
-  /// <returns>True if both arguments represent zero</returns>
-  using Base::AreZero;
-
-  /// <summary>
-  /// User defined conversion operator. Converts Float16_t to float.
-  /// </summary>
-  explicit operator float() const noexcept { return ToFloat(); }
-
-  using Base::operator==;
-  using Base::operator!=;
-  using Base::operator<;
-};
-
-static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
-
-/** \brief bfloat16 (Brain Floating Point) data type
- *
- * \details This struct is used for converting float to bfloat16 and back
- * so the user could feed inputs and fetch outputs using these type.
- *
- * The size of the structure should align with uint16_t and one can freely cast
- * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data.
- *
- * \code{.unparsed}
- * // This example demonstrates converion from float to float16
- * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
- * std::vector<Ort::BFloat16_t> bfp16_values;
- * bfp16_values.reserve(std::size(values));
- * std::transform(std::begin(values), std::end(values), std::back_inserter(bfp16_values),
- *     [](float value) { return Ort::BFloat16_t(value); });
- *
- * \endcode
- */
-struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
- private:
-  /// <summary>
-  /// Constructor from a uint16_t representation of bfloat16
-  /// used in FromBits() to escape overload resolution issue with
-  /// constructor from float.
-  /// No conversion is done.
-  /// </summary>
-  /// <param name="v">16-bit bfloat16 value</param>
-  constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; }
-
- public:
-  using Base = OrtDataType::BFloat16Impl<BFloat16_t>;
-
-  GPUdDefault() BFloat16_t() = default;
-
-  /// <summary>
-  /// Explicit conversion to uint16_t representation of bfloat16.
-  /// </summary>
-  /// <param name="v">uint16_t bit representation of bfloat16</param>
-  /// <returns>new instance of BFloat16_t</returns>
-  GPUd() static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); }
-
-  /// <summary>
-  /// __ctor from float. Float is converted into bfloat16 16-bit representation.
-  /// </summary>
-  /// <param name="v">float value</param>
-  GPUd() explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
-
-  /// <summary>
-  /// Converts bfloat16 to float
-  /// </summary>
-  /// <returns>float representation of bfloat16 value</returns>
-  GPUd() float ToFloat() const noexcept { return Base::ToFloatImpl(); }
-
-  /// <summary>
-  /// Checks if the value is negative
-  /// </summary>
-  /// <returns>true if negative</returns>
-  using Base::IsNegative;
-
-  /// <summary>
-  /// Tests if the value is NaN
-  /// </summary>
-  /// <returns>true if NaN</returns>
-  using Base::IsNaN;
-
-  /// <summary>
-  /// Tests if the value is finite
-  /// </summary>
-  /// <returns>true if finite</returns>
-  using Base::IsFinite;
-
-  /// <summary>
-  /// Tests if the value represents positive infinity.
-  /// </summary>
-  /// <returns>true if positive infinity</returns>
-  using Base::IsPositiveInfinity;
-
-  /// <summary>
-  /// Tests if the value represents negative infinity
-  /// </summary>
-  /// <returns>true if negative infinity</returns>
-  using Base::IsNegativeInfinity;
-
-  /// <summary>
-  /// Tests if the value is either positive or negative infinity.
-  /// </summary>
-  /// <returns>True if absolute value is infinity</returns>
-  using Base::IsInfinity;
-
-  /// <summary>
-  /// Tests if the value is NaN or zero. Useful for comparisons.
-  /// </summary>
-  /// <returns>True if NaN or zero.</returns>
-  using Base::IsNaNOrZero;
-
-  /// <summary>
-  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
-  /// </summary>
-  /// <returns>True if so</returns>
-  using Base::IsNormal;
-
-  /// <summary>
-  /// Tests if the value is subnormal (denormal).
-  /// </summary>
-  /// <returns>True if so</returns>
-  using Base::IsSubnormal;
-
-  /// <summary>
-  /// Creates an instance that represents absolute value.
-  /// </summary>
-  /// <returns>Absolute value</returns>
-  using Base::Abs;
-
-  /// <summary>
-  /// Creates a new instance with the sign flipped.
-  /// </summary>
-  /// <returns>Flipped sign instance</returns>
-  using Base::Negate;
-
-  /// <summary>
-  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
-  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
-  /// and therefore equivalent, if the resulting value is still zero.
-  /// </summary>
-  /// <param name="lhs">first value</param>
-  /// <param name="rhs">second value</param>
-  /// <returns>True if both arguments represent zero</returns>
-  using Base::AreZero;
+      return result;
+    }
 
-  /// <summary>
-  /// User defined conversion operator. Converts BFloat16_t to float.
-  /// </summary>
-  explicit operator float() const noexcept { return ToFloat(); }
+    /** \brief IEEE 754 half-precision floating point data type
+     *
+     * \details This struct is used for converting float to float16 and back
+     * so the user could feed inputs and fetch outputs using these type.
+     *
+     * The size of the structure should align with uint16_t and one can freely cast
+     * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data.
+     *
+     * \code{.unparsed}
+     * // This example demonstrates converion from float to float16
+     * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+     * std::vector<Ort::Float16_t> fp16_values;
+     * fp16_values.reserve(std::size(values));
+     * std::transform(std::begin(values), std::end(values), std::back_inserter(fp16_values),
+     *     [](float value) { return Ort::Float16_t(value); });
+     *
+     * \endcode
+     */
+    struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
+     private:
+      /// <summary>
+      /// Constructor from a 16-bit representation of a float16 value
+      /// No conversion is done here.
+      /// </summary>
+      /// <param name="v">16-bit representation</param>
+      constexpr explicit Float16_t(uint16_t v) noexcept { val = v; }
+
+     public:
+      using Base = OrtDataType::Float16Impl<Float16_t>;
+
+      /// <summary>
+      /// Default constructor
+      /// </summary>
+      GPUdDefault() Float16_t() = default;
+
+      /// <summary>
+      /// Explicit conversion to uint16_t representation of float16.
+      /// </summary>
+      /// <param name="v">uint16_t bit representation of float16</param>
+      /// <returns>new instance of Float16_t</returns>
+      GPUd() constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); }
+
+      /// <summary>
+      /// __ctor from float. Float is converted into float16 16-bit representation.
+      /// </summary>
+      /// <param name="v">float value</param>
+      GPUd() explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+      /// <summary>
+      /// Converts float16 to float
+      /// </summary>
+      /// <returns>float representation of float16 value</returns>
+      GPUd() float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+      /// <summary>
+      /// Checks if the value is negative
+      /// </summary>
+      /// <returns>true if negative</returns>
+      using Base::IsNegative;
+
+      /// <summary>
+      /// Tests if the value is NaN
+      /// </summary>
+      /// <returns>true if NaN</returns>
+      using Base::IsNaN;
+
+      /// <summary>
+      /// Tests if the value is finite
+      /// </summary>
+      /// <returns>true if finite</returns>
+      using Base::IsFinite;
+
+      /// <summary>
+      /// Tests if the value represents positive infinity.
+      /// </summary>
+      /// <returns>true if positive infinity</returns>
+      using Base::IsPositiveInfinity;
+
+      /// <summary>
+      /// Tests if the value represents negative infinity
+      /// </summary>
+      /// <returns>true if negative infinity</returns>
+      using Base::IsNegativeInfinity;
+
+      /// <summary>
+      /// Tests if the value is either positive or negative infinity.
+      /// </summary>
+      /// <returns>True if absolute value is infinity</returns>
+      using Base::IsInfinity;
+
+      /// <summary>
+      /// Tests if the value is NaN or zero. Useful for comparisons.
+      /// </summary>
+      /// <returns>True if NaN or zero.</returns>
+      using Base::IsNaNOrZero;
+
+      /// <summary>
+      /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+      /// </summary>
+      /// <returns>True if so</returns>
+      using Base::IsNormal;
+
+      /// <summary>
+      /// Tests if the value is subnormal (denormal).
+      /// </summary>
+      /// <returns>True if so</returns>
+      using Base::IsSubnormal;
+
+      /// <summary>
+      /// Creates an instance that represents absolute value.
+      /// </summary>
+      /// <returns>Absolute value</returns>
+      using Base::Abs;
+
+      /// <summary>
+      /// Creates a new instance with the sign flipped.
+      /// </summary>
+      /// <returns>Flipped sign instance</returns>
+      using Base::Negate;
+
+      /// <summary>
+      /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+      /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+      /// and therefore equivalent, if the resulting value is still zero.
+      /// </summary>
+      /// <param name="lhs">first value</param>
+      /// <param name="rhs">second value</param>
+      /// <returns>True if both arguments represent zero</returns>
+      using Base::AreZero;
+
+      /// <summary>
+      /// User defined conversion operator. Converts Float16_t to float.
+      /// </summary>
+      explicit operator float() const noexcept { return ToFloat(); }
+
+      using Base::operator==;
+      using Base::operator!=;
+      using Base::operator<;
+    };
 
-  // We do not have an inherited impl for the below operators
-  // as the internal class implements them a little differently
-  bool operator==(const BFloat16_t& rhs) const noexcept;
-  bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); }
-  bool operator<(const BFloat16_t& rhs) const noexcept;
-};
+    static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
+
+    /** \brief bfloat16 (Brain Floating Point) data type
+     *
+     * \details This struct is used for converting float to bfloat16 and back
+     * so the user could feed inputs and fetch outputs using these type.
+     *
+     * The size of the structure should align with uint16_t and one can freely cast
+     * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data.
+     *
+     * \code{.unparsed}
+     * // This example demonstrates converion from float to float16
+     * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+     * std::vector<Ort::BFloat16_t> bfp16_values;
+     * bfp16_values.reserve(std::size(values));
+     * std::transform(std::begin(values), std::end(values), std::back_inserter(bfp16_values),
+     *     [](float value) { return Ort::BFloat16_t(value); });
+     *
+     * \endcode
+     */
+    struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
+     private:
+      /// <summary>
+      /// Constructor from a uint16_t representation of bfloat16
+      /// used in FromBits() to escape overload resolution issue with
+      /// constructor from float.
+      /// No conversion is done.
+      /// </summary>
+      /// <param name="v">16-bit bfloat16 value</param>
+      constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; }
+
+     public:
+      using Base = OrtDataType::BFloat16Impl<BFloat16_t>;
+
+      GPUdDefault() BFloat16_t() = default;
+
+      /// <summary>
+      /// Explicit conversion to uint16_t representation of bfloat16.
+      /// </summary>
+      /// <param name="v">uint16_t bit representation of bfloat16</param>
+      /// <returns>new instance of BFloat16_t</returns>
+      GPUd() static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); }
+
+      /// <summary>
+      /// __ctor from float. Float is converted into bfloat16 16-bit representation.
+      /// </summary>
+      /// <param name="v">float value</param>
+      GPUd() explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+      /// <summary>
+      /// Converts bfloat16 to float
+      /// </summary>
+      /// <returns>float representation of bfloat16 value</returns>
+      GPUd() float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+      /// <summary>
+      /// Checks if the value is negative
+      /// </summary>
+      /// <returns>true if negative</returns>
+      using Base::IsNegative;
+
+      /// <summary>
+      /// Tests if the value is NaN
+      /// </summary>
+      /// <returns>true if NaN</returns>
+      using Base::IsNaN;
+
+      /// <summary>
+      /// Tests if the value is finite
+      /// </summary>
+      /// <returns>true if finite</returns>
+      using Base::IsFinite;
+
+      /// <summary>
+      /// Tests if the value represents positive infinity.
+      /// </summary>
+      /// <returns>true if positive infinity</returns>
+      using Base::IsPositiveInfinity;
+
+      /// <summary>
+      /// Tests if the value represents negative infinity
+      /// </summary>
+      /// <returns>true if negative infinity</returns>
+      using Base::IsNegativeInfinity;
+
+      /// <summary>
+      /// Tests if the value is either positive or negative infinity.
+      /// </summary>
+      /// <returns>True if absolute value is infinity</returns>
+      using Base::IsInfinity;
+
+      /// <summary>
+      /// Tests if the value is NaN or zero. Useful for comparisons.
+      /// </summary>
+      /// <returns>True if NaN or zero.</returns>
+      using Base::IsNaNOrZero;
+
+      /// <summary>
+      /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+      /// </summary>
+      /// <returns>True if so</returns>
+      using Base::IsNormal;
+
+      /// <summary>
+      /// Tests if the value is subnormal (denormal).
+      /// </summary>
+      /// <returns>True if so</returns>
+      using Base::IsSubnormal;
+
+      /// <summary>
+      /// Creates an instance that represents absolute value.
+      /// </summary>
+      /// <returns>Absolute value</returns>
+      using Base::Abs;
+
+      /// <summary>
+      /// Creates a new instance with the sign flipped.
+      /// </summary>
+      /// <returns>Flipped sign instance</returns>
+      using Base::Negate;
+
+      /// <summary>
+      /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+      /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+      /// and therefore equivalent, if the resulting value is still zero.
+      /// </summary>
+      /// <param name="lhs">first value</param>
+      /// <param name="rhs">second value</param>
+      /// <returns>True if both arguments represent zero</returns>
+      using Base::AreZero;
+
+      /// <summary>
+      /// User defined conversion operator. Converts BFloat16_t to float.
+      /// </summary>
+      explicit operator float() const noexcept { return ToFloat(); }
+
+      // We do not have an inherited impl for the below operators
+      // as the internal class implements them a little differently
+      bool operator==(const BFloat16_t& rhs) const noexcept;
+      bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); }
+      bool operator<(const BFloat16_t& rhs) const noexcept;
+    };
 
-static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
+    static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
 
-} // namespace OrtDataType
+  } // namespace OrtDataType
 
 } // namespace o2
 #endif
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index c2eaaf6ee1f4e..8ef41e35a7e21 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -63,7 +63,6 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
   GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, uint8_t = 0, int8_t = 0, int8_t = 0, uint = 0, Args...);
 
  private:
-
   static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, uint8_t, int8_t, uint);
   static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
   static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);

From bd3c8d1ea0a90558da11ccc81ef95e9e9a80ef8a Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 12 Mar 2025 11:10:57 +0100
Subject: [PATCH 57/77] Merging dev and adjusting build issues

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx      | 6 +++---
 .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx            | 2 +-
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h  | 2 +-
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx         | 6 ++++++
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 757b9c7b2982e..106a74ef6d433 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -931,7 +931,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
-            nnApplication.inferenceNetwork(clustererNN.model_class, clustererNN, iSize, clusterer.modelProbabilities);
+            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, evalDtype);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Assigning class labels
             } else {
@@ -939,10 +939,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             }
 
             if (!clustererNN.nnClusterizerUseCfRegression) {
-              nnApplication.inferenceNetwork(clustererNN.model_reg_1, clustererNN, iSize, clusterer.outputDataReg1);
+              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, evalDtype);
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
-                nnApplication.inferenceNetwork(clustererNN.model_reg_2, clustererNN, iSize, clusterer.outputDataReg2);
+                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, evalDtype);
                 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 2
               }
             }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index ed4607d412746..e64336016021f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -54,7 +54,7 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
   }
 }
 
-void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output)
+void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype)
 {
   if (dtype == 0) {
     model.inference<OrtDataType::Float16_t, float>(clusterer.inputData16, size * clusterer.nnClusterizerElementSize, output);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 14a256b5fa95b..1ba34aa370330 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -39,7 +39,7 @@ class GPUTPCNNClusterizerHost
   GPUTPCNNClusterizerHost() = default;
   GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
 
-  void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output);
+  void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype);
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index adcea7d4a1076..d23a1987215c9 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -22,6 +22,12 @@ using namespace o2::gpu::tpccf;
 #include "CfUtils.h"
 #include "ClusterAccumulator.h"
 #include "ML/3rdparty/GPUORTFloat16.h"
+
+#if !defined(GPUCA_GPUCODE)
+#include "GPUHostDataTypes.h"
+#include "MCLabelAccumulator.h"
+#endif
+
 #ifdef GPUCA_GPUCODE
 #include "GPUTPCCFClusterizer.inc"
 #endif

From cc6c05c81c12f6a301a2f2b885cb9d85dc139154 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 12 Mar 2025 11:47:20 +0100
Subject: [PATCH 58/77] Adjusting for comments

---
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h | 24 ++++++++++---------
 .../Global/GPUChainTrackingClusterizer.cxx    |  4 +++-
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
index b66fde4628c2b..e4969005ab23a 100644
--- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -275,7 +275,7 @@ union float32_bits {
 }; // namespace detail
 
 template <class Derived>
-GPUd() inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept
+GPUdi() constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept
 {
   detail::float32_bits f{};
   f.f = v;
@@ -324,7 +324,7 @@ GPUd() inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noe
 }
 
 template <class Derived>
-GPUd() inline float Float16Impl<Derived>::ToFloatImpl() const noexcept
+GPUdi() float Float16Impl<Derived>::ToFloatImpl() const noexcept
 {
   constexpr detail::float32_bits magic = {113 << 23};
   constexpr unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
@@ -528,7 +528,7 @@ struct BFloat16Impl {
 };
 
 template <class Derived>
-GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
+GPUdi() uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
 {
   uint16_t result;
   if (o2::gpu::CAMath::IsNaN(v)) {
@@ -537,7 +537,7 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
     auto get_msb_half = [](float fl) {
       uint16_t result;
 #ifdef GPUCA_GPUCODE
-      result = 0;
+      o2::gpu::CAMath::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
 #else
 #ifdef __cpp_if_constexpr
       if constexpr (detail::endian::native == detail::endian::little)
@@ -547,6 +547,7 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
       {
         std::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
       } else {
+        static_assert(false, "ERROR!!!");
         std::memcpy(&result, &fl, sizeof(uint16_t));
       }
       return result;
@@ -566,17 +567,18 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
     }
 
     template <class Derived>
-    GPUd() inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
+    GPUdi() float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
     {
       if (IsNaN()) {
-        return std::numeric_limits<float>::quiet_NaN();
+        return o2::gpu::CAMath::QuietNaN();
       }
       float result;
+      char* const first = reinterpret_cast<char*>(&result);
+      char* const second = first + sizeof(uint16_t);
 #ifdef GPUCA_GPUCODE
-      result = 0; // Fixme: implement memcpy
+      first[0] = first[1] = 0;
+      o2::gpu::CAMath::memcpy(second, &val, sizeof(uint16_t));
 #else
-  char* const first = reinterpret_cast<char*>(&result);
-  char* const second = first + sizeof(uint16_t);
 #ifdef __cpp_if_constexpr
   if constexpr (detail::endian::native == detail::endian::little)
 #else
@@ -726,7 +728,7 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
       /// <summary>
       /// User defined conversion operator. Converts Float16_t to float.
       /// </summary>
-      explicit operator float() const noexcept { return ToFloat(); }
+      GPUdi() explicit operator float() const noexcept { return ToFloat(); }
 
       using Base::operator==;
       using Base::operator!=;
@@ -867,7 +869,7 @@ GPUd() inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
       /// <summary>
       /// User defined conversion operator. Converts BFloat16_t to float.
       /// </summary>
-      explicit operator float() const noexcept { return ToFloat(); }
+      GPUdi() explicit operator float() const noexcept { return ToFloat(); }
 
       // We do not have an inherited impl for the below operators
       // as the internal class implements them a little differently
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 106a74ef6d433..af571d5bdf0e1 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -899,6 +899,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
           }
 
+          int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
+          clustererNN.nnClusterizerDtype = evalDtype;
+
           // Settings for the NN evaluation
           clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
           clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
@@ -920,7 +923,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
 
           float time_clusterizer = 0, time_fill = 0;
-          int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
 
           for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
             uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;

From 80f818dea14bc059c0189c29b341ba6d1dc8929a Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 12 Mar 2025 13:54:26 +0100
Subject: [PATCH 59/77] Fixing incorrect #endif

---
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h |  3 +--
 GPU/GPUTracking/CMakeLists.txt                | 24 +++++++++----------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
index e4969005ab23a..b09b0e965a213 100644
--- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -547,9 +547,9 @@ GPUdi() uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
       {
         std::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
       } else {
-        static_assert(false, "ERROR!!!");
         std::memcpy(&result, &fl, sizeof(uint16_t));
       }
+#endif
       return result;
     };
 
@@ -562,7 +562,6 @@ GPUdi() uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
     U32 += (upper_bits & 1) + kRoundToNearest;
     result = get_msb_half(F32);
   }
-#endif
       return result;
     }
 
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 17b66c14c838d..5b73863da66b9 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -268,19 +268,19 @@ unset(HDRS_TMP)
 if(ALIGPU_BUILD_TYPE STREQUAL "O2")
 
   o2_add_library(GPUDataTypes
-                  TARGETVARNAME targetName
-                  PUBLIC_INCLUDE_DIRECTORIES .
-                                              Definitions
-                                              DataTypes
-                  PUBLIC_LINK_LIBRARIES O2::GPUUtils
-                                        O2::GPUCommon
-                                        O2::ReconstructionDataFormats
-                                        O2::TPCFastTransformation
-                                        O2::ML
-                  PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
-                  SOURCES ${SRCS_DATATYPES})
+                 TARGETVARNAME targetName
+                 PUBLIC_INCLUDE_DIRECTORIES .
+                                             Definitions
+                                             DataTypes
+                 PUBLIC_LINK_LIBRARIES O2::GPUUtils
+                                       O2::GPUCommon
+                                       O2::ReconstructionDataFormats
+                                       O2::TPCFastTransformation
+                                       O2::ML
+                 PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
+                 SOURCES ${SRCS_DATATYPES})
   add_compile_definitions(GPUCA_HAS_ONNX=1)
-  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX)
+  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1)
 
   o2_target_root_dictionary(GPUDataTypes
                             HEADERS ${HDRS_CINT_DATATYPES} ${HDRS_CINT_O2_ADDITIONAL}

From ac6105237044d09f0a30a54b91719dd94e2a6e94 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Wed, 12 Mar 2025 12:55:12 +0000
Subject: [PATCH 60/77] Please consider the following formatting changes

---
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h | 596 +++++++++---------
 1 file changed, 298 insertions(+), 298 deletions(-)

diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
index b09b0e965a213..76fd6734cf9db 100644
--- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -562,21 +562,21 @@ GPUdi() uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
     U32 += (upper_bits & 1) + kRoundToNearest;
     result = get_msb_half(F32);
   }
-      return result;
-    }
+  return result;
+}
 
-    template <class Derived>
-    GPUdi() float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
-    {
-      if (IsNaN()) {
-        return o2::gpu::CAMath::QuietNaN();
-      }
-      float result;
-      char* const first = reinterpret_cast<char*>(&result);
-      char* const second = first + sizeof(uint16_t);
+template <class Derived>
+GPUdi() float BFloat16Impl<Derived>::ToFloatImpl() const noexcept
+{
+  if (IsNaN()) {
+    return o2::gpu::CAMath::QuietNaN();
+  }
+  float result;
+  char* const first = reinterpret_cast<char*>(&result);
+  char* const second = first + sizeof(uint16_t);
 #ifdef GPUCA_GPUCODE
-      first[0] = first[1] = 0;
-      o2::gpu::CAMath::memcpy(second, &val, sizeof(uint16_t));
+  first[0] = first[1] = 0;
+  o2::gpu::CAMath::memcpy(second, &val, sizeof(uint16_t));
 #else
 #ifdef __cpp_if_constexpr
   if constexpr (detail::endian::native == detail::endian::little)
@@ -591,295 +591,295 @@ GPUdi() uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept
     std::memset(second, 0, sizeof(uint16_t));
   }
 #endif
-      return result;
-    }
+  return result;
+}
 
-    /** \brief IEEE 754 half-precision floating point data type
-     *
-     * \details This struct is used for converting float to float16 and back
-     * so the user could feed inputs and fetch outputs using these type.
-     *
-     * The size of the structure should align with uint16_t and one can freely cast
-     * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data.
-     *
-     * \code{.unparsed}
-     * // This example demonstrates converion from float to float16
-     * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
-     * std::vector<Ort::Float16_t> fp16_values;
-     * fp16_values.reserve(std::size(values));
-     * std::transform(std::begin(values), std::end(values), std::back_inserter(fp16_values),
-     *     [](float value) { return Ort::Float16_t(value); });
-     *
-     * \endcode
-     */
-    struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
-     private:
-      /// <summary>
-      /// Constructor from a 16-bit representation of a float16 value
-      /// No conversion is done here.
-      /// </summary>
-      /// <param name="v">16-bit representation</param>
-      constexpr explicit Float16_t(uint16_t v) noexcept { val = v; }
-
-     public:
-      using Base = OrtDataType::Float16Impl<Float16_t>;
-
-      /// <summary>
-      /// Default constructor
-      /// </summary>
-      GPUdDefault() Float16_t() = default;
-
-      /// <summary>
-      /// Explicit conversion to uint16_t representation of float16.
-      /// </summary>
-      /// <param name="v">uint16_t bit representation of float16</param>
-      /// <returns>new instance of Float16_t</returns>
-      GPUd() constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); }
-
-      /// <summary>
-      /// __ctor from float. Float is converted into float16 16-bit representation.
-      /// </summary>
-      /// <param name="v">float value</param>
-      GPUd() explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
-
-      /// <summary>
-      /// Converts float16 to float
-      /// </summary>
-      /// <returns>float representation of float16 value</returns>
-      GPUd() float ToFloat() const noexcept { return Base::ToFloatImpl(); }
-
-      /// <summary>
-      /// Checks if the value is negative
-      /// </summary>
-      /// <returns>true if negative</returns>
-      using Base::IsNegative;
-
-      /// <summary>
-      /// Tests if the value is NaN
-      /// </summary>
-      /// <returns>true if NaN</returns>
-      using Base::IsNaN;
-
-      /// <summary>
-      /// Tests if the value is finite
-      /// </summary>
-      /// <returns>true if finite</returns>
-      using Base::IsFinite;
-
-      /// <summary>
-      /// Tests if the value represents positive infinity.
-      /// </summary>
-      /// <returns>true if positive infinity</returns>
-      using Base::IsPositiveInfinity;
-
-      /// <summary>
-      /// Tests if the value represents negative infinity
-      /// </summary>
-      /// <returns>true if negative infinity</returns>
-      using Base::IsNegativeInfinity;
-
-      /// <summary>
-      /// Tests if the value is either positive or negative infinity.
-      /// </summary>
-      /// <returns>True if absolute value is infinity</returns>
-      using Base::IsInfinity;
-
-      /// <summary>
-      /// Tests if the value is NaN or zero. Useful for comparisons.
-      /// </summary>
-      /// <returns>True if NaN or zero.</returns>
-      using Base::IsNaNOrZero;
-
-      /// <summary>
-      /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
-      /// </summary>
-      /// <returns>True if so</returns>
-      using Base::IsNormal;
-
-      /// <summary>
-      /// Tests if the value is subnormal (denormal).
-      /// </summary>
-      /// <returns>True if so</returns>
-      using Base::IsSubnormal;
-
-      /// <summary>
-      /// Creates an instance that represents absolute value.
-      /// </summary>
-      /// <returns>Absolute value</returns>
-      using Base::Abs;
-
-      /// <summary>
-      /// Creates a new instance with the sign flipped.
-      /// </summary>
-      /// <returns>Flipped sign instance</returns>
-      using Base::Negate;
-
-      /// <summary>
-      /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
-      /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
-      /// and therefore equivalent, if the resulting value is still zero.
-      /// </summary>
-      /// <param name="lhs">first value</param>
-      /// <param name="rhs">second value</param>
-      /// <returns>True if both arguments represent zero</returns>
-      using Base::AreZero;
-
-      /// <summary>
-      /// User defined conversion operator. Converts Float16_t to float.
-      /// </summary>
-      GPUdi() explicit operator float() const noexcept { return ToFloat(); }
-
-      using Base::operator==;
-      using Base::operator!=;
-      using Base::operator<;
-    };
+/** \brief IEEE 754 half-precision floating point data type
+ *
+ * \details This struct is used for converting float to float16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::Float16_t> fp16_values;
+ * fp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(fp16_values),
+ *     [](float value) { return Ort::Float16_t(value); });
+ *
+ * \endcode
+ */
+struct Float16_t : OrtDataType::Float16Impl<Float16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a 16-bit representation of a float16 value
+  /// No conversion is done here.
+  /// </summary>
+  /// <param name="v">16-bit representation</param>
+  constexpr explicit Float16_t(uint16_t v) noexcept { val = v; }
 
-    static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
-
-    /** \brief bfloat16 (Brain Floating Point) data type
-     *
-     * \details This struct is used for converting float to bfloat16 and back
-     * so the user could feed inputs and fetch outputs using these type.
-     *
-     * The size of the structure should align with uint16_t and one can freely cast
-     * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data.
-     *
-     * \code{.unparsed}
-     * // This example demonstrates converion from float to float16
-     * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
-     * std::vector<Ort::BFloat16_t> bfp16_values;
-     * bfp16_values.reserve(std::size(values));
-     * std::transform(std::begin(values), std::end(values), std::back_inserter(bfp16_values),
-     *     [](float value) { return Ort::BFloat16_t(value); });
-     *
-     * \endcode
-     */
-    struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
-     private:
-      /// <summary>
-      /// Constructor from a uint16_t representation of bfloat16
-      /// used in FromBits() to escape overload resolution issue with
-      /// constructor from float.
-      /// No conversion is done.
-      /// </summary>
-      /// <param name="v">16-bit bfloat16 value</param>
-      constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; }
-
-     public:
-      using Base = OrtDataType::BFloat16Impl<BFloat16_t>;
-
-      GPUdDefault() BFloat16_t() = default;
-
-      /// <summary>
-      /// Explicit conversion to uint16_t representation of bfloat16.
-      /// </summary>
-      /// <param name="v">uint16_t bit representation of bfloat16</param>
-      /// <returns>new instance of BFloat16_t</returns>
-      GPUd() static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); }
-
-      /// <summary>
-      /// __ctor from float. Float is converted into bfloat16 16-bit representation.
-      /// </summary>
-      /// <param name="v">float value</param>
-      GPUd() explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
-
-      /// <summary>
-      /// Converts bfloat16 to float
-      /// </summary>
-      /// <returns>float representation of bfloat16 value</returns>
-      GPUd() float ToFloat() const noexcept { return Base::ToFloatImpl(); }
-
-      /// <summary>
-      /// Checks if the value is negative
-      /// </summary>
-      /// <returns>true if negative</returns>
-      using Base::IsNegative;
-
-      /// <summary>
-      /// Tests if the value is NaN
-      /// </summary>
-      /// <returns>true if NaN</returns>
-      using Base::IsNaN;
-
-      /// <summary>
-      /// Tests if the value is finite
-      /// </summary>
-      /// <returns>true if finite</returns>
-      using Base::IsFinite;
-
-      /// <summary>
-      /// Tests if the value represents positive infinity.
-      /// </summary>
-      /// <returns>true if positive infinity</returns>
-      using Base::IsPositiveInfinity;
-
-      /// <summary>
-      /// Tests if the value represents negative infinity
-      /// </summary>
-      /// <returns>true if negative infinity</returns>
-      using Base::IsNegativeInfinity;
-
-      /// <summary>
-      /// Tests if the value is either positive or negative infinity.
-      /// </summary>
-      /// <returns>True if absolute value is infinity</returns>
-      using Base::IsInfinity;
-
-      /// <summary>
-      /// Tests if the value is NaN or zero. Useful for comparisons.
-      /// </summary>
-      /// <returns>True if NaN or zero.</returns>
-      using Base::IsNaNOrZero;
-
-      /// <summary>
-      /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
-      /// </summary>
-      /// <returns>True if so</returns>
-      using Base::IsNormal;
-
-      /// <summary>
-      /// Tests if the value is subnormal (denormal).
-      /// </summary>
-      /// <returns>True if so</returns>
-      using Base::IsSubnormal;
-
-      /// <summary>
-      /// Creates an instance that represents absolute value.
-      /// </summary>
-      /// <returns>Absolute value</returns>
-      using Base::Abs;
-
-      /// <summary>
-      /// Creates a new instance with the sign flipped.
-      /// </summary>
-      /// <returns>Flipped sign instance</returns>
-      using Base::Negate;
-
-      /// <summary>
-      /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
-      /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
-      /// and therefore equivalent, if the resulting value is still zero.
-      /// </summary>
-      /// <param name="lhs">first value</param>
-      /// <param name="rhs">second value</param>
-      /// <returns>True if both arguments represent zero</returns>
-      using Base::AreZero;
-
-      /// <summary>
-      /// User defined conversion operator. Converts BFloat16_t to float.
-      /// </summary>
-      GPUdi() explicit operator float() const noexcept { return ToFloat(); }
-
-      // We do not have an inherited impl for the below operators
-      // as the internal class implements them a little differently
-      bool operator==(const BFloat16_t& rhs) const noexcept;
-      bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); }
-      bool operator<(const BFloat16_t& rhs) const noexcept;
-    };
+ public:
+  using Base = OrtDataType::Float16Impl<Float16_t>;
+
+  /// <summary>
+  /// Default constructor
+  /// </summary>
+  GPUdDefault() Float16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of float16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of float16</param>
+  /// <returns>new instance of Float16_t</returns>
+  GPUd() constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into float16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  GPUd() explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  GPUd() float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts Float16_t to float.
+  /// </summary>
+  GPUdi() explicit operator float() const noexcept { return ToFloat(); }
+
+  using Base::operator==;
+  using Base::operator!=;
+  using Base::operator<;
+};
+
+static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
+
+/** \brief bfloat16 (Brain Floating Point) data type
+ *
+ * \details This struct is used for converting float to bfloat16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::BFloat16_t> bfp16_values;
+ * bfp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(bfp16_values),
+ *     [](float value) { return Ort::BFloat16_t(value); });
+ *
+ * \endcode
+ */
+struct BFloat16_t : OrtDataType::BFloat16Impl<BFloat16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a uint16_t representation of bfloat16
+  /// used in FromBits() to escape overload resolution issue with
+  /// constructor from float.
+  /// No conversion is done.
+  /// </summary>
+  /// <param name="v">16-bit bfloat16 value</param>
+  constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; }
+
+ public:
+  using Base = OrtDataType::BFloat16Impl<BFloat16_t>;
+
+  GPUdDefault() BFloat16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of bfloat16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of bfloat16</param>
+  /// <returns>new instance of BFloat16_t</returns>
+  GPUd() static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into bfloat16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  GPUd() explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  GPUd() float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts BFloat16_t to float.
+  /// </summary>
+  GPUdi() explicit operator float() const noexcept { return ToFloat(); }
+
+  // We do not have an inherited impl for the below operators
+  // as the internal class implements them a little differently
+  bool operator==(const BFloat16_t& rhs) const noexcept;
+  bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); }
+  bool operator<(const BFloat16_t& rhs) const noexcept;
+};
 
-    static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
+static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
 
-  } // namespace OrtDataType
+} // namespace OrtDataType
 
 } // namespace o2
 #endif
\ No newline at end of file

From c03a60e1ab123900ccdef821b940b6cb4f602784 Mon Sep 17 00:00:00 2001
From: David Rohr <github@jwdt.org>
Date: Thu, 13 Mar 2025 09:27:12 +0100
Subject: [PATCH 61/77] Fix indentation, remove duplicate define

---
 GPU/GPUTracking/CMakeLists.txt | 66 +++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 34 deletions(-)

diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 5b73863da66b9..ccb52408a4b89 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -162,35 +162,35 @@ set(HDRS_INSTALL
 set(SRCS_NO_CINT ${SRCS_NO_CINT} display/GPUDisplayInterface.cxx)
 
 set(SRCS_NO_CINT ${SRCS_NO_CINT}
-  Global/GPUChainITS.cxx
-  ITS/GPUITSFitter.cxx
-  ITS/GPUITSFitterKernels.cxx
-  dEdx/GPUdEdx.cxx
-  TPCConvert/GPUTPCConvert.cxx
-  TPCConvert/GPUTPCConvertKernel.cxx
-  DataCompression/GPUTPCCompression.cxx
-  DataCompression/GPUTPCCompressionTrackModel.cxx
-  DataCompression/GPUTPCCompressionKernels.cxx
-  DataCompression/GPUTPCDecompression.cxx
-  DataCompression/GPUTPCDecompressionKernels.cxx
-  DataCompression/TPCClusterDecompressor.cxx
-  DataCompression/GPUTPCClusterStatistics.cxx
-  TPCClusterFinder/GPUTPCClusterFinder.cxx
-  TPCClusterFinder/ClusterAccumulator.cxx
-  TPCClusterFinder/MCLabelAccumulator.cxx
-  TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
-  TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
-  TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
-  TPCClusterFinder/GPUTPCCFPeakFinder.cxx
-  TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
-  TPCClusterFinder/GPUTPCCFClusterizer.cxx
-  TPCClusterFinder/GPUTPCCFDeconvolution.cxx
-  TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
-  TPCClusterFinder/GPUTPCCFDecodeZS.cxx
-  TPCClusterFinder/GPUTPCCFGather.cxx
-  Refit/GPUTrackingRefit.cxx
-  Refit/GPUTrackingRefitKernel.cxx
-  Merger/GPUTPCGMO2Output.cxx)
+    Global/GPUChainITS.cxx
+    ITS/GPUITSFitter.cxx
+    ITS/GPUITSFitterKernels.cxx
+    dEdx/GPUdEdx.cxx
+    TPCConvert/GPUTPCConvert.cxx
+    TPCConvert/GPUTPCConvertKernel.cxx
+    DataCompression/GPUTPCCompression.cxx
+    DataCompression/GPUTPCCompressionTrackModel.cxx
+    DataCompression/GPUTPCCompressionKernels.cxx
+    DataCompression/GPUTPCDecompression.cxx
+    DataCompression/GPUTPCDecompressionKernels.cxx
+    DataCompression/TPCClusterDecompressor.cxx
+    DataCompression/GPUTPCClusterStatistics.cxx
+    TPCClusterFinder/GPUTPCClusterFinder.cxx
+    TPCClusterFinder/ClusterAccumulator.cxx
+    TPCClusterFinder/MCLabelAccumulator.cxx
+    TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx
+    TPCClusterFinder/GPUTPCCFStreamCompaction.cxx
+    TPCClusterFinder/GPUTPCCFChargeMapFiller.cxx
+    TPCClusterFinder/GPUTPCCFPeakFinder.cxx
+    TPCClusterFinder/GPUTPCCFNoiseSuppression.cxx
+    TPCClusterFinder/GPUTPCCFClusterizer.cxx
+    TPCClusterFinder/GPUTPCCFDeconvolution.cxx
+    TPCClusterFinder/GPUTPCCFMCLabelFlattener.cxx
+    TPCClusterFinder/GPUTPCCFDecodeZS.cxx
+    TPCClusterFinder/GPUTPCCFGather.cxx
+    Refit/GPUTrackingRefit.cxx
+    Refit/GPUTrackingRefitKernel.cxx
+    Merger/GPUTPCGMO2Output.cxx)
 
 if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
   list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx TPCClusterFinder/GPUTPCNNClusterizer.cxx TPCClusterFinder/GPUTPCNNClusterizerHost.cxx)
@@ -266,12 +266,11 @@ unset(HDRS_TMP)
 
 # Main CMake part for O2
 if(ALIGPU_BUILD_TYPE STREQUAL "O2")
-
   o2_add_library(GPUDataTypes
                  TARGETVARNAME targetName
                  PUBLIC_INCLUDE_DIRECTORIES .
-                                             Definitions
-                                             DataTypes
+                                            Definitions
+                                            DataTypes
                  PUBLIC_LINK_LIBRARIES O2::GPUUtils
                                        O2::GPUCommon
                                        O2::ReconstructionDataFormats
@@ -279,7 +278,6 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                                        O2::ML
                  PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
                  SOURCES ${SRCS_DATATYPES})
-  add_compile_definitions(GPUCA_HAS_ONNX=1)
   target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1)
 
   o2_target_root_dictionary(GPUDataTypes
@@ -417,4 +415,4 @@ endif()
 
 if(${GPUCA_NO_FAST_MATH})
   target_compile_definitions(${targetName} PUBLIC GPUCA_NO_FAST_MATH)
-endif()
\ No newline at end of file
+endif()

From 207ba9c1b0003e171030059ff1f3592a1b7b78e1 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 13 Mar 2025 11:03:41 +0100
Subject: [PATCH 62/77] Fixing one memory issue. Segfault / memory leak
 persists

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx         | 3 ++-
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx       | 2 +-
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h         | 1 +
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx            | 1 +
 4 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index af571d5bdf0e1..d87716107c217 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -893,6 +893,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
           clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
           clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
+          clustererNN.nnClusterizerTotalClusters = clusterer.mPmemory->counters.nClusters;
           if (nn_settings.nnClusterizerVerbosity < 0) {
             clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
           } else {
@@ -962,7 +963,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
 
           if (clustererNN.nnClusterizerVerbosity < 3) {
-            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", slice: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 1265f6821e1ef..6a9b6f546ae07 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -32,7 +32,7 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
   computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
   computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
   computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, outputDataClass, nnClusterizerBatchedMode);
+  computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters);
   if (nnClusterizerModelClassNumOutputNodes > 0) {
     computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
   }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index d0f3da460fee0..6b628132c17b5 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -46,6 +46,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   bool nnSigmoidTrafoClassThreshold = 1;
   int nnClusterizerUseCfRegression = 0;
   int nnClusterizerBatchedMode = 1;
+  int nnClusterizerTotalClusters = 1;
   int nnClusterizerVerbosity = 0;
   int nnClusterizerBoundaryFillValue = -1;
   int nnClusterizerDumpDigits = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index d23a1987215c9..3c713e0c9b9a8 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -143,6 +143,7 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n
 
   clustererNN.peakPositions[glo_idx] = peak;
   clustererNN.centralCharges[glo_idx] = central_charge;
+  clustererNN.outputDataClass[glo_idx + batchStart] = -1;
 
   int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
 #ifndef GPUCA_GPUCODE

From 0978c1966bd161a50efdc0cc712204db2f60be06 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 13 Mar 2025 11:41:36 +0100
Subject: [PATCH 63/77] Adjusting for new toNative function

---
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 3c713e0c9b9a8..fed0fc4d128bf 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -233,13 +233,13 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
     pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg1[model_output_index + 4],
                static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg1[model_output_index],
                clustererNN.outputDataReg1[model_output_index + 2],
-               static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1],
+               static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1],
                clustererNN.outputDataReg1[model_output_index + 3],
                clustererNN.clusterFlags[2 * glo_idx],
                clustererNN.clusterFlags[2 * glo_idx + 1]);
 
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param());
+    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), (clusterer.mPmemory->fragment).start, chargeMap);
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -313,13 +313,13 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
     pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 8],
                static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index],
                clustererNN.outputDataReg2[model_output_index + 4],
-               static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2],
+               static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2],
                clustererNN.outputDataReg2[model_output_index + 6],
                clustererNN.clusterFlags[2 * glo_idx],
                clustererNN.clusterFlags[2 * glo_idx + 1]);
 
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param());
+    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), (clusterer.mPmemory->fragment).start, chargeMap);
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -348,12 +348,12 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
     pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 9],
                static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index + 1],
                clustererNN.outputDataReg2[model_output_index + 5],
-               static_cast<float>((clusterer.mPmemory->fragment).start) + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3],
+               static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3],
                clustererNN.outputDataReg2[model_output_index + 7],
                clustererNN.clusterFlags[2 * glo_idx],
                clustererNN.clusterFlags[2 * glo_idx + 1]);
 
-    rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param());
+    rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), (clusterer.mPmemory->fragment).start, chargeMap);
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;

From ad9696e0bc73adbe50590baf03008768e2cd48d5 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 13 Mar 2025 12:45:44 +0100
Subject: [PATCH 64/77] Fixing .finalize

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
index 443de3585a499..83dd3f73d8d0d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
@@ -59,10 +59,9 @@ GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t
     }
     return;
   }
-  pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry);
 
   tpc::ClusterNative myCluster;
-  bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param());
+  bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param(), fragment.start, chargeMap);
 
   if (rejectCluster) {
     if (clusterPosInRow) {

From 337743578f0a8535beb6d05808574f878d058b17 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 13 Mar 2025 13:17:23 +0100
Subject: [PATCH 65/77] Adjusting CMakeLIsts and other bugs

---
 GPU/GPUTracking/CMakeLists.txt                |   2 +-
 .../TPCClusterFinder/GPUTPCCFClusterizer.cxx  | 233 ------------------
 .../TPCClusterFinder/GPUTPCCFClusterizer.inc  |   1 -
 .../GPUTPCNNClusterizerKernels.cxx            |   3 +-
 4 files changed, 3 insertions(+), 236 deletions(-)

diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index ccb52408a4b89..3408738ce89e7 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -324,7 +324,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
     ${targetName}
     PRIVATE $<TARGET_PROPERTY:O2::Framework,INTERFACE_INCLUDE_DIRECTORIES>)
 
-  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2)
+  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1)
 
   o2_target_root_dictionary(${MODULE}
                             HEADERS ${HDRS_CINT_O2} ${HDRS_CINT_O2_ADDITIONAL}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
index 3787feabab2b1..2131347decec6 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
@@ -36,237 +36,4 @@ GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
   GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
-<<<<<<< HEAD
 }
-=======
-}
-
-GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
-                                                       processorType& clusterer,
-                                                       const CfFragment& fragment,
-                                                       GPUSharedMemory& smem,
-                                                       const Array2D<PackedCharge>& chargeMap,
-                                                       const ChargePos* filteredPeakPositions,
-                                                       const GPUSettingsRec& calib,
-                                                       MCLabelAccumulator* labelAcc,
-                                                       uint32_t clusternum,
-                                                       uint32_t maxClusterPerRow,
-                                                       uint32_t* clusterInRow,
-                                                       tpc::ClusterNative* clusterByRow,
-                                                       uint32_t* clusterPosInRow)
-{
-  uint32_t idx = get_global_id(0);
-
-  // For certain configurations dummy work items are added, so the total
-  // number of work items is dividable by 64.
-  // These dummy items also compute the last cluster but discard the result.
-  ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
-  Charge charge = chargeMap[pos].unpack();
-
-  ClusterAccumulator pc;
-  CPU_ONLY(labelAcc->collect(pos, charge));
-
-  buildCluster(
-    calib,
-    chargeMap,
-    pos,
-    smem.posBcast,
-    smem.buf,
-    smem.innerAboveThreshold,
-    &pc,
-    labelAcc);
-
-  if (idx >= clusternum) {
-    return;
-  }
-  if (fragment.isOverlap(pos.time())) {
-    if (clusterPosInRow) {
-      clusterPosInRow[idx] = maxClusterPerRow;
-    }
-    return;
-  }
-  tpc::ClusterNative myCluster;
-  bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param(), fragment.start, chargeMap);
-
-  if (rejectCluster) {
-    if (clusterPosInRow) {
-      clusterPosInRow[idx] = maxClusterPerRow;
-    }
-    return;
-  }
-
-  uint32_t rowIndex = 0;
-  if (clusterByRow != nullptr) {
-    rowIndex = sortIntoBuckets(
-      clusterer,
-      myCluster,
-      pos.row(),
-      maxClusterPerRow,
-      clusterInRow,
-      clusterByRow);
-    if (clusterPosInRow != nullptr) {
-      clusterPosInRow[idx] = rowIndex;
-    }
-  } else if (clusterPosInRow) {
-    rowIndex = clusterPosInRow[idx];
-  }
-
-  CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
-}
-
-GPUdii() void GPUTPCCFClusterizer::updateClusterInner(
-  const GPUSettingsRec& calib,
-  uint16_t lid,
-  uint16_t N,
-  const PackedCharge* buf,
-  const ChargePos& pos,
-  ClusterAccumulator* cluster,
-  MCLabelAccumulator* labelAcc,
-  uint8_t* innerAboveThreshold)
-{
-  uint8_t aboveThreshold = 0;
-
-  GPUCA_UNROLL(U(), U())
-  for (uint16_t i = 0; i < N; i++) {
-    Delta2 d = cfconsts::InnerNeighbors[i];
-
-    PackedCharge p = buf[N * lid + i];
-
-    Charge q = cluster->updateInner(p, d);
-
-    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
-
-    aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
-  }
-
-  innerAboveThreshold[lid] = aboveThreshold;
-
-  GPUbarrier();
-}
-
-GPUdii() void GPUTPCCFClusterizer::updateClusterOuter(
-  uint16_t lid,
-  uint16_t N,
-  uint16_t M,
-  uint16_t offset,
-  const PackedCharge* buf,
-  const ChargePos& pos,
-  ClusterAccumulator* cluster,
-  MCLabelAccumulator* labelAcc)
-{
-  GPUCA_UNROLL(U(), U())
-  for (uint16_t i = offset; i < M + offset; i++) {
-    PackedCharge p = buf[N * lid + i];
-
-    Delta2 d = cfconsts::OuterNeighbors[i];
-
-    Charge q = cluster->updateOuter(p, d);
-    static_cast<void>(q); // Avoid unused varible warning on GPU.
-
-    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
-  }
-}
-
-GPUdii() void GPUTPCCFClusterizer::buildCluster(
-  const GPUSettingsRec& calib,
-  const Array2D<PackedCharge>& chargeMap,
-  ChargePos pos,
-  ChargePos* posBcast,
-  PackedCharge* buf,
-  uint8_t* innerAboveThreshold,
-  ClusterAccumulator* myCluster,
-  MCLabelAccumulator* labelAcc)
-{
-  uint16_t ll = get_local_id(0);
-
-  posBcast[ll] = pos;
-  GPUbarrier();
-
-  CfUtils::blockLoad<PackedCharge>(
-    chargeMap,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    8,
-    cfconsts::InnerNeighbors,
-    posBcast,
-    buf);
-  updateClusterInner(
-    calib,
-    ll,
-    8,
-    buf,
-    pos,
-    myCluster,
-    labelAcc,
-    innerAboveThreshold);
-
-  uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
-
-  bool inGroup1 = ll < wgSizeHalf;
-
-  uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
-
-  CfUtils::condBlockLoad(
-    chargeMap,
-    wgSizeHalf,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    16,
-    cfconsts::OuterNeighbors,
-    posBcast,
-    innerAboveThreshold,
-    buf);
-
-  if (inGroup1) {
-    updateClusterOuter(
-      llhalf,
-      16,
-      16,
-      0,
-      buf,
-      pos,
-      myCluster,
-      labelAcc);
-  }
-
-#if defined(GPUCA_GPUCODE)
-  CfUtils::condBlockLoad(
-    chargeMap,
-    wgSizeHalf,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    16,
-    cfconsts::OuterNeighbors,
-    posBcast + wgSizeHalf,
-    innerAboveThreshold + wgSizeHalf,
-    buf);
-  if (!inGroup1) {
-    updateClusterOuter(
-      llhalf,
-      16,
-      16,
-      0,
-      buf,
-      pos,
-      myCluster,
-      labelAcc);
-  }
-#endif
-}
-
-GPUd() uint32_t GPUTPCCFClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t* elemsInBucket, tpc::ClusterNative* buckets)
-{
-  uint32_t index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
-  if (index < maxElemsPerBucket) {
-    buckets[maxElemsPerBucket * row + index] = cluster;
-  } else {
-    clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISector * 1000 + row, index, maxElemsPerBucket);
-    CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
-  }
-  return index;
-}
->>>>>>> dev
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
index 83dd3f73d8d0d..ff07cecae241a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
@@ -59,7 +59,6 @@ GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t
     }
     return;
   }
-
   tpc::ClusterNative myCluster;
   bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param(), fragment.start, chargeMap);
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index fed0fc4d128bf..8842b98b69c31 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -38,7 +38,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
 {
   uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
-  if (processors.tpcNNClusterer[sector].outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  if (clustererNN.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
     return;
   }
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));

From 9893b434b1134a408a5d7a0f7e03c0d03c1496de Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 13 Mar 2025 13:53:46 +0100
Subject: [PATCH 66/77] Adding GPUCA_HAS_ONNX only to tracking

---
 GPU/GPUTracking/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 3408738ce89e7..7a7b211cdb406 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -275,10 +275,9 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                                        O2::GPUCommon
                                        O2::ReconstructionDataFormats
                                        O2::TPCFastTransformation
-                                       O2::ML
                  PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC
                  SOURCES ${SRCS_DATATYPES})
-  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1)
+  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2)
 
   o2_target_root_dictionary(GPUDataTypes
                             HEADERS ${HDRS_CINT_DATATYPES} ${HDRS_CINT_O2_ADDITIONAL}
@@ -299,6 +298,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                                        O2::TPCFastTransformation
                                        O2::DetectorsRaw
                                        O2::Steer
+                                       O2::ML
                  PUBLIC_INCLUDE_DIRECTORIES .
                                             Definitions
                                             DataTypes

From bce04bc7472e608bf448c94ef7be3e06e748ba1f Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 13 Mar 2025 15:09:30 +0100
Subject: [PATCH 67/77] Changing to fixed size for number of clusters

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index d87716107c217..483d40de6db96 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -893,7 +893,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
           clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
           clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
-          clustererNN.nnClusterizerTotalClusters = clusterer.mPmemory->counters.nClusters;
+          clustererNN.nnClusterizerTotalClusters = clusterer.mNMaxClusterPerRow;
           if (nn_settings.nnClusterizerVerbosity < 0) {
             clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
           } else {

From 713dd64b1037bea80f68f31a8bb6eebfaba3a08c Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 13 Mar 2025 22:46:49 +0100
Subject: [PATCH 68/77] Fixed segfault. Not producing the right number of
 clusters yet.

---
 GPU/GPUTracking/Base/GPUConstantMem.h         |   3 +-
 .../Global/GPUChainTrackingClusterizer.cxx    | 161 +++++++++---------
 .../TPCClusterFinder/ClusterAccumulator.cxx   |  49 ++++++
 .../TPCClusterFinder/ClusterAccumulator.h     |   1 +
 .../GPUTPCNNClusterizerKernels.cxx            |  54 +++---
 .../GPUTPCNNClusterizerKernels.h              |   8 +-
 GPU/GPUTracking/kernels.cmake                 |  12 +-
 7 files changed, 170 insertions(+), 118 deletions(-)

diff --git a/GPU/GPUTracking/Base/GPUConstantMem.h b/GPU/GPUTracking/Base/GPUConstantMem.h
index 65aa0dcddf8e4..8f1cc90f5ae93 100644
--- a/GPU/GPUTracking/Base/GPUConstantMem.h
+++ b/GPU/GPUTracking/Base/GPUConstantMem.h
@@ -42,8 +42,7 @@ namespace o2::gpu
 {
 struct GPUConstantMem {
   GPUParam param;
-  GPUTPCTracker
-    tpcTrackers[GPUCA_NSECTORS];
+  GPUTPCTracker tpcTrackers[GPUCA_NSECTORS];
   GPUTPCConvert tpcConverter;
   GPUTPCCompression tpcCompressor;
   GPUTPCDecompression tpcDecompressor;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 483d40de6db96..501cfa5f2b57b 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -523,7 +523,7 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
         mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSId, mRec);
       } else {
         AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSOffsetId);
-        AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSId);
+        AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSId);        
       }
     }
   } else {
@@ -611,6 +611,36 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
   }
 
+#ifdef GPUCA_HAS_ONNX
+  uint32_t maxClusters = -1;
+  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+    maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
+  }
+  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+    GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
+    const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
+    clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
+    clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
+    clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
+    clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
+    clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
+    clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
+    clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
+    clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
+    clustererNN.nnClusterizerTotalClusters = maxClusters;
+    clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
+    clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
+    if (nn_settings.nnClusterizerVerbosity < 0) {
+      clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
+    } else {
+      clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
+    }
+    clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
+    GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+    AllocateRegisteredMemory(clustererNN.mMemoryId);
+  }
+#endif
+
   if (doGPU && mIOPtrs.tpcZS) {
     processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta;
     WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
@@ -885,86 +915,59 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
           // Setting some initial sizes, important for memory allocation
           const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
-          clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
-          clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
-          clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
-          clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
-          clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
-          clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
-          clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
-          clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
-          clustererNN.nnClusterizerTotalClusters = clusterer.mNMaxClusterPerRow;
-          if (nn_settings.nnClusterizerVerbosity < 0) {
-            clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
-          } else {
-            clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
-          }
-
           int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
-          clustererNN.nnClusterizerDtype = evalDtype;
-
-          // Settings for the NN evaluation
-          clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
-          clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
-
-          GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
-
-          if(fragment.index == 0){
-            AllocateRegisteredMemory(clustererNN.mMemoryId);
-          }
-
-          if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
-            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
-            DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
-          }
-
-          if (clustererNN.nnSigmoidTrafoClassThreshold) {
-            // Inverse sigmoid transformation
-            clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
-          }
-
-          float time_clusterizer = 0, time_fill = 0;
-
-          for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
-            uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
-            size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
-
-            auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Filling the data
-
-            auto stop0 = std::chrono::high_resolution_clock::now();
-            auto start1 = std::chrono::high_resolution_clock::now();
-            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, evalDtype);
-            if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Assigning class labels
-            } else {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Assigning class labels
-            }
 
-            if (!clustererNN.nnClusterizerUseCfRegression) {
-              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, evalDtype);
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 1
-              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
-                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, evalDtype);
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, batchStart); // Running the NN for regression class 2
-              }
-            }
-            auto stop1 = std::chrono::high_resolution_clock::now();
-
-            time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
-            time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
-          }
-
-          auto start1 = std::chrono::high_resolution_clock::now();
-          if (clustererNN.nnClusterizerUseCfRegression) {
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, iSector, evalDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
-          }
-          auto stop1 = std::chrono::high_resolution_clock::now();
-          time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
-
-          if (clustererNN.nnClusterizerVerbosity < 3) {
-            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
-          }
+           GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+           
+           if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
+             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
+             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+           }
+
+           if (clustererNN.nnSigmoidTrafoClassThreshold) {
+             // Inverse sigmoid transformation
+             clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
+           }
+           
+           float time_clusterizer = 0, time_fill = 0;
+           for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
+             uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
+             size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
+             
+             auto start0 = std::chrono::high_resolution_clock::now();
+             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
+             
+             auto stop0 = std::chrono::high_resolution_clock::now();
+             auto start1 = std::chrono::high_resolution_clock::now();
+             nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
+             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
+               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+             } else {
+               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+             }
+
+             if (!clustererNN.nnClusterizerUseCfRegression) {
+               nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
+               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
+               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
+                 nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
+                 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
+               }
+             }
+             auto stop1 = std::chrono::high_resolution_clock::now();
+             
+             time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+             time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
+           }
+           auto start1 = std::chrono::high_resolution_clock::now();
+           if (clustererNN.nnClusterizerUseCfRegression) {
+             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+           }
+           auto stop1 = std::chrono::high_resolution_clock::now();
+           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+           if (clustererNN.nnClusterizerVerbosity < 3) {
+             LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+           }
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif
diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.cxx b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.cxx
index 77dc6e119df7d..2ddca10232b14 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.cxx
@@ -119,3 +119,52 @@ GPUd() bool ClusterAccumulator::toNative(const ChargePos& pos, Charge q, tpc::Cl
 
   return true;
 }
+
+GPUd() bool ClusterAccumulator::toNativeSimple(const ChargePos& pos, Charge q, tpc::ClusterNative& cn, const GPUParam& param, const Array2D<PackedCharge>& chargeMap)
+{
+  cn.qTot = CAMath::Float2UIntRn(mQtot);
+  if (cn.qTot <= param.rec.tpc.cfQTotCutoff) {
+    return false;
+  }
+  cn.qMax = q;
+  if (cn.qMax <= param.rec.tpc.cfQMaxCutoff) {
+    return false;
+  }
+  if (mTimeMean < param.rec.tpc.clustersShiftTimebinsClusterizer) {
+    return false;
+  }
+  if (q <= param.rec.tpc.cfQMaxCutoffSingleTime && mTimeSigma == 0) {
+    return false;
+  }
+  if (q <= param.rec.tpc.cfQMaxCutoffSinglePad && mPadSigma == 0) {
+    return false;
+  }
+
+  bool wasSplitInTime = mSplitInTime >= param.rec.tpc.cfMinSplitNum;
+  bool wasSplitInPad = mSplitInPad >= param.rec.tpc.cfMinSplitNum;
+  bool isSingleCluster = (mPadSigma == 0) || (mTimeSigma == 0);
+
+  uint8_t flags = 0;
+  uint8_t pad = pos.pad();
+  bool isEdgeCluster = pad < 2 || pad >= param.tpcGeometry.NPads(pos.row()) - 2; // Geometrical edge check, peak within 2 pads of sector edge
+  if (isEdgeCluster) {
+    bool leftEdge = (pad < 2);
+    if (leftEdge ? (pad == 1 && chargeMap[pos.delta({-1, 0})].unpack() < 1) : (pad == (param.tpcGeometry.NPads(pos.row()) - 2) && chargeMap[pos.delta({1, 0})].unpack() < 1)) {
+      isEdgeCluster = false; // No edge cluster if peak is close to edge but no charge at the edge.
+    } else if (leftEdge ? (pad < mPadMean) : (pad > mPadMean)) {
+      mPadMean = pad; // Correct to peak position if COG is close to middle of pad than peak
+    }
+  }
+
+  flags |= (isEdgeCluster) ? tpc::ClusterNative::flagEdge : 0;
+  flags |= (wasSplitInTime) ? tpc::ClusterNative::flagSplitTime : 0;
+  flags |= (wasSplitInPad) ? tpc::ClusterNative::flagSplitPad : 0;
+  flags |= (isSingleCluster) ? tpc::ClusterNative::flagSingle : 0;
+
+  cn.setTimeFlags(mTimeMean - param.rec.tpc.clustersShiftTimebinsClusterizer, flags);
+  cn.setPad(mPadMean);
+  cn.setSigmaTime(mTimeSigma);
+  cn.setSigmaPad(mPadSigma);
+
+  return true;
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
index 44b72ce986a1d..e66fe461d633c 100644
--- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
+++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h
@@ -42,6 +42,7 @@ class ClusterAccumulator
   GPUd() tpccf::Charge updateOuter(PackedCharge, tpccf::Delta2);
 
   GPUd() bool toNative(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&, tpccf::TPCTime, const Array2D<PackedCharge>&);
+  GPUd() bool toNativeSimple(const ChargePos&, tpccf::Charge, tpc::ClusterNative&, const GPUParam&, const Array2D<PackedCharge>&);
 
   GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInTime, uint8_t splitInPad)
   {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 8842b98b69c31..424166fdd161a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -34,11 +34,11 @@ using namespace o2::gpu::tpccf;
 
 // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& notUsed, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  auto& clusterer = processors.tpcClusterer[sector];
-  auto& clustererNN = processors.tpcNNClusterer[sector];
+  auto& clusterer = processors->tpcClusterer[sector];
+  auto& clustererNN = processors->tpcNNClusterer[sector];
   if (clustererNN.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
     return;
   }
@@ -50,22 +50,22 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& notUsed, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart);
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& notUsed, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
+  processors->tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors->tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors->tpcNNClusterer[sector].nnClassThreshold);
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& notUsed, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
-  auto& clusterer = processors.tpcNNClusterer[sector];
+  auto& clusterer = processors->tpcNNClusterer[sector];
   uint glo_idx = get_global_id(0);
   uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
   float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
@@ -82,20 +82,20 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& notUsed, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
+  if (glo_idx >= processors->tpcClusterer[sector].mPmemory->counters.nClusters) {
     return;
   }
   GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& notUsed, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
+  if (glo_idx >= processors->tpcClusterer[sector].mPmemory->counters.nClusters) {
     return;
   }
   GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
@@ -128,11 +128,11 @@ GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_
 }
 
 // Filling the input data for the neural network where there is no boundary
-GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart)
+GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUConstantMem* processors, uint8_t sector, int8_t dtype, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  auto& clusterer = processors.tpcClusterer[sector];
-  auto& clustererNN = processors.tpcNNClusterer[sector];
+  auto& clusterer = processors->tpcClusterer[sector];
+  auto& clustererNN = processors->tpcNNClusterer[sector];
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
 
@@ -192,10 +192,10 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n
   }
 }
 
-GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
-  auto& clusterer = processors.tpcClusterer[sector];
-  auto& clustererNN = processors.tpcNNClusterer[sector];
+  auto& clusterer = processors->tpcClusterer[sector];
+  auto& clustererNN = processors->tpcNNClusterer[sector];
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
@@ -234,13 +234,13 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
     pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg1[model_output_index + 4],
                static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg1[model_output_index],
                clustererNN.outputDataReg1[model_output_index + 2],
-               static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1],
+               (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1],
                clustererNN.outputDataReg1[model_output_index + 3],
                clustererNN.clusterFlags[2 * glo_idx],
                clustererNN.clusterFlags[2 * glo_idx + 1]);
 
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), (clusterer.mPmemory->fragment).start, chargeMap);
+    bool rejectCluster = !pc.toNativeSimple(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -272,10 +272,10 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
   }
 }
 
-GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
-  auto& clusterer = processors.tpcClusterer[sector];
-  auto& clustererNN = processors.tpcNNClusterer[sector];
+  auto& clusterer = processors->tpcClusterer[sector];
+  auto& clustererNN = processors->tpcNNClusterer[sector];
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
@@ -314,13 +314,13 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
     pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 8],
                static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index],
                clustererNN.outputDataReg2[model_output_index + 4],
-               static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2],
+               (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2],
                clustererNN.outputDataReg2[model_output_index + 6],
                clustererNN.clusterFlags[2 * glo_idx],
                clustererNN.clusterFlags[2 * glo_idx + 1]);
 
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), (clusterer.mPmemory->fragment).start, chargeMap);
+    bool rejectCluster = !pc.toNativeSimple(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -349,12 +349,12 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
     pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 9],
                static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index + 1],
                clustererNN.outputDataReg2[model_output_index + 5],
-               static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3],
+               (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3],
                clustererNN.outputDataReg2[model_output_index + 7],
                clustererNN.clusterFlags[2 * glo_idx],
                clustererNN.clusterFlags[2 * glo_idx + 1]);
 
-    rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), (clusterer.mPmemory->fragment).start, chargeMap);
+    rejectCluster = !pc.toNativeSimple(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index 8ef41e35a7e21..8c29c7b540ee2 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -60,12 +60,12 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
   };
 
   template <int32_t iKernel = defaultKernel, typename... Args>
-  GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, uint8_t = 0, int8_t = 0, int8_t = 0, uint = 0, Args...);
+  GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, GPUConstantMem*, uint8_t = 0, int8_t = 0, int8_t = 0, uint = 0, Args...);
 
  private:
-  static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, uint8_t, int8_t, uint);
-  static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
-  static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
+  static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, GPUConstantMem*, uint8_t, int8_t, uint);
+  static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, GPUConstantMem*, uint8_t, int8_t, int8_t, uint);
+  static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, GPUConstantMem*, uint8_t, int8_t, int8_t, uint);
 
   static GPUd() int padOffset(int, int, const GPUTPCGeometry&);
   static GPUd() int rowOffset(int, int);
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index e628586253e17..198b48a560598 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -114,12 +114,12 @@ o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"        "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUSTERFINDER"                                    LB      single)
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB      single)
 if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer"             "= TPCNNCLUSTERFINDER"                                  LB      single uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN"                  "= TPCNNCLUSTERFINDER"                                  LB      single uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels"        "= TPCNNCLUSTERFINDER"                                  LB      single uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels"        "= TPCNNCLUSTERFINDER"                                  LB      single uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression"      "= TPCNNCLUSTERFINDER"                                  LB      single uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression"      "= TPCNNCLUSTERFINDER"                                  LB      single uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer"             "= TPCNNCLUSTERFINDER"                                  LB      single GPUConstantMem* processors uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN"                  "= TPCNNCLUSTERFINDER"                                  LB      single GPUConstantMem* processors uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels"        "= TPCNNCLUSTERFINDER"                                  LB      single GPUConstantMem* processors uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels"        "= TPCNNCLUSTERFINDER"                                  LB      single GPUConstantMem* processors uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression"      "= TPCNNCLUSTERFINDER"                                  LB      single GPUConstantMem* processors uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression"      "= TPCNNCLUSTERFINDER"                                  LB      single GPUConstantMem* processors uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 endif()
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB      single int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"           "= TPCCLUSTERFINDER"                                    NO      single)

From e66efb1781924a1021f040423c0dd42f01d34a03 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 14 Mar 2025 00:34:47 +0100
Subject: [PATCH 69/77] Network now accepts clusters over all sectors

---
 Common/ML/src/OrtInterface.cxx                |   4 +-
 .../Global/GPUChainTrackingClusterizer.cxx    | 109 ++++++++----------
 2 files changed, 53 insertions(+), 60 deletions(-)

diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index f052e8fddd3e1..dc8d1a23b6569 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -233,10 +233,10 @@ void OrtModel::inference(I* input, size_t input_size, O* output)
   }
 
   std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
-  size_t outputSize = (int64_t)(inputShape[0] * mOutputShapes[0][1]);
+  size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]);
   Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
 
-  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, 1);
+  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here
 }
 
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 501cfa5f2b57b..66ce8c49779f1 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -888,9 +888,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
         uint32_t iSector = iSectorBase + lane;
         GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
-#ifdef GPUCA_HAS_ONNX
-        GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
-#endif
         GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
 
         if (doGPU) {
@@ -912,62 +909,58 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
         if (GetProcessingSettings().nn.applyNNclusterizer) {
 #ifdef GPUCA_HAS_ONNX
-
-          // Setting some initial sizes, important for memory allocation
+          GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
           const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
-          int evalDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
-
-           GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
-           
-           if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
-             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
-             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
-           }
-
-           if (clustererNN.nnSigmoidTrafoClassThreshold) {
-             // Inverse sigmoid transformation
-             clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
-           }
-           
-           float time_clusterizer = 0, time_fill = 0;
-           for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
-             uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
-             size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
-             
-             auto start0 = std::chrono::high_resolution_clock::now();
-             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
-             
-             auto stop0 = std::chrono::high_resolution_clock::now();
-             auto start1 = std::chrono::high_resolution_clock::now();
-             nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
-             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
-               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
-             } else {
-               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
-             }
-
-             if (!clustererNN.nnClusterizerUseCfRegression) {
-               nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
-               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
-               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
-                 nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
-                 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
-               }
-             }
-             auto stop1 = std::chrono::high_resolution_clock::now();
-             
-             time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
-             time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
-           }
-           auto start1 = std::chrono::high_resolution_clock::now();
-           if (clustererNN.nnClusterizerUseCfRegression) {
-             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
-           }
-           auto stop1 = std::chrono::high_resolution_clock::now();
-           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
-           if (clustererNN.nnClusterizerVerbosity < 3) {
-             LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
-           }
+          GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+          
+          if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
+            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
+            DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+          }
+          
+          float time_clusterizer = 0, time_fill = 0;
+          for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
+            uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
+            size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
+            
+            auto start0 = std::chrono::high_resolution_clock::now();
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
+            
+            auto stop0 = std::chrono::high_resolution_clock::now();
+            auto start1 = std::chrono::high_resolution_clock::now();
+            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
+            if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+            } else {
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+            }
+
+            if (!clustererNN.nnClusterizerUseCfRegression) {
+              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
+              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
+                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
+              }
+            }
+            auto stop1 = std::chrono::high_resolution_clock::now();
+            
+            time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+            time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
+          }
+          auto start1 = std::chrono::high_resolution_clock::now();
+          if (clustererNN.nnClusterizerUseCfRegression) {
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+          }
+          auto stop1 = std::chrono::high_resolution_clock::now();
+          time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+          if (clustererNN.nnClusterizerVerbosity < 3) {
+            int acceptedClusters = 0;
+            for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
+              acceptedClusters += clustererNN.outputDataClass[i];
+            }
+            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+          }
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif

From 2b9b8daccdf1baa8b7902caba36a007274a77903 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 14 Mar 2025 00:37:41 +0100
Subject: [PATCH 70/77] Whitespaces...

---
 .../Global/GPUChainTrackingClusterizer.cxx    | 791 +++++++++++-------
 1 file changed, 487 insertions(+), 304 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 66ce8c49779f1..5a7a6a9514e05 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -50,72 +50,88 @@ using namespace o2::tpc::constants;
 using namespace o2::dataformats;
 
 #ifdef GPUCA_TPC_GEOMETRY_O2
-std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment& fragment)
+std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment &fragment)
 {
   bool doGPU = mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding;
-  GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
-  GPUTPCClusterFinder::ZSOffset* o = processors()->tpcClusterer[iSector].mPzsOffsets;
+  GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector];
+  GPUTPCClusterFinder::ZSOffset *o = processors()->tpcClusterer[iSector].mPzsOffsets;
   uint32_t digits = 0;
   uint32_t pages = 0;
-  for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
+  for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++)
+  {
     clusterer.mMinMaxCN[j] = mCFContext->fragmentData[fragment.index].minMaxCN[iSector][j];
-    if (doGPU) {
+    if (doGPU)
+    {
       uint16_t posInEndpoint = 0;
       uint16_t pagesEndpoint = 0;
-      for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) {
+      for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++)
+      {
         const uint32_t pageFirst = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0;
         const uint32_t pageLast = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
-        for (uint32_t l = pageFirst; l < pageLast; l++) {
+        for (uint32_t l = pageFirst; l < pageLast; l++)
+        {
           uint16_t pageDigits = mCFContext->fragmentData[fragment.index].pageDigits[iSector][j][posInEndpoint++];
-          if (pageDigits) {
+          if (pageDigits)
+          {
             *(o++) = GPUTPCClusterFinder::ZSOffset{digits, j, pagesEndpoint};
             digits += pageDigits;
           }
           pagesEndpoint++;
         }
       }
-      if (pagesEndpoint != mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size()) {
-        if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
+      if (pagesEndpoint != mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size())
+      {
+        if (GetProcessingSettings().ignoreNonFatalGPUErrors)
+        {
           GPUError("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size());
           return {0, 0};
-        } else {
+        }
+        else
+        {
           GPUFatal("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size());
         }
       }
-    } else {
+    }
+    else
+    {
       clusterer.mPzsOffsets[j] = GPUTPCClusterFinder::ZSOffset{digits, j, 0};
       digits += mCFContext->fragmentData[fragment.index].nDigits[iSector][j];
       pages += mCFContext->fragmentData[fragment.index].nPages[iSector][j];
     }
   }
-  if (doGPU) {
+  if (doGPU)
+  {
     pages = o - processors()->tpcClusterer[iSector].mPzsOffsets;
   }
-  if (!doGPU && GetProcessingSettings().debugLevel >= 4 && mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
+  if (!doGPU && GetProcessingSettings().debugLevel >= 4 && mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased)
+  {
     TPCClusterizerEnsureZSOffsets(iSector, fragment);
   }
   return {digits, pages};
 }
 
-void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment& fragment)
+void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment &fragment)
 {
-  GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
+  GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector];
   uint32_t nAdcs = 0;
-  for (uint16_t endpoint = 0; endpoint < GPUTrackingInOutZS::NENDPOINTS; endpoint++) {
-    const auto& data = mCFContext->fragmentData[fragment.index];
+  for (uint16_t endpoint = 0; endpoint < GPUTrackingInOutZS::NENDPOINTS; endpoint++)
+  {
+    const auto &data = mCFContext->fragmentData[fragment.index];
     uint32_t pagesEndpoint = 0;
     const uint32_t nAdcsExpected = data.nDigits[iSector][endpoint];
     const uint32_t nPagesExpected = data.nPages[iSector][endpoint];
 
     uint32_t nAdcDecoded = 0;
-    const auto& zs = mIOPtrs.tpcZS->sector[iSector];
-    for (uint32_t i = data.minMaxCN[iSector][endpoint].zsPtrFirst; i < data.minMaxCN[iSector][endpoint].zsPtrLast; i++) {
+    const auto &zs = mIOPtrs.tpcZS->sector[iSector];
+    for (uint32_t i = data.minMaxCN[iSector][endpoint].zsPtrFirst; i < data.minMaxCN[iSector][endpoint].zsPtrLast; i++)
+    {
       const uint32_t pageFirst = (i == data.minMaxCN[iSector][endpoint].zsPtrFirst) ? data.minMaxCN[iSector][endpoint].zsPageFirst : 0;
       const uint32_t pageLast = (i + 1 == data.minMaxCN[iSector][endpoint].zsPtrLast) ? data.minMaxCN[iSector][endpoint].zsPageLast : zs.nZSPtr[endpoint][i];
-      for (uint32_t j = pageFirst; j < pageLast; j++) {
-        const uint8_t* page = static_cast<const uint8_t*>(zs.zsPtr[endpoint][i]) + j * TPCZSHDR::TPC_ZS_PAGE_SIZE;
-        const header::RAWDataHeader* rawDataHeader = reinterpret_cast<const header::RAWDataHeader*>(page);
-        const TPCZSHDRV2* decHdr = reinterpret_cast<const TPCZSHDRV2*>(page + raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2));
+      for (uint32_t j = pageFirst; j < pageLast; j++)
+      {
+        const uint8_t *page = static_cast<const uint8_t *>(zs.zsPtr[endpoint][i]) + j * TPCZSHDR::TPC_ZS_PAGE_SIZE;
+        const header::RAWDataHeader *rawDataHeader = reinterpret_cast<const header::RAWDataHeader *>(page);
+        const TPCZSHDRV2 *decHdr = reinterpret_cast<const TPCZSHDRV2 *>(page + raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2));
         const uint16_t nSamplesInPage = decHdr->nADCsamples;
 
         nAdcDecoded += nSamplesInPage;
@@ -123,15 +139,18 @@ void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfF
       }
     }
 
-    if (pagesEndpoint != nPagesExpected) {
+    if (pagesEndpoint != nPagesExpected)
+    {
       GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC raw page count mismatch: expected %d / buffered %lu", iSector, endpoint, fragment.index, pagesEndpoint, nPagesExpected);
     }
 
-    if (nAdcDecoded != nAdcsExpected) {
+    if (nAdcDecoded != nAdcsExpected)
+    {
       GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC count mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcsExpected, nAdcDecoded);
     }
 
-    if (nAdcs != clusterer.mPzsOffsets[endpoint].offset) {
+    if (nAdcs != clusterer.mPzsOffsets[endpoint].offset)
+    {
       GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC offset mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcs, clusterer.mPzsOffsets[endpoint].offset);
     }
 
@@ -141,12 +160,13 @@ void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfF
 
 namespace
 {
-struct TPCCFDecodeScanTmp {
-  int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter;
-};
+  struct TPCCFDecodeScanTmp
+  {
+    int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter;
+  };
 } // namespace
 
-std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment)
+std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment &fragment)
 {
   mRec->getGeneralStepTimer(GeneralStep::Prepare).Start();
   uint32_t nDigits = 0;
@@ -154,15 +174,20 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
   uint32_t endpointAdcSamples[GPUTrackingInOutZS::NENDPOINTS];
   memset(endpointAdcSamples, 0, sizeof(endpointAdcSamples));
   bool doGPU = mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding;
-  int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : (mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0;
+  int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : (mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader *)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0])
+                                                                                                                                                                                                               : 0;
 
-  for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
+  for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++)
+  {
 #ifndef GPUCA_NO_VC
-    if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1) {
-      for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++) {
-        for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++) {
-          Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
-          Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
+    if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1)
+    {
+      for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++)
+      {
+        for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++)
+        {
+          Vc::Common::prefetchMid(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
+          Vc::Common::prefetchMid(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
         }
       }
     }
@@ -171,7 +196,8 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
     std::vector<std::pair<CfFragment, TPCCFDecodeScanTmp>> fragments;
     fragments.reserve(mCFContext->nFragments);
     fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragment, {0, 0, 0, 0, 0, -1}});
-    for (uint32_t i = 1; i < mCFContext->nFragments; i++) {
+    for (uint32_t i = 1; i < mCFContext->nFragments; i++)
+    {
       fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragments.back().first.next(), {0, 0, 0, 0, 0, -1}});
     }
     std::vector<bool> fragmentExtends(mCFContext->nFragments, false);
@@ -179,64 +205,82 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
     uint32_t firstPossibleFragment = 0;
     uint32_t pageCounter = 0;
     uint32_t emptyPages = 0;
-    for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
-      if (GetProcessingSettings().tpcSingleSector != -1 && GetProcessingSettings().tpcSingleSector != (int32_t)iSector) {
+    for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++)
+    {
+      if (GetProcessingSettings().tpcSingleSector != -1 && GetProcessingSettings().tpcSingleSector != (int32_t)iSector)
+      {
         break;
       }
       nPages += mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
-      for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
+      for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++)
+      {
 #ifndef GPUCA_NO_VC
-        if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]) {
-          Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE);
-          Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
+        if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k])
+        {
+          Vc::Common::prefetchForOneRead(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE);
+          Vc::Common::prefetchForOneRead(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
         }
 #endif
-        const uint8_t* const page = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE;
-        const o2::header::RAWDataHeader* rdh = (const o2::header::RAWDataHeader*)page;
-        if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader)) {
+        const uint8_t *const page = ((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE;
+        const o2::header::RAWDataHeader *rdh = (const o2::header::RAWDataHeader *)page;
+        if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader))
+        {
           emptyPages++;
           continue;
         }
         pageCounter++;
-        const TPCZSHDR* const hdr = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdh)) == rdh_utils::DLBZSLinkID ? (page + o2::raw::RDHUtils::getMemorySize(*rdh) - sizeof(TPCZSHDRV2)) : (page + sizeof(o2::header::RAWDataHeader)));
-        if (mCFContext->zsVersion == -1) {
+        const TPCZSHDR *const hdr = (const TPCZSHDR *)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdh)) == rdh_utils::DLBZSLinkID ? (page + o2::raw::RDHUtils::getMemorySize(*rdh) - sizeof(TPCZSHDRV2)) : (page + sizeof(o2::header::RAWDataHeader)));
+        if (mCFContext->zsVersion == -1)
+        {
           mCFContext->zsVersion = hdr->version;
-          if (GetProcessingSettings().param.tpcTriggerHandling && mCFContext->zsVersion < ZSVersion::ZSVersionDenseLinkBased) { // TODO: Move tpcTriggerHandling to recoSteps bitmask
+          if (GetProcessingSettings().param.tpcTriggerHandling && mCFContext->zsVersion < ZSVersion::ZSVersionDenseLinkBased)
+          { // TODO: Move tpcTriggerHandling to recoSteps bitmask
             static bool errorShown = false;
-            if (errorShown == false) {
+            if (errorShown == false)
+            {
               GPUAlarm("Trigger handling only possible with TPC Dense Link Based data, received version %d, disabling", mCFContext->zsVersion);
             }
             errorShown = true;
           }
-        } else if (mCFContext->zsVersion != (int32_t)hdr->version) {
+        }
+        else if (mCFContext->zsVersion != (int32_t)hdr->version)
+        {
           GPUError("Received TPC ZS 8kb page of mixed versions, expected %d, received %d (linkid %d, feeCRU %d, feeEndpoint %d, feelinkid %d)", mCFContext->zsVersion, (int32_t)hdr->version, (int32_t)o2::raw::RDHUtils::getLinkID(*rdh), (int32_t)rdh_utils::getCRU(*rdh), (int32_t)rdh_utils::getEndPoint(*rdh), (int32_t)rdh_utils::getLink(*rdh));
           constexpr size_t bufferSize = 3 * std::max(sizeof(*rdh), sizeof(*hdr)) + 1;
           char dumpBuffer[bufferSize];
-          for (size_t i = 0; i < sizeof(*rdh); i++) {
+          for (size_t i = 0; i < sizeof(*rdh); i++)
+          {
             // "%02X " guaranteed to be 3 chars + ending 0.
-            snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)rdh)[i]);
+            snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t *)rdh)[i]);
           }
           GPUAlarm("RDH of page: %s", dumpBuffer);
-          for (size_t i = 0; i < sizeof(*hdr); i++) {
+          for (size_t i = 0; i < sizeof(*hdr); i++)
+          {
             // "%02X " guaranteed to be 3 chars + ending 0.
-            snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)hdr)[i]);
+            snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t *)hdr)[i]);
           }
           GPUAlarm("Metainfo of page: %s", dumpBuffer);
-          if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
+          if (GetProcessingSettings().ignoreNonFatalGPUErrors)
+          {
             mCFContext->abandonTimeframe = true;
             return {0, 0};
-          } else {
+          }
+          else
+          {
             GPUFatal("Cannot process with invalid TPC ZS data, exiting");
           }
         }
-        if (GetProcessingSettings().param.tpcTriggerHandling) {
-          const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr;
-          if (hdr2->flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent) {
-            const char* triggerWord = (const char*)hdr - TPCZSHDRV2::TRIGGER_WORD_SIZE;
+        if (GetProcessingSettings().param.tpcTriggerHandling)
+        {
+          const TPCZSHDRV2 *const hdr2 = (const TPCZSHDRV2 *)hdr;
+          if (hdr2->flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent)
+          {
+            const char *triggerWord = (const char *)hdr - TPCZSHDRV2::TRIGGER_WORD_SIZE;
             o2::tpc::TriggerInfoDLBZS tmp;
-            memcpy((void*)&tmp.triggerWord, triggerWord, TPCZSHDRV2::TRIGGER_WORD_SIZE);
+            memcpy((void *)&tmp.triggerWord, triggerWord, TPCZSHDRV2::TRIGGER_WORD_SIZE);
             tmp.orbit = o2::raw::RDHUtils::getHeartBeatOrbit(*rdh);
-            if (tmp.triggerWord.isValid(0)) {
+            if (tmp.triggerWord.isValid(0))
+            {
               mTriggerBuffer->triggers.emplace(tmp);
             }
           }
@@ -245,28 +289,37 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
         endpointAdcSamples[j] += hdr->nADCsamples;
         uint32_t timeBin = (hdr->timeOffset + (o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) - firstHBF) * o2::constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN;
         uint32_t maxTimeBin = timeBin + hdr->nTimeBinSpan;
-        if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
-          const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr;
-          if (hdr2->flags & TPCZSHDRV2::ZSFlags::nTimeBinSpanBit8) {
+        if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased)
+        {
+          const TPCZSHDRV2 *const hdr2 = (const TPCZSHDRV2 *)hdr;
+          if (hdr2->flags & TPCZSHDRV2::ZSFlags::nTimeBinSpanBit8)
+          {
             maxTimeBin += 256;
           }
         }
-        if (maxTimeBin > mCFContext->tpcMaxTimeBin) {
+        if (maxTimeBin > mCFContext->tpcMaxTimeBin)
+        {
           mCFContext->tpcMaxTimeBin = maxTimeBin;
         }
         bool extendsInNextPage = false;
-        if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
-          if (l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k] && o2::raw::RDHUtils::getMemorySize(*rdh) == TPCZSHDR::TPC_ZS_PAGE_SIZE) {
-            const o2::header::RAWDataHeader* nextrdh = (const o2::header::RAWDataHeader*)(page + TPCZSHDR::TPC_ZS_PAGE_SIZE);
+        if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased)
+        {
+          if (l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k] && o2::raw::RDHUtils::getMemorySize(*rdh) == TPCZSHDR::TPC_ZS_PAGE_SIZE)
+          {
+            const o2::header::RAWDataHeader *nextrdh = (const o2::header::RAWDataHeader *)(page + TPCZSHDR::TPC_ZS_PAGE_SIZE);
             extendsInNextPage = o2::raw::RDHUtils::getHeartBeatOrbit(*nextrdh) == o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) && o2::raw::RDHUtils::getMemorySize(*nextrdh) > sizeof(o2::header::RAWDataHeader);
           }
         }
-        while (firstPossibleFragment && (uint32_t)fragments[firstPossibleFragment - 1].first.last() > timeBin) {
+        while (firstPossibleFragment && (uint32_t)fragments[firstPossibleFragment - 1].first.last() > timeBin)
+        {
           firstPossibleFragment--;
         }
-        auto handleExtends = [&](uint32_t ff) {
-          if (fragmentExtends[ff]) {
-            if (doGPU) {
+        auto handleExtends = [&](uint32_t ff)
+        {
+          if (fragmentExtends[ff])
+          {
+            if (doGPU)
+            {
               // Only add extended page on GPU. On CPU the pages are in consecutive memory anyway.
               // Not adding the page prevents an issue where a page is decoded twice on CPU, when only the extend should be decoded.
               fragments[ff].second.zsPageLast++;
@@ -276,39 +329,57 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
             fragmentExtends[ff] = false;
           }
         };
-        if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
-          for (uint32_t ff = 0; ff < firstPossibleFragment; ff++) {
+        if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased)
+        {
+          for (uint32_t ff = 0; ff < firstPossibleFragment; ff++)
+          {
             handleExtends(ff);
           }
         }
-        for (uint32_t f = firstPossibleFragment; f < mCFContext->nFragments; f++) {
-          if (timeBin < (uint32_t)fragments[f].first.last() && (uint32_t)fragments[f].first.first() <= maxTimeBin) {
-            if (!fragments[f].second.hasData) {
+        for (uint32_t f = firstPossibleFragment; f < mCFContext->nFragments; f++)
+        {
+          if (timeBin < (uint32_t)fragments[f].first.last() && (uint32_t)fragments[f].first.first() <= maxTimeBin)
+          {
+            if (!fragments[f].second.hasData)
+            {
               fragments[f].second.hasData = 1;
               fragments[f].second.zsPtrFirst = k;
               fragments[f].second.zsPageFirst = l;
-            } else {
-              if (pageCounter > (uint32_t)fragments[f].second.pageCounter + 1) {
+            }
+            else
+            {
+              if (pageCounter > (uint32_t)fragments[f].second.pageCounter + 1)
+              {
                 mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages + pageCounter - fragments[f].second.pageCounter - 1;
-                for (uint32_t k2 = fragments[f].second.zsPtrLast - 1; k2 <= k; k2++) {
-                  for (uint32_t l2 = ((int32_t)k2 == fragments[f].second.zsPtrLast - 1) ? fragments[f].second.zsPageLast : 0; l2 < (k2 < k ? mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k2] : l); l2++) {
-                    if (doGPU) {
+                for (uint32_t k2 = fragments[f].second.zsPtrLast - 1; k2 <= k; k2++)
+                {
+                  for (uint32_t l2 = ((int32_t)k2 == fragments[f].second.zsPtrLast - 1) ? fragments[f].second.zsPageLast : 0; l2 < (k2 < k ? mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k2] : l); l2++)
+                  {
+                    if (doGPU)
+                    {
                       mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0);
-                    } else {
+                    }
+                    else
+                    {
                       // CPU cannot skip unneeded pages, so we must keep space to store the invalid dummy clusters
-                      const uint8_t* const pageTmp = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k2]) + l2 * TPCZSHDR::TPC_ZS_PAGE_SIZE;
-                      const o2::header::RAWDataHeader* rdhTmp = (const o2::header::RAWDataHeader*)pageTmp;
-                      if (o2::raw::RDHUtils::getMemorySize(*rdhTmp) != sizeof(o2::header::RAWDataHeader)) {
-                        const TPCZSHDR* const hdrTmp = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdhTmp)) == rdh_utils::DLBZSLinkID ? (pageTmp + o2::raw::RDHUtils::getMemorySize(*rdhTmp) - sizeof(TPCZSHDRV2)) : (pageTmp + sizeof(o2::header::RAWDataHeader)));
+                      const uint8_t *const pageTmp = ((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k2]) + l2 * TPCZSHDR::TPC_ZS_PAGE_SIZE;
+                      const o2::header::RAWDataHeader *rdhTmp = (const o2::header::RAWDataHeader *)pageTmp;
+                      if (o2::raw::RDHUtils::getMemorySize(*rdhTmp) != sizeof(o2::header::RAWDataHeader))
+                      {
+                        const TPCZSHDR *const hdrTmp = (const TPCZSHDR *)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdhTmp)) == rdh_utils::DLBZSLinkID ? (pageTmp + o2::raw::RDHUtils::getMemorySize(*rdhTmp) - sizeof(TPCZSHDRV2)) : (pageTmp + sizeof(o2::header::RAWDataHeader)));
                         mCFContext->fragmentData[f].nDigits[iSector][j] += hdrTmp->nADCsamples;
                       }
                     }
                   }
                 }
-              } else if (emptyPages) {
+              }
+              else if (emptyPages)
+              {
                 mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages;
-                if (doGPU) {
-                  for (uint32_t m = 0; m < emptyPages; m++) {
+                if (doGPU)
+                {
+                  for (uint32_t m = 0; m < emptyPages; m++)
+                  {
                     mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0);
                   }
                 }
@@ -319,20 +390,28 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
             fragments[f].second.pageCounter = pageCounter;
             mCFContext->fragmentData[f].nPages[iSector][j]++;
             mCFContext->fragmentData[f].nDigits[iSector][j] += hdr->nADCsamples;
-            if (doGPU) {
+            if (doGPU)
+            {
               mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(hdr->nADCsamples);
             }
             fragmentExtends[f] = extendsInNextPage;
-          } else {
+          }
+          else
+          {
             handleExtends(f);
-            if (timeBin < (uint32_t)fragments[f].first.last()) {
-              if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
-                for (uint32_t ff = f + 1; ff < mCFContext->nFragments; ff++) {
+            if (timeBin < (uint32_t)fragments[f].first.last())
+            {
+              if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased)
+              {
+                for (uint32_t ff = f + 1; ff < mCFContext->nFragments; ff++)
+                {
                   handleExtends(ff);
                 }
               }
               break;
-            } else {
+            }
+            else
+            {
               firstPossibleFragment = f + 1;
             }
           }
@@ -340,7 +419,8 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
         emptyPages = 0;
       }
     }
-    for (uint32_t f = 0; f < mCFContext->nFragments; f++) {
+    for (uint32_t f = 0; f < mCFContext->nFragments; f++)
+    {
       mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrLast = fragments[f].second.zsPtrLast;
       mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrFirst = fragments[f].second.zsPtrFirst;
       mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPageLast = fragments[f].second.zsPageLast;
@@ -351,16 +431,20 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
   mCFContext->nPagesSector[iSector] = nPages;
 
   mCFContext->nDigitsEndpointMax[iSector] = 0;
-  for (uint32_t i = 0; i < GPUTrackingInOutZS::NENDPOINTS; i++) {
-    if (endpointAdcSamples[i] > mCFContext->nDigitsEndpointMax[iSector]) {
+  for (uint32_t i = 0; i < GPUTrackingInOutZS::NENDPOINTS; i++)
+  {
+    if (endpointAdcSamples[i] > mCFContext->nDigitsEndpointMax[iSector])
+    {
       mCFContext->nDigitsEndpointMax[iSector] = endpointAdcSamples[i];
     }
   }
   uint32_t nDigitsFragmentMax = 0;
-  for (uint32_t i = 0; i < mCFContext->nFragments; i++) {
+  for (uint32_t i = 0; i < mCFContext->nFragments; i++)
+  {
     uint32_t pagesInFragment = 0;
     uint32_t digitsInFragment = 0;
-    for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
+    for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++)
+    {
       pagesInFragment += mCFContext->fragmentData[i].nPages[iSector][j];
       digitsInFragment += mCFContext->fragmentData[i].nDigits[iSector][j];
     }
@@ -371,29 +455,36 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
   return {nDigits, nDigitsFragmentMax};
 }
 
-void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int32_t stage, bool doGPU, int32_t lane)
+void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder &clusterer, GPUTPCClusterFinder &clustererShadow, int32_t stage, bool doGPU, int32_t lane)
 {
-  auto& in = stage ? clustererShadow.mPpeakPositions : clustererShadow.mPpositions;
-  auto& out = stage ? clustererShadow.mPfilteredPeakPositions : clustererShadow.mPpeakPositions;
-  if (doGPU) {
+  auto &in = stage ? clustererShadow.mPpeakPositions : clustererShadow.mPpositions;
+  auto &out = stage ? clustererShadow.mPfilteredPeakPositions : clustererShadow.mPpeakPositions;
+  if (doGPU)
+  {
     const uint32_t iSector = clusterer.mISector;
-    auto& count = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
+    auto &count = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
 
     std::vector<size_t> counts;
 
     uint32_t nSteps = clusterer.getNSteps(count);
-    if (nSteps > clusterer.mNBufs) {
+    if (nSteps > clusterer.mNBufs)
+    {
       GPUError("Clusterer buffers exceeded (%u > %u)", nSteps, (int32_t)clusterer.mNBufs);
       exit(1);
     }
 
     size_t tmpCount = count;
-    if (nSteps > 1) {
-      for (uint32_t i = 1; i < nSteps; i++) {
+    if (nSteps > 1)
+    {
+      for (uint32_t i = 1; i < nSteps; i++)
+      {
         counts.push_back(tmpCount);
-        if (i == 1) {
+        if (i == 1)
+        {
           runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanStart>({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, stage);
-        } else {
+        }
+        else
+        {
           runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanUp>({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, tmpCount);
         }
         tmpCount = (tmpCount + clusterer.mScanWorkGroupSize - 1) / clusterer.mScanWorkGroupSize;
@@ -401,19 +492,24 @@ void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clust
 
       runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanTop>({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, nSteps, tmpCount);
 
-      for (uint32_t i = nSteps - 1; i > 1; i--) {
+      for (uint32_t i = nSteps - 1; i > 1; i--)
+      {
         tmpCount = counts[i - 1];
         runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanDown>({GetGrid(tmpCount - clusterer.mScanWorkGroupSize, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, clusterer.mScanWorkGroupSize, tmpCount);
       }
     }
 
     runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::compactDigits>({GetGrid(count, clusterer.mScanWorkGroupSize, lane), {iSector}}, 1, stage, in, out);
-  } else {
-    auto& nOut = stage ? clusterer.mPmemory->counters.nClusters : clusterer.mPmemory->counters.nPeaks;
-    auto& nIn = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
+  }
+  else
+  {
+    auto &nOut = stage ? clusterer.mPmemory->counters.nClusters : clusterer.mPmemory->counters.nPeaks;
+    auto &nIn = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
     size_t count = 0;
-    for (size_t i = 0; i < nIn; i++) {
-      if (clusterer.mPisPeak[i]) {
+    for (size_t i = 0; i < nIn; i++)
+    {
+      if (clusterer.mPisPeak[i])
+      {
         out[count++] = in[i];
       }
     }
@@ -421,28 +517,33 @@ void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clust
   }
 }
 
-std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane)
+std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment &fragment, int32_t lane)
 {
   bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
-  if (mCFContext->abandonTimeframe) {
+  if (mCFContext->abandonTimeframe)
+  {
     return {0, 0};
   }
-  const auto& retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment);
-  if (doGPU) {
-    GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
-    GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
+  const auto &retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment);
+  if (doGPU)
+  {
+    GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector];
+    GPUTPCClusterFinder &clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
     uint32_t nPagesSector = 0;
-    for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
+    for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++)
+    {
       uint32_t nPages = 0;
       mInputsHost->mPzsMeta->sector[iSector].zsPtr[j] = &mInputsShadow->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j];
       mInputsHost->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j] = clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
-      for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) {
+      for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++)
+      {
         const uint32_t min = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0;
         const uint32_t max = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
-        if (max > min) {
-          char* src = (char*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + min * TPCZSHDR::TPC_ZS_PAGE_SIZE;
-          char* ptrLast = (char*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + (max - 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
-          size_t size = (ptrLast - src) + o2::raw::RDHUtils::getMemorySize(*(const o2::header::RAWDataHeader*)ptrLast);
+        if (max > min)
+        {
+          char *src = (char *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + min * TPCZSHDR::TPC_ZS_PAGE_SIZE;
+          char *ptrLast = (char *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + (max - 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
+          size_t size = (ptrLast - src) + o2::raw::RDHUtils::getMemorySize(*(const o2::header::RAWDataHeader *)ptrLast);
           GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE, src, size, lane, true);
         }
         nPages += max - min;
@@ -460,8 +561,10 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int
 int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
 {
   bool doGPU = mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding;
-  if (restorePointers) {
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+  if (restorePointers)
+  {
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
+    {
       processors()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetHost;
       processorsShadow()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetDevice;
       processorsShadow()->tpcClusterer[iSector].mPzs = mCFContext->ptrSave[iSector].zsDevice;
@@ -469,9 +572,10 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
     processorsShadow()->ioPtrs.clustersNative = mCFContext->ptrClusterNativeSave;
     return 0;
   }
-  const auto& threadContext = GetThreadContext();
+  const auto &threadContext = GetThreadContext();
   mRec->MemoryScalers()->nTPCdigits = 0;
-  if (mCFContext == nullptr) {
+  if (mCFContext == nullptr)
+  {
     mCFContext.reset(new GPUTPCCFChainContext);
   }
   const int16_t maxFragmentLen = GetProcessingSettings().overrideClusterizerFragmentLen;
@@ -479,86 +583,114 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
   mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
   const CfFragment fragmentMax{(tpccf::TPCTime)mCFContext->tpcMaxTimeBin + 1, maxFragmentLen};
   mCFContext->prepare(mIOPtrs.tpcZS, fragmentMax);
-  if (GetProcessingSettings().param.tpcTriggerHandling) {
+  if (GetProcessingSettings().param.tpcTriggerHandling)
+  {
     mTriggerBuffer->triggers.clear();
   }
-  if (mIOPtrs.tpcZS) {
+  if (mIOPtrs.tpcZS)
+  {
     uint32_t nDigitsFragmentMax[NSECTORS];
     mCFContext->zsVersion = -1;
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
-      if (mIOPtrs.tpcZS->sector[iSector].count[0]) {
-        const void* rdh = mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0];
-        if (rdh && o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeaderV6>() > o2::raw::RDHUtils::getVersion(rdh)) {
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
+    {
+      if (mIOPtrs.tpcZS->sector[iSector].count[0])
+      {
+        const void *rdh = mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0];
+        if (rdh && o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeaderV6>() > o2::raw::RDHUtils::getVersion(rdh))
+        {
           GPUError("Data has invalid RDH version %d, %d required\n", o2::raw::RDHUtils::getVersion(rdh), o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeader>());
           return 1;
         }
       }
 #ifndef GPUCA_NO_VC
-      if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1) {
-        for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
-          for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
-            for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
-              Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
-              Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
+      if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1)
+      {
+        for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++)
+        {
+          for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++)
+          {
+            for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++)
+            {
+              Vc::Common::prefetchFar(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
+              Vc::Common::prefetchFar(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
             }
           }
         }
       }
 #endif
-      const auto& x = TPCClusterizerDecodeZSCount(iSector, fragmentMax);
+      const auto &x = TPCClusterizerDecodeZSCount(iSector, fragmentMax);
       nDigitsFragmentMax[iSector] = x.first;
       processors()->tpcClusterer[iSector].mPmemory->counters.nDigits = x.first;
       mRec->MemoryScalers()->nTPCdigits += x.first;
     }
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
+    {
       uint32_t nDigitsBase = nDigitsFragmentMax[iSector];
       uint32_t threshold = 40000000;
       uint32_t nDigitsScaled = nDigitsBase > threshold ? nDigitsBase : std::min((threshold + nDigitsBase) / 2, 2 * nDigitsBase);
       processors()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]);
-      if (doGPU) {
+      if (doGPU)
+      {
         processorsShadow()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]);
       }
-      if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
+      if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)
+      {
         mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSOffsetId, mRec);
         mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSId, mRec);
-      } else {
+      }
+      else
+      {
         AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSOffsetId);
-        AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSId);        
+        AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSId);
       }
     }
-  } else {
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+  }
+  else
+  {
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
+    {
       uint32_t nDigits = mIOPtrs.tpcPackedDigits->nTPCDigits[iSector];
       mRec->MemoryScalers()->nTPCdigits += nDigits;
       processors()->tpcClusterer[iSector].SetNMaxDigits(nDigits, mCFContext->nPagesFragmentMax, nDigits, 0);
     }
   }
 
-  if (mIOPtrs.tpcZS) {
+  if (mIOPtrs.tpcZS)
+  {
     GPUInfo("Event has %u 8kb TPC ZS pages (version %d), %ld digits", mCFContext->nPagesTotal, mCFContext->zsVersion, (int64_t)mRec->MemoryScalers()->nTPCdigits);
-  } else {
+  }
+  else
+  {
     GPUInfo("Event has %ld TPC Digits", (int64_t)mRec->MemoryScalers()->nTPCdigits);
   }
 
-  if (mCFContext->tpcMaxTimeBin > maxAllowedTimebin) {
+  if (mCFContext->tpcMaxTimeBin > maxAllowedTimebin)
+  {
     GPUError("Input data has invalid time bin %u > %d", mCFContext->tpcMaxTimeBin, maxAllowedTimebin);
-    if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
+    if (GetProcessingSettings().ignoreNonFatalGPUErrors)
+    {
       mCFContext->abandonTimeframe = true;
       mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
-    } else {
+    }
+    else
+    {
       return 1;
     }
   }
 
   mCFContext->fragmentFirst = CfFragment{std::max<int32_t>(mCFContext->tpcMaxTimeBin + 1, maxFragmentLen), maxFragmentLen};
-  for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++) {
-    if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
+  for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++)
+  {
+    if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1)
+    {
       mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector);
     }
   }
 
-  if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+  if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)
+  {
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
+    {
       mCFContext->ptrSave[iSector].zsOffsetHost = processors()->tpcClusterer[iSector].mPzsOffsets;
       mCFContext->ptrSave[iSector].zsOffsetDevice = processorsShadow()->tpcClusterer[iSector].mPzsOffsets;
       mCFContext->ptrSave[iSector].zsDevice = processorsShadow()->tpcClusterer[iSector].mPzs;
@@ -570,55 +702,66 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
 
 int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 {
-  if (param().rec.fwdTPCDigitsAsClusters) {
+  if (param().rec.fwdTPCDigitsAsClusters)
+  {
     return ForwardTPCDigits();
   }
 #ifdef GPUCA_TPC_GEOMETRY_O2
   int32_t tpcTimeBinCut = mUpdateNewCalibObjects && mNewCalibValues->newTPCTimeBinCut ? mNewCalibValues->tpcTimeBinCut : param().tpcCutTimeBin;
   mRec->PushNonPersistentMemory(qStr2Tag("TPCCLUST"));
-  const auto& threadContext = GetThreadContext();
+  const auto &threadContext = GetThreadContext();
   const bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
-  if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) {
+  if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer))
+  {
     return 1;
   }
-  if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) {
+  if (GetProcessingSettings().autoAdjustHostThreads && !doGPU)
+  {
     mRec->SetNActiveThreads(mRec->MemoryScalers()->nTPCdigits / 6000);
   }
 
   mRec->MemoryScalers()->nTPCHits = mRec->MemoryScalers()->NTPCClusters(mRec->MemoryScalers()->nTPCdigits);
   float tpcHitLowOccupancyScalingFactor = 1.f;
-  if (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF) {
+  if (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF)
+  {
     uint32_t nHitsBase = mRec->MemoryScalers()->nTPCHits;
     uint32_t threshold = 30000000 / 256 * mIOPtrs.settingsTF->nHBFPerTF;
-    if (mIOPtrs.settingsTF->nHBFPerTF < 64) {
+    if (mIOPtrs.settingsTF->nHBFPerTF < 64)
+    {
       threshold *= 2;
     }
     mRec->MemoryScalers()->nTPCHits = std::max<uint32_t>(nHitsBase, std::min<uint32_t>(threshold, nHitsBase * 3.5f)); // Increase the buffer size for low occupancy data to compensate for noisy pads creating exceiive clusters
-    if (nHitsBase < threshold) {
+    if (nHitsBase < threshold)
+    {
       float maxFactor = mRec->MemoryScalers()->nTPCHits < threshold * 2 / 3 ? 3 : (mRec->MemoryScalers()->nTPCHits < threshold ? 2.25f : 1.75f);
       mRec->MemoryScalers()->temporaryFactor *= std::min(maxFactor, (float)threshold / nHitsBase);
       tpcHitLowOccupancyScalingFactor = std::min(3.5f, (float)threshold / nHitsBase);
     }
   }
-  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
+  {
     processors()->tpcClusterer[iSector].SetMaxData(mIOPtrs); // First iteration to set data sizes
   }
   mRec->ComputeReuseMax(nullptr); // Resolve maximums for shared buffers
-  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
+  {
     SetupGPUProcessor(&processors()->tpcClusterer[iSector], true); // Now we allocate
   }
-  if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
+  if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)
+  {
     RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
   }
 
 #ifdef GPUCA_HAS_ONNX
   uint32_t maxClusters = -1;
-  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
+  {
     maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
   }
-  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
-    GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
-    const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
+  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
+  {
+    GPUTPCNNClusterizer &clustererNN = processors()->tpcNNClusterer[iSector];
+    const GPUSettingsProcessingNNclusterizer &nn_settings = GetProcessingSettings().nn;
     clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
     clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
     clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
@@ -630,9 +773,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     clustererNN.nnClusterizerTotalClusters = maxClusters;
     clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
     clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
-    if (nn_settings.nnClusterizerVerbosity < 0) {
+    if (nn_settings.nnClusterizerVerbosity < 0)
+    {
       clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
-    } else {
+    }
+    else
+    {
       clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
     }
     clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
@@ -641,46 +787,55 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
 #endif
 
-  if (doGPU && mIOPtrs.tpcZS) {
+  if (doGPU && mIOPtrs.tpcZS)
+  {
     processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta;
-    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
+    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char *)&processors()->ioPtrs - (char *)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
   }
-  if (doGPU) {
-    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
+  if (doGPU)
+  {
+    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char *)processors()->tpcClusterer - (char *)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
   }
 
   size_t nClsTotal = 0;
-  ClusterNativeAccess* tmpNativeAccess = mClusterNativeAccess.get();
-  ClusterNative* tmpNativeClusters = nullptr;
+  ClusterNativeAccess *tmpNativeAccess = mClusterNativeAccess.get();
+  ClusterNative *tmpNativeClusters = nullptr;
   std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;
 
   // setup MC Labels
   bool propagateMCLabels = GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC;
 
-  auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;
+  auto *digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;
 
   bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
   bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output
 
   mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor;
-  if (buildNativeGPU) {
+  if (buildNativeGPU)
+  {
     AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
   }
-  if (buildNativeHost && !(buildNativeGPU && GetProcessingSettings().delayedOutput)) {
-    if (mWaitForFinalInputs) {
+  if (buildNativeHost && !(buildNativeGPU && GetProcessingSettings().delayedOutput))
+  {
+    if (mWaitForFinalInputs)
+    {
       GPUFatal("Cannot use waitForFinalInput callback without delayed output");
     }
-    if (!GetProcessingSettings().tpcApplyClusterFilterOnCPU) {
+    if (!GetProcessingSettings().tpcApplyClusterFilterOnCPU)
+    {
       AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
       tmpNativeClusters = mInputsHost->mPclusterNativeOutput;
-    } else {
+    }
+    else
+    {
       tmpNativeClusterBuffer = std::make_unique<ClusterNative[]>(mInputsHost->mNClusterNative);
       tmpNativeClusters = tmpNativeClusterBuffer.get();
     }
   }
 
   GPUTPCLinearLabels mcLinearLabels;
-  if (propagateMCLabels) {
+  if (propagateMCLabels)
+  {
     // No need to overallocate here, nTPCHits is anyway an upper bound used for the GPU cluster buffer, and we can always enlarge the buffer anyway
     mcLinearLabels.header.reserve(mRec->MemoryScalers()->nTPCHits / 2);
     mcLinearLabels.data.reserve(mRec->MemoryScalers()->nTPCHits);
@@ -689,8 +844,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   int8_t transferRunning[NSECTORS] = {0};
   uint32_t outputQueueStart = mOutputQueue.size();
 
-  auto notifyForeignChainFinished = [this]() {
-    if (mPipelineNotifyCtx) {
+  auto notifyForeignChainFinished = [this]()
+  {
+    if (mPipelineNotifyCtx)
+    {
       SynchronizeStream(OutputStream()); // Must finish before updating ioPtrs in (global) constant memory
       {
         std::lock_guard<std::mutex> lock(mPipelineNotifyCtx->mutex);
@@ -701,118 +858,144 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   };
   bool synchronizeCalibUpdate = false;
 
-  for (uint32_t iSectorBase = 0; iSectorBase < NSECTORS; iSectorBase += GetProcessingSettings().nTPCClustererLanes) {
+  for (uint32_t iSectorBase = 0; iSectorBase < NSECTORS; iSectorBase += GetProcessingSettings().nTPCClustererLanes)
+  {
     std::vector<bool> laneHasData(GetProcessingSettings().nTPCClustererLanes, false);
     static_assert(NSECTORS <= GPUCA_MAX_STREAMS, "Stream events must be able to hold all sectors");
     const int32_t maxLane = std::min<int32_t>(GetProcessingSettings().nTPCClustererLanes, NSECTORS - iSectorBase);
-    for (CfFragment fragment = mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next()) {
-      if (GetProcessingSettings().debugLevel >= 3) {
+    for (CfFragment fragment = mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next())
+    {
+      if (GetProcessingSettings().debugLevel >= 3)
+      {
         GPUInfo("Processing time bins [%d, %d) for sectors %d to %d", fragment.start, fragment.last(), iSectorBase, iSectorBase + GetProcessingSettings().nTPCClustererLanes - 1);
       }
-      mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
-        if (doGPU && fragment.index != 0) {
-          SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished
-        }
-
-        uint32_t iSector = iSectorBase + lane;
-        GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
-        GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
-        clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0;
-        clusterer.mPmemory->fragment = fragment;
-
-        if (mIOPtrs.tpcPackedDigits) {
-          bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS;
-          bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels;
-          auto* inDigits = mIOPtrs.tpcPackedDigits;
-          size_t numDigits = inDigits->nTPCDigits[iSector];
-          if (setDigitsOnGPU) {
-            GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSector], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true);
-          }
-          if (setDigitsOnHost) {
-            clusterer.mPdigits = const_cast<o2::tpc::Digit*>(inDigits->tpcDigits[iSector]); // TODO: Needs fixing (invalid const cast)
-          }
-          clusterer.mPmemory->counters.nDigits = numDigits;
-        }
-
-        if (mIOPtrs.tpcZS) {
-          if (mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
-            clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSector].first;
-            clusterer.mPmemory->counters.nPagesSubsector = mCFContext->nextPos[iSector].second;
-          } else {
-            clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubsector = 0;
-          }
-        }
-        TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-
-        using ChargeMapType = decltype(*clustererShadow.mPchargeMap);
-        using PeakMapType = decltype(*clustererShadow.mPpeakMap);
-        runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout<ChargeMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); // TODO: Not working in OpenCL2!!!
-        runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout<PeakMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType));
-        if (fragment.index == 0) {
-          runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy));
-        }
-        DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges");
-
-        if (doGPU) {
-          if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
-            TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane);
-            SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane);
-          }
-          SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory
-        }
-
-        if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSector] || mCFContext->zsVersion == -1)) {
-          clusterer.mPmemory->counters.nPositions = 0;
-          return;
-        }
-        if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSector] == 0) {
-          clusterer.mPmemory->counters.nPositions = 0;
-          return;
-        }
-
-        if (propagateMCLabels && fragment.index == 0) {
-          clusterer.PrepareMC();
-          clusterer.mPinputLabels = digitsMC->v[iSector];
-          if (clusterer.mPinputLabels == nullptr) {
-            GPUFatal("MC label container missing, sector %d", iSector);
-          }
-          if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSector]) {
-            GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSector], (int32_t)clusterer.mPinputLabels->getIndexedSize());
-          }
-        }
-
-        if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSector) {
-          if (not mIOPtrs.tpcZS) {
-            runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane), {iSector}}, mIOPtrs.tpcZS == nullptr);
-            TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-          } else if (propagateMCLabels) {
-            runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, mIOPtrs.tpcZS == nullptr);
-            TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-          }
-        }
-
-        if (mIOPtrs.tpcZS) {
-          int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
-          uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubsector : GPUTrackingInOutZS::NENDPOINTS;
-
-          (void)tpcTimeBinCut; // TODO: To be used in decoding kernels
-          switch (mCFContext->zsVersion) {
-            default:
-              GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion);
-              break;
-            case ZSVersionRowBased10BitADC:
-            case ZSVersionRowBased12BitADC:
-              runKernel<GPUTPCCFDecodeZS>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
-              break;
-            case ZSVersionLinkBasedWithMeta:
-              runKernel<GPUTPCCFDecodeZSLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
-              break;
-            case ZSVersionDenseLinkBased:
-              runKernel<GPUTPCCFDecodeZSDenseLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
-              break;
-          }
-          TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-        } // clang-format off
+      mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane)
+                                 {
+                                   if (doGPU && fragment.index != 0)
+                                   {
+                                     SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished
+                                   }
+
+                                   uint32_t iSector = iSectorBase + lane;
+                                   GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector];
+                                   GPUTPCClusterFinder &clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
+                                   clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0;
+                                   clusterer.mPmemory->fragment = fragment;
+
+                                   if (mIOPtrs.tpcPackedDigits)
+                                   {
+                                     bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS;
+                                     bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels;
+                                     auto *inDigits = mIOPtrs.tpcPackedDigits;
+                                     size_t numDigits = inDigits->nTPCDigits[iSector];
+                                     if (setDigitsOnGPU)
+                                     {
+                                       GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSector], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true);
+                                     }
+                                     if (setDigitsOnHost)
+                                     {
+                                       clusterer.mPdigits = const_cast<o2::tpc::Digit *>(inDigits->tpcDigits[iSector]); // TODO: Needs fixing (invalid const cast)
+                                     }
+                                     clusterer.mPmemory->counters.nDigits = numDigits;
+                                   }
+
+                                   if (mIOPtrs.tpcZS)
+                                   {
+                                     if (mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1)
+                                     {
+                                       clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSector].first;
+                                       clusterer.mPmemory->counters.nPagesSubsector = mCFContext->nextPos[iSector].second;
+                                     }
+                                     else
+                                     {
+                                       clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubsector = 0;
+                                     }
+                                   }
+                                   TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+
+                                   using ChargeMapType = decltype(*clustererShadow.mPchargeMap);
+                                   using PeakMapType = decltype(*clustererShadow.mPpeakMap);
+                                   runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout<ChargeMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); // TODO: Not working in OpenCL2!!!
+                                   runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout<PeakMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType));
+                                   if (fragment.index == 0)
+                                   {
+                                     runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy));
+                                   }
+                                   DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges");
+
+                                   if (doGPU)
+                                   {
+                                     if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1)
+                                     {
+                                       TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane);
+                                       SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane);
+                                     }
+                                     SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory
+                                   }
+
+                                   if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSector] || mCFContext->zsVersion == -1))
+                                   {
+                                     clusterer.mPmemory->counters.nPositions = 0;
+                                     return;
+                                   }
+                                   if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSector] == 0)
+                                   {
+                                     clusterer.mPmemory->counters.nPositions = 0;
+                                     return;
+                                   }
+
+                                   if (propagateMCLabels && fragment.index == 0)
+                                   {
+                                     clusterer.PrepareMC();
+                                     clusterer.mPinputLabels = digitsMC->v[iSector];
+                                     if (clusterer.mPinputLabels == nullptr)
+                                     {
+                                       GPUFatal("MC label container missing, sector %d", iSector);
+                                     }
+                                     if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSector])
+                                     {
+                                       GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSector], (int32_t)clusterer.mPinputLabels->getIndexedSize());
+                                     }
+                                   }
+
+                                   if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSector)
+                                   {
+                                     if (not mIOPtrs.tpcZS)
+                                     {
+                                       runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane), {iSector}}, mIOPtrs.tpcZS == nullptr);
+                                       TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+                                     }
+                                     else if (propagateMCLabels)
+                                     {
+                                       runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, mIOPtrs.tpcZS == nullptr);
+                                       TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+                                     }
+                                   }
+
+                                   if (mIOPtrs.tpcZS)
+                                   {
+                                     int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader *)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
+                                     uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubsector : GPUTrackingInOutZS::NENDPOINTS;
+
+                                     (void)tpcTimeBinCut; // TODO: To be used in decoding kernels
+                                     switch (mCFContext->zsVersion)
+                                     {
+                                     default:
+                                       GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion);
+                                       break;
+                                     case ZSVersionRowBased10BitADC:
+                                     case ZSVersionRowBased12BitADC:
+                                       runKernel<GPUTPCCFDecodeZS>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
+                                       break;
+                                     case ZSVersionLinkBasedWithMeta:
+                                       runKernel<GPUTPCCFDecodeZSLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
+                                       break;
+                                     case ZSVersionDenseLinkBased:
+                                       runKernel<GPUTPCCFDecodeZSDenseLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
+                                       break;
+                                     }
+                                     TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+                                   } // clang-format off
       });
       mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
         uint32_t iSector = iSectorBase + lane;

From 85d185ebad77a292ed87e7b9275322c15f0bf800 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 14 Mar 2025 00:54:00 +0100
Subject: [PATCH 71/77] Some weird formatting

---
 .../Global/GPUChainTrackingClusterizer.cxx    | 252 +++++++++---------
 1 file changed, 126 insertions(+), 126 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 5a7a6a9514e05..626137d821e10 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -870,132 +870,132 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         GPUInfo("Processing time bins [%d, %d) for sectors %d to %d", fragment.start, fragment.last(), iSectorBase, iSectorBase + GetProcessingSettings().nTPCClustererLanes - 1);
       }
       mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane)
-                                 {
-                                   if (doGPU && fragment.index != 0)
-                                   {
-                                     SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished
-                                   }
-
-                                   uint32_t iSector = iSectorBase + lane;
-                                   GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector];
-                                   GPUTPCClusterFinder &clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
-                                   clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0;
-                                   clusterer.mPmemory->fragment = fragment;
-
-                                   if (mIOPtrs.tpcPackedDigits)
-                                   {
-                                     bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS;
-                                     bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels;
-                                     auto *inDigits = mIOPtrs.tpcPackedDigits;
-                                     size_t numDigits = inDigits->nTPCDigits[iSector];
-                                     if (setDigitsOnGPU)
-                                     {
-                                       GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSector], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true);
-                                     }
-                                     if (setDigitsOnHost)
-                                     {
-                                       clusterer.mPdigits = const_cast<o2::tpc::Digit *>(inDigits->tpcDigits[iSector]); // TODO: Needs fixing (invalid const cast)
-                                     }
-                                     clusterer.mPmemory->counters.nDigits = numDigits;
-                                   }
-
-                                   if (mIOPtrs.tpcZS)
-                                   {
-                                     if (mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1)
-                                     {
-                                       clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSector].first;
-                                       clusterer.mPmemory->counters.nPagesSubsector = mCFContext->nextPos[iSector].second;
-                                     }
-                                     else
-                                     {
-                                       clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubsector = 0;
-                                     }
-                                   }
-                                   TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-
-                                   using ChargeMapType = decltype(*clustererShadow.mPchargeMap);
-                                   using PeakMapType = decltype(*clustererShadow.mPpeakMap);
-                                   runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout<ChargeMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); // TODO: Not working in OpenCL2!!!
-                                   runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout<PeakMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType));
-                                   if (fragment.index == 0)
-                                   {
-                                     runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy));
-                                   }
-                                   DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges");
-
-                                   if (doGPU)
-                                   {
-                                     if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1)
-                                     {
-                                       TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane);
-                                       SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane);
-                                     }
-                                     SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory
-                                   }
-
-                                   if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSector] || mCFContext->zsVersion == -1))
-                                   {
-                                     clusterer.mPmemory->counters.nPositions = 0;
-                                     return;
-                                   }
-                                   if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSector] == 0)
-                                   {
-                                     clusterer.mPmemory->counters.nPositions = 0;
-                                     return;
-                                   }
-
-                                   if (propagateMCLabels && fragment.index == 0)
-                                   {
-                                     clusterer.PrepareMC();
-                                     clusterer.mPinputLabels = digitsMC->v[iSector];
-                                     if (clusterer.mPinputLabels == nullptr)
-                                     {
-                                       GPUFatal("MC label container missing, sector %d", iSector);
-                                     }
-                                     if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSector])
-                                     {
-                                       GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSector], (int32_t)clusterer.mPinputLabels->getIndexedSize());
-                                     }
-                                   }
-
-                                   if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSector)
-                                   {
-                                     if (not mIOPtrs.tpcZS)
-                                     {
-                                       runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane), {iSector}}, mIOPtrs.tpcZS == nullptr);
-                                       TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-                                     }
-                                     else if (propagateMCLabels)
-                                     {
-                                       runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, mIOPtrs.tpcZS == nullptr);
-                                       TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-                                     }
-                                   }
-
-                                   if (mIOPtrs.tpcZS)
-                                   {
-                                     int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader *)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
-                                     uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubsector : GPUTrackingInOutZS::NENDPOINTS;
-
-                                     (void)tpcTimeBinCut; // TODO: To be used in decoding kernels
-                                     switch (mCFContext->zsVersion)
-                                     {
-                                     default:
-                                       GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion);
-                                       break;
-                                     case ZSVersionRowBased10BitADC:
-                                     case ZSVersionRowBased12BitADC:
-                                       runKernel<GPUTPCCFDecodeZS>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
-                                       break;
-                                     case ZSVersionLinkBasedWithMeta:
-                                       runKernel<GPUTPCCFDecodeZSLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
-                                       break;
-                                     case ZSVersionDenseLinkBased:
-                                       runKernel<GPUTPCCFDecodeZSDenseLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
-                                       break;
-                                     }
-                                     TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-                                   } // clang-format off
+      {
+        if (doGPU && fragment.index != 0)
+        {
+          SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished
+        }
+
+        uint32_t iSector = iSectorBase + lane;
+        GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector];
+        GPUTPCClusterFinder &clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
+        clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0;
+        clusterer.mPmemory->fragment = fragment;
+
+        if (mIOPtrs.tpcPackedDigits)
+        {
+          bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS;
+          bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels;
+          auto *inDigits = mIOPtrs.tpcPackedDigits;
+          size_t numDigits = inDigits->nTPCDigits[iSector];
+          if (setDigitsOnGPU)
+          {
+            GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSector], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true);
+          }
+          if (setDigitsOnHost)
+          {
+            clusterer.mPdigits = const_cast<o2::tpc::Digit *>(inDigits->tpcDigits[iSector]); // TODO: Needs fixing (invalid const cast)
+          }
+          clusterer.mPmemory->counters.nDigits = numDigits;
+        }
+
+        if (mIOPtrs.tpcZS)
+        {
+          if (mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1)
+          {
+            clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSector].first;
+            clusterer.mPmemory->counters.nPagesSubsector = mCFContext->nextPos[iSector].second;
+          }
+          else
+          {
+            clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubsector = 0;
+          }
+        }
+        TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+
+        using ChargeMapType = decltype(*clustererShadow.mPchargeMap);
+        using PeakMapType = decltype(*clustererShadow.mPpeakMap);
+        runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout<ChargeMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); // TODO: Not working in OpenCL2!!!
+        runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout<PeakMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType));
+        if (fragment.index == 0)
+        {
+          runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy));
+        }
+        DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges");
+
+        if (doGPU)
+        {
+          if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1)
+          {
+            TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane);
+            SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane);
+          }
+          SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory
+        }
+
+        if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSector] || mCFContext->zsVersion == -1))
+        {
+          clusterer.mPmemory->counters.nPositions = 0;
+          return;
+        }
+        if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSector] == 0)
+        {
+          clusterer.mPmemory->counters.nPositions = 0;
+          return;
+        }
+
+        if (propagateMCLabels && fragment.index == 0)
+        {
+          clusterer.PrepareMC();
+          clusterer.mPinputLabels = digitsMC->v[iSector];
+          if (clusterer.mPinputLabels == nullptr)
+          {
+            GPUFatal("MC label container missing, sector %d", iSector);
+          }
+          if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSector])
+          {
+            GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSector], (int32_t)clusterer.mPinputLabels->getIndexedSize());
+          }
+        }
+
+        if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSector)
+        {
+          if (not mIOPtrs.tpcZS)
+          {
+            runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane), {iSector}}, mIOPtrs.tpcZS == nullptr);
+            TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+          }
+          else if (propagateMCLabels)
+          {
+            runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, mIOPtrs.tpcZS == nullptr);
+            TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+          }
+        }
+
+        if (mIOPtrs.tpcZS)
+        {
+          int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader *)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
+          uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubsector : GPUTrackingInOutZS::NENDPOINTS;
+
+          (void)tpcTimeBinCut; // TODO: To be used in decoding kernels
+          switch (mCFContext->zsVersion)
+          {
+          default:
+            GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion);
+            break;
+          case ZSVersionRowBased10BitADC:
+          case ZSVersionRowBased12BitADC:
+            runKernel<GPUTPCCFDecodeZS>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
+            break;
+          case ZSVersionLinkBasedWithMeta:
+            runKernel<GPUTPCCFDecodeZSLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
+            break;
+          case ZSVersionDenseLinkBased:
+            runKernel<GPUTPCCFDecodeZSDenseLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
+            break;
+          }
+          TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+        } // clang-format off
       });
       mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
         uint32_t iSector = iSectorBase + lane;

From 49352ab6b891da405199d75a17051712b25a3b75 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Thu, 13 Mar 2025 23:54:37 +0000
Subject: [PATCH 72/77] Please consider the following formatting changes

---
 .../Global/GPUChainTrackingClusterizer.cxx    | 658 +++++++-----------
 1 file changed, 238 insertions(+), 420 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 626137d821e10..e8eb436d6ef5c 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -50,88 +50,72 @@ using namespace o2::tpc::constants;
 using namespace o2::dataformats;
 
 #ifdef GPUCA_TPC_GEOMETRY_O2
-std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment &fragment)
+std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCountUpdate(uint32_t iSector, const CfFragment& fragment)
 {
   bool doGPU = mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding;
-  GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector];
-  GPUTPCClusterFinder::ZSOffset *o = processors()->tpcClusterer[iSector].mPzsOffsets;
+  GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
+  GPUTPCClusterFinder::ZSOffset* o = processors()->tpcClusterer[iSector].mPzsOffsets;
   uint32_t digits = 0;
   uint32_t pages = 0;
-  for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++)
-  {
+  for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
     clusterer.mMinMaxCN[j] = mCFContext->fragmentData[fragment.index].minMaxCN[iSector][j];
-    if (doGPU)
-    {
+    if (doGPU) {
       uint16_t posInEndpoint = 0;
       uint16_t pagesEndpoint = 0;
-      for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++)
-      {
+      for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) {
         const uint32_t pageFirst = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0;
         const uint32_t pageLast = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
-        for (uint32_t l = pageFirst; l < pageLast; l++)
-        {
+        for (uint32_t l = pageFirst; l < pageLast; l++) {
           uint16_t pageDigits = mCFContext->fragmentData[fragment.index].pageDigits[iSector][j][posInEndpoint++];
-          if (pageDigits)
-          {
+          if (pageDigits) {
             *(o++) = GPUTPCClusterFinder::ZSOffset{digits, j, pagesEndpoint};
             digits += pageDigits;
           }
           pagesEndpoint++;
         }
       }
-      if (pagesEndpoint != mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size())
-      {
-        if (GetProcessingSettings().ignoreNonFatalGPUErrors)
-        {
+      if (pagesEndpoint != mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size()) {
+        if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
           GPUError("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size());
           return {0, 0};
-        }
-        else
-        {
+        } else {
           GPUFatal("TPC raw page count mismatch in TPCClusterizerDecodeZSCountUpdate: expected %d / buffered %lu", pagesEndpoint, mCFContext->fragmentData[fragment.index].pageDigits[iSector][j].size());
         }
       }
-    }
-    else
-    {
+    } else {
       clusterer.mPzsOffsets[j] = GPUTPCClusterFinder::ZSOffset{digits, j, 0};
       digits += mCFContext->fragmentData[fragment.index].nDigits[iSector][j];
       pages += mCFContext->fragmentData[fragment.index].nPages[iSector][j];
     }
   }
-  if (doGPU)
-  {
+  if (doGPU) {
     pages = o - processors()->tpcClusterer[iSector].mPzsOffsets;
   }
-  if (!doGPU && GetProcessingSettings().debugLevel >= 4 && mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased)
-  {
+  if (!doGPU && GetProcessingSettings().debugLevel >= 4 && mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
     TPCClusterizerEnsureZSOffsets(iSector, fragment);
   }
   return {digits, pages};
 }
 
-void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment &fragment)
+void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfFragment& fragment)
 {
-  GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector];
+  GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
   uint32_t nAdcs = 0;
-  for (uint16_t endpoint = 0; endpoint < GPUTrackingInOutZS::NENDPOINTS; endpoint++)
-  {
-    const auto &data = mCFContext->fragmentData[fragment.index];
+  for (uint16_t endpoint = 0; endpoint < GPUTrackingInOutZS::NENDPOINTS; endpoint++) {
+    const auto& data = mCFContext->fragmentData[fragment.index];
     uint32_t pagesEndpoint = 0;
     const uint32_t nAdcsExpected = data.nDigits[iSector][endpoint];
     const uint32_t nPagesExpected = data.nPages[iSector][endpoint];
 
     uint32_t nAdcDecoded = 0;
-    const auto &zs = mIOPtrs.tpcZS->sector[iSector];
-    for (uint32_t i = data.minMaxCN[iSector][endpoint].zsPtrFirst; i < data.minMaxCN[iSector][endpoint].zsPtrLast; i++)
-    {
+    const auto& zs = mIOPtrs.tpcZS->sector[iSector];
+    for (uint32_t i = data.minMaxCN[iSector][endpoint].zsPtrFirst; i < data.minMaxCN[iSector][endpoint].zsPtrLast; i++) {
       const uint32_t pageFirst = (i == data.minMaxCN[iSector][endpoint].zsPtrFirst) ? data.minMaxCN[iSector][endpoint].zsPageFirst : 0;
       const uint32_t pageLast = (i + 1 == data.minMaxCN[iSector][endpoint].zsPtrLast) ? data.minMaxCN[iSector][endpoint].zsPageLast : zs.nZSPtr[endpoint][i];
-      for (uint32_t j = pageFirst; j < pageLast; j++)
-      {
-        const uint8_t *page = static_cast<const uint8_t *>(zs.zsPtr[endpoint][i]) + j * TPCZSHDR::TPC_ZS_PAGE_SIZE;
-        const header::RAWDataHeader *rawDataHeader = reinterpret_cast<const header::RAWDataHeader *>(page);
-        const TPCZSHDRV2 *decHdr = reinterpret_cast<const TPCZSHDRV2 *>(page + raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2));
+      for (uint32_t j = pageFirst; j < pageLast; j++) {
+        const uint8_t* page = static_cast<const uint8_t*>(zs.zsPtr[endpoint][i]) + j * TPCZSHDR::TPC_ZS_PAGE_SIZE;
+        const header::RAWDataHeader* rawDataHeader = reinterpret_cast<const header::RAWDataHeader*>(page);
+        const TPCZSHDRV2* decHdr = reinterpret_cast<const TPCZSHDRV2*>(page + raw::RDHUtils::getMemorySize(*rawDataHeader) - sizeof(TPCZSHDRV2));
         const uint16_t nSamplesInPage = decHdr->nADCsamples;
 
         nAdcDecoded += nSamplesInPage;
@@ -139,18 +123,15 @@ void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfF
       }
     }
 
-    if (pagesEndpoint != nPagesExpected)
-    {
+    if (pagesEndpoint != nPagesExpected) {
       GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC raw page count mismatch: expected %d / buffered %lu", iSector, endpoint, fragment.index, pagesEndpoint, nPagesExpected);
     }
 
-    if (nAdcDecoded != nAdcsExpected)
-    {
+    if (nAdcDecoded != nAdcsExpected) {
       GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC count mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcsExpected, nAdcDecoded);
     }
 
-    if (nAdcs != clusterer.mPzsOffsets[endpoint].offset)
-    {
+    if (nAdcs != clusterer.mPzsOffsets[endpoint].offset) {
       GPUFatal("Sector %d, Endpoint %d, Fragment %d: TPC ADC offset mismatch: expected %u, buffered %u", iSector, endpoint, fragment.index, nAdcs, clusterer.mPzsOffsets[endpoint].offset);
     }
 
@@ -160,13 +141,12 @@ void GPUChainTracking::TPCClusterizerEnsureZSOffsets(uint32_t iSector, const CfF
 
 namespace
 {
-  struct TPCCFDecodeScanTmp
-  {
-    int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter;
-  };
+struct TPCCFDecodeScanTmp {
+  int32_t zsPtrFirst, zsPageFirst, zsPtrLast, zsPageLast, hasData, pageCounter;
+};
 } // namespace
 
-std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment &fragment)
+std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint32_t iSector, const CfFragment& fragment)
 {
   mRec->getGeneralStepTimer(GeneralStep::Prepare).Start();
   uint32_t nDigits = 0;
@@ -174,20 +154,16 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
   uint32_t endpointAdcSamples[GPUTrackingInOutZS::NENDPOINTS];
   memset(endpointAdcSamples, 0, sizeof(endpointAdcSamples));
   bool doGPU = mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding;
-  int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : (mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader *)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0])
+  int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : (mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0])
                                                                                                                                                                                                                : 0;
 
-  for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++)
-  {
+  for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
 #ifndef GPUCA_NO_VC
-    if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1)
-    {
-      for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++)
-      {
-        for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++)
-        {
-          Vc::Common::prefetchMid(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
-          Vc::Common::prefetchMid(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
+    if (GetProcessingSettings().prefetchTPCpageScan >= 3 && j < GPUTrackingInOutZS::NENDPOINTS - 1) {
+      for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j + 1]; k++) {
+        for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j + 1][k]; l++) {
+          Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
+          Vc::Common::prefetchMid(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j + 1][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
         }
       }
     }
@@ -196,8 +172,7 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
     std::vector<std::pair<CfFragment, TPCCFDecodeScanTmp>> fragments;
     fragments.reserve(mCFContext->nFragments);
     fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragment, {0, 0, 0, 0, 0, -1}});
-    for (uint32_t i = 1; i < mCFContext->nFragments; i++)
-    {
+    for (uint32_t i = 1; i < mCFContext->nFragments; i++) {
       fragments.emplace_back(std::pair<CfFragment, TPCCFDecodeScanTmp>{fragments.back().first.next(), {0, 0, 0, 0, 0, -1}});
     }
     std::vector<bool> fragmentExtends(mCFContext->nFragments, false);
@@ -205,82 +180,64 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
     uint32_t firstPossibleFragment = 0;
     uint32_t pageCounter = 0;
     uint32_t emptyPages = 0;
-    for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++)
-    {
-      if (GetProcessingSettings().tpcSingleSector != -1 && GetProcessingSettings().tpcSingleSector != (int32_t)iSector)
-      {
+    for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
+      if (GetProcessingSettings().tpcSingleSector != -1 && GetProcessingSettings().tpcSingleSector != (int32_t)iSector) {
         break;
       }
       nPages += mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
-      for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++)
-      {
+      for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
 #ifndef GPUCA_NO_VC
-        if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k])
-        {
-          Vc::Common::prefetchForOneRead(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE);
-          Vc::Common::prefetchForOneRead(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
+        if (GetProcessingSettings().prefetchTPCpageScan >= 2 && l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]) {
+          Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE);
+          Vc::Common::prefetchForOneRead(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + (l + 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
         }
 #endif
-        const uint8_t *const page = ((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE;
-        const o2::header::RAWDataHeader *rdh = (const o2::header::RAWDataHeader *)page;
-        if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader))
-        {
+        const uint8_t* const page = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE;
+        const o2::header::RAWDataHeader* rdh = (const o2::header::RAWDataHeader*)page;
+        if (o2::raw::RDHUtils::getMemorySize(*rdh) == sizeof(o2::header::RAWDataHeader)) {
           emptyPages++;
           continue;
         }
         pageCounter++;
-        const TPCZSHDR *const hdr = (const TPCZSHDR *)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdh)) == rdh_utils::DLBZSLinkID ? (page + o2::raw::RDHUtils::getMemorySize(*rdh) - sizeof(TPCZSHDRV2)) : (page + sizeof(o2::header::RAWDataHeader)));
-        if (mCFContext->zsVersion == -1)
-        {
+        const TPCZSHDR* const hdr = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdh)) == rdh_utils::DLBZSLinkID ? (page + o2::raw::RDHUtils::getMemorySize(*rdh) - sizeof(TPCZSHDRV2)) : (page + sizeof(o2::header::RAWDataHeader)));
+        if (mCFContext->zsVersion == -1) {
           mCFContext->zsVersion = hdr->version;
-          if (GetProcessingSettings().param.tpcTriggerHandling && mCFContext->zsVersion < ZSVersion::ZSVersionDenseLinkBased)
-          { // TODO: Move tpcTriggerHandling to recoSteps bitmask
+          if (GetProcessingSettings().param.tpcTriggerHandling && mCFContext->zsVersion < ZSVersion::ZSVersionDenseLinkBased) { // TODO: Move tpcTriggerHandling to recoSteps bitmask
             static bool errorShown = false;
-            if (errorShown == false)
-            {
+            if (errorShown == false) {
               GPUAlarm("Trigger handling only possible with TPC Dense Link Based data, received version %d, disabling", mCFContext->zsVersion);
             }
             errorShown = true;
           }
-        }
-        else if (mCFContext->zsVersion != (int32_t)hdr->version)
-        {
+        } else if (mCFContext->zsVersion != (int32_t)hdr->version) {
           GPUError("Received TPC ZS 8kb page of mixed versions, expected %d, received %d (linkid %d, feeCRU %d, feeEndpoint %d, feelinkid %d)", mCFContext->zsVersion, (int32_t)hdr->version, (int32_t)o2::raw::RDHUtils::getLinkID(*rdh), (int32_t)rdh_utils::getCRU(*rdh), (int32_t)rdh_utils::getEndPoint(*rdh), (int32_t)rdh_utils::getLink(*rdh));
           constexpr size_t bufferSize = 3 * std::max(sizeof(*rdh), sizeof(*hdr)) + 1;
           char dumpBuffer[bufferSize];
-          for (size_t i = 0; i < sizeof(*rdh); i++)
-          {
+          for (size_t i = 0; i < sizeof(*rdh); i++) {
             // "%02X " guaranteed to be 3 chars + ending 0.
-            snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t *)rdh)[i]);
+            snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)rdh)[i]);
           }
           GPUAlarm("RDH of page: %s", dumpBuffer);
-          for (size_t i = 0; i < sizeof(*hdr); i++)
-          {
+          for (size_t i = 0; i < sizeof(*hdr); i++) {
             // "%02X " guaranteed to be 3 chars + ending 0.
-            snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t *)hdr)[i]);
+            snprintf(dumpBuffer + 3 * i, 4, "%02X ", (int32_t)((uint8_t*)hdr)[i]);
           }
           GPUAlarm("Metainfo of page: %s", dumpBuffer);
-          if (GetProcessingSettings().ignoreNonFatalGPUErrors)
-          {
+          if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
             mCFContext->abandonTimeframe = true;
             return {0, 0};
-          }
-          else
-          {
+          } else {
             GPUFatal("Cannot process with invalid TPC ZS data, exiting");
           }
         }
-        if (GetProcessingSettings().param.tpcTriggerHandling)
-        {
-          const TPCZSHDRV2 *const hdr2 = (const TPCZSHDRV2 *)hdr;
-          if (hdr2->flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent)
-          {
-            const char *triggerWord = (const char *)hdr - TPCZSHDRV2::TRIGGER_WORD_SIZE;
+        if (GetProcessingSettings().param.tpcTriggerHandling) {
+          const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr;
+          if (hdr2->flags & TPCZSHDRV2::ZSFlags::TriggerWordPresent) {
+            const char* triggerWord = (const char*)hdr - TPCZSHDRV2::TRIGGER_WORD_SIZE;
             o2::tpc::TriggerInfoDLBZS tmp;
-            memcpy((void *)&tmp.triggerWord, triggerWord, TPCZSHDRV2::TRIGGER_WORD_SIZE);
+            memcpy((void*)&tmp.triggerWord, triggerWord, TPCZSHDRV2::TRIGGER_WORD_SIZE);
             tmp.orbit = o2::raw::RDHUtils::getHeartBeatOrbit(*rdh);
-            if (tmp.triggerWord.isValid(0))
-            {
+            if (tmp.triggerWord.isValid(0)) {
               mTriggerBuffer->triggers.emplace(tmp);
             }
           }
@@ -289,37 +246,28 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
         endpointAdcSamples[j] += hdr->nADCsamples;
         uint32_t timeBin = (hdr->timeOffset + (o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) - firstHBF) * o2::constants::lhc::LHCMaxBunches) / LHCBCPERTIMEBIN;
         uint32_t maxTimeBin = timeBin + hdr->nTimeBinSpan;
-        if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased)
-        {
-          const TPCZSHDRV2 *const hdr2 = (const TPCZSHDRV2 *)hdr;
-          if (hdr2->flags & TPCZSHDRV2::ZSFlags::nTimeBinSpanBit8)
-          {
+        if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
+          const TPCZSHDRV2* const hdr2 = (const TPCZSHDRV2*)hdr;
+          if (hdr2->flags & TPCZSHDRV2::ZSFlags::nTimeBinSpanBit8) {
             maxTimeBin += 256;
           }
         }
-        if (maxTimeBin > mCFContext->tpcMaxTimeBin)
-        {
+        if (maxTimeBin > mCFContext->tpcMaxTimeBin) {
           mCFContext->tpcMaxTimeBin = maxTimeBin;
         }
         bool extendsInNextPage = false;
-        if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased)
-        {
-          if (l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k] && o2::raw::RDHUtils::getMemorySize(*rdh) == TPCZSHDR::TPC_ZS_PAGE_SIZE)
-          {
-            const o2::header::RAWDataHeader *nextrdh = (const o2::header::RAWDataHeader *)(page + TPCZSHDR::TPC_ZS_PAGE_SIZE);
+        if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
+          if (l + 1 < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k] && o2::raw::RDHUtils::getMemorySize(*rdh) == TPCZSHDR::TPC_ZS_PAGE_SIZE) {
+            const o2::header::RAWDataHeader* nextrdh = (const o2::header::RAWDataHeader*)(page + TPCZSHDR::TPC_ZS_PAGE_SIZE);
             extendsInNextPage = o2::raw::RDHUtils::getHeartBeatOrbit(*nextrdh) == o2::raw::RDHUtils::getHeartBeatOrbit(*rdh) && o2::raw::RDHUtils::getMemorySize(*nextrdh) > sizeof(o2::header::RAWDataHeader);
           }
         }
-        while (firstPossibleFragment && (uint32_t)fragments[firstPossibleFragment - 1].first.last() > timeBin)
-        {
+        while (firstPossibleFragment && (uint32_t)fragments[firstPossibleFragment - 1].first.last() > timeBin) {
           firstPossibleFragment--;
         }
-        auto handleExtends = [&](uint32_t ff)
-        {
-          if (fragmentExtends[ff])
-          {
-            if (doGPU)
-            {
+        auto handleExtends = [&](uint32_t ff) {
+          if (fragmentExtends[ff]) {
+            if (doGPU) {
               // Only add extended page on GPU. On CPU the pages are in consecutive memory anyway.
               // Not adding the page prevents an issue where a page is decoded twice on CPU, when only the extend should be decoded.
               fragments[ff].second.zsPageLast++;
@@ -329,57 +277,39 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
             fragmentExtends[ff] = false;
           }
         };
-        if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased)
-        {
-          for (uint32_t ff = 0; ff < firstPossibleFragment; ff++)
-          {
+        if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
+          for (uint32_t ff = 0; ff < firstPossibleFragment; ff++) {
             handleExtends(ff);
           }
         }
-        for (uint32_t f = firstPossibleFragment; f < mCFContext->nFragments; f++)
-        {
-          if (timeBin < (uint32_t)fragments[f].first.last() && (uint32_t)fragments[f].first.first() <= maxTimeBin)
-          {
-            if (!fragments[f].second.hasData)
-            {
+        for (uint32_t f = firstPossibleFragment; f < mCFContext->nFragments; f++) {
+          if (timeBin < (uint32_t)fragments[f].first.last() && (uint32_t)fragments[f].first.first() <= maxTimeBin) {
+            if (!fragments[f].second.hasData) {
               fragments[f].second.hasData = 1;
               fragments[f].second.zsPtrFirst = k;
               fragments[f].second.zsPageFirst = l;
-            }
-            else
-            {
-              if (pageCounter > (uint32_t)fragments[f].second.pageCounter + 1)
-              {
+            } else {
+              if (pageCounter > (uint32_t)fragments[f].second.pageCounter + 1) {
                 mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages + pageCounter - fragments[f].second.pageCounter - 1;
-                for (uint32_t k2 = fragments[f].second.zsPtrLast - 1; k2 <= k; k2++)
-                {
-                  for (uint32_t l2 = ((int32_t)k2 == fragments[f].second.zsPtrLast - 1) ? fragments[f].second.zsPageLast : 0; l2 < (k2 < k ? mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k2] : l); l2++)
-                  {
-                    if (doGPU)
-                    {
+                for (uint32_t k2 = fragments[f].second.zsPtrLast - 1; k2 <= k; k2++) {
+                  for (uint32_t l2 = ((int32_t)k2 == fragments[f].second.zsPtrLast - 1) ? fragments[f].second.zsPageLast : 0; l2 < (k2 < k ? mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k2] : l); l2++) {
+                    if (doGPU) {
                       mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0);
-                    }
-                    else
-                    {
+                    } else {
                       // CPU cannot skip unneeded pages, so we must keep space to store the invalid dummy clusters
-                      const uint8_t *const pageTmp = ((const uint8_t *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k2]) + l2 * TPCZSHDR::TPC_ZS_PAGE_SIZE;
-                      const o2::header::RAWDataHeader *rdhTmp = (const o2::header::RAWDataHeader *)pageTmp;
-                      if (o2::raw::RDHUtils::getMemorySize(*rdhTmp) != sizeof(o2::header::RAWDataHeader))
-                      {
-                        const TPCZSHDR *const hdrTmp = (const TPCZSHDR *)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdhTmp)) == rdh_utils::DLBZSLinkID ? (pageTmp + o2::raw::RDHUtils::getMemorySize(*rdhTmp) - sizeof(TPCZSHDRV2)) : (pageTmp + sizeof(o2::header::RAWDataHeader)));
+                      const uint8_t* const pageTmp = ((const uint8_t*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k2]) + l2 * TPCZSHDR::TPC_ZS_PAGE_SIZE;
+                      const o2::header::RAWDataHeader* rdhTmp = (const o2::header::RAWDataHeader*)pageTmp;
+                      if (o2::raw::RDHUtils::getMemorySize(*rdhTmp) != sizeof(o2::header::RAWDataHeader)) {
+                        const TPCZSHDR* const hdrTmp = (const TPCZSHDR*)(rdh_utils::getLink(o2::raw::RDHUtils::getFEEID(*rdhTmp)) == rdh_utils::DLBZSLinkID ? (pageTmp + o2::raw::RDHUtils::getMemorySize(*rdhTmp) - sizeof(TPCZSHDRV2)) : (pageTmp + sizeof(o2::header::RAWDataHeader)));
                         mCFContext->fragmentData[f].nDigits[iSector][j] += hdrTmp->nADCsamples;
                       }
                     }
                   }
                 }
-              }
-              else if (emptyPages)
-              {
+              } else if (emptyPages) {
                 mCFContext->fragmentData[f].nPages[iSector][j] += emptyPages;
-                if (doGPU)
-                {
-                  for (uint32_t m = 0; m < emptyPages; m++)
-                  {
+                if (doGPU) {
+                  for (uint32_t m = 0; m < emptyPages; m++) {
                     mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(0);
                   }
                 }
@@ -390,28 +320,20 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
             fragments[f].second.pageCounter = pageCounter;
             mCFContext->fragmentData[f].nPages[iSector][j]++;
             mCFContext->fragmentData[f].nDigits[iSector][j] += hdr->nADCsamples;
-            if (doGPU)
-            {
+            if (doGPU) {
               mCFContext->fragmentData[f].pageDigits[iSector][j].emplace_back(hdr->nADCsamples);
             }
             fragmentExtends[f] = extendsInNextPage;
-          }
-          else
-          {
+          } else {
             handleExtends(f);
-            if (timeBin < (uint32_t)fragments[f].first.last())
-            {
-              if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased)
-              {
-                for (uint32_t ff = f + 1; ff < mCFContext->nFragments; ff++)
-                {
+            if (timeBin < (uint32_t)fragments[f].first.last()) {
+              if (mCFContext->zsVersion >= ZSVersion::ZSVersionDenseLinkBased) {
+                for (uint32_t ff = f + 1; ff < mCFContext->nFragments; ff++) {
                   handleExtends(ff);
                 }
               }
               break;
-            }
-            else
-            {
+            } else {
               firstPossibleFragment = f + 1;
             }
           }
@@ -419,8 +341,7 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
         emptyPages = 0;
       }
     }
-    for (uint32_t f = 0; f < mCFContext->nFragments; f++)
-    {
+    for (uint32_t f = 0; f < mCFContext->nFragments; f++) {
       mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrLast = fragments[f].second.zsPtrLast;
       mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPtrFirst = fragments[f].second.zsPtrFirst;
       mCFContext->fragmentData[f].minMaxCN[iSector][j].zsPageLast = fragments[f].second.zsPageLast;
@@ -431,20 +352,16 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
   mCFContext->nPagesSector[iSector] = nPages;
 
   mCFContext->nDigitsEndpointMax[iSector] = 0;
-  for (uint32_t i = 0; i < GPUTrackingInOutZS::NENDPOINTS; i++)
-  {
-    if (endpointAdcSamples[i] > mCFContext->nDigitsEndpointMax[iSector])
-    {
+  for (uint32_t i = 0; i < GPUTrackingInOutZS::NENDPOINTS; i++) {
+    if (endpointAdcSamples[i] > mCFContext->nDigitsEndpointMax[iSector]) {
       mCFContext->nDigitsEndpointMax[iSector] = endpointAdcSamples[i];
     }
   }
   uint32_t nDigitsFragmentMax = 0;
-  for (uint32_t i = 0; i < mCFContext->nFragments; i++)
-  {
+  for (uint32_t i = 0; i < mCFContext->nFragments; i++) {
     uint32_t pagesInFragment = 0;
     uint32_t digitsInFragment = 0;
-    for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++)
-    {
+    for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
       pagesInFragment += mCFContext->fragmentData[i].nPages[iSector][j];
       digitsInFragment += mCFContext->fragmentData[i].nDigits[iSector][j];
     }
@@ -455,36 +372,29 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::TPCClusterizerDecodeZSCount(uint
   return {nDigits, nDigitsFragmentMax};
 }
 
-void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder &clusterer, GPUTPCClusterFinder &clustererShadow, int32_t stage, bool doGPU, int32_t lane)
+void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder& clusterer, GPUTPCClusterFinder& clustererShadow, int32_t stage, bool doGPU, int32_t lane)
 {
-  auto &in = stage ? clustererShadow.mPpeakPositions : clustererShadow.mPpositions;
-  auto &out = stage ? clustererShadow.mPfilteredPeakPositions : clustererShadow.mPpeakPositions;
-  if (doGPU)
-  {
+  auto& in = stage ? clustererShadow.mPpeakPositions : clustererShadow.mPpositions;
+  auto& out = stage ? clustererShadow.mPfilteredPeakPositions : clustererShadow.mPpeakPositions;
+  if (doGPU) {
     const uint32_t iSector = clusterer.mISector;
-    auto &count = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
+    auto& count = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
 
     std::vector<size_t> counts;
 
     uint32_t nSteps = clusterer.getNSteps(count);
-    if (nSteps > clusterer.mNBufs)
-    {
+    if (nSteps > clusterer.mNBufs) {
       GPUError("Clusterer buffers exceeded (%u > %u)", nSteps, (int32_t)clusterer.mNBufs);
       exit(1);
     }
 
     size_t tmpCount = count;
-    if (nSteps > 1)
-    {
-      for (uint32_t i = 1; i < nSteps; i++)
-      {
+    if (nSteps > 1) {
+      for (uint32_t i = 1; i < nSteps; i++) {
         counts.push_back(tmpCount);
-        if (i == 1)
-        {
+        if (i == 1) {
           runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanStart>({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, stage);
-        }
-        else
-        {
+        } else {
           runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanUp>({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, tmpCount);
         }
         tmpCount = (tmpCount + clusterer.mScanWorkGroupSize - 1) / clusterer.mScanWorkGroupSize;
@@ -492,24 +402,19 @@ void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder &clust
 
       runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanTop>({GetGrid(tmpCount, clusterer.mScanWorkGroupSize, lane), {iSector}}, nSteps, tmpCount);
 
-      for (uint32_t i = nSteps - 1; i > 1; i--)
-      {
+      for (uint32_t i = nSteps - 1; i > 1; i--) {
         tmpCount = counts[i - 1];
         runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::scanDown>({GetGrid(tmpCount - clusterer.mScanWorkGroupSize, clusterer.mScanWorkGroupSize, lane), {iSector}}, i, clusterer.mScanWorkGroupSize, tmpCount);
       }
     }
 
     runKernel<GPUTPCCFStreamCompaction, GPUTPCCFStreamCompaction::compactDigits>({GetGrid(count, clusterer.mScanWorkGroupSize, lane), {iSector}}, 1, stage, in, out);
-  }
-  else
-  {
-    auto &nOut = stage ? clusterer.mPmemory->counters.nClusters : clusterer.mPmemory->counters.nPeaks;
-    auto &nIn = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
+  } else {
+    auto& nOut = stage ? clusterer.mPmemory->counters.nClusters : clusterer.mPmemory->counters.nPeaks;
+    auto& nIn = stage ? clusterer.mPmemory->counters.nPeaks : clusterer.mPmemory->counters.nPositions;
     size_t count = 0;
-    for (size_t i = 0; i < nIn; i++)
-    {
-      if (clusterer.mPisPeak[i])
-      {
+    for (size_t i = 0; i < nIn; i++) {
+      if (clusterer.mPisPeak[i]) {
         out[count++] = in[i];
       }
     }
@@ -517,33 +422,28 @@ void GPUChainTracking::RunTPCClusterizer_compactPeaks(GPUTPCClusterFinder &clust
   }
 }
 
-std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment &fragment, int32_t lane)
+std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int32_t iSector, const CfFragment& fragment, int32_t lane)
 {
   bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
-  if (mCFContext->abandonTimeframe)
-  {
+  if (mCFContext->abandonTimeframe) {
     return {0, 0};
   }
-  const auto &retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment);
-  if (doGPU)
-  {
-    GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector];
-    GPUTPCClusterFinder &clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
+  const auto& retVal = TPCClusterizerDecodeZSCountUpdate(iSector, fragment);
+  if (doGPU) {
+    GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
+    GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
     uint32_t nPagesSector = 0;
-    for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++)
-    {
+    for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
       uint32_t nPages = 0;
       mInputsHost->mPzsMeta->sector[iSector].zsPtr[j] = &mInputsShadow->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j];
       mInputsHost->mPzsPtrs[iSector * GPUTrackingInOutZS::NENDPOINTS + j] = clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
-      for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++)
-      {
+      for (uint32_t k = clusterer.mMinMaxCN[j].zsPtrFirst; k < clusterer.mMinMaxCN[j].zsPtrLast; k++) {
         const uint32_t min = (k == clusterer.mMinMaxCN[j].zsPtrFirst) ? clusterer.mMinMaxCN[j].zsPageFirst : 0;
         const uint32_t max = (k + 1 == clusterer.mMinMaxCN[j].zsPtrLast) ? clusterer.mMinMaxCN[j].zsPageLast : mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k];
-        if (max > min)
-        {
-          char *src = (char *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + min * TPCZSHDR::TPC_ZS_PAGE_SIZE;
-          char *ptrLast = (char *)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + (max - 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
-          size_t size = (ptrLast - src) + o2::raw::RDHUtils::getMemorySize(*(const o2::header::RAWDataHeader *)ptrLast);
+        if (max > min) {
+          char* src = (char*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + min * TPCZSHDR::TPC_ZS_PAGE_SIZE;
+          char* ptrLast = (char*)mIOPtrs.tpcZS->sector[iSector].zsPtr[j][k] + (max - 1) * TPCZSHDR::TPC_ZS_PAGE_SIZE;
+          size_t size = (ptrLast - src) + o2::raw::RDHUtils::getMemorySize(*(const o2::header::RAWDataHeader*)ptrLast);
           GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPzs + (nPagesSector + nPages) * TPCZSHDR::TPC_ZS_PAGE_SIZE, src, size, lane, true);
         }
         nPages += max - min;
@@ -561,10 +461,8 @@ std::pair<uint32_t, uint32_t> GPUChainTracking::RunTPCClusterizer_transferZS(int
 int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
 {
   bool doGPU = mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding;
-  if (restorePointers)
-  {
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
-    {
+  if (restorePointers) {
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       processors()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetHost;
       processorsShadow()->tpcClusterer[iSector].mPzsOffsets = mCFContext->ptrSave[iSector].zsOffsetDevice;
       processorsShadow()->tpcClusterer[iSector].mPzs = mCFContext->ptrSave[iSector].zsDevice;
@@ -572,10 +470,9 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
     processorsShadow()->ioPtrs.clustersNative = mCFContext->ptrClusterNativeSave;
     return 0;
   }
-  const auto &threadContext = GetThreadContext();
+  const auto& threadContext = GetThreadContext();
   mRec->MemoryScalers()->nTPCdigits = 0;
-  if (mCFContext == nullptr)
-  {
+  if (mCFContext == nullptr) {
     mCFContext.reset(new GPUTPCCFChainContext);
   }
   const int16_t maxFragmentLen = GetProcessingSettings().overrideClusterizerFragmentLen;
@@ -583,114 +480,86 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
   mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
   const CfFragment fragmentMax{(tpccf::TPCTime)mCFContext->tpcMaxTimeBin + 1, maxFragmentLen};
   mCFContext->prepare(mIOPtrs.tpcZS, fragmentMax);
-  if (GetProcessingSettings().param.tpcTriggerHandling)
-  {
+  if (GetProcessingSettings().param.tpcTriggerHandling) {
     mTriggerBuffer->triggers.clear();
   }
-  if (mIOPtrs.tpcZS)
-  {
+  if (mIOPtrs.tpcZS) {
     uint32_t nDigitsFragmentMax[NSECTORS];
     mCFContext->zsVersion = -1;
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
-    {
-      if (mIOPtrs.tpcZS->sector[iSector].count[0])
-      {
-        const void *rdh = mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0];
-        if (rdh && o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeaderV6>() > o2::raw::RDHUtils::getVersion(rdh))
-        {
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+      if (mIOPtrs.tpcZS->sector[iSector].count[0]) {
+        const void* rdh = mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0];
+        if (rdh && o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeaderV6>() > o2::raw::RDHUtils::getVersion(rdh)) {
           GPUError("Data has invalid RDH version %d, %d required\n", o2::raw::RDHUtils::getVersion(rdh), o2::raw::RDHUtils::getVersion<o2::header::RAWDataHeader>());
           return 1;
         }
       }
 #ifndef GPUCA_NO_VC
-      if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1)
-      {
-        for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++)
-        {
-          for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++)
-          {
-            for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++)
-            {
-              Vc::Common::prefetchFar(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
-              Vc::Common::prefetchFar(((const uint8_t *)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
+      if (GetProcessingSettings().prefetchTPCpageScan >= 1 && iSector < NSECTORS - 1) {
+        for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
+          for (uint32_t k = 0; k < mIOPtrs.tpcZS->sector[iSector].count[j]; k++) {
+            for (uint32_t l = 0; l < mIOPtrs.tpcZS->sector[iSector].nZSPtr[j][k]; l++) {
+              Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE);
+              Vc::Common::prefetchFar(((const uint8_t*)mIOPtrs.tpcZS->sector[iSector + 1].zsPtr[j][k]) + l * TPCZSHDR::TPC_ZS_PAGE_SIZE + sizeof(o2::header::RAWDataHeader));
             }
           }
         }
       }
 #endif
-      const auto &x = TPCClusterizerDecodeZSCount(iSector, fragmentMax);
+      const auto& x = TPCClusterizerDecodeZSCount(iSector, fragmentMax);
       nDigitsFragmentMax[iSector] = x.first;
       processors()->tpcClusterer[iSector].mPmemory->counters.nDigits = x.first;
       mRec->MemoryScalers()->nTPCdigits += x.first;
     }
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
-    {
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       uint32_t nDigitsBase = nDigitsFragmentMax[iSector];
       uint32_t threshold = 40000000;
       uint32_t nDigitsScaled = nDigitsBase > threshold ? nDigitsBase : std::min((threshold + nDigitsBase) / 2, 2 * nDigitsBase);
       processors()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]);
-      if (doGPU)
-      {
+      if (doGPU) {
         processorsShadow()->tpcClusterer[iSector].SetNMaxDigits(processors()->tpcClusterer[iSector].mPmemory->counters.nDigits, mCFContext->nPagesFragmentMax, nDigitsScaled, mCFContext->nDigitsEndpointMax[iSector]);
       }
-      if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)
-      {
+      if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
         mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSOffsetId, mRec);
         mPipelineNotifyCtx->rec->AllocateRegisteredForeignMemory(processors()->tpcClusterer[iSector].mZSId, mRec);
-      }
-      else
-      {
+      } else {
         AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSOffsetId);
         AllocateRegisteredMemory(processors()->tpcClusterer[iSector].mZSId);
       }
     }
-  }
-  else
-  {
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
-    {
+  } else {
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       uint32_t nDigits = mIOPtrs.tpcPackedDigits->nTPCDigits[iSector];
       mRec->MemoryScalers()->nTPCdigits += nDigits;
       processors()->tpcClusterer[iSector].SetNMaxDigits(nDigits, mCFContext->nPagesFragmentMax, nDigits, 0);
     }
   }
 
-  if (mIOPtrs.tpcZS)
-  {
+  if (mIOPtrs.tpcZS) {
     GPUInfo("Event has %u 8kb TPC ZS pages (version %d), %ld digits", mCFContext->nPagesTotal, mCFContext->zsVersion, (int64_t)mRec->MemoryScalers()->nTPCdigits);
-  }
-  else
-  {
+  } else {
     GPUInfo("Event has %ld TPC Digits", (int64_t)mRec->MemoryScalers()->nTPCdigits);
   }
 
-  if (mCFContext->tpcMaxTimeBin > maxAllowedTimebin)
-  {
+  if (mCFContext->tpcMaxTimeBin > maxAllowedTimebin) {
     GPUError("Input data has invalid time bin %u > %d", mCFContext->tpcMaxTimeBin, maxAllowedTimebin);
-    if (GetProcessingSettings().ignoreNonFatalGPUErrors)
-    {
+    if (GetProcessingSettings().ignoreNonFatalGPUErrors) {
       mCFContext->abandonTimeframe = true;
       mCFContext->tpcMaxTimeBin = maxAllowedTimebin;
-    }
-    else
-    {
+    } else {
       return 1;
     }
   }
 
   mCFContext->fragmentFirst = CfFragment{std::max<int32_t>(mCFContext->tpcMaxTimeBin + 1, maxFragmentLen), maxFragmentLen};
-  for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++)
-  {
-    if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1)
-    {
+  for (int32_t iSector = 0; iSector < GetProcessingSettings().nTPCClustererLanes && iSector < NSECTORS; iSector++) {
+    if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
       mCFContext->nextPos[iSector] = RunTPCClusterizer_transferZS(iSector, mCFContext->fragmentFirst, GetProcessingSettings().nTPCClustererLanes + iSector);
     }
   }
 
-  if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)
-  {
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
-    {
+  if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       mCFContext->ptrSave[iSector].zsOffsetHost = processors()->tpcClusterer[iSector].mPzsOffsets;
       mCFContext->ptrSave[iSector].zsOffsetDevice = processorsShadow()->tpcClusterer[iSector].mPzsOffsets;
       mCFContext->ptrSave[iSector].zsDevice = processorsShadow()->tpcClusterer[iSector].mPzs;
@@ -702,66 +571,55 @@ int32_t GPUChainTracking::RunTPCClusterizer_prepare(bool restorePointers)
 
 int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 {
-  if (param().rec.fwdTPCDigitsAsClusters)
-  {
+  if (param().rec.fwdTPCDigitsAsClusters) {
     return ForwardTPCDigits();
   }
 #ifdef GPUCA_TPC_GEOMETRY_O2
   int32_t tpcTimeBinCut = mUpdateNewCalibObjects && mNewCalibValues->newTPCTimeBinCut ? mNewCalibValues->tpcTimeBinCut : param().tpcCutTimeBin;
   mRec->PushNonPersistentMemory(qStr2Tag("TPCCLUST"));
-  const auto &threadContext = GetThreadContext();
+  const auto& threadContext = GetThreadContext();
   const bool doGPU = GetRecoStepsGPU() & RecoStep::TPCClusterFinding;
-  if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer))
-  {
+  if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) {
     return 1;
   }
-  if (GetProcessingSettings().autoAdjustHostThreads && !doGPU)
-  {
+  if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) {
     mRec->SetNActiveThreads(mRec->MemoryScalers()->nTPCdigits / 6000);
   }
 
   mRec->MemoryScalers()->nTPCHits = mRec->MemoryScalers()->NTPCClusters(mRec->MemoryScalers()->nTPCdigits);
   float tpcHitLowOccupancyScalingFactor = 1.f;
-  if (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF)
-  {
+  if (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasNHBFPerTF) {
     uint32_t nHitsBase = mRec->MemoryScalers()->nTPCHits;
     uint32_t threshold = 30000000 / 256 * mIOPtrs.settingsTF->nHBFPerTF;
-    if (mIOPtrs.settingsTF->nHBFPerTF < 64)
-    {
+    if (mIOPtrs.settingsTF->nHBFPerTF < 64) {
       threshold *= 2;
     }
     mRec->MemoryScalers()->nTPCHits = std::max<uint32_t>(nHitsBase, std::min<uint32_t>(threshold, nHitsBase * 3.5f)); // Increase the buffer size for low occupancy data to compensate for noisy pads creating exceiive clusters
-    if (nHitsBase < threshold)
-    {
+    if (nHitsBase < threshold) {
       float maxFactor = mRec->MemoryScalers()->nTPCHits < threshold * 2 / 3 ? 3 : (mRec->MemoryScalers()->nTPCHits < threshold ? 2.25f : 1.75f);
       mRec->MemoryScalers()->temporaryFactor *= std::min(maxFactor, (float)threshold / nHitsBase);
       tpcHitLowOccupancyScalingFactor = std::min(3.5f, (float)threshold / nHitsBase);
     }
   }
-  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
-  {
+  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
     processors()->tpcClusterer[iSector].SetMaxData(mIOPtrs); // First iteration to set data sizes
   }
   mRec->ComputeReuseMax(nullptr); // Resolve maximums for shared buffers
-  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
-  {
+  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
     SetupGPUProcessor(&processors()->tpcClusterer[iSector], true); // Now we allocate
   }
-  if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)
-  {
+  if (mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer) {
     RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
   }
 
 #ifdef GPUCA_HAS_ONNX
   uint32_t maxClusters = -1;
-  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
-  {
+  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
     maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
   }
-  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++)
-  {
-    GPUTPCNNClusterizer &clustererNN = processors()->tpcNNClusterer[iSector];
-    const GPUSettingsProcessingNNclusterizer &nn_settings = GetProcessingSettings().nn;
+  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+    GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
+    const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
     clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
     clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
     clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
@@ -773,12 +631,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     clustererNN.nnClusterizerTotalClusters = maxClusters;
     clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
     clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
-    if (nn_settings.nnClusterizerVerbosity < 0)
-    {
+    if (nn_settings.nnClusterizerVerbosity < 0) {
       clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
-    }
-    else
-    {
+    } else {
       clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
     }
     clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
@@ -787,55 +642,46 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
 #endif
 
-  if (doGPU && mIOPtrs.tpcZS)
-  {
+  if (doGPU && mIOPtrs.tpcZS) {
     processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta;
-    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char *)&processors()->ioPtrs - (char *)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
+    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
   }
-  if (doGPU)
-  {
-    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char *)processors()->tpcClusterer - (char *)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
+  if (doGPU) {
+    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
   }
 
   size_t nClsTotal = 0;
-  ClusterNativeAccess *tmpNativeAccess = mClusterNativeAccess.get();
-  ClusterNative *tmpNativeClusters = nullptr;
+  ClusterNativeAccess* tmpNativeAccess = mClusterNativeAccess.get();
+  ClusterNative* tmpNativeClusters = nullptr;
   std::unique_ptr<ClusterNative[]> tmpNativeClusterBuffer;
 
   // setup MC Labels
   bool propagateMCLabels = GetProcessingSettings().runMC && processors()->ioPtrs.tpcPackedDigits && processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC;
 
-  auto *digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;
+  auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;
 
   bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
   bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output
 
   mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor;
-  if (buildNativeGPU)
-  {
+  if (buildNativeGPU) {
     AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
   }
-  if (buildNativeHost && !(buildNativeGPU && GetProcessingSettings().delayedOutput))
-  {
-    if (mWaitForFinalInputs)
-    {
+  if (buildNativeHost && !(buildNativeGPU && GetProcessingSettings().delayedOutput)) {
+    if (mWaitForFinalInputs) {
       GPUFatal("Cannot use waitForFinalInput callback without delayed output");
     }
-    if (!GetProcessingSettings().tpcApplyClusterFilterOnCPU)
-    {
+    if (!GetProcessingSettings().tpcApplyClusterFilterOnCPU) {
       AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeOutput, mSubOutputControls[GPUTrackingOutputs::getIndex(&GPUTrackingOutputs::clustersNative)]);
       tmpNativeClusters = mInputsHost->mPclusterNativeOutput;
-    }
-    else
-    {
+    } else {
       tmpNativeClusterBuffer = std::make_unique<ClusterNative[]>(mInputsHost->mNClusterNative);
       tmpNativeClusters = tmpNativeClusterBuffer.get();
     }
   }
 
   GPUTPCLinearLabels mcLinearLabels;
-  if (propagateMCLabels)
-  {
+  if (propagateMCLabels) {
     // No need to overallocate here, nTPCHits is anyway an upper bound used for the GPU cluster buffer, and we can always enlarge the buffer anyway
     mcLinearLabels.header.reserve(mRec->MemoryScalers()->nTPCHits / 2);
     mcLinearLabels.data.reserve(mRec->MemoryScalers()->nTPCHits);
@@ -844,10 +690,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   int8_t transferRunning[NSECTORS] = {0};
   uint32_t outputQueueStart = mOutputQueue.size();
 
-  auto notifyForeignChainFinished = [this]()
-  {
-    if (mPipelineNotifyCtx)
-    {
+  auto notifyForeignChainFinished = [this]() {
+    if (mPipelineNotifyCtx) {
       SynchronizeStream(OutputStream()); // Must finish before updating ioPtrs in (global) constant memory
       {
         std::lock_guard<std::mutex> lock(mPipelineNotifyCtx->mutex);
@@ -858,56 +702,44 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   };
   bool synchronizeCalibUpdate = false;
 
-  for (uint32_t iSectorBase = 0; iSectorBase < NSECTORS; iSectorBase += GetProcessingSettings().nTPCClustererLanes)
-  {
+  for (uint32_t iSectorBase = 0; iSectorBase < NSECTORS; iSectorBase += GetProcessingSettings().nTPCClustererLanes) {
     std::vector<bool> laneHasData(GetProcessingSettings().nTPCClustererLanes, false);
     static_assert(NSECTORS <= GPUCA_MAX_STREAMS, "Stream events must be able to hold all sectors");
     const int32_t maxLane = std::min<int32_t>(GetProcessingSettings().nTPCClustererLanes, NSECTORS - iSectorBase);
-    for (CfFragment fragment = mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next())
-    {
-      if (GetProcessingSettings().debugLevel >= 3)
-      {
+    for (CfFragment fragment = mCFContext->fragmentFirst; !fragment.isEnd(); fragment = fragment.next()) {
+      if (GetProcessingSettings().debugLevel >= 3) {
         GPUInfo("Processing time bins [%d, %d) for sectors %d to %d", fragment.start, fragment.last(), iSectorBase, iSectorBase + GetProcessingSettings().nTPCClustererLanes - 1);
       }
-      mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane)
-      {
-        if (doGPU && fragment.index != 0)
-        {
+      mRec->runParallelOuterLoop(doGPU, maxLane, [&](uint32_t lane) {
+        if (doGPU && fragment.index != 0) {
           SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished
         }
 
         uint32_t iSector = iSectorBase + lane;
-        GPUTPCClusterFinder &clusterer = processors()->tpcClusterer[iSector];
-        GPUTPCClusterFinder &clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
+        GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector];
+        GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer;
         clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0;
         clusterer.mPmemory->fragment = fragment;
 
-        if (mIOPtrs.tpcPackedDigits)
-        {
+        if (mIOPtrs.tpcPackedDigits) {
           bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS;
           bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels;
-          auto *inDigits = mIOPtrs.tpcPackedDigits;
+          auto* inDigits = mIOPtrs.tpcPackedDigits;
           size_t numDigits = inDigits->nTPCDigits[iSector];
-          if (setDigitsOnGPU)
-          {
+          if (setDigitsOnGPU) {
             GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSector], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true);
           }
-          if (setDigitsOnHost)
-          {
-            clusterer.mPdigits = const_cast<o2::tpc::Digit *>(inDigits->tpcDigits[iSector]); // TODO: Needs fixing (invalid const cast)
+          if (setDigitsOnHost) {
+            clusterer.mPdigits = const_cast<o2::tpc::Digit*>(inDigits->tpcDigits[iSector]); // TODO: Needs fixing (invalid const cast)
           }
           clusterer.mPmemory->counters.nDigits = numDigits;
         }
 
-        if (mIOPtrs.tpcZS)
-        {
-          if (mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1)
-          {
+        if (mIOPtrs.tpcZS) {
+          if (mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
             clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSector].first;
             clusterer.mPmemory->counters.nPagesSubsector = mCFContext->nextPos[iSector].second;
-          }
-          else
-          {
+          } else {
             clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubsector = 0;
           }
         }
@@ -917,82 +749,68 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         using PeakMapType = decltype(*clustererShadow.mPpeakMap);
         runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout<ChargeMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); // TODO: Not working in OpenCL2!!!
         runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout<PeakMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType));
-        if (fragment.index == 0)
-        {
+        if (fragment.index == 0) {
           runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy));
         }
         DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges");
 
-        if (doGPU)
-        {
-          if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1)
-          {
+        if (doGPU) {
+          if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSector] && mCFContext->zsVersion != -1) {
             TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane);
             SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane);
           }
           SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory
         }
 
-        if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSector] || mCFContext->zsVersion == -1))
-        {
+        if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSector] || mCFContext->zsVersion == -1)) {
           clusterer.mPmemory->counters.nPositions = 0;
           return;
         }
-        if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSector] == 0)
-        {
+        if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSector] == 0) {
           clusterer.mPmemory->counters.nPositions = 0;
           return;
         }
 
-        if (propagateMCLabels && fragment.index == 0)
-        {
+        if (propagateMCLabels && fragment.index == 0) {
           clusterer.PrepareMC();
           clusterer.mPinputLabels = digitsMC->v[iSector];
-          if (clusterer.mPinputLabels == nullptr)
-          {
+          if (clusterer.mPinputLabels == nullptr) {
             GPUFatal("MC label container missing, sector %d", iSector);
           }
-          if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSector])
-          {
+          if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSector]) {
             GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSector], (int32_t)clusterer.mPinputLabels->getIndexedSize());
           }
         }
 
-        if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSector)
-        {
-          if (not mIOPtrs.tpcZS)
-          {
+        if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSector) {
+          if (not mIOPtrs.tpcZS) {
             runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane), {iSector}}, mIOPtrs.tpcZS == nullptr);
             TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-          }
-          else if (propagateMCLabels)
-          {
+          } else if (propagateMCLabels) {
             runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, mIOPtrs.tpcZS == nullptr);
             TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
           }
         }
 
-        if (mIOPtrs.tpcZS)
-        {
-          int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader *)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
+        if (mIOPtrs.tpcZS) {
+          int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0);
           uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubsector : GPUTrackingInOutZS::NENDPOINTS;
 
           (void)tpcTimeBinCut; // TODO: To be used in decoding kernels
-          switch (mCFContext->zsVersion)
-          {
-          default:
-            GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion);
-            break;
-          case ZSVersionRowBased10BitADC:
-          case ZSVersionRowBased12BitADC:
-            runKernel<GPUTPCCFDecodeZS>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
-            break;
-          case ZSVersionLinkBasedWithMeta:
-            runKernel<GPUTPCCFDecodeZSLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
-            break;
-          case ZSVersionDenseLinkBased:
-            runKernel<GPUTPCCFDecodeZSDenseLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
-            break;
+          switch (mCFContext->zsVersion) {
+            default:
+              GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion);
+              break;
+            case ZSVersionRowBased10BitADC:
+            case ZSVersionRowBased12BitADC:
+              runKernel<GPUTPCCFDecodeZS>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
+              break;
+            case ZSVersionLinkBasedWithMeta:
+              runKernel<GPUTPCCFDecodeZSLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
+              break;
+            case ZSVersionDenseLinkBased:
+              runKernel<GPUTPCCFDecodeZSDenseLink>({GetGridBlk(nBlocks, lane), {iSector}}, firstHBF);
+              break;
           }
           TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
         } // clang-format off

From 78c342d1cc6c11f7f281e7650d01c83c7b8b5352 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 14 Mar 2025 01:02:46 +0100
Subject: [PATCH 73/77] Removing white-spaces

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index e8eb436d6ef5c..f82f2d65566ba 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -913,20 +913,20 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
           const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
           GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
-          
+
           if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
-          
+
           float time_clusterizer = 0, time_fill = 0;
           for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
             uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
             size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
-            
+
             auto start0 = std::chrono::high_resolution_clock::now();
             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
-            
+
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
             nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
@@ -945,7 +945,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
-            
+
             time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
             time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
           }

From 6a7b17c8da959e8fa34767c76262a1665bdbcb6d Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 14 Mar 2025 08:49:10 +0100
Subject: [PATCH 74/77] Adding necessary if-statement to avoid automatic model
 loading

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  2 +-
 .../Global/GPUChainTrackingClusterizer.cxx    | 52 ++++++++++---------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 684d4d734a29e..e02262b629340 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -229,7 +229,7 @@ AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify i
 AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
 AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
 AddOption(nnInferenceThreadsPerNN, int, 0, "", 0, "Number of threads used to evaluate one neural network")
-AddOption(nnInferenceEnableOrtOptimization, unsigned int, 1, "", 0, "Enables graph optimizations in ONNX Runtime. Can be greater than 1!")
+AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347")
 AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
 AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data")
 AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index f82f2d65566ba..336f1aeda0640 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -613,32 +613,34 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
 
 #ifdef GPUCA_HAS_ONNX
-  uint32_t maxClusters = -1;
-  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
-    maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
-  }
-  for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
-    GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
-    const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
-    clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
-    clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
-    clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
-    clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
-    clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
-    clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
-    clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
-    clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
-    clustererNN.nnClusterizerTotalClusters = maxClusters;
-    clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
-    clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
-    if (nn_settings.nnClusterizerVerbosity < 0) {
-      clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
-    } else {
-      clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
+  if (GetProcessingSettings().nn.applyNNclusterizer) {
+    uint32_t maxClusters = -1;
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+      maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
+    }
+    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
+      GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
+      const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
+      clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
+      clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
+      clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
+      clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
+      clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
+      clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
+      clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
+      clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
+      clustererNN.nnClusterizerTotalClusters = maxClusters;
+      clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
+      clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
+      if (nn_settings.nnClusterizerVerbosity < 0) {
+        clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
+      } else {
+        clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
+      }
+      clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
+      GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+      AllocateRegisteredMemory(clustererNN.mMemoryId);
     }
-    clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
-    GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
-    AllocateRegisteredMemory(clustererNN.mMemoryId);
   }
 #endif
 

From bb163ea3a486db66e1d1d0fd12e1c95203e5bbfd Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 14 Mar 2025 10:55:24 +0100
Subject: [PATCH 75/77] Removing GPUConstantMem, adding interOpNumThreads
 option

---
 Common/ML/include/ML/OrtInterface.h           |  4 +-
 Common/ML/src/OrtInterface.cxx                |  6 ++-
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  3 +-
 .../Global/GPUChainTrackingClusterizer.cxx    | 14 +++----
 .../GPUTPCNNClusterizerHost.cxx               |  3 +-
 .../GPUTPCNNClusterizerKernels.cxx            | 42 +++++++++----------
 .../GPUTPCNNClusterizerKernels.h              |  8 ++--
 GPU/GPUTracking/kernels.cmake                 | 12 +++---
 8 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 368754aee0f92..44cafc4a80083 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -84,8 +84,8 @@ class OrtModel
 
   // Environment settings
   bool mInitialized = false;
-  std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda
-  int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
+  std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
+  int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
   std::string printShape(const std::vector<int64_t>&);
 };
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index dc8d1a23b6569..d96a2a1a2805f 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -52,6 +52,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
     deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
     allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
     intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
+    interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0);
     loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
     enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
     enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
@@ -90,13 +91,14 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 
   if (device == "CPU") {
     (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
-    if (intraOpNumThreads > 1) {
+    (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads);
+    if (intraOpNumThreads > 1 || interOpNumThreads > 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
     } else if (intraOpNumThreads == 1) {
       (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
     }
     if (loggingLevel < 2) {
-      LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads";
+      LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " (intraOpNumThreads) and " << interOpNumThreads << " (interOpNumThreads) threads";
     }
   }
 
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index e02262b629340..63fcf51004eae 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -228,7 +228,8 @@ AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify i
 AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
 AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
 AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
-AddOption(nnInferenceThreadsPerNN, int, 0, "", 0, "Number of threads used to evaluate one neural network")
+AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
+AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
 AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347")
 AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime")
 AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 336f1aeda0640..6b0e8de3f2498 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -927,23 +927,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
             auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
             nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
             } else {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
             }
 
             if (!clustererNN.nnClusterizerUseCfRegression) {
               nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
                 nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
@@ -953,7 +953,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
           auto start1 = std::chrono::high_resolution_clock::now();
           if (clustererNN.nnClusterizerUseCfRegression) {
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, processors(), iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
@@ -970,7 +970,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         } else {
           runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
           DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
-          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 0);
+          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0);
         }
 
         if (doGPU && propagateMCLabels) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index e64336016021f..3331c1efb89ea 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -27,7 +27,8 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
     {"device-id", std::to_string(settings.nnInferenceDeviceId)},
     {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
     {"dtype", settings.nnInferenceDtype},
-    {"intra-op-num-threads", std::to_string(settings.nnInferenceThreadsPerNN)},
+    {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
+    {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)},
     {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
     {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)},
     {"profiling-output-path", settings.nnInferenceOrtProfilingPath},
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index cb71fff2674c6..901901165f561 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -34,11 +34,11 @@ using namespace o2::gpu::tpccf;
 
 // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& notUsed, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  auto& clusterer = processors->tpcClusterer[sector];
-  auto& clustererNN = processors->tpcNNClusterer[sector];
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
   if (clustererNN.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices
     return;
   }
@@ -50,22 +50,22 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& notUsed, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart);
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& notUsed, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  processors->tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors->tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors->tpcNNClusterer[sector].nnClassThreshold);
+  processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& notUsed, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
-  auto& clusterer = processors->tpcNNClusterer[sector];
+  auto& clusterer = processors.tpcNNClusterer[sector];
   uint glo_idx = get_global_id(0);
   uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
   float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
@@ -82,20 +82,20 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& notUsed, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  if (glo_idx >= processors->tpcClusterer[sector].mPmemory->counters.nClusters) {
+  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
     return;
   }
   GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& notUsed, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  if (glo_idx >= processors->tpcClusterer[sector].mPmemory->counters.nClusters) {
+  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
     return;
   }
   GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
@@ -128,11 +128,11 @@ GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_
 }
 
 // Filling the input data for the neural network where there is no boundary
-GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUConstantMem* processors, uint8_t sector, int8_t dtype, uint batchStart)
+GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  auto& clusterer = processors->tpcClusterer[sector];
-  auto& clustererNN = processors->tpcNNClusterer[sector];
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
 
@@ -192,10 +192,10 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n
   }
 }
 
-GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
-  auto& clusterer = processors->tpcClusterer[sector];
-  auto& clustererNN = processors->tpcNNClusterer[sector];
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
@@ -272,10 +272,10 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
   }
 }
 
-GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, GPUConstantMem* processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
-  auto& clusterer = processors->tpcClusterer[sector];
-  auto& clustererNN = processors->tpcNNClusterer[sector];
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index 8c29c7b540ee2..8ef41e35a7e21 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -60,12 +60,12 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
   };
 
   template <int32_t iKernel = defaultKernel, typename... Args>
-  GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, GPUConstantMem*, uint8_t = 0, int8_t = 0, int8_t = 0, uint = 0, Args...);
+  GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, uint8_t = 0, int8_t = 0, int8_t = 0, uint = 0, Args...);
 
  private:
-  static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, GPUConstantMem*, uint8_t, int8_t, uint);
-  static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, GPUConstantMem*, uint8_t, int8_t, int8_t, uint);
-  static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, GPUConstantMem*, uint8_t, int8_t, int8_t, uint);
+  static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, uint8_t, int8_t, uint);
+  static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
+  static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint);
 
   static GPUd() int padOffset(int, int, const GPUTPCGeometry&);
   static GPUd() int rowOffset(int, int);
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index f668a75059422..ad348a84264f0 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -115,12 +115,12 @@ o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"             "= TPCCLUS
 o2_gpu_add_kernel("GPUTPCCFDeconvolution"                             "= TPCCLUSTERFINDER"                                    LB)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUSTERFINDER"                                    LB int8_t onlyMC)
 if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer"        "= TPCNNCLUSTERFINDER"                                LB GPUConstantMem* processors uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN"             "= TPCNNCLUSTERFINDER"                                LB GPUConstantMem* processors uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels"   "= TPCNNCLUSTERFINDER"                                LB GPUConstantMem* processors uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels"   "= TPCNNCLUSTERFINDER"                                LB GPUConstantMem* processors uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER"                                LB GPUConstantMem* processors uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression" "= TPCNNCLUSTERFINDER"                                LB GPUConstantMem* processors uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer"        "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN"             "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels"   "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels"   "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression" "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 endif()
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER")
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                 "= TPCCLUSTERFINDER"                                    NO GPUTPCLinearLabels* out)

From eabba5f377568b18422ee26b3e5e54096151c168 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 14 Mar 2025 11:40:50 +0100
Subject: [PATCH 76/77] Found the bug where I loose clusters

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 6b0e8de3f2498..57a114499db1f 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -632,6 +632,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       clustererNN.nnClusterizerTotalClusters = maxClusters;
       clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
       clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
+      if (clustererNN.nnSigmoidTrafoClassThreshold) {
+        clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
+      }
       if (nn_settings.nnClusterizerVerbosity < 0) {
         clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
       } else {

From 1e807545d2cd7d4781aaafc121e303995b97507d Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 14 Mar 2025 14:30:30 +0100
Subject: [PATCH 77/77] Editor configured for whitespaces at EOF

---
 Common/ML/include/ML/OrtInterface.h                             | 2 +-
 Common/ML/src/OrtInterface.cxx                                  | 2 +-
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx          | 2 +-
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h          | 2 +-
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx    | 2 +-
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h      | 2 +-
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 2 +-
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h   | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 44cafc4a80083..93549178848ca 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -94,4 +94,4 @@ class OrtModel
 
 } // namespace o2
 
-#endif // O2_ML_ORTINTERFACE_H
\ No newline at end of file
+#endif // O2_ML_ORTINTERFACE_H
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index d96a2a1a2805f..fc784dd14d2dc 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -267,4 +267,4 @@ std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
 
 } // namespace ml
 
-} // namespace o2
\ No newline at end of file
+} // namespace o2
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 57a114499db1f..63d56da37595b 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -1175,4 +1175,4 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
 #endif
   return 0;
-}
\ No newline at end of file
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 6b628132c17b5..ea6340dfd48bc 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -74,4 +74,4 @@ class GPUTPCNNClusterizer : public GPUProcessor
 
 } // namespace o2::gpu
 
-#endif
\ No newline at end of file
+#endif
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 3331c1efb89ea..5002c63524020 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -62,4 +62,4 @@ void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNC
   } else {
     model.inference<float, float>(clusterer.inputData32, size * clusterer.nnClusterizerElementSize, output);
   }
-}
\ No newline at end of file
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 1ba34aa370330..7efa0edecb893 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -65,4 +65,4 @@ class GPUTPCNNClusterizerHost
 
 } // namespace o2::gpu
 
-#endif
\ No newline at end of file
+#endif
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 901901165f561..25cd2497fbf62 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -383,4 +383,4 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
     }
     return;
   }
-}
\ No newline at end of file
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index 8ef41e35a7e21..c7bd18115d61f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -74,4 +74,4 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
 
 } // namespace o2::gpu
 
-#endif
\ No newline at end of file
+#endif