From 84eac06b0bb1562d826b4ace3a8435c3385b91a0 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 15 Mar 2025 21:36:05 +0100
Subject: [PATCH 01/10] Initial set of bug.fixes and cosmetic changes

---
 .../Global/GPUChainTrackingClusterizer.cxx    |  15 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |  38 +++--
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |   3 +-
 .../GPUTPCNNClusterizerHost.cxx               |   4 +-
 .../GPUTPCNNClusterizerHost.h                 |  17 --
 .../GPUTPCNNClusterizerKernels.cxx            | 160 ++++++++----------
 6 files changed, 101 insertions(+), 136 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 63d56da37595b..546f62b6c35d6 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -614,7 +614,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
 #ifdef GPUCA_HAS_ONNX
   if (GetProcessingSettings().nn.applyNNclusterizer) {
-    uint32_t maxClusters = -1;
+    uint32_t maxClusters = 0;
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
     }
@@ -918,6 +918,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
           const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
           GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+          int withMC = (doGPU && propagateMCLabels);
 
           if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
@@ -930,23 +931,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
             auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Filling the data
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
             nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels
             } else {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels
             }
 
             if (!clustererNN.nnClusterizerUseCfRegression) {
               nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
                 nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 2
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
@@ -956,7 +957,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
           auto start1 = std::chrono::high_resolution_clock::now();
           if (clustererNN.nnClusterizerUseCfRegression) {
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 6a9b6f546ae07..df0f895cd5976 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -24,25 +24,29 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}
 
 void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 {
-  if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
-    computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
-  } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {
-    computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
-  }
-  computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters);
-  if (nnClusterizerModelClassNumOutputNodes > 0) {
-    computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
-  }
-  if (!nnClusterizerUseCfRegression) {
-    if (nnClusterizerModelReg1NumOutputNodes > 0) {
-      computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
+  if (nnClusterizerBatchedMode > 0){
+    if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
+      computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
+    } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {
+      computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
     }
-    if (nnClusterizerModelReg2NumOutputNodes > 0) {
-      computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
+    computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
+    computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
+    computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
+    if (nnClusterizerModelClassNumOutputNodes > 0) {
+      computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
     }
+    if (!nnClusterizerUseCfRegression) {
+      if (nnClusterizerModelReg1NumOutputNodes > 0) {
+        computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
+      }
+      if (nnClusterizerModelReg2NumOutputNodes > 0) {
+        computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
+      }
+    }
+  }
+  if (nnClusterizerTotalClusters > 0) {
+    computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters);
   }
   return mem;
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index ea6340dfd48bc..01d1873f3b351 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -42,7 +42,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int nnClusterizerSizeInputTime = 3;
   int nnClusterizerElementSize = -1;
   bool nnClusterizerAddIndexData = true;
-  float nnClassThreshold = 0.16;
+  float nnClassThreshold = 0.01;
   bool nnSigmoidTrafoClassThreshold = 1;
   int nnClusterizerUseCfRegression = 0;
   int nnClusterizerBatchedMode = 1;
@@ -58,7 +58,6 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int mISector = -1;
 
   // Memory allocation for neural network
-  uint class2_elements = 0;
   float* inputData32 = nullptr;
   OrtDataType::Float16_t* inputData16 = nullptr;
   float* outputDataClass = nullptr;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 5002c63524020..321fad3d039db 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -12,6 +12,8 @@
 /// \file GPUTPCNNClusterizerHost.cxx
 /// \author Christian Sonnabend
 
+#include <CommonUtils/StringUtils.h>
+
 #include "GPUTPCNNClusterizerHost.h"
 #include "GPUTPCNNClusterizer.h"
 #include "GPUSettings.h"
@@ -37,7 +39,7 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
   model_class.init(OrtOptions);
   clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
 
-  reg_model_paths = splitString(settings.nnRegressionPath, ":");
+  reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':');
 
   if (!settings.nnClusterizerUseCfRegression) {
     if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 7efa0edecb893..430d78d0bb2fb 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -44,23 +44,6 @@ class GPUTPCNNClusterizerHost
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
   std::vector<std::string> reg_model_paths;
-
- private:
-  // Avoid including CommonUtils/StringUtils.h
-  std::vector<std::string> splitString(const std::string& input, const std::string& delimiter)
-  {
-    std::vector<std::string> tokens;
-    std::size_t pos = 0;
-    std::size_t found;
-
-    while ((found = input.find(delimiter, pos)) != std::string::npos) {
-      tokens.push_back(input.substr(pos, found - pos));
-      pos = found + delimiter.length();
-    }
-    tokens.push_back(input.substr(pos));
-
-    return tokens;
-  }
 }; // class GPUTPCNNClusterizerHost
 
 } // namespace o2::gpu
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 25cd2497fbf62..c536303147ae6 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -34,7 +34,7 @@ using namespace o2::gpu::tpccf;
 
 // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
@@ -44,91 +44,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
   }
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
   o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
   GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart);
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  auto& clusterer = processors.tpcNNClusterer[sector];
-  uint glo_idx = get_global_id(0);
-  uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
-  float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
-  uint class_label = 0;
-  for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
-    if (pIdx == elem_iterator) {
-      current_max_prob = clusterer.modelProbabilities[pIdx];
-    } else {
-      class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
-    }
-  }
-  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
-  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
-    return;
-  }
-  GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
-    return;
-  }
-  GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
-}
-
-// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
-GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
-{
-  return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);
-}
-
-GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift)
-{
-  return (row > 62 ? global_shift : 0);
-}
-
-GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
-{
-  if (pad < 0 || row < 0) { // Faster short-circuit
-    return true;
-  } else if (row < 63) {
-    return (pad >= static_cast<int>(geo.NPads(row)));
-  } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
-    return true;
-  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) {
-    return (pad >= static_cast<int>(geo.NPads(row - global_shift)));
-  } else {
-    return true;
-  }
-}
-
-// Filling the input data for the neural network where there is no boundary
-GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
@@ -144,7 +66,7 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n
 
   clustererNN.peakPositions[glo_idx] = peak;
   clustererNN.centralCharges[glo_idx] = central_charge;
-  clustererNN.outputDataClass[glo_idx + batchStart] = -1;
+  clustererNN.outputDataClass[glo_idx + batchStart] = -1.f;
 
   int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
 #ifndef GPUCA_GPUCODE
@@ -192,14 +114,43 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n
   }
 }
 
-GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
+{
+  auto& clusterer = processors.tpcNNClusterer[sector];
+  uint glo_idx = get_global_id(0);
+  uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
+  float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
+  uint class_label = 0;
+  for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
+    if (pIdx == elem_iterator) {
+      current_max_prob = clusterer.modelProbabilities[pIdx];
+    } else {
+      class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
+    }
+  }
+  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
+  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
 {
+  uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
+
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
   int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg1NumOutputNodes;
 
@@ -210,7 +161,7 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
     ClusterAccumulator pc;
 
     // Publishing logic is taken from default clusterizer
-    if (onlyMC) {
+    if (withMC) {
       ClusterAccumulator dummy_pc;
       CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
       GPUTPCCFClusterizer::buildCluster(
@@ -223,7 +174,6 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
         &dummy_pc,
         labelAcc);
     }
-
     if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -272,24 +222,25 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
   }
 }
 
-GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
 {
+  uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
+
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
   int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg2NumOutputNodes;
 
-  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg2NumOutputNodes << " -- " << clustererNN.peakPositions.size() << " -- " << clustererNN.centralCharges.size();
-
   if (clustererNN.outputDataClass[full_glo_idx] > 0) {
 
     ClusterAccumulator pc;
 
-    if (onlyMC) {
+    if (withMC) {
       ClusterAccumulator dummy_pc;
       CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
       GPUTPCCFClusterizer::buildCluster(
@@ -302,7 +253,6 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
         &dummy_pc,
         labelAcc);
     }
-
     if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -384,3 +334,29 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
     return;
   }
 }
+
+// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
+GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
+{
+  return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);
+}
+
+GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift)
+{
+  return (row > 62 ? global_shift : 0);
+}
+
+GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
+{
+  if (pad < 0 || row < 0) { // Faster short-circuit
+    return true;
+  } else if (row < 63) {
+    return (pad >= static_cast<int>(geo.NPads(row)));
+  } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
+    return true;
+  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) {
+    return (pad >= static_cast<int>(geo.NPads(row - global_shift)));
+  } else {
+    return true;
+  }
+}

From 219164923257101b6084bd97700314ea4f109d30 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Sat, 15 Mar 2025 20:37:59 +0000
Subject: [PATCH 02/10] Please consider the following formatting changes

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index df0f895cd5976..655e2bf5a933c 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -24,7 +24,7 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}
 
 void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 {
-  if (nnClusterizerBatchedMode > 0){
+  if (nnClusterizerBatchedMode > 0) {
     if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
       computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
     } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {

From b742c50537a7aa73812e62079306830884fce271 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 15 Mar 2025 21:42:35 +0100
Subject: [PATCH 03/10] Adjusting eval sizes. Makes code neater and avoids some
 calculations

---
 Common/ML/src/OrtInterface.cxx                      | 13 ++++++-------
 .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx    |  6 +++---
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index fc784dd14d2dc..ae809a2ba5c1a 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -226,19 +226,18 @@ template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Fl
 template <class I, class O>
 void OrtModel::inference(I* input, size_t input_size, O* output)
 {
-  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<int64_t> inputShape{input_size, (int64_t)mInputShapes[0][1]};
   Ort::Value inputTensor = Ort::Value(nullptr);
   if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
-    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size());
+    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
   } else {
-    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size());
+    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
   }
 
-  std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
-  size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]);
-  Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
+  std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
+  Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
 
-  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here
+  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is always correct here
 }
 
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 321fad3d039db..b32d042ebd1fa 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -57,11 +57,11 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
   }
 }
 
-void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype)
+void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype)
 {
   if (dtype == 0) {
-    model.inference<OrtDataType::Float16_t, float>(clusterer.inputData16, size * clusterer.nnClusterizerElementSize, output);
+    model.inference<OrtDataType::Float16_t, float>(clustererNN.inputData16, size, output);
   } else {
-    model.inference<float, float>(clusterer.inputData32, size * clusterer.nnClusterizerElementSize, output);
+    model.inference<float, float>(clustererNN.inputData32, size, output);
   }
 }

From 0c1cfb742e987ab50d87c0f5023a63841531335d Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 20 Mar 2025 13:13:25 +0100
Subject: [PATCH 04/10] Adding separate functions. Now the host process only
 needs one instance and one initialization

---
 .../Global/GPUChainTrackingClusterizer.cxx    |  7 +++---
 .../GPUTPCNNClusterizerHost.cxx               | 24 +++++++++++++++----
 .../GPUTPCNNClusterizerHost.h                 |  3 +++
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index b928ed7c177eb..916f2634fb2f6 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -612,14 +612,16 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
 
 #ifdef GPUCA_HAS_ONNX
+  const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
+  GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only
   if (GetProcessingSettings().nn.applyNNclusterizer) {
     uint32_t maxClusters = 0;
+    nnApplication.init(nn_settings);
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
     }
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
-      const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
       clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
       clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
       clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
@@ -640,7 +642,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
       }
       clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
-      GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+      nnApplication.initClusterizer(nn_settings, clustererNN);
       AllocateRegisteredMemory(clustererNN.mMemoryId);
     }
   }
@@ -916,7 +918,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 #ifdef GPUCA_HAS_ONNX
           GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
           const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
-          GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
           int withMC = (doGPU && propagateMCLabels);
 
           if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index b32d042ebd1fa..a1f78ca787282 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -21,7 +21,12 @@
 
 using namespace o2::gpu;
 
-GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer)
+GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings)
+{
+  init(settings);
+}
+
+void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
 {
   OrtOptions = {
     {"model-path", settings.nnClassificationPath},
@@ -37,21 +42,30 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
     {"logging-level", std::to_string(settings.nnInferenceVerbosity)}};
 
   model_class.init(OrtOptions);
-  clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
 
-  reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':');
+  reg_model_paths = splitString(settings.nnRegressionPath, ":");
 
   if (!settings.nnClusterizerUseCfRegression) {
     if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
       OrtOptions["model-path"] = reg_model_paths[0];
       model_reg_1.init(OrtOptions);
-      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
     } else {
       OrtOptions["model-path"] = reg_model_paths[0];
       model_reg_1.init(OrtOptions);
-      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
       OrtOptions["model-path"] = reg_model_paths[1];
       model_reg_2.init(OrtOptions);
+    }
+  }
+}
+
+void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer)
+{
+  clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
+  if (!settings.nnClusterizerUseCfRegression) {
+    if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
+      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
+    } else {
+      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
       clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1];
     }
   }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 430d78d0bb2fb..1f31567dc42f1 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -39,6 +39,9 @@ class GPUTPCNNClusterizerHost
   GPUTPCNNClusterizerHost() = default;
   GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
 
+  void init(const GPUSettingsProcessingNNclusterizer&);
+  void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+
   void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype);
 
   std::unordered_map<std::string, std::string> OrtOptions;

From 83c004fa0f84ab8fec7a7458e210ad63ab7a489f Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 22 Mar 2025 16:55:50 +0100
Subject: [PATCH 05/10] First version of CCDB implementation

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  8 ++++++
 .../Global/GPUChainTrackingClusterizer.cxx    | 21 +++++++++++++++
 .../GPUTPCNNClusterizerHost.cxx               | 26 +++++++++++++++++++
 .../GPUTPCNNClusterizerHost.h                 |  1 +
 4 files changed, 56 insertions(+)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 40a7fc71cbb4d..7611e810768fe 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -249,6 +249,14 @@ AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The c
 AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
 AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
 AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
+// CCDB
+AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally")
+AddOption(nnCCDBURL, std::string, "http://alice-ccdb.cern.ch", "", 0, "The CCDB URL from where the network files are fetched")
+AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks")
+AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression")
+AddOption(nnCCDBLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
+AddOption(nnCCDBBeamType, std::string, "", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp")
+AddOption(nnCCDBInteractionRate, int, -1, "", 0, "Distinguishes between networks for different interaction rates [kHz].")
 AddHelp("help", 'h')
 EndConfig()
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 916f2634fb2f6..c7816bb9ec17c 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -615,6 +615,27 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
   GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only
   if (GetProcessingSettings().nn.applyNNclusterizer) {
+    if(nn_settings.nnLoadFromCCDB) {
+      std::unordered_map<std::string, std::string> ccdbSettings = {
+        {"nnCCDBPath", nn_settings.nnCCDBPath},
+        {"inputDType", nn_settings.inputDType},
+        {"outputDType", nn_settings.outputDType},
+        {"nnCCDBWithMomentum", std::to_string(nn_settings.nnCCDBWithMomentum)},
+        {"nnCCDBLayerType", nn_settings.nnCCDBLayerType},
+        {"nnCCDBBeamType", nn_settings.nnCCDBBeamType},
+        {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)}
+      };
+
+      std::unordered_map<std::string, std::string> networkRetrieval = ccdbSettings;
+
+      networkRetrieval["nnCCDBEvalType"] = "classification_c1";
+      networkRetrieval["outputFile"] = "net_classification_c1.onnx";
+      nnApplication.loadFromCCDB(networkRetrieval);
+
+      networkRetrieval["nnCCDBEvalType"] = "regression_c1";
+      networkRetrieval["outputFile"] = "net_regression_c1.onnx";
+      nnApplication.loadFromCCDB(networkRetrieval);
+    }
     uint32_t maxClusters = 0;
     nnApplication.init(nn_settings);
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index a1f78ca787282..20190994b97ba 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -26,6 +26,32 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
   init(settings);
 }
 
+void GPUTPCNNClusterizerHost::loadFromCCDB(std::unordered_map<std::string, std::string> settings) {
+  o2::ccdb::CcdbApi ccdbApi;
+  ccdbApi.init(settings["nnCCDBURL"]);
+
+  metadata[settings["inputDType"]] = settings["inputDType"];
+  metadata[settings["outputDType"]] = settings["outputDType"];
+  metadata[settings["nnCCDBEvalType"]] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C
+  metadata[settings["nnCCDBWithMomentum"]] = std::stoi(settings["nnCCDBWithMomentum"]); // 0, 1 -> Only for regression model
+  metadata[settings["nnCCDBLayerType"]] = settings["nnCCDBLayerType"]; // FC, CNN
+  if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) {
+    metadata[settings["nnCCDBInteractionRate"]] = settings["nnCCDBInteractionRate"];
+  }
+  if (settings["nnCCDBBeamType"] != "") {
+    metadata[settings["nnCCDBBeamType"]] = settings["nnCCDBBeamType"];
+  }
+
+  bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnPathCCDB"], ".", metadata, 1, false, settings["outputFile"]);
+  // headers = ccdbApi.retrieveHeaders(nnPathCCDB, metadata, ccdbTimestamp); // potentially needed to init some local variables
+
+  if (retrieveSuccess) {
+    LOG(info) << "Network " << settings["nnPathCCDB"] << " retrieved from CCDB, stored at " << settings["networkPathLocal"];
+  } else {
+    LOG(error) << "Failed to retrieve network from CCDB";
+  }
+}
+
 void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
 {
   OrtOptions = {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 1f31567dc42f1..a3f3ecd72ffca 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -41,6 +41,7 @@ class GPUTPCNNClusterizerHost
 
   void init(const GPUSettingsProcessingNNclusterizer&);
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+  void loadFromCCDB(std::unordered_map<std::string, std::string>);
 
   void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype);
 

From d767ed1b636f97c0fe8447e8e3ebfc854a4aa214 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 24 Mar 2025 00:16:18 +0100
Subject: [PATCH 06/10] Working CCDB API calls (tested with test-ccdb)

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h | 11 +++++---
 .../Global/GPUChainTrackingClusterizer.cxx    | 13 ++++++----
 .../GPUTPCNNClusterizerHost.cxx               | 25 ++++++++++---------
 .../GPUTPCNNClusterizerHost.h                 |  8 ++++--
 4 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 7611e810768fe..5b4d08f5ffe67 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -229,6 +229,8 @@ AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify i
 AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
 AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
 AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
 AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
 AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
 AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347")
@@ -251,12 +253,13 @@ AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regress
 AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
 // CCDB
 AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally")
-AddOption(nnCCDBURL, std::string, "http://alice-ccdb.cern.ch", "", 0, "The CCDB URL from where the network files are fetched")
+AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched")
 AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks")
 AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression")
-AddOption(nnCCDBLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
-AddOption(nnCCDBBeamType, std::string, "", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp")
-AddOption(nnCCDBInteractionRate, int, -1, "", 0, "Distinguishes between networks for different interaction rates [kHz].")
+AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
+AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
+AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp")
+AddOption(nnCCDBInteractionRate, int, 50, "", 0, "Distinguishes between networks for different interaction rates [kHz].")
 AddHelp("help", 'h')
 EndConfig()
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index c7816bb9ec17c..fb6bffe51a160 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -616,26 +616,29 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only
   if (GetProcessingSettings().nn.applyNNclusterizer) {
     if(nn_settings.nnLoadFromCCDB) {
-      std::unordered_map<std::string, std::string> ccdbSettings = {
+      std::map<std::string, std::string> ccdbSettings = {
+        {"nnCCDBURL", nn_settings.nnCCDBURL},
         {"nnCCDBPath", nn_settings.nnCCDBPath},
-        {"inputDType", nn_settings.inputDType},
-        {"outputDType", nn_settings.outputDType},
+        {"inputDType", nn_settings.nnInferenceInputDType},
+        {"outputDType", nn_settings.nnInferenceOutputDType},
         {"nnCCDBWithMomentum", std::to_string(nn_settings.nnCCDBWithMomentum)},
-        {"nnCCDBLayerType", nn_settings.nnCCDBLayerType},
         {"nnCCDBBeamType", nn_settings.nnCCDBBeamType},
         {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)}
       };
 
-      std::unordered_map<std::string, std::string> networkRetrieval = ccdbSettings;
+      std::map<std::string, std::string> networkRetrieval = ccdbSettings;
 
+      networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType;
       networkRetrieval["nnCCDBEvalType"] = "classification_c1";
       networkRetrieval["outputFile"] = "net_classification_c1.onnx";
       nnApplication.loadFromCCDB(networkRetrieval);
 
+      networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType;
       networkRetrieval["nnCCDBEvalType"] = "regression_c1";
       networkRetrieval["outputFile"] = "net_regression_c1.onnx";
       nnApplication.loadFromCCDB(networkRetrieval);
     }
+
     uint32_t maxClusters = 0;
     nnApplication.init(nn_settings);
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 20190994b97ba..b4ee558b1e201 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -16,6 +16,7 @@
 
 #include "GPUTPCNNClusterizerHost.h"
 #include "GPUTPCNNClusterizer.h"
+#include "CCDB/CcdbApi.h"
 #include "GPUSettings.h"
 #include "ML/3rdparty/GPUORTFloat16.h"
 
@@ -26,27 +27,27 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
   init(settings);
 }
 
-void GPUTPCNNClusterizerHost::loadFromCCDB(std::unordered_map<std::string, std::string> settings) {
+void GPUTPCNNClusterizerHost::loadFromCCDB(std::map<std::string, std::string> settings) {
   o2::ccdb::CcdbApi ccdbApi;
   ccdbApi.init(settings["nnCCDBURL"]);
 
-  metadata[settings["inputDType"]] = settings["inputDType"];
-  metadata[settings["outputDType"]] = settings["outputDType"];
-  metadata[settings["nnCCDBEvalType"]] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C
-  metadata[settings["nnCCDBWithMomentum"]] = std::stoi(settings["nnCCDBWithMomentum"]); // 0, 1 -> Only for regression model
-  metadata[settings["nnCCDBLayerType"]] = settings["nnCCDBLayerType"]; // FC, CNN
+  metadata["inputDType"] = settings["inputDType"];
+  metadata["outputDType"] = settings["outputDType"];
+  metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C
+  metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model
+  metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN
   if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) {
-    metadata[settings["nnCCDBInteractionRate"]] = settings["nnCCDBInteractionRate"];
+    metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"];
   }
   if (settings["nnCCDBBeamType"] != "") {
-    metadata[settings["nnCCDBBeamType"]] = settings["nnCCDBBeamType"];
+    metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"];
   }
 
-  bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnPathCCDB"], ".", metadata, 1, false, settings["outputFile"]);
-  // headers = ccdbApi.retrieveHeaders(nnPathCCDB, metadata, ccdbTimestamp); // potentially needed to init some local variables
+  bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], ".", metadata, 1, false, settings["outputFile"]);
+  // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables
 
   if (retrieveSuccess) {
-    LOG(info) << "Network " << settings["nnPathCCDB"] << " retrieved from CCDB, stored at " << settings["networkPathLocal"];
+    LOG(info) << "Network " << settings["nnCCDBPath"] << " retrieved from CCDB, stored at " << settings["outputFile"];
   } else {
     LOG(error) << "Failed to retrieve network from CCDB";
   }
@@ -69,7 +70,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
 
   model_class.init(OrtOptions);
 
-  reg_model_paths = splitString(settings.nnRegressionPath, ":");
+  reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':');
 
   if (!settings.nnClusterizerUseCfRegression) {
     if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index a3f3ecd72ffca..798d4af2826b3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -37,17 +37,21 @@ class GPUTPCNNClusterizerHost
 {
  public:
   GPUTPCNNClusterizerHost() = default;
-  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&);
 
   void init(const GPUSettingsProcessingNNclusterizer&);
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
-  void loadFromCCDB(std::unordered_map<std::string, std::string>);
+  void loadFromCCDB(std::map<std::string, std::string>);
 
   void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype);
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
   std::vector<std::string> reg_model_paths;
+
+  private:
+    std::map<std::string, std::string> metadata;
+    std::map<std::string, std::string> headers;
 }; // class GPUTPCNNClusterizerHost
 
 } // namespace o2::gpu

From ad4b22be3c457fbcb6e0cca852bf8800d7b9929e Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 24 Mar 2025 10:38:51 +0100
Subject: [PATCH 07/10] Improve fetching, but have to pass settings by value,
 not const ref

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  1 +
 .../Global/GPUChainTrackingClusterizer.cxx    | 31 +++++++++++++++----
 .../GPUTPCNNClusterizerHost.cxx               |  4 +--
 .../GPUTPCNNClusterizerHost.h                 |  4 +--
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 5b4d08f5ffe67..a8a4ae566f485 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -255,6 +255,7 @@ AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then
 AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally")
 AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched")
 AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks")
+AddOption(nnCCDBFetchMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)")
 AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression")
 AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
 AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index fb6bffe51a160..dcd5cc2197e3c 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -40,6 +40,7 @@
 #endif
 
 #ifdef GPUCA_HAS_ONNX
+#include <CommonUtils/StringUtils.h>
 #include "GPUTPCNNClusterizerKernels.h"
 #include "GPUTPCNNClusterizerHost.h"
 #endif
@@ -612,7 +613,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
 
 #ifdef GPUCA_HAS_ONNX
-  const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
+  GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn;
   GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only
   if (GetProcessingSettings().nn.applyNNclusterizer) {
     if(nn_settings.nnLoadFromCCDB) {
@@ -626,17 +627,35 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)}
       };
 
+      std::string nnFetchFolder = "";
+      std::vector<std::string> fetchMode = o2::utils::Str::tokenize(nn_settings.nnCCDBFetchMode, ':');
       std::map<std::string, std::string> networkRetrieval = ccdbSettings;
 
-      networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType;
-      networkRetrieval["nnCCDBEvalType"] = "classification_c1";
-      networkRetrieval["outputFile"] = "net_classification_c1.onnx";
-      nnApplication.loadFromCCDB(networkRetrieval);
+      if (fetchMode[0] == "c1") {
+        networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType;
+        networkRetrieval["nnCCDBEvalType"] = "classification_c1";
+        networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx";
+        nnApplication.loadFromCCDB(networkRetrieval);
+      } else if (fetchMode[0] == "c2") {
+        networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType;
+        networkRetrieval["nnCCDBEvalType"] = "classification_c2";
+        networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx";
+        nnApplication.loadFromCCDB(networkRetrieval);
+      }
+      nn_settings.nnClassificationPath = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally
 
       networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType;
       networkRetrieval["nnCCDBEvalType"] = "regression_c1";
-      networkRetrieval["outputFile"] = "net_regression_c1.onnx";
+      networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx";
       nnApplication.loadFromCCDB(networkRetrieval);
+      nn_settings.nnRegressionPath = networkRetrieval["outputFile"];
+      if (fetchMode[1] == "r2") {
+        networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType;
+        networkRetrieval["nnCCDBEvalType"] = "regression_c2";
+        networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx";
+        nnApplication.loadFromCCDB(networkRetrieval);
+        nn_settings.nnRegressionPath += ":", networkRetrieval["outputFile"];
+      }
     }
 
     uint32_t maxClusters = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index b4ee558b1e201..da32d4938ebed 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -22,7 +22,7 @@
 
 using namespace o2::gpu;
 
-GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings)
+GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer settings)
 {
   init(settings);
 }
@@ -53,7 +53,7 @@ void GPUTPCNNClusterizerHost::loadFromCCDB(std::map<std::string, std::string> se
   }
 }
 
-void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
+void GPUTPCNNClusterizerHost::init(GPUSettingsProcessingNNclusterizer settings)
 {
   OrtOptions = {
     {"model-path", settings.nnClassificationPath},
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 798d4af2826b3..b6d5e48304e0d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -37,9 +37,9 @@ class GPUTPCNNClusterizerHost
 {
  public:
   GPUTPCNNClusterizerHost() = default;
-  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&);
+  GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer);
 
-  void init(const GPUSettingsProcessingNNclusterizer&);
+  void init(GPUSettingsProcessingNNclusterizer);
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
   void loadFromCCDB(std::map<std::string, std::string>);
 

From 81c646be0d2e27af8707a0b67a651cccc9de5b64 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 24 Mar 2025 11:04:57 +0100
Subject: [PATCH 08/10] Using const ref and moving CCDB calls to host
 initialization

---
 .../Global/GPUChainTrackingClusterizer.cxx    | 47 +---------------
 .../GPUTPCNNClusterizerHost.cxx               | 53 +++++++++++++++++--
 .../GPUTPCNNClusterizerHost.h                 |  4 +-
 3 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index dcd5cc2197e3c..98a0ec16495c5 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -40,7 +40,6 @@
 #endif
 
 #ifdef GPUCA_HAS_ONNX
-#include <CommonUtils/StringUtils.h>
 #include "GPUTPCNNClusterizerKernels.h"
 #include "GPUTPCNNClusterizerHost.h"
 #endif
@@ -613,51 +612,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
 
 #ifdef GPUCA_HAS_ONNX
-  GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn;
+  const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
   GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only
   if (GetProcessingSettings().nn.applyNNclusterizer) {
-    if(nn_settings.nnLoadFromCCDB) {
-      std::map<std::string, std::string> ccdbSettings = {
-        {"nnCCDBURL", nn_settings.nnCCDBURL},
-        {"nnCCDBPath", nn_settings.nnCCDBPath},
-        {"inputDType", nn_settings.nnInferenceInputDType},
-        {"outputDType", nn_settings.nnInferenceOutputDType},
-        {"nnCCDBWithMomentum", std::to_string(nn_settings.nnCCDBWithMomentum)},
-        {"nnCCDBBeamType", nn_settings.nnCCDBBeamType},
-        {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)}
-      };
-
-      std::string nnFetchFolder = "";
-      std::vector<std::string> fetchMode = o2::utils::Str::tokenize(nn_settings.nnCCDBFetchMode, ':');
-      std::map<std::string, std::string> networkRetrieval = ccdbSettings;
-
-      if (fetchMode[0] == "c1") {
-        networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType;
-        networkRetrieval["nnCCDBEvalType"] = "classification_c1";
-        networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx";
-        nnApplication.loadFromCCDB(networkRetrieval);
-      } else if (fetchMode[0] == "c2") {
-        networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType;
-        networkRetrieval["nnCCDBEvalType"] = "classification_c2";
-        networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx";
-        nnApplication.loadFromCCDB(networkRetrieval);
-      }
-      nn_settings.nnClassificationPath = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally
-
-      networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType;
-      networkRetrieval["nnCCDBEvalType"] = "regression_c1";
-      networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx";
-      nnApplication.loadFromCCDB(networkRetrieval);
-      nn_settings.nnRegressionPath = networkRetrieval["outputFile"];
-      if (fetchMode[1] == "r2") {
-        networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType;
-        networkRetrieval["nnCCDBEvalType"] = "regression_c2";
-        networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx";
-        nnApplication.loadFromCCDB(networkRetrieval);
-        nn_settings.nnRegressionPath += ":", networkRetrieval["outputFile"];
-      }
-    }
-
     uint32_t maxClusters = 0;
     nnApplication.init(nn_settings);
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
@@ -988,7 +945,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             if (!clustererNN.nnClusterizerUseCfRegression) {
               nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 1
-              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
+              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
                 nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
                 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 2
               }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index da32d4938ebed..533ac0c7481ff 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -22,7 +22,7 @@
 
 using namespace o2::gpu;
 
-GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer settings)
+GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings)
 {
   init(settings);
 }
@@ -53,10 +53,55 @@ void GPUTPCNNClusterizerHost::loadFromCCDB(std::map<std::string, std::string> se
   }
 }
 
-void GPUTPCNNClusterizerHost::init(GPUSettingsProcessingNNclusterizer settings)
+void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
 {
+  std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath;
+  std::vector<std::string> reg_model_paths;
+
+  if(settings.nnLoadFromCCDB) {
+    std::map<std::string, std::string> ccdbSettings = {
+      {"nnCCDBURL", settings.nnCCDBURL},
+      {"nnCCDBPath", settings.nnCCDBPath},
+      {"inputDType", settings.nnInferenceInputDType},
+      {"outputDType", settings.nnInferenceOutputDType},
+      {"nnCCDBWithMomentum", std::to_string(settings.nnCCDBWithMomentum)},
+      {"nnCCDBBeamType", settings.nnCCDBBeamType},
+      {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)}
+    };
+
+    std::string nnFetchFolder = "";
+    std::vector<std::string> fetchMode = o2::utils::Str::tokenize(settings.nnCCDBFetchMode, ':');
+    std::map<std::string, std::string> networkRetrieval = ccdbSettings;
+
+    if (fetchMode[0] == "c1") {
+      networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType;
+      networkRetrieval["nnCCDBEvalType"] = "classification_c1";
+      networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx";
+      loadFromCCDB(networkRetrieval);
+    } else if (fetchMode[0] == "c2") {
+      networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType;
+      networkRetrieval["nnCCDBEvalType"] = "classification_c2";
+      networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx";
+      loadFromCCDB(networkRetrieval);
+    }
+    class_model_path = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally
+
+    networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType;
+    networkRetrieval["nnCCDBEvalType"] = "regression_c1";
+    networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx";
+    loadFromCCDB(networkRetrieval);
+    reg_model_path = networkRetrieval["outputFile"];
+    if (fetchMode[1] == "r2") {
+      networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType;
+      networkRetrieval["nnCCDBEvalType"] = "regression_c2";
+      networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx";
+      loadFromCCDB(networkRetrieval);
+      reg_model_path += ":", networkRetrieval["outputFile"];
+    }
+  }
+
   OrtOptions = {
-    {"model-path", settings.nnClassificationPath},
+    {"model-path", class_model_path},
     {"device", settings.nnInferenceDevice},
     {"device-id", std::to_string(settings.nnInferenceDeviceId)},
     {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
@@ -70,7 +115,7 @@ void GPUTPCNNClusterizerHost::init(GPUSettingsProcessingNNclusterizer settings)
 
   model_class.init(OrtOptions);
 
-  reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':');
+  reg_model_paths = o2::utils::Str::tokenize(reg_model_path, ':');
 
   if (!settings.nnClusterizerUseCfRegression) {
     if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index b6d5e48304e0d..798d4af2826b3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -37,9 +37,9 @@ class GPUTPCNNClusterizerHost
 {
  public:
   GPUTPCNNClusterizerHost() = default;
-  GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer);
+  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&);
 
-  void init(GPUSettingsProcessingNNclusterizer);
+  void init(const GPUSettingsProcessingNNclusterizer&);
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
   void loadFromCCDB(std::map<std::string, std::string>);
 

From 566ddb7b0b6133cde807ef5526a2efa66be1a785 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 25 Mar 2025 09:51:18 +0100
Subject: [PATCH 09/10] Simplifications and renaming

---
 Common/ML/include/ML/OrtInterface.h           |  2 +-
 Common/ML/src/OrtInterface.cxx                |  1 -
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  1 -
 .../Global/GPUChainTrackingClusterizer.cxx    | 20 +++++++++----------
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |  4 ++--
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  2 +-
 .../GPUTPCNNClusterizerHost.cxx               |  3 +--
 .../GPUTPCNNClusterizerKernels.cxx            | 20 +++++++++++--------
 8 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 93549178848ca..cbd8501f9898f 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -84,7 +84,7 @@ class OrtModel
 
   // Environment settings
   bool mInitialized = false;
-  std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
+  std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
   int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
   std::string printShape(const std::vector<int64_t>&);
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index ae809a2ba5c1a..5e9f1a8b0a5b6 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -48,7 +48,6 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   if (!optionsMap["model-path"].empty()) {
     modelPath = optionsMap["model-path"];
     device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
-    dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
     deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
     allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
     intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index a8a4ae566f485..83f6e320b8f5b 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -228,7 +228,6 @@ AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural
 AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
 AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
 AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
-AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
 AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16
 AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
 AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 98a0ec16495c5..1638e134a4d6a 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -641,7 +641,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       } else {
         clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
       }
-      clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
+      clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos;
       nnApplication.initClusterizer(nn_settings, clustererNN);
       AllocateRegisteredMemory(clustererNN.mMemoryId);
     }
@@ -931,23 +931,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
             auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Filling the data
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Filling the data
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
-            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
+            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnInferenceInputDType);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
             } else {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
             }
 
             if (!clustererNN.nnClusterizerUseCfRegression) {
-              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 1
+              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnInferenceInputDType);
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
-                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 2
+                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnInferenceInputDType);
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
@@ -957,7 +957,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
           auto start1 = std::chrono::high_resolution_clock::now();
           if (clustererNN.nnClusterizerUseCfRegression) {
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 655e2bf5a933c..cc3f29434615f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -25,9 +25,9 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}
 void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 {
   if (nnClusterizerBatchedMode > 0) {
-    if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
+    if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) {
       computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
-    } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {
+    } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
       computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
     }
     computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 01d1873f3b351..0b9e3a6572684 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -54,7 +54,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int nnClusterizerModelClassNumOutputNodes = -1;
   int nnClusterizerModelReg1NumOutputNodes = -1;
   int nnClusterizerModelReg2NumOutputNodes = -1;
-  int nnClusterizerDtype = 0; // 0: float16, 1: float32
+  int nnInferenceInputDType = 0; // 0: float16, 1: float32
   int mISector = -1;
 
   // Memory allocation for neural network
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 533ac0c7481ff..3463740cf7918 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -105,7 +105,6 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
     {"device", settings.nnInferenceDevice},
     {"device-id", std::to_string(settings.nnInferenceDeviceId)},
     {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
-    {"dtype", settings.nnInferenceDtype},
     {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
     {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)},
     {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
@@ -134,7 +133,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
 {
   clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
   if (!settings.nnClusterizerUseCfRegression) {
-    if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
+    if (model_class.getNumOutputNodes()[0][1] == 1 || model_reg_2.isInitialized()) {
       clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
     } else {
       clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 763119444bf7c..73051bd8477fd 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -125,20 +125,24 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
-  auto& clusterer = processors.tpcNNClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
   uint glo_idx = get_global_id(0);
-  uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
+  uint elem_iterator = glo_idx * clustererNN.nnClusterizerModelClassNumOutputNodes;
   float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
   uint class_label = 0;
-  for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
+  for (int pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes; pIdx++) {
     if (pIdx == elem_iterator) {
-      current_max_prob = clusterer.modelProbabilities[pIdx];
+      current_max_prob = clustererNN.modelProbabilities[pIdx];
     } else {
-      class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
+      class_label = (clustererNN.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
     }
   }
-  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
-  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
+  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
+  clustererNN.outputDataClass[glo_idx + batchStart] = class_label;
+  if (class_label > 1) {
+    clustererNN.clusterFlags[2 * glo_idx] = 1;
+    clustererNN.clusterFlags[2 * glo_idx + 1] = 1;
+  }
 }
 
 template <>
@@ -157,7 +161,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
 
-  if (clustererNN.outputDataClass[full_glo_idx] == 1) {
+  if (clustererNN.outputDataClass[full_glo_idx] == 1 || (clustererNN.nnClusterizerModelReg2NumOutputNodes == -1 && clustererNN.outputDataClass[full_glo_idx] >= 1)) {
 
     ClusterAccumulator pc;
 

From a9c33b5b7775123283b0de118b99ae2945b0c669 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Tue, 25 Mar 2025 08:52:20 +0000
Subject: [PATCH 10/10] Please consider the following formatting changes

---
 .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx     | 12 ++++++------
 .../TPCClusterFinder/GPUTPCNNClusterizerHost.h       |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 3463740cf7918..35db3f2107e7d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -27,15 +27,16 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
   init(settings);
 }
 
-void GPUTPCNNClusterizerHost::loadFromCCDB(std::map<std::string, std::string> settings) {
+void GPUTPCNNClusterizerHost::loadFromCCDB(std::map<std::string, std::string> settings)
+{
   o2::ccdb::CcdbApi ccdbApi;
   ccdbApi.init(settings["nnCCDBURL"]);
 
   metadata["inputDType"] = settings["inputDType"];
   metadata["outputDType"] = settings["outputDType"];
-  metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C
+  metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"];         // classification_1C, classification_2C, regression_1C, regression_2C
   metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model
-  metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN
+  metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"];       // FC, CNN
   if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) {
     metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"];
   }
@@ -58,7 +59,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
   std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath;
   std::vector<std::string> reg_model_paths;
 
-  if(settings.nnLoadFromCCDB) {
+  if (settings.nnLoadFromCCDB) {
     std::map<std::string, std::string> ccdbSettings = {
       {"nnCCDBURL", settings.nnCCDBURL},
       {"nnCCDBPath", settings.nnCCDBPath},
@@ -66,8 +67,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
       {"outputDType", settings.nnInferenceOutputDType},
       {"nnCCDBWithMomentum", std::to_string(settings.nnCCDBWithMomentum)},
       {"nnCCDBBeamType", settings.nnCCDBBeamType},
-      {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)}
-    };
+      {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)}};
 
     std::string nnFetchFolder = "";
     std::vector<std::string> fetchMode = o2::utils::Str::tokenize(settings.nnCCDBFetchMode, ':');
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 798d4af2826b3..210d5f94dd503 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -49,9 +49,9 @@ class GPUTPCNNClusterizerHost
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
   std::vector<std::string> reg_model_paths;
 
-  private:
-    std::map<std::string, std::string> metadata;
-    std::map<std::string, std::string> headers;
+ private:
+  std::map<std::string, std::string> metadata;
+  std::map<std::string, std::string> headers;
 }; // class GPUTPCNNClusterizerHost
 
 } // namespace o2::gpu