From 84eac06b0bb1562d826b4ace3a8435c3385b91a0 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 15 Mar 2025 21:36:05 +0100
Subject: [PATCH 01/40] Initial set of bug.fixes and cosmetic changes

---
 .../Global/GPUChainTrackingClusterizer.cxx    |  15 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |  38 +++--
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |   3 +-
 .../GPUTPCNNClusterizerHost.cxx               |   4 +-
 .../GPUTPCNNClusterizerHost.h                 |  17 --
 .../GPUTPCNNClusterizerKernels.cxx            | 160 ++++++++----------
 6 files changed, 101 insertions(+), 136 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 63d56da37595b..546f62b6c35d6 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -614,7 +614,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
 #ifdef GPUCA_HAS_ONNX
   if (GetProcessingSettings().nn.applyNNclusterizer) {
-    uint32_t maxClusters = -1;
+    uint32_t maxClusters = 0;
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
     }
@@ -918,6 +918,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
           const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
           GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+          int withMC = (doGPU && propagateMCLabels);
 
           if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
@@ -930,23 +931,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
             auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Filling the data
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
             nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels
             } else {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels
             }
 
             if (!clustererNN.nnClusterizerUseCfRegression) {
               nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
                 nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 2
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
@@ -956,7 +957,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
           auto start1 = std::chrono::high_resolution_clock::now();
           if (clustererNN.nnClusterizerUseCfRegression) {
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 6a9b6f546ae07..df0f895cd5976 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -24,25 +24,29 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}
 
 void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 {
-  if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
-    computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
-  } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {
-    computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
-  }
-  computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
-  computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters);
-  if (nnClusterizerModelClassNumOutputNodes > 0) {
-    computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
-  }
-  if (!nnClusterizerUseCfRegression) {
-    if (nnClusterizerModelReg1NumOutputNodes > 0) {
-      computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
+  if (nnClusterizerBatchedMode > 0){
+    if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
+      computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
+    } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {
+      computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
     }
-    if (nnClusterizerModelReg2NumOutputNodes > 0) {
-      computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
+    computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
+    computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
+    computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
+    if (nnClusterizerModelClassNumOutputNodes > 0) {
+      computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
     }
+    if (!nnClusterizerUseCfRegression) {
+      if (nnClusterizerModelReg1NumOutputNodes > 0) {
+        computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
+      }
+      if (nnClusterizerModelReg2NumOutputNodes > 0) {
+        computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
+      }
+    }
+  }
+  if (nnClusterizerTotalClusters > 0) {
+    computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters);
   }
   return mem;
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index ea6340dfd48bc..01d1873f3b351 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -42,7 +42,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int nnClusterizerSizeInputTime = 3;
   int nnClusterizerElementSize = -1;
   bool nnClusterizerAddIndexData = true;
-  float nnClassThreshold = 0.16;
+  float nnClassThreshold = 0.01;
   bool nnSigmoidTrafoClassThreshold = 1;
   int nnClusterizerUseCfRegression = 0;
   int nnClusterizerBatchedMode = 1;
@@ -58,7 +58,6 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int mISector = -1;
 
   // Memory allocation for neural network
-  uint class2_elements = 0;
   float* inputData32 = nullptr;
   OrtDataType::Float16_t* inputData16 = nullptr;
   float* outputDataClass = nullptr;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 5002c63524020..321fad3d039db 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -12,6 +12,8 @@
 /// \file GPUTPCNNClusterizerHost.cxx
 /// \author Christian Sonnabend
 
+#include <CommonUtils/StringUtils.h>
+
 #include "GPUTPCNNClusterizerHost.h"
 #include "GPUTPCNNClusterizer.h"
 #include "GPUSettings.h"
@@ -37,7 +39,7 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
   model_class.init(OrtOptions);
   clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
 
-  reg_model_paths = splitString(settings.nnRegressionPath, ":");
+  reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':');
 
   if (!settings.nnClusterizerUseCfRegression) {
     if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 7efa0edecb893..430d78d0bb2fb 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -44,23 +44,6 @@ class GPUTPCNNClusterizerHost
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
   std::vector<std::string> reg_model_paths;
-
- private:
-  // Avoid including CommonUtils/StringUtils.h
-  std::vector<std::string> splitString(const std::string& input, const std::string& delimiter)
-  {
-    std::vector<std::string> tokens;
-    std::size_t pos = 0;
-    std::size_t found;
-
-    while ((found = input.find(delimiter, pos)) != std::string::npos) {
-      tokens.push_back(input.substr(pos, found - pos));
-      pos = found + delimiter.length();
-    }
-    tokens.push_back(input.substr(pos));
-
-    return tokens;
-  }
 }; // class GPUTPCNNClusterizerHost
 
 } // namespace o2::gpu
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 25cd2497fbf62..c536303147ae6 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -34,7 +34,7 @@ using namespace o2::gpu::tpccf;
 
 // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::runCfClusterizer>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
@@ -44,91 +44,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
   }
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAcc(clusterer));
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
   o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new;
   GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart);
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  auto& clusterer = processors.tpcNNClusterer[sector];
-  uint glo_idx = get_global_id(0);
-  uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
-  float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
-  uint class_label = 0;
-  for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
-    if (pIdx == elem_iterator) {
-      current_max_prob = clusterer.modelProbabilities[pIdx];
-    } else {
-      class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
-    }
-  }
-  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
-  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
-    return;
-  }
-  GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
-}
-
-template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
-{
-  uint glo_idx = get_global_id(0);
-  if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) {
-    return;
-  }
-  GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart);
-}
-
-// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
-GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
-{
-  return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);
-}
-
-GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift)
-{
-  return (row > 62 ? global_shift : 0);
-}
-
-GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
-{
-  if (pad < 0 || row < 0) { // Faster short-circuit
-    return true;
-  } else if (row < 63) {
-    return (pad >= static_cast<int>(geo.NPads(row)));
-  } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
-    return true;
-  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) {
-    return (pad >= static_cast<int>(geo.NPads(row - global_shift)));
-  } else {
-    return true;
-  }
-}
-
-// Filling the input data for the neural network where there is no boundary
-GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
@@ -144,7 +66,7 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n
 
   clustererNN.peakPositions[glo_idx] = peak;
   clustererNN.centralCharges[glo_idx] = central_charge;
-  clustererNN.outputDataClass[glo_idx + batchStart] = -1;
+  clustererNN.outputDataClass[glo_idx + batchStart] = -1.f;
 
   int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
 #ifndef GPUCA_GPUCODE
@@ -192,14 +114,43 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n
   }
 }
 
-GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
+{
+  auto& clusterer = processors.tpcNNClusterer[sector];
+  uint glo_idx = get_global_id(0);
+  uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
+  float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
+  uint class_label = 0;
+  for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
+    if (pIdx == elem_iterator) {
+      current_max_prob = clusterer.modelProbabilities[pIdx];
+    } else {
+      class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
+    }
+  }
+  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
+  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
+}
+
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass1Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
 {
+  uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
+
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
   int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg1NumOutputNodes;
 
@@ -210,7 +161,7 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
     ClusterAccumulator pc;
 
     // Publishing logic is taken from default clusterizer
-    if (onlyMC) {
+    if (withMC) {
       ClusterAccumulator dummy_pc;
       CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
       GPUTPCCFClusterizer::buildCluster(
@@ -223,7 +174,6 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
         &dummy_pc,
         labelAcc);
     }
-
     if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -272,24 +222,25 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha
   }
 }
 
-GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishClass2Regression>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
 {
+  uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
+
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
-  tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
+  tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
   uint full_glo_idx = glo_idx + batchStart;
   int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg2NumOutputNodes;
 
-  // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg2NumOutputNodes << " -- " << clustererNN.peakPositions.size() << " -- " << clustererNN.centralCharges.size();
-
   if (clustererNN.outputDataClass[full_glo_idx] > 0) {
 
     ClusterAccumulator pc;
 
-    if (onlyMC) {
+    if (withMC) {
       ClusterAccumulator dummy_pc;
       CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
       GPUTPCCFClusterizer::buildCluster(
@@ -302,7 +253,6 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
         &dummy_pc,
         labelAcc);
     }
-
     if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -384,3 +334,29 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha
     return;
   }
 }
+
+// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
+GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo)
+{
+  return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2);
+}
+
+GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift)
+{
+  return (row > 62 ? global_shift : 0);
+}
+
+GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo)
+{
+  if (pad < 0 || row < 0) { // Faster short-circuit
+    return true;
+  } else if (row < 63) {
+    return (pad >= static_cast<int>(geo.NPads(row)));
+  } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network
+    return true;
+  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) {
+    return (pad >= static_cast<int>(geo.NPads(row - global_shift)));
+  } else {
+    return true;
+  }
+}

From 219164923257101b6084bd97700314ea4f109d30 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Sat, 15 Mar 2025 20:37:59 +0000
Subject: [PATCH 02/40] Please consider the following formatting changes

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index df0f895cd5976..655e2bf5a933c 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -24,7 +24,7 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}
 
 void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 {
-  if (nnClusterizerBatchedMode > 0){
+  if (nnClusterizerBatchedMode > 0) {
     if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
       computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
     } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {

From b742c50537a7aa73812e62079306830884fce271 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 15 Mar 2025 21:42:35 +0100
Subject: [PATCH 03/40] Adjusting eval sizes. Makes code neater and avoids some
 calculations

---
 Common/ML/src/OrtInterface.cxx                      | 13 ++++++-------
 .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx    |  6 +++---
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index fc784dd14d2dc..ae809a2ba5c1a 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -226,19 +226,18 @@ template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Fl
 template <class I, class O>
 void OrtModel::inference(I* input, size_t input_size, O* output)
 {
-  std::vector<int64_t> inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<int64_t> inputShape{input_size, (int64_t)mInputShapes[0][1]};
   Ort::Value inputTensor = Ort::Value(nullptr);
   if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
-    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size, inputShape.data(), inputShape.size());
+    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
   } else {
-    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size());
+    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
   }
 
-  std::vector<int64_t> outputShape{inputShape[0], mOutputShapes[0][1]};
-  size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]);
-  Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size());
+  std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
+  Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
 
-  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here
+  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is always correct here
 }
 
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 321fad3d039db..b32d042ebd1fa 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -57,11 +57,11 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
   }
 }
 
-void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype)
+void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype)
 {
   if (dtype == 0) {
-    model.inference<OrtDataType::Float16_t, float>(clusterer.inputData16, size * clusterer.nnClusterizerElementSize, output);
+    model.inference<OrtDataType::Float16_t, float>(clustererNN.inputData16, size, output);
   } else {
-    model.inference<float, float>(clusterer.inputData32, size * clusterer.nnClusterizerElementSize, output);
+    model.inference<float, float>(clustererNN.inputData32, size, output);
   }
 }

From 0c1cfb742e987ab50d87c0f5023a63841531335d Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 20 Mar 2025 13:13:25 +0100
Subject: [PATCH 04/40] Adding separate functions. Now the host process only
 needs one instance and one initialization

---
 .../Global/GPUChainTrackingClusterizer.cxx    |  7 +++---
 .../GPUTPCNNClusterizerHost.cxx               | 24 +++++++++++++++----
 .../GPUTPCNNClusterizerHost.h                 |  3 +++
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index b928ed7c177eb..916f2634fb2f6 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -612,14 +612,16 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
 
 #ifdef GPUCA_HAS_ONNX
+  const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
+  GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only
   if (GetProcessingSettings().nn.applyNNclusterizer) {
     uint32_t maxClusters = 0;
+    nnApplication.init(nn_settings);
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
     }
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
-      const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
       clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
       clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
       clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
@@ -640,7 +642,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
       }
       clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
-      GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+      nnApplication.initClusterizer(nn_settings, clustererNN);
       AllocateRegisteredMemory(clustererNN.mMemoryId);
     }
   }
@@ -916,7 +918,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 #ifdef GPUCA_HAS_ONNX
           GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
           const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
-          GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
           int withMC = (doGPU && propagateMCLabels);
 
           if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index b32d042ebd1fa..a1f78ca787282 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -21,7 +21,12 @@
 
 using namespace o2::gpu;
 
-GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer)
+GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings)
+{
+  init(settings);
+}
+
+void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
 {
   OrtOptions = {
     {"model-path", settings.nnClassificationPath},
@@ -37,21 +42,30 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
     {"logging-level", std::to_string(settings.nnInferenceVerbosity)}};
 
   model_class.init(OrtOptions);
-  clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
 
-  reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':');
+  reg_model_paths = splitString(settings.nnRegressionPath, ":");
 
   if (!settings.nnClusterizerUseCfRegression) {
     if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
       OrtOptions["model-path"] = reg_model_paths[0];
       model_reg_1.init(OrtOptions);
-      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
     } else {
       OrtOptions["model-path"] = reg_model_paths[0];
       model_reg_1.init(OrtOptions);
-      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
       OrtOptions["model-path"] = reg_model_paths[1];
       model_reg_2.init(OrtOptions);
+    }
+  }
+}
+
+void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer)
+{
+  clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
+  if (!settings.nnClusterizerUseCfRegression) {
+    if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
+      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
+    } else {
+      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
       clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1];
     }
   }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 430d78d0bb2fb..1f31567dc42f1 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -39,6 +39,9 @@ class GPUTPCNNClusterizerHost
   GPUTPCNNClusterizerHost() = default;
   GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
 
+  void init(const GPUSettingsProcessingNNclusterizer&);
+  void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+
   void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype);
 
   std::unordered_map<std::string, std::string> OrtOptions;

From 83c004fa0f84ab8fec7a7458e210ad63ab7a489f Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 22 Mar 2025 16:55:50 +0100
Subject: [PATCH 05/40] First version of CCDB implementation

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  8 ++++++
 .../Global/GPUChainTrackingClusterizer.cxx    | 21 +++++++++++++++
 .../GPUTPCNNClusterizerHost.cxx               | 26 +++++++++++++++++++
 .../GPUTPCNNClusterizerHost.h                 |  1 +
 4 files changed, 56 insertions(+)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 40a7fc71cbb4d..7611e810768fe 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -249,6 +249,14 @@ AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The c
 AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
 AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
 AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
+// CCDB
+AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally")
+AddOption(nnCCDBURL, std::string, "http://alice-ccdb.cern.ch", "", 0, "The CCDB URL from where the network files are fetched")
+AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks")
+AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression")
+AddOption(nnCCDBLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
+AddOption(nnCCDBBeamType, std::string, "", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp")
+AddOption(nnCCDBInteractionRate, int, -1, "", 0, "Distinguishes between networks for different interaction rates [kHz].")
 AddHelp("help", 'h')
 EndConfig()
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 916f2634fb2f6..c7816bb9ec17c 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -615,6 +615,27 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
   GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only
   if (GetProcessingSettings().nn.applyNNclusterizer) {
+    if(nn_settings.nnLoadFromCCDB) {
+      std::unordered_map<std::string, std::string> ccdbSettings = {
+        {"nnCCDBPath", nn_settings.nnCCDBPath},
+        {"inputDType", nn_settings.inputDType},
+        {"outputDType", nn_settings.outputDType},
+        {"nnCCDBWithMomentum", std::to_string(nn_settings.nnCCDBWithMomentum)},
+        {"nnCCDBLayerType", nn_settings.nnCCDBLayerType},
+        {"nnCCDBBeamType", nn_settings.nnCCDBBeamType},
+        {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)}
+      };
+
+      std::unordered_map<std::string, std::string> networkRetrieval = ccdbSettings;
+
+      networkRetrieval["nnCCDBEvalType"] = "classification_c1";
+      networkRetrieval["outputFile"] = "net_classification_c1.onnx";
+      nnApplication.loadFromCCDB(networkRetrieval);
+
+      networkRetrieval["nnCCDBEvalType"] = "regression_c1";
+      networkRetrieval["outputFile"] = "net_regression_c1.onnx";
+      nnApplication.loadFromCCDB(networkRetrieval);
+    }
     uint32_t maxClusters = 0;
     nnApplication.init(nn_settings);
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index a1f78ca787282..20190994b97ba 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -26,6 +26,32 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
   init(settings);
 }
 
+void GPUTPCNNClusterizerHost::loadFromCCDB(std::unordered_map<std::string, std::string> settings) {
+  o2::ccdb::CcdbApi ccdbApi;
+  ccdbApi.init(settings["nnCCDBURL"]);
+
+  metadata[settings["inputDType"]] = settings["inputDType"];
+  metadata[settings["outputDType"]] = settings["outputDType"];
+  metadata[settings["nnCCDBEvalType"]] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C
+  metadata[settings["nnCCDBWithMomentum"]] = std::stoi(settings["nnCCDBWithMomentum"]); // 0, 1 -> Only for regression model
+  metadata[settings["nnCCDBLayerType"]] = settings["nnCCDBLayerType"]; // FC, CNN
+  if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) {
+    metadata[settings["nnCCDBInteractionRate"]] = settings["nnCCDBInteractionRate"];
+  }
+  if (settings["nnCCDBBeamType"] != "") {
+    metadata[settings["nnCCDBBeamType"]] = settings["nnCCDBBeamType"];
+  }
+
+  bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnPathCCDB"], ".", metadata, 1, false, settings["outputFile"]);
+  // headers = ccdbApi.retrieveHeaders(nnPathCCDB, metadata, ccdbTimestamp); // potentially needed to init some local variables
+
+  if (retrieveSuccess) {
+    LOG(info) << "Network " << settings["nnPathCCDB"] << " retrieved from CCDB, stored at " << settings["networkPathLocal"];
+  } else {
+    LOG(error) << "Failed to retrieve network from CCDB";
+  }
+}
+
 void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
 {
   OrtOptions = {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 1f31567dc42f1..a3f3ecd72ffca 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -41,6 +41,7 @@ class GPUTPCNNClusterizerHost
 
   void init(const GPUSettingsProcessingNNclusterizer&);
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+  void loadFromCCDB(std::unordered_map<std::string, std::string>);
 
   void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype);
 

From d767ed1b636f97c0fe8447e8e3ebfc854a4aa214 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 24 Mar 2025 00:16:18 +0100
Subject: [PATCH 06/40] Working CCDB API calls (tested with test-ccdb)

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h | 11 +++++---
 .../Global/GPUChainTrackingClusterizer.cxx    | 13 ++++++----
 .../GPUTPCNNClusterizerHost.cxx               | 25 ++++++++++---------
 .../GPUTPCNNClusterizerHost.h                 |  8 ++++--
 4 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 7611e810768fe..5b4d08f5ffe67 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -229,6 +229,8 @@ AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify i
 AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
 AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
 AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16
+AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
 AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
 AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
 AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347")
@@ -251,12 +253,13 @@ AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regress
 AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
 // CCDB
 AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally")
-AddOption(nnCCDBURL, std::string, "http://alice-ccdb.cern.ch", "", 0, "The CCDB URL from where the network files are fetched")
+AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched")
 AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks")
 AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression")
-AddOption(nnCCDBLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
-AddOption(nnCCDBBeamType, std::string, "", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp")
-AddOption(nnCCDBInteractionRate, int, -1, "", 0, "Distinguishes between networks for different interaction rates [kHz].")
+AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
+AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
+AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp")
+AddOption(nnCCDBInteractionRate, int, 50, "", 0, "Distinguishes between networks for different interaction rates [kHz].")
 AddHelp("help", 'h')
 EndConfig()
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index c7816bb9ec17c..fb6bffe51a160 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -616,26 +616,29 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only
   if (GetProcessingSettings().nn.applyNNclusterizer) {
     if(nn_settings.nnLoadFromCCDB) {
-      std::unordered_map<std::string, std::string> ccdbSettings = {
+      std::map<std::string, std::string> ccdbSettings = {
+        {"nnCCDBURL", nn_settings.nnCCDBURL},
         {"nnCCDBPath", nn_settings.nnCCDBPath},
-        {"inputDType", nn_settings.inputDType},
-        {"outputDType", nn_settings.outputDType},
+        {"inputDType", nn_settings.nnInferenceInputDType},
+        {"outputDType", nn_settings.nnInferenceOutputDType},
         {"nnCCDBWithMomentum", std::to_string(nn_settings.nnCCDBWithMomentum)},
-        {"nnCCDBLayerType", nn_settings.nnCCDBLayerType},
         {"nnCCDBBeamType", nn_settings.nnCCDBBeamType},
         {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)}
       };
 
-      std::unordered_map<std::string, std::string> networkRetrieval = ccdbSettings;
+      std::map<std::string, std::string> networkRetrieval = ccdbSettings;
 
+      networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType;
       networkRetrieval["nnCCDBEvalType"] = "classification_c1";
       networkRetrieval["outputFile"] = "net_classification_c1.onnx";
       nnApplication.loadFromCCDB(networkRetrieval);
 
+      networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType;
       networkRetrieval["nnCCDBEvalType"] = "regression_c1";
       networkRetrieval["outputFile"] = "net_regression_c1.onnx";
       nnApplication.loadFromCCDB(networkRetrieval);
     }
+
     uint32_t maxClusters = 0;
     nnApplication.init(nn_settings);
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 20190994b97ba..b4ee558b1e201 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -16,6 +16,7 @@
 
 #include "GPUTPCNNClusterizerHost.h"
 #include "GPUTPCNNClusterizer.h"
+#include "CCDB/CcdbApi.h"
 #include "GPUSettings.h"
 #include "ML/3rdparty/GPUORTFloat16.h"
 
@@ -26,27 +27,27 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
   init(settings);
 }
 
-void GPUTPCNNClusterizerHost::loadFromCCDB(std::unordered_map<std::string, std::string> settings) {
+void GPUTPCNNClusterizerHost::loadFromCCDB(std::map<std::string, std::string> settings) {
   o2::ccdb::CcdbApi ccdbApi;
   ccdbApi.init(settings["nnCCDBURL"]);
 
-  metadata[settings["inputDType"]] = settings["inputDType"];
-  metadata[settings["outputDType"]] = settings["outputDType"];
-  metadata[settings["nnCCDBEvalType"]] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C
-  metadata[settings["nnCCDBWithMomentum"]] = std::stoi(settings["nnCCDBWithMomentum"]); // 0, 1 -> Only for regression model
-  metadata[settings["nnCCDBLayerType"]] = settings["nnCCDBLayerType"]; // FC, CNN
+  metadata["inputDType"] = settings["inputDType"];
+  metadata["outputDType"] = settings["outputDType"];
+  metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C
+  metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model
+  metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN
   if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) {
-    metadata[settings["nnCCDBInteractionRate"]] = settings["nnCCDBInteractionRate"];
+    metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"];
   }
   if (settings["nnCCDBBeamType"] != "") {
-    metadata[settings["nnCCDBBeamType"]] = settings["nnCCDBBeamType"];
+    metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"];
   }
 
-  bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnPathCCDB"], ".", metadata, 1, false, settings["outputFile"]);
-  // headers = ccdbApi.retrieveHeaders(nnPathCCDB, metadata, ccdbTimestamp); // potentially needed to init some local variables
+  bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], ".", metadata, 1, false, settings["outputFile"]);
+  // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables
 
   if (retrieveSuccess) {
-    LOG(info) << "Network " << settings["nnPathCCDB"] << " retrieved from CCDB, stored at " << settings["networkPathLocal"];
+    LOG(info) << "Network " << settings["nnCCDBPath"] << " retrieved from CCDB, stored at " << settings["outputFile"];
   } else {
     LOG(error) << "Failed to retrieve network from CCDB";
   }
@@ -69,7 +70,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
 
   model_class.init(OrtOptions);
 
-  reg_model_paths = splitString(settings.nnRegressionPath, ":");
+  reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':');
 
   if (!settings.nnClusterizerUseCfRegression) {
     if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index a3f3ecd72ffca..798d4af2826b3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -37,17 +37,21 @@ class GPUTPCNNClusterizerHost
 {
  public:
   GPUTPCNNClusterizerHost() = default;
-  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&);
 
   void init(const GPUSettingsProcessingNNclusterizer&);
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
-  void loadFromCCDB(std::unordered_map<std::string, std::string>);
+  void loadFromCCDB(std::map<std::string, std::string>);
 
   void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype);
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
   std::vector<std::string> reg_model_paths;
+
+  private:
+    std::map<std::string, std::string> metadata;
+    std::map<std::string, std::string> headers;
 }; // class GPUTPCNNClusterizerHost
 
 } // namespace o2::gpu

From ad4b22be3c457fbcb6e0cca852bf8800d7b9929e Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 24 Mar 2025 10:38:51 +0100
Subject: [PATCH 07/40] Improve fetching, but have to pass settings by value,
 not const ref

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  1 +
 .../Global/GPUChainTrackingClusterizer.cxx    | 31 +++++++++++++++----
 .../GPUTPCNNClusterizerHost.cxx               |  4 +--
 .../GPUTPCNNClusterizerHost.h                 |  4 +--
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 5b4d08f5ffe67..a8a4ae566f485 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -255,6 +255,7 @@ AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then
 AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally")
 AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched")
 AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks")
+AddOption(nnCCDBFetchMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)")
 AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression")
 AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
 AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index fb6bffe51a160..dcd5cc2197e3c 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -40,6 +40,7 @@
 #endif
 
 #ifdef GPUCA_HAS_ONNX
+#include <CommonUtils/StringUtils.h>
 #include "GPUTPCNNClusterizerKernels.h"
 #include "GPUTPCNNClusterizerHost.h"
 #endif
@@ -612,7 +613,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
 
 #ifdef GPUCA_HAS_ONNX
-  const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
+  GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn;
   GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only
   if (GetProcessingSettings().nn.applyNNclusterizer) {
     if(nn_settings.nnLoadFromCCDB) {
@@ -626,17 +627,35 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)}
       };
 
+      std::string nnFetchFolder = "";
+      std::vector<std::string> fetchMode = o2::utils::Str::tokenize(nn_settings.nnCCDBFetchMode, ':');
       std::map<std::string, std::string> networkRetrieval = ccdbSettings;
 
-      networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType;
-      networkRetrieval["nnCCDBEvalType"] = "classification_c1";
-      networkRetrieval["outputFile"] = "net_classification_c1.onnx";
-      nnApplication.loadFromCCDB(networkRetrieval);
+      if (fetchMode[0] == "c1") {
+        networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType;
+        networkRetrieval["nnCCDBEvalType"] = "classification_c1";
+        networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx";
+        nnApplication.loadFromCCDB(networkRetrieval);
+      } else if (fetchMode[0] == "c2") {
+        networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType;
+        networkRetrieval["nnCCDBEvalType"] = "classification_c2";
+        networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx";
+        nnApplication.loadFromCCDB(networkRetrieval);
+      }
+      nn_settings.nnClassificationPath = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally
 
       networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType;
       networkRetrieval["nnCCDBEvalType"] = "regression_c1";
-      networkRetrieval["outputFile"] = "net_regression_c1.onnx";
+      networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx";
       nnApplication.loadFromCCDB(networkRetrieval);
+      nn_settings.nnRegressionPath = networkRetrieval["outputFile"];
+      if (fetchMode[1] == "r2") {
+        networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType;
+        networkRetrieval["nnCCDBEvalType"] = "regression_c2";
+        networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx";
+        nnApplication.loadFromCCDB(networkRetrieval);
+        nn_settings.nnRegressionPath += ":", networkRetrieval["outputFile"];
+      }
     }
 
     uint32_t maxClusters = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index b4ee558b1e201..da32d4938ebed 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -22,7 +22,7 @@
 
 using namespace o2::gpu;
 
-GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings)
+GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer settings)
 {
   init(settings);
 }
@@ -53,7 +53,7 @@ void GPUTPCNNClusterizerHost::loadFromCCDB(std::map<std::string, std::string> se
   }
 }
 
-void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
+void GPUTPCNNClusterizerHost::init(GPUSettingsProcessingNNclusterizer settings)
 {
   OrtOptions = {
     {"model-path", settings.nnClassificationPath},
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 798d4af2826b3..b6d5e48304e0d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -37,9 +37,9 @@ class GPUTPCNNClusterizerHost
 {
  public:
   GPUTPCNNClusterizerHost() = default;
-  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&);
+  GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer);
 
-  void init(const GPUSettingsProcessingNNclusterizer&);
+  void init(GPUSettingsProcessingNNclusterizer);
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
   void loadFromCCDB(std::map<std::string, std::string>);
 

From 81c646be0d2e27af8707a0b67a651cccc9de5b64 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 24 Mar 2025 11:04:57 +0100
Subject: [PATCH 08/40] Using const ref and moving CCDB calls to host
 initialization

---
 .../Global/GPUChainTrackingClusterizer.cxx    | 47 +---------------
 .../GPUTPCNNClusterizerHost.cxx               | 53 +++++++++++++++++--
 .../GPUTPCNNClusterizerHost.h                 |  4 +-
 3 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index dcd5cc2197e3c..98a0ec16495c5 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -40,7 +40,6 @@
 #endif
 
 #ifdef GPUCA_HAS_ONNX
-#include <CommonUtils/StringUtils.h>
 #include "GPUTPCNNClusterizerKernels.h"
 #include "GPUTPCNNClusterizerHost.h"
 #endif
@@ -613,51 +612,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   }
 
 #ifdef GPUCA_HAS_ONNX
-  GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn;
+  const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
   GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only
   if (GetProcessingSettings().nn.applyNNclusterizer) {
-    if(nn_settings.nnLoadFromCCDB) {
-      std::map<std::string, std::string> ccdbSettings = {
-        {"nnCCDBURL", nn_settings.nnCCDBURL},
-        {"nnCCDBPath", nn_settings.nnCCDBPath},
-        {"inputDType", nn_settings.nnInferenceInputDType},
-        {"outputDType", nn_settings.nnInferenceOutputDType},
-        {"nnCCDBWithMomentum", std::to_string(nn_settings.nnCCDBWithMomentum)},
-        {"nnCCDBBeamType", nn_settings.nnCCDBBeamType},
-        {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)}
-      };
-
-      std::string nnFetchFolder = "";
-      std::vector<std::string> fetchMode = o2::utils::Str::tokenize(nn_settings.nnCCDBFetchMode, ':');
-      std::map<std::string, std::string> networkRetrieval = ccdbSettings;
-
-      if (fetchMode[0] == "c1") {
-        networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType;
-        networkRetrieval["nnCCDBEvalType"] = "classification_c1";
-        networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx";
-        nnApplication.loadFromCCDB(networkRetrieval);
-      } else if (fetchMode[0] == "c2") {
-        networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType;
-        networkRetrieval["nnCCDBEvalType"] = "classification_c2";
-        networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx";
-        nnApplication.loadFromCCDB(networkRetrieval);
-      }
-      nn_settings.nnClassificationPath = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally
-
-      networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType;
-      networkRetrieval["nnCCDBEvalType"] = "regression_c1";
-      networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx";
-      nnApplication.loadFromCCDB(networkRetrieval);
-      nn_settings.nnRegressionPath = networkRetrieval["outputFile"];
-      if (fetchMode[1] == "r2") {
-        networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType;
-        networkRetrieval["nnCCDBEvalType"] = "regression_c2";
-        networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx";
-        nnApplication.loadFromCCDB(networkRetrieval);
-        nn_settings.nnRegressionPath += ":", networkRetrieval["outputFile"];
-      }
-    }
-
     uint32_t maxClusters = 0;
     nnApplication.init(nn_settings);
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
@@ -988,7 +945,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             if (!clustererNN.nnClusterizerUseCfRegression) {
               nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 1
-              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) {
+              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
                 nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
                 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 2
               }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index da32d4938ebed..533ac0c7481ff 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -22,7 +22,7 @@
 
 using namespace o2::gpu;
 
-GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer settings)
+GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings)
 {
   init(settings);
 }
@@ -53,10 +53,55 @@ void GPUTPCNNClusterizerHost::loadFromCCDB(std::map<std::string, std::string> se
   }
 }
 
-void GPUTPCNNClusterizerHost::init(GPUSettingsProcessingNNclusterizer settings)
+void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
 {
+  std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath;
+  std::vector<std::string> reg_model_paths;
+
+  if(settings.nnLoadFromCCDB) {
+    std::map<std::string, std::string> ccdbSettings = {
+      {"nnCCDBURL", settings.nnCCDBURL},
+      {"nnCCDBPath", settings.nnCCDBPath},
+      {"inputDType", settings.nnInferenceInputDType},
+      {"outputDType", settings.nnInferenceOutputDType},
+      {"nnCCDBWithMomentum", std::to_string(settings.nnCCDBWithMomentum)},
+      {"nnCCDBBeamType", settings.nnCCDBBeamType},
+      {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)}
+    };
+
+    std::string nnFetchFolder = "";
+    std::vector<std::string> fetchMode = o2::utils::Str::tokenize(settings.nnCCDBFetchMode, ':');
+    std::map<std::string, std::string> networkRetrieval = ccdbSettings;
+
+    if (fetchMode[0] == "c1") {
+      networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType;
+      networkRetrieval["nnCCDBEvalType"] = "classification_c1";
+      networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx";
+      loadFromCCDB(networkRetrieval);
+    } else if (fetchMode[0] == "c2") {
+      networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType;
+      networkRetrieval["nnCCDBEvalType"] = "classification_c2";
+      networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx";
+      loadFromCCDB(networkRetrieval);
+    }
+    class_model_path = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally
+
+    networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType;
+    networkRetrieval["nnCCDBEvalType"] = "regression_c1";
+    networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx";
+    loadFromCCDB(networkRetrieval);
+    reg_model_path = networkRetrieval["outputFile"];
+    if (fetchMode[1] == "r2") {
+      networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType;
+      networkRetrieval["nnCCDBEvalType"] = "regression_c2";
+      networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx";
+      loadFromCCDB(networkRetrieval);
+      reg_model_path += ":", networkRetrieval["outputFile"];
+    }
+  }
+
   OrtOptions = {
-    {"model-path", settings.nnClassificationPath},
+    {"model-path", class_model_path},
     {"device", settings.nnInferenceDevice},
     {"device-id", std::to_string(settings.nnInferenceDeviceId)},
     {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
@@ -70,7 +115,7 @@ void GPUTPCNNClusterizerHost::init(GPUSettingsProcessingNNclusterizer settings)
 
   model_class.init(OrtOptions);
 
-  reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':');
+  reg_model_paths = o2::utils::Str::tokenize(reg_model_path, ':');
 
   if (!settings.nnClusterizerUseCfRegression) {
     if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index b6d5e48304e0d..798d4af2826b3 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -37,9 +37,9 @@ class GPUTPCNNClusterizerHost
 {
  public:
   GPUTPCNNClusterizerHost() = default;
-  GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer);
+  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&);
 
-  void init(GPUSettingsProcessingNNclusterizer);
+  void init(const GPUSettingsProcessingNNclusterizer&);
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
   void loadFromCCDB(std::map<std::string, std::string>);
 

From 566ddb7b0b6133cde807ef5526a2efa66be1a785 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 25 Mar 2025 09:51:18 +0100
Subject: [PATCH 09/40] Simplifications and renaming

---
 Common/ML/include/ML/OrtInterface.h           |  2 +-
 Common/ML/src/OrtInterface.cxx                |  1 -
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  1 -
 .../Global/GPUChainTrackingClusterizer.cxx    | 20 +++++++++----------
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |  4 ++--
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  2 +-
 .../GPUTPCNNClusterizerHost.cxx               |  3 +--
 .../GPUTPCNNClusterizerKernels.cxx            | 20 +++++++++++--------
 8 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 93549178848ca..cbd8501f9898f 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -84,7 +84,7 @@ class OrtModel
 
   // Environment settings
   bool mInitialized = false;
-  std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
+  std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
   int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
   std::string printShape(const std::vector<int64_t>&);
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index ae809a2ba5c1a..5e9f1a8b0a5b6 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -48,7 +48,6 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   if (!optionsMap["model-path"].empty()) {
     modelPath = optionsMap["model-path"];
     device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
-    dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
     deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
     allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
     intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index a8a4ae566f485..83f6e320b8f5b 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -228,7 +228,6 @@ AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural
 AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)")
 AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id")
 AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference")
-AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
 AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16
 AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16
 AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 98a0ec16495c5..1638e134a4d6a 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -641,7 +641,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       } else {
         clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
       }
-      clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
+      clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos;
       nnApplication.initClusterizer(nn_settings, clustererNN);
       AllocateRegisteredMemory(clustererNN.mMemoryId);
     }
@@ -931,23 +931,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
             auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Filling the data
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Filling the data
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
-            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype);
+            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnInferenceInputDType);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
             } else {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
             }
 
             if (!clustererNN.nnClusterizerUseCfRegression) {
-              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype);
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 1
+              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnInferenceInputDType);
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
-                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype);
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 2
+                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnInferenceInputDType);
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
@@ -957,7 +957,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           }
           auto start1 = std::chrono::high_resolution_clock::now();
           if (clustererNN.nnClusterizerUseCfRegression) {
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 655e2bf5a933c..cc3f29434615f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -25,9 +25,9 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {}
 void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 {
   if (nnClusterizerBatchedMode > 0) {
-    if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) {
+    if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) {
       computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
-    } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) {
+    } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
       computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
     }
     computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 01d1873f3b351..0b9e3a6572684 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -54,7 +54,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int nnClusterizerModelClassNumOutputNodes = -1;
   int nnClusterizerModelReg1NumOutputNodes = -1;
   int nnClusterizerModelReg2NumOutputNodes = -1;
-  int nnClusterizerDtype = 0; // 0: float16, 1: float32
+  int nnInferenceInputDType = 0; // 0: float16, 1: float32
   int mISector = -1;
 
   // Memory allocation for neural network
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 533ac0c7481ff..3463740cf7918 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -105,7 +105,6 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
     {"device", settings.nnInferenceDevice},
     {"device-id", std::to_string(settings.nnInferenceDeviceId)},
     {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
-    {"dtype", settings.nnInferenceDtype},
     {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
     {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)},
     {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
@@ -134,7 +133,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
 {
   clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
   if (!settings.nnClusterizerUseCfRegression) {
-    if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
+    if (model_class.getNumOutputNodes()[0][1] == 1 || model_reg_2.isInitialized()) {
       clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
     } else {
       clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 763119444bf7c..73051bd8477fd 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -125,20 +125,24 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
-  auto& clusterer = processors.tpcNNClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
   uint glo_idx = get_global_id(0);
-  uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes;
+  uint elem_iterator = glo_idx * clustererNN.nnClusterizerModelClassNumOutputNodes;
   float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty]
   uint class_label = 0;
-  for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) {
+  for (int pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes; pIdx++) {
     if (pIdx == elem_iterator) {
-      current_max_prob = clusterer.modelProbabilities[pIdx];
+      current_max_prob = clustererNN.modelProbabilities[pIdx];
     } else {
-      class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
+      class_label = (clustererNN.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
     }
   }
-  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
-  clusterer.outputDataClass[glo_idx + batchStart] = class_label;
+  // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
+  clustererNN.outputDataClass[glo_idx + batchStart] = class_label;
+  if (class_label > 1) {
+    clustererNN.clusterFlags[2 * glo_idx] = 1;
+    clustererNN.clusterFlags[2 * glo_idx + 1] = 1;
+  }
 }
 
 template <>
@@ -157,7 +161,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
   // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size();
 
-  if (clustererNN.outputDataClass[full_glo_idx] == 1) {
+  if (clustererNN.outputDataClass[full_glo_idx] == 1 || (clustererNN.nnClusterizerModelReg2NumOutputNodes == -1 && clustererNN.outputDataClass[full_glo_idx] >= 1)) {
 
     ClusterAccumulator pc;
 

From a9c33b5b7775123283b0de118b99ae2945b0c669 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Tue, 25 Mar 2025 08:52:20 +0000
Subject: [PATCH 10/40] Please consider the following formatting changes

---
 .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx     | 12 ++++++------
 .../TPCClusterFinder/GPUTPCNNClusterizerHost.h       |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 3463740cf7918..35db3f2107e7d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -27,15 +27,16 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl
   init(settings);
 }
 
-void GPUTPCNNClusterizerHost::loadFromCCDB(std::map<std::string, std::string> settings) {
+void GPUTPCNNClusterizerHost::loadFromCCDB(std::map<std::string, std::string> settings)
+{
   o2::ccdb::CcdbApi ccdbApi;
   ccdbApi.init(settings["nnCCDBURL"]);
 
   metadata["inputDType"] = settings["inputDType"];
   metadata["outputDType"] = settings["outputDType"];
-  metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C
+  metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"];         // classification_1C, classification_2C, regression_1C, regression_2C
   metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model
-  metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN
+  metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"];       // FC, CNN
   if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) {
     metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"];
   }
@@ -58,7 +59,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
   std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath;
   std::vector<std::string> reg_model_paths;
 
-  if(settings.nnLoadFromCCDB) {
+  if (settings.nnLoadFromCCDB) {
     std::map<std::string, std::string> ccdbSettings = {
       {"nnCCDBURL", settings.nnCCDBURL},
       {"nnCCDBPath", settings.nnCCDBPath},
@@ -66,8 +67,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
       {"outputDType", settings.nnInferenceOutputDType},
       {"nnCCDBWithMomentum", std::to_string(settings.nnCCDBWithMomentum)},
       {"nnCCDBBeamType", settings.nnCCDBBeamType},
-      {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)}
-    };
+      {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)}};
 
     std::string nnFetchFolder = "";
     std::vector<std::string> fetchMode = o2::utils::Str::tokenize(settings.nnCCDBFetchMode, ':');
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 798d4af2826b3..210d5f94dd503 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -49,9 +49,9 @@ class GPUTPCNNClusterizerHost
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
   std::vector<std::string> reg_model_paths;
 
-  private:
-    std::map<std::string, std::string> metadata;
-    std::map<std::string, std::string> headers;
+ private:
+  std::map<std::string, std::string> metadata;
+  std::map<std::string, std::string> headers;
 }; // class GPUTPCNNClusterizerHost
 
 } // namespace o2::gpu

From 9037ea6d7b46a44e73ba5da3f741852a8189b797 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 27 Mar 2025 15:16:21 +0100
Subject: [PATCH 11/40] First version of GPU stream implementation. Still needs
 testing.

---
 Common/ML/include/ML/OrtInterface.h           |  4 +-
 Common/ML/src/OrtInterface.cxx                | 13 +++---
 .../Base/GPUReconstructionProcessing.h        |  3 ++
 GPU/GPUTracking/Base/cuda/CMakeLists.txt      |  1 +
 .../Base/cuda/GPUReconstructionCUDA.cu        | 42 +++++++++++++++++++
 .../Base/cuda/GPUReconstructionCUDA.h         |  1 +
 GPU/GPUTracking/Base/hip/CMakeLists.txt       |  1 +
 7 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 93549178848ca..3d2de192a1fd6 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -84,8 +84,8 @@ class OrtModel
 
   // Environment settings
   bool mInitialized = false;
-  std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
-  int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
+  std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
+  int intraOpNumThreads = 1, interOpNumThreads = 1, streamId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
   std::string printShape(const std::vector<int64_t>&);
 };
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index fc784dd14d2dc..7f550e8e9b32c 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -48,8 +48,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   if (!optionsMap["model-path"].empty()) {
     modelPath = optionsMap["model-path"];
     device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
-    dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float");
-    deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
+    streamId = (optionsMap.contains("stream-id") ? std::stoi(optionsMap["stream-id"]) : 0);
     allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
     intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
     interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0);
@@ -61,7 +60,8 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 #if defined(ORT_ROCM_BUILD)
 #if ORT_ROCM_BUILD == 1
   if (device == "ROCM") {
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
+    // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, streamId));
+    o2::gpu::SetONNXGPUStream(pImplOrt->sessionOptions, streamId);
     LOG(info) << "(ORT) ROCM execution provider set";
   }
 #endif
@@ -69,7 +69,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 #if defined(ORT_MIGRAPHX_BUILD)
 #if ORT_MIGRAPHX_BUILD == 1
   if (device == "MIGRAPHX") {
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
+    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, streamId));
     LOG(info) << "(ORT) MIGraphX execution provider set";
   }
 #endif
@@ -77,7 +77,8 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 #if defined(ORT_CUDA_BUILD)
 #if ORT_CUDA_BUILD == 1
   if (device == "CUDA") {
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
+    // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, streamId));
+    o2::gpu::SetONNXGPUStream(pImplOrt->sessionOptions, streamId);
     LOG(info) << "(ORT) CUDA execution provider set";
     dev_mem_str = "Cuda";
   }
@@ -85,7 +86,7 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 #endif
 
   if (allocateDeviceMemory) {
-    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
+    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, streamId, OrtMemType::OrtMemTypeDefault);
     LOG(info) << "(ORT) Memory info set to on-device memory";
   }
 
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
index b0466efceac24..662258ba13d97 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
@@ -22,6 +22,8 @@
 #include <functional>
 #include <atomic>
 
+struct OrtSessionOptions;
+
 namespace o2::gpu
 {
 
@@ -88,6 +90,7 @@ class GPUReconstructionProcessing : public GPUReconstruction
   void AddGPUEvents(T*& events);
 
   virtual std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() override;
+  virtual int32_t SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) { return 0; }
 
   struct RecoStepTimerMeta {
     HighResTimer timerToGPU;
diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
index de54f09fdc2e1..613a73bc49b27 100644
--- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
@@ -115,6 +115,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
     ${MODULE}
     SOURCES ${SRCS}
     PUBLIC_LINK_LIBRARIES ${TMP_BASELIB} O2::ITStrackingCUDA
+    PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime
     PRIVATE_INCLUDE_DIRECTORIES
       ${CMAKE_SOURCE_DIR}/Detectors/Base/src
       ${CMAKE_SOURCE_DIR}/Detectors/TRD/base/src
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index f87d5c8189cdc..f1f3c2ecba12f 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -16,6 +16,7 @@
 #include "GPUReconstructionCUDAIncludesHost.h"
 
 #include <cuda_profiler_api.h>
+#include "ML/OrtInterface.h"
 
 #include "GPUReconstructionCUDA.h"
 #include "GPUReconstructionCUDAInternals.h"
@@ -35,6 +36,10 @@
 #undef GPUCA_KRNL
 #endif
 
+#ifdef GPUCA_HAS_ONNX
+#include <onnxruntime_cxx_api.h>
+#endif
+
 static constexpr size_t REQUIRE_MIN_MEMORY = 1024L * 1024 * 1024;
 static constexpr size_t REQUIRE_MEMORY_RESERVED = 512L * 1024 * 1024;
 static constexpr size_t REQUIRE_FREE_MEMORY_RESERVED_PER_SM = 40L * 1024 * 1024;
@@ -656,6 +661,28 @@ void GPUReconstructionCUDA::endGPUProfiling()
 {
   GPUChkErr(cudaProfilerStop());
 }
+
+#ifdef GPUCA_HAS_ONNX
+int32_t GPUReconstructionCUDA::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream)
+{
+  OrtCUDAProviderOptionsV2* cuda_options = nullptr;
+  CreateCUDAProviderOptions(&cuda_options);
+
+  // std::vector<const char*> keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"};
+  // std::vector<const char*> values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"};
+  // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
+
+  // this implicitly sets "has_user_compute_stream"
+  UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", &mInternals->Streams[stream]);
+  Ort::ThrowOnError(SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options));
+
+  // Finally, don't forget to release the provider options
+  ReleaseCUDAProviderOptions(cuda_options);
+
+  return 0;
+}
+#endif // GPUCA_HAS_ONNX
+
 #else  // HIP
 void* GPUReconstructionHIP::getGPUPointer(void* ptr)
 {
@@ -663,6 +690,21 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr)
   GPUChkErr(hipHostGetDevicePointer(&retVal, ptr, 0));
   return retVal;
 }
+
+#ifdef GPUCA_HAS_ONNX
+int32_t GPUReconstructionCUDA::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream)
+{
+  // Create ROCm provider options
+  const auto& api = Ort::GetApi();
+  OrtROCMProviderOptions rocm_options{};
+  rocm_options.has_user_compute_stream = 1;  // Indicate that we are passing a user stream
+  rocm_options.user_compute_stream = &mInternals->Streams[stream];
+
+  // Append the ROCm execution provider with the custom HIP stream
+  Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_ROCM(session_options, &rocm_options));
+  return 0;
+}
+#endif // GPUCA_HAS_ONNX
 #endif // __HIPCC__
 
 namespace o2::gpu
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
index a98b14a873ca0..34674c549a9c7 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -79,6 +79,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels<GPUReconstructionC
   size_t GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override;
   void ReleaseEvent(deviceEvent ev) override;
   void RecordMarker(deviceEvent* ev, int32_t stream) override;
+  int32_t SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) override;
 
   void GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame>* timeFrame) override;
 
diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt
index 43259decef956..d4ebd29306ccc 100644
--- a/GPU/GPUTracking/Base/hip/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt
@@ -153,6 +153,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
     ${MODULE}
     SOURCES ${SRCS}
     PUBLIC_LINK_LIBRARIES ${TMP_BASELIB} O2::ITStrackingHIP
+    PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime
     PRIVATE_INCLUDE_DIRECTORIES
       ${CMAKE_SOURCE_DIR}/Detectors/Base/src
       ${CMAKE_SOURCE_DIR}/Detectors/TRD/base/src

From 64c19d5a5700b726639236d40b619a49d21fd0c4 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 27 Mar 2025 15:41:11 +0100
Subject: [PATCH 12/40] Fixes

---
 GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu           | 3 +--
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx       | 2 +-
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 4 ++--
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h   | 2 +-
 4 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index f1f3c2ecba12f..d8c4dc7914718 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -16,7 +16,6 @@
 #include "GPUReconstructionCUDAIncludesHost.h"
 
 #include <cuda_profiler_api.h>
-#include "ML/OrtInterface.h"
 
 #include "GPUReconstructionCUDA.h"
 #include "GPUReconstructionCUDAInternals.h"
@@ -692,7 +691,7 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr)
 }
 
 #ifdef GPUCA_HAS_ONNX
-int32_t GPUReconstructionCUDA::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream)
+int32_t GPUReconstructionHIP::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream)
 {
   // Create ROCm provider options
   const auto& api = Ort::GetApi();
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 4047dcae0a6b3..f6a3d64c3e120 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -640,7 +640,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
       }
       clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos;
-      GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+      GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN, iSector);
       AllocateRegisteredMemory(clustererNN.mMemoryId);
     }
   }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 5002c63524020..0f53e12d2e063 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -19,12 +19,12 @@
 
 using namespace o2::gpu;
 
-GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer)
+GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer, int32_t streamId)
 {
   OrtOptions = {
     {"model-path", settings.nnClassificationPath},
     {"device", settings.nnInferenceDevice},
-    {"device-id", std::to_string(settings.nnInferenceDeviceId)},
+    {"stream-id", std::to_string(streamId)},
     {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
     {"dtype", settings.nnInferenceDtype},
     {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 7efa0edecb893..51f1f76679c7b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -37,7 +37,7 @@ class GPUTPCNNClusterizerHost
 {
  public:
   GPUTPCNNClusterizerHost() = default;
-  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&, int32_t = 0);
 
   void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype);
 

From 8a5bb69c12ea5629d930e5c953345b7372d024d3 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Thu, 27 Mar 2025 14:44:20 +0000
Subject: [PATCH 13/40] Please consider the following formatting changes

---
 GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index d8c4dc7914718..915f3bb4707de 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -696,7 +696,7 @@ int32_t GPUReconstructionHIP::SetONNXGPUStream(OrtSessionOptions* session_option
   // Create ROCm provider options
   const auto& api = Ort::GetApi();
   OrtROCMProviderOptions rocm_options{};
-  rocm_options.has_user_compute_stream = 1;  // Indicate that we are passing a user stream
+  rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
   rocm_options.user_compute_stream = &mInternals->Streams[stream];
 
   // Append the ROCm execution provider with the custom HIP stream

From 46fb1e126da5c6bb13b4f725114c2b2c0e048649 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 27 Mar 2025 21:09:27 +0100
Subject: [PATCH 14/40] Adding the lane variable. This PR will in any case
 conflict with #14069

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index f6a3d64c3e120..bf83f97b28775 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -916,7 +916,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 #ifdef GPUCA_HAS_ONNX
           GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
           const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
-          GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN);
+          GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN, lane);
 
           if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});

From 70320c3afce42dca26ccd10f165b246e82b6341f Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 29 Mar 2025 13:39:27 +0100
Subject: [PATCH 15/40] Compiles on EPNs. Need to add shadow processors next.
 But for this, I will merge https://github.com/AliceO2Group/AliceO2/pull/14069
 to have the changes in GPUChainTrackingClusterizer.

---
 Common/ML/include/ML/OrtInterface.h           |  7 ++
 Common/ML/src/OrtInterface.cxx                | 70 +++++++++++--------
 GPU/GPUTracking/Base/GPUReconstructionCPU.h   |  7 ++
 .../Base/GPUReconstructionProcessing.h        |  2 +-
 .../Base/cuda/GPUReconstructionCUDA.cu        | 20 +++---
 .../Base/cuda/GPUReconstructionCUDA.h         |  6 +-
 GPU/GPUTracking/CMakeLists.txt                | 14 ++++
 GPU/GPUTracking/Global/GPUChain.h             |  1 +
 .../Global/GPUChainTrackingClusterizer.cxx    |  3 +
 9 files changed, 90 insertions(+), 40 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 3d2de192a1fd6..33e6821108112 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -26,6 +26,11 @@
 // O2 includes
 #include "Framework/Logger.h"
 
+namespace Ort {
+  struct SessionOptions;
+  struct MemoryInfo;
+}
+
 namespace o2
 {
 
@@ -42,6 +47,8 @@ class OrtModel
   void init(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
   void reset(std::unordered_map<std::string, std::string>);
   bool isInitialized() { return mInitialized; }
+  Ort::SessionOptions* updateSessionOptions();
+  Ort::MemoryInfo* updateMemoryInfo();
 
   virtual ~OrtModel() = default;
 
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 7f550e8e9b32c..1f750abf8226e 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -35,6 +35,16 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c
   Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
 };
 
+Ort::SessionOptions* OrtModel::updateSessionOptions()
+{
+  return &(pImplOrt->sessionOptions);
+}
+
+Ort::MemoryInfo* OrtModel::updateMemoryInfo()
+{
+  return &(pImplOrt->memoryInfo);
+}
+
 void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
 {
 
@@ -56,39 +66,41 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
     enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
     enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
 
-    std::string dev_mem_str = "Hip";
-#if defined(ORT_ROCM_BUILD)
-#if ORT_ROCM_BUILD == 1
-  if (device == "ROCM") {
-    // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, streamId));
-    o2::gpu::SetONNXGPUStream(pImplOrt->sessionOptions, streamId);
-    LOG(info) << "(ORT) ROCM execution provider set";
-  }
-#endif
-#endif
-#if defined(ORT_MIGRAPHX_BUILD)
-#if ORT_MIGRAPHX_BUILD == 1
-  if (device == "MIGRAPHX") {
-    Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, streamId));
-    LOG(info) << "(ORT) MIGraphX execution provider set";
-  }
-#endif
-#endif
-#if defined(ORT_CUDA_BUILD)
-#if ORT_CUDA_BUILD == 1
-  if (device == "CUDA") {
-    // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, streamId));
-    o2::gpu::SetONNXGPUStream(pImplOrt->sessionOptions, streamId);
-    LOG(info) << "(ORT) CUDA execution provider set";
-    dev_mem_str = "Cuda";
-  }
-#endif
-#endif
-
+// #if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1
+//   if (device == "ROCM") {
+//     // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, streamId));
+//     SetONNXGPUStream(pImplOrt->sessionOptions, streamId);
+//     LOG(info) << "(ORT) ROCM execution provider set";
+//   }
+// #endif
+// #if defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1
+//   if (device == "MIGRAPHX") {
+//     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, streamId));
+//     LOG(info) << "(ORT) MIGraphX execution provider set";
+//   }
+// #endif
+// #if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1
+//   if (device == "CUDA") {
+//     // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, streamId));
+//     SetONNXGPUStream(pImplOrt->sessionOptions, streamId);
+//     LOG(info) << "(ORT) CUDA execution provider set";
+//     dev_mem_str = "Cuda";
+//   }
+// #endif
+
+#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
   if (allocateDeviceMemory) {
+    std::string dev_mem_str = "";
+    if (device == "ROCM") {
+      dev_mem_str = "Hip";
+    }
+    if (device == "CUDA") {
+      dev_mem_str = "Cuda";
+    }
     pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, streamId, OrtMemType::OrtMemTypeDefault);
     LOG(info) << "(ORT) Memory info set to on-device memory";
   }
+#endif
 
   if (device == "CPU") {
     (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index fd999ec2304e1..3bb6fff25be17 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -24,6 +24,10 @@
 #include "GPUReconstructionKernelIncludes.h"
 #include "GPUReconstructionKernels.h"
 
+namespace Ort {
+  struct SessionOptions;
+}
+
 namespace o2::gpu
 {
 
@@ -111,6 +115,9 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
   size_t WriteToConstantMemory(size_t offset, const void* src, size_t size, int32_t stream = -1, deviceEvent* ev = nullptr) override;
   virtual size_t TransferMemoryInternal(GPUMemoryResource* res, int32_t stream, deviceEvent* ev, deviceEvent* evList, int32_t nEvents, bool toGPU, const void* src, void* dst);
 
+  // ONNX runtime
+  virtual void SetONNXGPUStream(Ort::SessionOptions*, int32_t) {}
+
   int32_t InitDevice() override;
   int32_t ExitDevice() override;
   int32_t GetThread();
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
index 662258ba13d97..411d18bfdb09e 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
@@ -90,7 +90,7 @@ class GPUReconstructionProcessing : public GPUReconstruction
   void AddGPUEvents(T*& events);
 
   virtual std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() override;
-  virtual int32_t SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) { return 0; }
+  virtual void SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) {}
 
   struct RecoStepTimerMeta {
     HighResTimer timerToGPU;
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 915f3bb4707de..3e0f739418125 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -661,11 +661,12 @@ void GPUReconstructionCUDA::endGPUProfiling()
   GPUChkErr(cudaProfilerStop());
 }
 
-#ifdef GPUCA_HAS_ONNX
-int32_t GPUReconstructionCUDA::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream)
+#if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1
+void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream)
 {
   OrtCUDAProviderOptionsV2* cuda_options = nullptr;
   CreateCUDAProviderOptions(&cuda_options);
+  OrtSessionOptions* raw_options = session_options->operator OrtSessionOptions*();
 
   // std::vector<const char*> keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"};
   // std::vector<const char*> values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"};
@@ -673,12 +674,10 @@ int32_t GPUReconstructionCUDA::SetONNXGPUStream(OrtSessionOptions* session_optio
 
   // this implicitly sets "has_user_compute_stream"
   UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", &mInternals->Streams[stream]);
-  Ort::ThrowOnError(SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options));
+  Ort::ThrowOnError(SessionOptionsAppendExecutionProvider_CUDA_V2(raw_options, cuda_options));
 
   // Finally, don't forget to release the provider options
   ReleaseCUDAProviderOptions(cuda_options);
-
-  return 0;
 }
 #endif // GPUCA_HAS_ONNX
 
@@ -690,8 +689,8 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr)
   return retVal;
 }
 
-#ifdef GPUCA_HAS_ONNX
-int32_t GPUReconstructionHIP::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream)
+#if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1
+void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream)
 {
   // Create ROCm provider options
   const auto& api = Ort::GetApi();
@@ -699,10 +698,13 @@ int32_t GPUReconstructionHIP::SetONNXGPUStream(OrtSessionOptions* session_option
   rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
   rocm_options.user_compute_stream = &mInternals->Streams[stream];
 
+  // Get the raw OrtSessionOptions pointer from the Ort::SessionOptions wrapper
+  OrtSessionOptions* raw_options = session_options->operator OrtSessionOptions*();
+
   // Append the ROCm execution provider with the custom HIP stream
-  Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_ROCM(session_options, &rocm_options));
-  return 0;
+  Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_ROCM(raw_options, &rocm_options));
 }
+
 #endif // GPUCA_HAS_ONNX
 #endif // __HIPCC__
 
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
index 34674c549a9c7..b72b8264c4098 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -25,6 +25,10 @@ extern "C" __declspec(dllexport) o2::gpu::GPUReconstruction* GPUReconstruction_C
 extern "C" o2::gpu::GPUReconstruction* GPUReconstruction_Create_CUDA(const o2::gpu::GPUSettingsDeviceBackend& cfg);
 #endif
 
+namespace Ort {
+  struct SessionOptions;
+}
+
 namespace o2::gpu
 {
 struct GPUReconstructionCUDAInternals;
@@ -79,7 +83,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels<GPUReconstructionC
   size_t GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override;
   void ReleaseEvent(deviceEvent ev) override;
   void RecordMarker(deviceEvent* ev, int32_t stream) override;
-  int32_t SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) override;
+  void SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream) override;
 
   void GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame>* timeFrame) override;
 
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index ad7dd9c210cd1..0859502e59ef2 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -14,6 +14,20 @@ set(MODULE GPUTracking)
 # set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O0") # to uncomment if needed, tired of typing this...
 # set(GPUCA_BUILD_DEBUG 1)
 
+# Pass ORT variables as a preprocessor definition
+if(DEFINED ENV{ORT_ROCM_BUILD})
+    add_compile_definitions(ORT_ROCM_BUILD=$ENV{ORT_ROCM_BUILD})
+endif()
+if(DEFINED ENV{ORT_CUDA_BUILD})
+    add_compile_definitions(ORT_CUDA_BUILD=$ENV{ORT_CUDA_BUILD})
+endif()
+if(DEFINED ENV{ORT_MIGRAPHX_BUILD})
+    add_compile_definitions(ORT_MIGRAPHX_BUILD=$ENV{ORT_MIGRAPHX_BUILD})
+endif()
+if(DEFINED ENV{ORT_TENSORRT_BUILD})
+    add_compile_definitions(ORT_TENSORRT_BUILD=$ENV{ORT_TENSORRT_BUILD})
+endif()
+
 if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH})
   set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}")
   if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_OPTO2})
diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
index 290ae32cafca8..66b50b781172a 100644
--- a/GPU/GPUTracking/Global/GPUChain.h
+++ b/GPU/GPUTracking/Global/GPUChain.h
@@ -83,6 +83,7 @@ class GPUChain
   inline GPUParam& param() { return mRec->param(); }
   inline const GPUConstantMem* processors() const { return mRec->processors(); }
   inline void SynchronizeStream(int32_t stream) { mRec->SynchronizeStream(stream); }
+  inline void SetONNXGPUStream(Ort::SessionOptions* opt, int32_t stream) { mRec->SetONNXGPUStream(opt, stream); }
   inline void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) { mRec->SynchronizeEvents(evList, nEvents); }
   inline void SynchronizeEventAndRelease(deviceEvent& ev, bool doGPU = true)
   {
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index bf83f97b28775..e5e36189fac50 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -917,6 +917,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
           const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
           GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN, lane);
+          SetONNXGPUStream(nnApplication.model_class.updateSessionOptions(), lane);
+          SetONNXGPUStream(nnApplication.model_reg_1.updateSessionOptions(), lane);
+          SetONNXGPUStream(nnApplication.model_reg_2.updateSessionOptions(), lane);
 
           if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});

From 9d9267f6d9afc191c15022eb58f0afd9ce6b997f Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 29 Mar 2025 19:20:21 +0100
Subject: [PATCH 16/40] Adding shadow instance. Not sure if this correctly
 allocates GPU memory using AllocateRegisteredMemory

---
 GPU/GPUTracking/Base/GPUReconstructionCPU.h   |  2 +-
 .../Base/GPUReconstructionProcessing.h        |  2 +-
 .../Base/cuda/GPUReconstructionCUDA.cu        |  6 +-
 .../Base/cuda/GPUReconstructionCUDA.h         |  2 +-
 GPU/GPUTracking/Global/GPUChain.h             |  2 +-
 .../Global/GPUChainTrackingClusterizer.cxx    | 97 +++++++++++--------
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 29 ++++++
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  1 +
 .../GPUTPCNNClusterizerHost.cxx               |  6 +-
 .../GPUTPCNNClusterizerHost.h                 |  2 +-
 10 files changed, 100 insertions(+), 49 deletions(-)

diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index 3bb6fff25be17..6f2610c3c93c7 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -116,7 +116,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
   virtual size_t TransferMemoryInternal(GPUMemoryResource* res, int32_t stream, deviceEvent* ev, deviceEvent* evList, int32_t nEvents, bool toGPU, const void* src, void* dst);
 
   // ONNX runtime
-  virtual void SetONNXGPUStream(Ort::SessionOptions*, int32_t) {}
+  virtual void SetONNXGPUStream(Ort::SessionOptions*, int32_t, int32_t*) {}
 
   int32_t InitDevice() override;
   int32_t ExitDevice() override;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
index 411d18bfdb09e..e3b66a825f1ed 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
@@ -90,7 +90,7 @@ class GPUReconstructionProcessing : public GPUReconstruction
   void AddGPUEvents(T*& events);
 
   virtual std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() override;
-  virtual void SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) {}
+  virtual void SetONNXGPUStream(OrtSessionOptions*, int32_t, int32_t*) {}
 
   struct RecoStepTimerMeta {
     HighResTimer timerToGPU;
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 3e0f739418125..26ef569fe1b7c 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -662,8 +662,9 @@ void GPUReconstructionCUDA::endGPUProfiling()
 }
 
 #if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1
-void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream)
+void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream, int32_t* deviceId)
 {
+  cudaGetDevice(deviceId);
   OrtCUDAProviderOptionsV2* cuda_options = nullptr;
   CreateCUDAProviderOptions(&cuda_options);
   OrtSessionOptions* raw_options = session_options->operator OrtSessionOptions*();
@@ -690,9 +691,10 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr)
 }
 
 #if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1
-void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream)
+void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream, int32_t* deviceId)
 {
   // Create ROCm provider options
+  cudaGetDevice(deviceId);
   const auto& api = Ort::GetApi();
   OrtROCMProviderOptions rocm_options{};
   rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
index b72b8264c4098..8194385444ade 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -83,7 +83,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels<GPUReconstructionC
   size_t GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override;
   void ReleaseEvent(deviceEvent ev) override;
   void RecordMarker(deviceEvent* ev, int32_t stream) override;
-  void SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream) override;
+  void SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream, int32_t* deviceId) override;
 
   void GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame>* timeFrame) override;
 
diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
index 66b50b781172a..4130990a7d1e2 100644
--- a/GPU/GPUTracking/Global/GPUChain.h
+++ b/GPU/GPUTracking/Global/GPUChain.h
@@ -83,7 +83,7 @@ class GPUChain
   inline GPUParam& param() { return mRec->param(); }
   inline const GPUConstantMem* processors() const { return mRec->processors(); }
   inline void SynchronizeStream(int32_t stream) { mRec->SynchronizeStream(stream); }
-  inline void SetONNXGPUStream(Ort::SessionOptions* opt, int32_t stream) { mRec->SetONNXGPUStream(opt, stream); }
+  inline void SetONNXGPUStream(Ort::SessionOptions* opt, int32_t stream, int32_t* deviceId) { mRec->SetONNXGPUStream(opt, stream, deviceId); }
   inline void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) { mRec->SynchronizeEvents(evList, nEvents); }
   inline void SynchronizeEventAndRelease(deviceEvent& ev, bool doGPU = true)
   {
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index a27bac308cdc6..3d5cb79711957 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -622,28 +622,45 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     }
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
-      clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
-      clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
-      clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
-      clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
-      clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
-      clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
-      clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
-      clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
-      clustererNN.nnClusterizerTotalClusters = maxClusters;
-      clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
-      clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
-      if (clustererNN.nnSigmoidTrafoClassThreshold) {
-        clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
+      GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[iSector] : clustererNN;
+      clustererNNShadow.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
+      clustererNNShadow.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
+      clustererNNShadow.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
+      clustererNNShadow.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
+      clustererNNShadow.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
+      clustererNNShadow.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
+      clustererNNShadow.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
+      clustererNNShadow.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
+      clustererNNShadow.nnClusterizerTotalClusters = maxClusters;
+      clustererNNShadow.nnClassThreshold = nn_settings.nnClassThreshold;
+      clustererNNShadow.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
+      if (clustererNNShadow.nnSigmoidTrafoClassThreshold) {
+        clustererNNShadow.nnClassThreshold = (float)std::log(clustererNNShadow.nnClassThreshold / (1.f - clustererNNShadow.nnClassThreshold));
       }
       if (nn_settings.nnClusterizerVerbosity < 0) {
-        clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
+        clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
       } else {
-        clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
+        clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
       }
-      clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos;
-      nnApplication.initClusterizer(nn_settings, clustererNN);
-      AllocateRegisteredMemory(clustererNN.mMemoryId);
+      clustererNNShadow.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos;
+      nnApplication.initClusterizer(nn_settings, clustererNNShadow);
+      // if (doGPU) {
+      //   std::vector<int32_t> pointerSizes = clustererNNShadow.pointerSizes();
+      //   // FIXME: These are for sure not needed. The arrays are empty at this point, only the space needs to be reserved. Is this already handeled by computePointerWithAlignment?
+      //   // Once a GPU is available, everything should be done on the GPU for now.
+      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.inputData32, clustererNN.inputData32, pointerSizes[0], lane, true);
+      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.inputData16, clustererNN.inputData16, pointerSizes[1], lane, true);
+      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataClass, clustererNN.outputDataClass, pointerSizes[2], lane, true);
+      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.modelProbabilities, clustererNN.modelProbabilities, pointerSizes[3], lane, true);
+      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataReg1, clustererNN.outputDataReg1, pointerSizes[4], lane, true);
+      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataReg2, clustererNN.outputDataReg2, pointerSizes[5], lane, true);
+      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.peakPositions, clustererNN.peakPositions, pointerSizes[6], lane, true);
+      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.clusterFlags, clustererNN.clusterFlags, pointerSizes[7], lane, true);
+      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.centralCharges, clustererNN.centralCharges, pointerSizes[8], lane, true);
+      // } else {
+      //   AllocateRegisteredMemory(clustererNNShadow.mMemoryId);
+      // }
+      AllocateRegisteredMemory(clustererNNShadow.mMemoryId);
     }
   }
 #endif
@@ -917,41 +934,43 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (GetProcessingSettings().nn.applyNNclusterizer) {
 #ifdef GPUCA_HAS_ONNX
           GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
+          GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[iSector] : clustererNN;
           const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
-          GPUTPCNNClusterizerHost nnApplication(nn_settings, lane);
-          SetONNXGPUStream(nnApplication.model_class.updateSessionOptions(), lane);
-          SetONNXGPUStream(nnApplication.model_reg_1.updateSessionOptions(), lane);
-          SetONNXGPUStream(nnApplication.model_reg_2.updateSessionOptions(), lane);
+          GPUTPCNNClusterizerHost nnApplication(nn_settings, lane); // FIXME: This needs to be the deviceID. If that is the lane, then this line is correct
+          int32_t deviceId = -1;
+          SetONNXGPUStream(nnApplication.model_class.updateSessionOptions(), lane, &deviceId);
+          SetONNXGPUStream(nnApplication.model_reg_1.updateSessionOptions(), lane, &deviceId);
+          SetONNXGPUStream(nnApplication.model_reg_2.updateSessionOptions(), lane, &deviceId);
           int withMC = (doGPU && propagateMCLabels);
 
-          if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
+          if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
 
           float time_clusterizer = 0, time_fill = 0;
-          for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) {
-            uint batchStart = batch * clustererNN.nnClusterizerBatchedMode;
-            size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
+          for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.nnClusterizerBatchedMode); batch++) {
+            uint batchStart = batch * clustererNNShadow.nnClusterizerBatchedMode;
+            size_t iSize = CAMath::Min((uint)clustererNNShadow.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
             auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Filling the data
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
-            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnInferenceInputDType);
+            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType, deviceId);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
             } else {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
             }
 
-            if (!clustererNN.nnClusterizerUseCfRegression) {
-              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnInferenceInputDType);
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1
+            if (!clustererNNShadow.nnClusterizerUseCfRegression) {
+              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType, deviceId);
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
-                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnInferenceInputDType);
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2
+                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType, deviceId);
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
@@ -960,15 +979,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
           }
           auto start1 = std::chrono::high_resolution_clock::now();
-          if (clustererNN.nnClusterizerUseCfRegression) {
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+          if (clustererNNShadow.nnClusterizerUseCfRegression) {
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
           }
           auto stop1 = std::chrono::high_resolution_clock::now();
           time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
-          if (clustererNN.nnClusterizerVerbosity < 3) {
+          if (clustererNNShadow.nnClusterizerVerbosity < 3) {
             int acceptedClusters = 0;
             for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
-              acceptedClusters += clustererNN.outputDataClass[i];
+              acceptedClusters += clustererNNShadow.outputDataClass[i];
             }
             LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index cc3f29434615f..f4e442a6d7fb4 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -51,6 +51,35 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
   return mem;
 }
 
+std::vector<int32_t> GPUTPCNNClusterizer::pointerSizes() {
+  std::vector<int32_t> sizes(9, -1);
+  if (nnClusterizerBatchedMode > 0) {
+    if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) {
+      sizes[0] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData16
+    } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
+      sizes[1] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData32
+    }
+    sizes[2] = nnClusterizerBatchedMode; // peakPositions
+    sizes[3] = 2 * nnClusterizerBatchedMode; // clusterFlags
+    sizes[4] = nnClusterizerBatchedMode; // centralCharges
+    if (nnClusterizerModelClassNumOutputNodes > 0) {
+      sizes[5] = nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes; // modelProbabilities
+    }
+    if (!nnClusterizerUseCfRegression) {
+      if (nnClusterizerModelReg1NumOutputNodes > 0) {
+        sizes[6] = nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes; // outputDataReg1
+      }
+      if (nnClusterizerModelReg2NumOutputNodes > 0) {
+        sizes[7] = nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes; // outputDataReg2
+      }
+    }
+  }
+  if (nnClusterizerTotalClusters > 0) {
+    sizes[8] = nnClusterizerTotalClusters; // outputDataClass
+  }
+  return sizes;
+}
+
 void GPUTPCNNClusterizer::RegisterMemoryAllocation()
 {
   AllocateAndInitializeLate();
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 0b9e3a6572684..0457534b3f903 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -34,6 +34,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   void RegisterMemoryAllocation();
   void InitializeProcessor();
   void SetMaxData(const GPUTrackingInOutPointers&);
+  std::vector<int32_t> pointerSizes();
 
   // Neural network clusterization
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 5b9413ffbea32..00a1bc09e536a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -142,11 +142,11 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   }
 }
 
-void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype)
+void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype, int32_t deviceId)
 {
   if (dtype == 0) {
-    model.inference<OrtDataType::Float16_t, float>(clustererNN.inputData16, size, output);
+    model.inference<OrtDataType::Float16_t, float>(clustererNN.inputData16, size, output, deviceId);
   } else {
-    model.inference<float, float>(clustererNN.inputData32, size, output);
+    model.inference<float, float>(clustererNN.inputData32, size, output, deviceId);
   }
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 788186e13966d..ee0a5ea19d1dd 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -43,7 +43,7 @@ class GPUTPCNNClusterizerHost
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
   void loadFromCCDB(std::map<std::string, std::string>);
 
-  void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype);
+  void networkInference(o2::ml::OrtModel, GPUTPCNNClusterizer&, size_t, float*, int32_t, int32_t);
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters

From 007a4a16a984c54f365f0ec416bfdc4607971be4 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 1 Apr 2025 14:40:25 +0200
Subject: [PATCH 17/40] This runs, but will eventually fill up the VRAM. Need
 to include a mem clean

---
 Common/ML/CMakeLists.txt                      |  16 +--
 Common/ML/include/ML/OrtInterface.h           |  19 ++-
 Common/ML/src/OrtInterface.cxx                | 124 +++++++-----------
 GPU/GPUTracking/Base/GPUReconstructionCPU.h   |   2 +-
 .../Base/GPUReconstructionProcessing.h        |   6 +-
 .../Base/cuda/GPUReconstructionCUDA.cu        |  28 ++--
 .../Base/cuda/GPUReconstructionCUDA.h         |   2 +-
 GPU/GPUTracking/CMakeLists.txt                |  21 ++-
 GPU/GPUTracking/Global/GPUChain.h             |   2 +-
 .../Global/GPUChainTrackingClusterizer.cxx    | 117 +++++++++++------
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |  16 +--
 .../GPUTPCNNClusterizerHost.cxx               |  35 +++--
 .../GPUTPCNNClusterizerHost.h                 |   6 +-
 .../GPUTPCNNClusterizerKernels.cxx            |  71 +++++-----
 14 files changed, 242 insertions(+), 223 deletions(-)

diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt
index 74be306c8b6a5..5bfa05b716123 100644
--- a/Common/ML/CMakeLists.txt
+++ b/Common/ML/CMakeLists.txt
@@ -10,18 +10,10 @@
 # or submit itself to any jurisdiction.
 
 # Pass ORT variables as a preprocessor definition
-if(DEFINED ENV{ORT_ROCM_BUILD})
-    add_compile_definitions(ORT_ROCM_BUILD=$ENV{ORT_ROCM_BUILD})
-endif()
-if(DEFINED ENV{ORT_CUDA_BUILD})
-    add_compile_definitions(ORT_CUDA_BUILD=$ENV{ORT_CUDA_BUILD})
-endif()
-if(DEFINED ENV{ORT_MIGRAPHX_BUILD})
-    add_compile_definitions(ORT_MIGRAPHX_BUILD=$ENV{ORT_MIGRAPHX_BUILD})
-endif()
-if(DEFINED ENV{ORT_TENSORRT_BUILD})
-    add_compile_definitions(ORT_TENSORRT_BUILD=$ENV{ORT_TENSORRT_BUILD})
-endif()
+add_compile_definitions(ORT_ROCM_BUILD=${ORT_ROCM_BUILD})
+add_compile_definitions(ORT_CUDA_BUILD=${ORT_CUDA_BUILD})
+add_compile_definitions(ORT_MIGRAPHX_BUILD=${ORT_MIGRAPHX_BUILD})
+add_compile_definitions(ORT_TENSORRT_BUILD=${ORT_TENSORRT_BUILD})
 
 o2_add_library(ML
                SOURCES src/OrtInterface.cxx
diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 5034899debb60..44c89b748f52c 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -43,12 +43,19 @@ class OrtModel
  public:
   // Constructor
   OrtModel() = default;
-  OrtModel(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
-  void init(std::unordered_map<std::string, std::string> optionsMap) { reset(optionsMap); }
-  void reset(std::unordered_map<std::string, std::string>);
+  OrtModel(std::unordered_map<std::string, std::string> optionsMap) {
+    initOptions(optionsMap);
+    initEnvironment();
+  }
+  void init(std::unordered_map<std::string, std::string> optionsMap) {
+    initOptions(optionsMap);
+    initEnvironment();
+  }
+  void initOptions(std::unordered_map<std::string, std::string> optionsMap);
+  void initEnvironment();
   bool isInitialized() { return mInitialized; }
-  Ort::SessionOptions* updateSessionOptions();
-  Ort::MemoryInfo* updateMemoryInfo();
+  Ort::SessionOptions& updateSessionOptions();
+  void setIO();
 
   virtual ~OrtModel() = default;
 
@@ -91,7 +98,7 @@ class OrtModel
 
   // Environment settings
   bool mInitialized = false;
-  std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
+  std::string modelPath, envName = "", device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
   int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
   std::string printShape(const std::vector<int64_t>&);
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 5d85e3194d07d..3100eb6dd2243 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -35,19 +35,13 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c
   Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
 };
 
-Ort::SessionOptions* OrtModel::updateSessionOptions()
+Ort::SessionOptions& OrtModel::updateSessionOptions()
 {
-  return &(pImplOrt->sessionOptions);
+  return pImplOrt->sessionOptions;
 }
 
-Ort::MemoryInfo* OrtModel::updateMemoryInfo()
+void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsMap)
 {
-  return &(pImplOrt->memoryInfo);
-}
-
-void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
-{
-
   pImplOrt = new OrtVariables();
 
   // Load from options map
@@ -58,71 +52,57 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   if (!optionsMap["model-path"].empty()) {
     modelPath = optionsMap["model-path"];
     device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
-    deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0);
     allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
     intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
     interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0);
     loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0);
     enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0);
     enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
-
-// #if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1
-//   if (device == "ROCM") {
-//     // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId));
-//     SetONNXGPUStream(pImplOrt->sessionOptions, deviceId);
-//     LOG(info) << "(ORT) ROCM execution provider set";
-//   }
-// #endif
-// #if defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1
-//   if (device == "MIGRAPHX") {
-//     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId));
-//     LOG(info) << "(ORT) MIGraphX execution provider set";
-//   }
-// #endif
-// #if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1
-//   if (device == "CUDA") {
-//     // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId));
-//     SetONNXGPUStream(pImplOrt->sessionOptions, deviceId);
-//     LOG(info) << "(ORT) CUDA execution provider set";
-//     dev_mem_str = "Cuda";
-//   }
-// #endif
-
-  if (device == "CPU") {
-    (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
-    (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads);
-    if (intraOpNumThreads > 1 || interOpNumThreads > 1) {
-      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
-    } else if (intraOpNumThreads == 1) {
-      (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
-    }
-    if (loggingLevel < 2) {
-      LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " (intraOpNumThreads) and " << interOpNumThreads << " (interOpNumThreads) threads";
+    envName = (optionsMap.contains("onnx-environment-name") ? optionsMap["onnx-environment-name"] : "onnx_model_inference");
+
+    if (device == "CPU") {
+      (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
+      (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads);
+      if (intraOpNumThreads > 1 || interOpNumThreads > 1) {
+        (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL);
+      } else if (intraOpNumThreads == 1) {
+        (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
+      }
+      if (loggingLevel < 2) {
+        LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " (intraOpNumThreads) and " << interOpNumThreads << " (interOpNumThreads) threads";
+      }
     }
-  }
 
-  (pImplOrt->sessionOptions).DisableMemPattern();
-  (pImplOrt->sessionOptions).DisableCpuMemArena();
+    // OrtROCMProviderOptions rocm_options{};
+    // (pImplOrt->sessionOptions).AppendExecutionProvider_ROCM(rocm_options);
 
-  if (enableProfiling) {
-    if (optionsMap.contains("profiling-output-path")) {
-      (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
+    (pImplOrt->sessionOptions).DisableMemPattern();
+    (pImplOrt->sessionOptions).DisableCpuMemArena();
+
+    if (enableProfiling) {
+      if (optionsMap.contains("profiling-output-path")) {
+        (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str());
+      } else {
+        LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
+        (pImplOrt->sessionOptions).DisableProfiling();
+      }
     } else {
-      LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now.";
       (pImplOrt->sessionOptions).DisableProfiling();
     }
+
+    (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
+    (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
   } else {
-    (pImplOrt->sessionOptions).DisableProfiling();
+    LOG(fatal) << "(ORT) Model path cannot be empty!";
   }
+}
 
+void OrtModel::initEnvironment()
+{
   mInitialized = true;
-
-  (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
-  (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
-
   pImplOrt->env = std::make_shared<Ort::Env>(
     OrtLoggingLevel(loggingLevel),
-    (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()),
+    (envName.empty() ? "ORT" : envName.c_str()),
     // Integrate ORT logging into Fairlogger
     [](void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, const char* message) {
       if (severity == ORT_LOGGING_LEVEL_VERBOSE) {
@@ -143,6 +123,10 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
   pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
 
+  setIO();
+}
+
+void OrtModel::setIO() {
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
     mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
   }
@@ -162,7 +146,6 @@ void OrtModel::reset(std::unordered_map<std::string, std::string> optionsMap)
   outputNamesChar.resize(mOutputNames.size(), nullptr);
   std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
                  [&](const std::string& str) { return str.c_str(); });
-  }
   if (loggingLevel < 2) {
     LOG(info) << "(ORT) Model loaded successfully! (input: " << printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")";
   }
@@ -203,9 +186,6 @@ std::vector<O> OrtModel::inference(std::vector<I>& input, int32_t deviceIndex)
 {
 #if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
   if (allocateDeviceMemory) {
-    if (deviceIndex >= 0) {
-      deviceId = deviceIndex;
-    }
     std::string dev_mem_str = "";
     if (device == "ROCM") {
       dev_mem_str = "Hip";
@@ -213,8 +193,8 @@ std::vector<O> OrtModel::inference(std::vector<I>& input, int32_t deviceIndex)
     if (device == "CUDA") {
       dev_mem_str = "Cuda";
     }
-    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
-    LOG(info) << "(ORT) Memory info set to on-device memory";
+    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault);
+    LOG(info) << "(ORT) Memory info set to on-device memory for device " << device << " with ID "<< deviceIndex;
   }
 #endif
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
@@ -241,11 +221,12 @@ template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Fl
 template <class I, class O>
 void OrtModel::inference(I* input, size_t input_size, O* output, int32_t deviceIndex)
 {
+  // std::vector<std::string> providers = Ort::GetAvailableProviders();
+  // for (const auto& provider : providers) {
+  //     LOG(info) << "Available Execution Provider: " << provider;
+  // }
 #if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
   if (allocateDeviceMemory) {
-    if (deviceIndex >= 0) {
-      deviceId = deviceIndex;
-    }
     std::string dev_mem_str = "";
     if (device == "ROCM") {
       dev_mem_str = "Hip";
@@ -253,8 +234,8 @@ void OrtModel::inference(I* input, size_t input_size, O* output, int32_t deviceI
     if (device == "CUDA") {
       dev_mem_str = "Cuda";
     }
-    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
-    LOG(info) << "(ORT) Memory info set to on-device memory";
+    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault);
+    LOG(info) << "(ORT) Memory info set to on-device memory for device " << device << " with ID "<< deviceIndex;
   }
 #endif
   std::vector<int64_t> inputShape{input_size, (int64_t)mInputShapes[0][1]};
@@ -268,7 +249,7 @@ void OrtModel::inference(I* input, size_t input_size, O* output, int32_t deviceI
   std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
   Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
 
-  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is always correct here
+  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size());
 }
 
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*, int32_t);
@@ -280,9 +261,6 @@ std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input, int32_t d
 {
 #if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
   if (allocateDeviceMemory) {
-    if (deviceIndex >= 0) {
-       deviceId = deviceIndex;
-    }
     std::string dev_mem_str = "";
     if (device == "ROCM") {
       dev_mem_str = "Hip";
@@ -290,8 +268,8 @@ std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input, int32_t d
     if (device == "CUDA") {
       dev_mem_str = "Cuda";
     }
-    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault);
-    LOG(info) << "(ORT) Memory info set to on-device memory";
+    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault);
+    LOG(info) << "(ORT) Memory info set to on-device memory for device " << device << " with ID " << deviceIndex;
   }
 #endif
   std::vector<Ort::Value> inputTensor;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index 6f2610c3c93c7..f41893e32b175 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -116,7 +116,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
   virtual size_t TransferMemoryInternal(GPUMemoryResource* res, int32_t stream, deviceEvent* ev, deviceEvent* evList, int32_t nEvents, bool toGPU, const void* src, void* dst);
 
   // ONNX runtime
-  virtual void SetONNXGPUStream(Ort::SessionOptions*, int32_t, int32_t*) {}
+  virtual void SetONNXGPUStream(Ort::SessionOptions&, int32_t, int32_t*) {}
 
   int32_t InitDevice() override;
   int32_t ExitDevice() override;
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
index e3b66a825f1ed..353c4bd76abb9 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
@@ -22,7 +22,9 @@
 #include <functional>
 #include <atomic>
 
-struct OrtSessionOptions;
+namespace Ort {
+struct SessionOptions;
+}
 
 namespace o2::gpu
 {
@@ -90,7 +92,7 @@ class GPUReconstructionProcessing : public GPUReconstruction
   void AddGPUEvents(T*& events);
 
   virtual std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() override;
-  virtual void SetONNXGPUStream(OrtSessionOptions*, int32_t, int32_t*) {}
+  virtual void SetONNXGPUStream(Ort::SessionOptions&, int32_t, int32_t*) {}
 
   struct RecoStepTimerMeta {
     HighResTimer timerToGPU;
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 26ef569fe1b7c..959072222125e 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -662,12 +662,11 @@ void GPUReconstructionCUDA::endGPUProfiling()
 }
 
 #if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1
-void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream, int32_t* deviceId)
+void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
 {
   cudaGetDevice(deviceId);
   OrtCUDAProviderOptionsV2* cuda_options = nullptr;
   CreateCUDAProviderOptions(&cuda_options);
-  OrtSessionOptions* raw_options = session_options->operator OrtSessionOptions*();
 
   // std::vector<const char*> keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"};
   // std::vector<const char*> values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"};
@@ -675,7 +674,7 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions* session_option
 
   // this implicitly sets "has_user_compute_stream"
   UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", &mInternals->Streams[stream]);
-  Ort::ThrowOnError(SessionOptionsAppendExecutionProvider_CUDA_V2(raw_options, cuda_options));
+  session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 
   // Finally, don't forget to release the provider options
   ReleaseCUDAProviderOptions(cuda_options);
@@ -691,20 +690,23 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr)
 }
 
 #if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1
-void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream, int32_t* deviceId)
+void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
 {
   // Create ROCm provider options
   cudaGetDevice(deviceId);
   const auto& api = Ort::GetApi();
-  OrtROCMProviderOptions rocm_options{};
-  rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
-  rocm_options.user_compute_stream = &mInternals->Streams[stream];
-
-  // Get the raw OrtSessionOptions pointer from the Ort::SessionOptions wrapper
-  OrtSessionOptions* raw_options = session_options->operator OrtSessionOptions*();
-
-  // Append the ROCm execution provider with the custom HIP stream
-  Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_ROCM(raw_options, &rocm_options));
+  // api.GetCurrentGpuDeviceId(deviceId);
+  OrtROCMProviderOptions rocm_options;
+  LOG(info) << "Creating ROCm provider options";
+  // rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
+  // LOG(info) << "Setting user compute stream";
+  // rocm_options.user_compute_stream = &(mInternals->Streams[stream]);
+  // LOG(info) << "Stream is set with streamId " << stream << " and reference " << &(mInternals->Streams[stream]);
+  session_options.AppendExecutionProvider_ROCM(rocm_options);
+  LOG(info) << "Appending ROCm provider options";
+  // OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, *deviceId);
+  // api.ReleaseROCMProviderOptions(rocm_options);
+  LOG(info) << "Releasing ROCm provider options";
 }
 
 #endif // GPUCA_HAS_ONNX
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
index 8194385444ade..cb4540015ff76 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -83,7 +83,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels<GPUReconstructionC
   size_t GPUMemCpy(void* dst, const void* src, size_t size, int32_t stream, int32_t toGPU, deviceEvent* ev = nullptr, deviceEvent* evList = nullptr, int32_t nEvents = 1) override;
   void ReleaseEvent(deviceEvent ev) override;
   void RecordMarker(deviceEvent* ev, int32_t stream) override;
-  void SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream, int32_t* deviceId) override;
+  void SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId) override;
 
   void GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* trackerTraits, std::unique_ptr<o2::its::VertexerTraits>* vertexerTraits, std::unique_ptr<o2::its::TimeFrame>* timeFrame) override;
 
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 0859502e59ef2..186d8ce4b0551 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -15,18 +15,15 @@ set(MODULE GPUTracking)
 # set(GPUCA_BUILD_DEBUG 1)
 
 # Pass ORT variables as a preprocessor definition
-if(DEFINED ENV{ORT_ROCM_BUILD})
-    add_compile_definitions(ORT_ROCM_BUILD=$ENV{ORT_ROCM_BUILD})
-endif()
-if(DEFINED ENV{ORT_CUDA_BUILD})
-    add_compile_definitions(ORT_CUDA_BUILD=$ENV{ORT_CUDA_BUILD})
-endif()
-if(DEFINED ENV{ORT_MIGRAPHX_BUILD})
-    add_compile_definitions(ORT_MIGRAPHX_BUILD=$ENV{ORT_MIGRAPHX_BUILD})
-endif()
-if(DEFINED ENV{ORT_TENSORRT_BUILD})
-    add_compile_definitions(ORT_TENSORRT_BUILD=$ENV{ORT_TENSORRT_BUILD})
-endif()
+add_compile_definitions(ORT_ROCM_BUILD=${ORT_ROCM_BUILD})
+add_compile_definitions(ORT_CUDA_BUILD=${ORT_CUDA_BUILD})
+add_compile_definitions(ORT_MIGRAPHX_BUILD=${ORT_MIGRAPHX_BUILD})
+add_compile_definitions(ORT_TENSORRT_BUILD=${ORT_TENSORRT_BUILD})
+
+message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}")
+message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}")
+message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}")
+message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}")
 
 if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH})
   set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}")
diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h
index 4130990a7d1e2..59712c30a62dd 100644
--- a/GPU/GPUTracking/Global/GPUChain.h
+++ b/GPU/GPUTracking/Global/GPUChain.h
@@ -83,7 +83,7 @@ class GPUChain
   inline GPUParam& param() { return mRec->param(); }
   inline const GPUConstantMem* processors() const { return mRec->processors(); }
   inline void SynchronizeStream(int32_t stream) { mRec->SynchronizeStream(stream); }
-  inline void SetONNXGPUStream(Ort::SessionOptions* opt, int32_t stream, int32_t* deviceId) { mRec->SetONNXGPUStream(opt, stream, deviceId); }
+  inline void SetONNXGPUStream(Ort::SessionOptions& opt, int32_t stream, int32_t* deviceId) { mRec->SetONNXGPUStream(opt, stream, deviceId); }
   inline void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) { mRec->SynchronizeEvents(evList, nEvents); }
   inline void SynchronizeEventAndRelease(deviceEvent& ev, bool doGPU = true)
   {
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 3d5cb79711957..e3088d6143f9b 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -623,44 +623,56 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
       GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
       GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[iSector] : clustererNN;
-      clustererNNShadow.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
-      clustererNNShadow.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
-      clustererNNShadow.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
-      clustererNNShadow.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
-      clustererNNShadow.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
-      clustererNNShadow.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
-      clustererNNShadow.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
-      clustererNNShadow.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
-      clustererNNShadow.nnClusterizerTotalClusters = maxClusters;
-      clustererNNShadow.nnClassThreshold = nn_settings.nnClassThreshold;
-      clustererNNShadow.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
-      if (clustererNNShadow.nnSigmoidTrafoClassThreshold) {
-        clustererNNShadow.nnClassThreshold = (float)std::log(clustererNNShadow.nnClassThreshold / (1.f - clustererNNShadow.nnClassThreshold));
-      }
-      if (nn_settings.nnClusterizerVerbosity < 0) {
-        clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
+
+      if (doGPU){
+        clustererNNShadow.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
+        clustererNNShadow.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
+        clustererNNShadow.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
+        clustererNNShadow.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
+        clustererNNShadow.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
+        clustererNNShadow.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
+        clustererNNShadow.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
+        clustererNNShadow.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
+        clustererNNShadow.nnClusterizerTotalClusters = maxClusters;
+        clustererNNShadow.nnClassThreshold = nn_settings.nnClassThreshold;
+        clustererNNShadow.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
+        if (clustererNNShadow.nnSigmoidTrafoClassThreshold) {
+          clustererNNShadow.nnClassThreshold = (float)std::log(clustererNNShadow.nnClassThreshold / (1.f - clustererNNShadow.nnClassThreshold));
+        }
+        if (nn_settings.nnClusterizerVerbosity < 0) {
+          clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
+        } else {
+          clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
+        }
+        clustererNNShadow.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos;
+        nnApplication.initModels();
+        nnApplication.initClusterizer(nn_settings, clustererNNShadow);
       } else {
-        clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
+        // not sure if this part is needed at all
+        clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
+        clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
+        clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
+        clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
+        clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
+        clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
+        clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
+        clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
+        clustererNN.nnClusterizerTotalClusters = maxClusters;
+        clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
+        clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
+        if (clustererNN.nnSigmoidTrafoClassThreshold) {
+          clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
+        }
+        if (nn_settings.nnClusterizerVerbosity < 0) {
+          clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
+        } else {
+          clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
+        }
+        clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos;
+        nnApplication.initModels();
+        nnApplication.initClusterizer(nn_settings, clustererNN);
       }
-      clustererNNShadow.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos;
-      nnApplication.initClusterizer(nn_settings, clustererNNShadow);
-      // if (doGPU) {
-      //   std::vector<int32_t> pointerSizes = clustererNNShadow.pointerSizes();
-      //   // FIXME: These are for sure not needed. The arrays are empty at this point, only the space needs to be reserved. Is this already handeled by computePointerWithAlignment?
-      //   // Once a GPU is available, everything should be done on the GPU for now.
-      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.inputData32, clustererNN.inputData32, pointerSizes[0], lane, true);
-      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.inputData16, clustererNN.inputData16, pointerSizes[1], lane, true);
-      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataClass, clustererNN.outputDataClass, pointerSizes[2], lane, true);
-      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.modelProbabilities, clustererNN.modelProbabilities, pointerSizes[3], lane, true);
-      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataReg1, clustererNN.outputDataReg1, pointerSizes[4], lane, true);
-      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataReg2, clustererNN.outputDataReg2, pointerSizes[5], lane, true);
-      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.peakPositions, clustererNN.peakPositions, pointerSizes[6], lane, true);
-      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.clusterFlags, clustererNN.clusterFlags, pointerSizes[7], lane, true);
-      //   GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.centralCharges, clustererNN.centralCharges, pointerSizes[8], lane, true);
-      // } else {
-      //   AllocateRegisteredMemory(clustererNNShadow.mMemoryId);
-      // }
-      AllocateRegisteredMemory(clustererNNShadow.mMemoryId);
+      AllocateRegisteredMemory(clustererNN.mMemoryId);
     }
   }
 #endif
@@ -936,12 +948,30 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
           GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[iSector] : clustererNN;
           const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
-          GPUTPCNNClusterizerHost nnApplication(nn_settings, lane); // FIXME: This needs to be the deviceID. If that is the lane, then this line is correct
+
           int32_t deviceId = -1;
-          SetONNXGPUStream(nnApplication.model_class.updateSessionOptions(), lane, &deviceId);
-          SetONNXGPUStream(nnApplication.model_reg_1.updateSessionOptions(), lane, &deviceId);
-          SetONNXGPUStream(nnApplication.model_reg_2.updateSessionOptions(), lane, &deviceId);
+          GPUTPCNNClusterizerHost nnApplication(nn_settings);
+          LOG(info) << "Allocating ONNX stream for lane " << lane << " and sector " << iSector;
+          if (nnApplication.modelsUsed[0]) {
+            SetONNXGPUStream((nnApplication.model_class).updateSessionOptions(), lane, &deviceId);
+            (nnApplication.model_class).initEnvironment();
+          }
+          if (nnApplication.modelsUsed[1]) {
+            SetONNXGPUStream((nnApplication.model_reg_1).updateSessionOptions(), lane, &deviceId);
+            (nnApplication.model_reg_1).initEnvironment();
+          }
+          if (nnApplication.modelsUsed[2]) {
+            SetONNXGPUStream((nnApplication.model_reg_2).updateSessionOptions(), lane, &deviceId);
+            (nnApplication.model_reg_2).initEnvironment();
+          }
+
           int withMC = (doGPU && propagateMCLabels);
+          if (doGPU){
+            // SetupGPUProcessor(&clustererNN, true);
+            WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer[lane] - (char*)processors(), &clustererNNShadow, sizeof(clustererNN), lane);
+            TransferMemoryResourcesToGPU(RecoStep::TPCClusterFinding, &clustererNNShadow, lane);
+            LOG(info) << "Successfully allocated for stream " << lane << " and sector " << iSector << " with memory size " << sizeof(clustererNN) << " and shadow size " << sizeof(clustererNNShadow);
+          }
 
           if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
@@ -958,7 +988,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
-            nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType, deviceId);
+            LOG(info) << "ONNX stream set. Device ID is " << deviceId << " for stream " << lane;
+            nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType, deviceId);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
             } else {
@@ -966,10 +997,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             }
 
             if (!clustererNNShadow.nnClusterizerUseCfRegression) {
-              nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType, deviceId);
+              nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType, deviceId);
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
-                nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType, deviceId);
+                nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType, deviceId);
                 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2
               }
             }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index f4e442a6d7fb4..208e8c6428cb5 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -30,9 +30,7 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
     } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
       computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
     }
-    computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode);
     computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
-    computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode);
     if (nnClusterizerModelClassNumOutputNodes > 0) {
       computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
     }
@@ -52,30 +50,28 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 }
 
 std::vector<int32_t> GPUTPCNNClusterizer::pointerSizes() {
-  std::vector<int32_t> sizes(9, -1);
+  std::vector<int32_t> sizes(7, -1);
   if (nnClusterizerBatchedMode > 0) {
     if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) {
       sizes[0] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData16
     } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
       sizes[1] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData32
     }
-    sizes[2] = nnClusterizerBatchedMode; // peakPositions
-    sizes[3] = 2 * nnClusterizerBatchedMode; // clusterFlags
-    sizes[4] = nnClusterizerBatchedMode; // centralCharges
+    sizes[2] = 2 * nnClusterizerBatchedMode; // clusterFlags
     if (nnClusterizerModelClassNumOutputNodes > 0) {
-      sizes[5] = nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes; // modelProbabilities
+      sizes[3] = nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes; // modelProbabilities
     }
     if (!nnClusterizerUseCfRegression) {
       if (nnClusterizerModelReg1NumOutputNodes > 0) {
-        sizes[6] = nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes; // outputDataReg1
+        sizes[4] = nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes; // outputDataReg1
       }
       if (nnClusterizerModelReg2NumOutputNodes > 0) {
-        sizes[7] = nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes; // outputDataReg2
+        sizes[5] = nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes; // outputDataReg2
       }
     }
   }
   if (nnClusterizerTotalClusters > 0) {
-    sizes[8] = nnClusterizerTotalClusters; // outputDataClass
+    sizes[6] = nnClusterizerTotalClusters; // outputDataClass
   }
   return sizes;
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 00a1bc09e536a..bd17d27edb3c4 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -22,11 +22,6 @@
 
 using namespace o2::gpu;
 
-GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, int32_t deviceId)
-{
-  init(settings, deviceId);
-}
-
 void GPUTPCNNClusterizerHost::loadFromCCDB(std::map<std::string, std::string> settings)
 {
   o2::ccdb::CcdbApi ccdbApi;
@@ -54,7 +49,7 @@ void GPUTPCNNClusterizerHost::loadFromCCDB(std::map<std::string, std::string> se
   }
 }
 
-void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings, int32_t deviceId)
+void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
 {
   std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath;
   std::vector<std::string> reg_model_paths;
@@ -103,7 +98,6 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
   OrtOptions = {
     {"model-path", class_model_path},
     {"device", settings.nnInferenceDevice},
-    {"device-id", std::to_string(deviceId)},
     {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
     {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
     {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)},
@@ -112,23 +106,40 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
     {"profiling-output-path", settings.nnInferenceOrtProfilingPath},
     {"logging-level", std::to_string(settings.nnInferenceVerbosity)}};
 
-  model_class.init(OrtOptions);
+  LOG(info) << "Model path: " << class_model_path;
+  model_class.initOptions(OrtOptions);
+  modelsUsed[0] = true;
 
   reg_model_paths = o2::utils::Str::tokenize(reg_model_path, ':');
 
   if (!settings.nnClusterizerUseCfRegression) {
-    if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) {
+    if (reg_model_paths.size() == 1) {
       OrtOptions["model-path"] = reg_model_paths[0];
-      model_reg_1.init(OrtOptions);
+      model_reg_1.initOptions(OrtOptions);
+      modelsUsed[1] = true;
     } else {
       OrtOptions["model-path"] = reg_model_paths[0];
-      model_reg_1.init(OrtOptions);
+      model_reg_1.initOptions(OrtOptions);
+      modelsUsed[1] = true;
       OrtOptions["model-path"] = reg_model_paths[1];
-      model_reg_2.init(OrtOptions);
+      model_reg_2.initOptions(OrtOptions);
+      modelsUsed[2] = true;
     }
   }
 }
 
+void GPUTPCNNClusterizerHost::initModels() {
+  if (!model_class.isInitialized() && modelsUsed[0]) {
+    model_class.initEnvironment();
+  }
+  if (!model_reg_1.isInitialized() && modelsUsed[1]) {
+    model_reg_1.initEnvironment();
+  }
+  if (!model_reg_2.isInitialized() && modelsUsed[2]) {
+    model_reg_2.initEnvironment();
+  }
+}
+
 void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer)
 {
   clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index ee0a5ea19d1dd..a383cbfd2bc7f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -37,16 +37,18 @@ class GPUTPCNNClusterizerHost
 {
  public:
   GPUTPCNNClusterizerHost() = default;
-  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, int32_t = 0);
+  GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) { init(settings); }
 
-  void init(const GPUSettingsProcessingNNclusterizer&, int32_t = 0);
+  void init(const GPUSettingsProcessingNNclusterizer&);
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
+  void initModels();
   void loadFromCCDB(std::map<std::string, std::string>);
 
   void networkInference(o2::ml::OrtModel, GPUTPCNNClusterizer&, size_t, float*, int32_t, int32_t);
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
+  std::vector<bool> modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2
   std::vector<std::string> reg_model_paths;
 
  private:
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 73051bd8477fd..ef75e1c1af19e 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -56,20 +56,15 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   uint glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
-  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
-  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
-
   uint write_idx = glo_idx * clustererNN.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId
 
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
   ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
   int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad()), time = static_cast<int>(peak.time()); // Explicit casting to avoid conversion errors
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
-
-  clustererNN.peakPositions[glo_idx] = peak;
-  clustererNN.centralCharges[glo_idx] = central_charge;
-  clustererNN.outputDataClass[glo_idx + batchStart] = -1.f;
-
   int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
+
 #ifndef GPUCA_GPUCODE
   GPUCA_UNROLL(U(), U());
 #endif
@@ -153,6 +148,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
+  float central_charge = static_cast<float>(chargeMap[peak].unpack());
+
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
   tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
@@ -168,34 +166,34 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     // Publishing logic is taken from default clusterizer
     if (withMC) {
       ClusterAccumulator dummy_pc;
-      CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
+      CPU_ONLY(labelAcc->collect(peak, central_charge));
       GPUTPCCFClusterizer::buildCluster(
         clusterer.Param().rec,
         chargeMap,
-        clustererNN.peakPositions[glo_idx],
+        peak,
         smem.posBcast,
         smem.buf,
         smem.innerAboveThreshold,
         &dummy_pc,
         labelAcc);
     }
-    if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
+    if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
       }
       return;
     }
 
-    pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg1[model_output_index + 4],
-               static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg1[model_output_index],
+    pc.setFull(central_charge * clustererNN.outputDataReg1[model_output_index + 4],
+               static_cast<float>(peak.pad()) + clustererNN.outputDataReg1[model_output_index],
                clustererNN.outputDataReg1[model_output_index + 2],
-               (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1],
+               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg1[model_output_index + 1],
                clustererNN.outputDataReg1[model_output_index + 3],
                clustererNN.clusterFlags[2 * glo_idx],
                clustererNN.clusterFlags[2 * glo_idx + 1]);
 
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
+    bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -204,11 +202,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     }
 
     uint rowIndex = 0;
-    if (clusterer.mPclusterByRow != nullptr) {
+    if (clusterOut != nullptr) {
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
         myCluster,
-        clustererNN.peakPositions[glo_idx].row(),
+        peak.row(),
         clusterer.mNMaxClusterPerRow,
         clusterer.mPclusterInRow,
         clusterOut);
@@ -218,7 +216,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     } else if (clusterer.mPclusterPosInRow) {
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-    CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+    CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
   } else {
     if (clusterer.mPclusterPosInRow) {
       clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -235,6 +233,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart];
+  float central_charge = static_cast<float>(chargeMap[peak].unpack());
+
   CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer));
   MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem);
   tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow;
@@ -247,18 +248,18 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
     if (withMC) {
       ClusterAccumulator dummy_pc;
-      CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack()));
+      CPU_ONLY(labelAcc->collect(peak, central_charge));
       GPUTPCCFClusterizer::buildCluster(
         clusterer.Param().rec,
         chargeMap,
-        clustererNN.peakPositions[glo_idx],
+        peak,
         smem.posBcast,
         smem.buf,
         smem.innerAboveThreshold,
         &dummy_pc,
         labelAcc);
     }
-    if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) {
+    if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
       }
@@ -266,16 +267,16 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     }
 
     // Cluster 1
-    pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 8],
-               static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index],
+    pc.setFull(central_charge * clustererNN.outputDataReg2[model_output_index + 8],
+               static_cast<float>(peak.pad()) + clustererNN.outputDataReg2[model_output_index],
                clustererNN.outputDataReg2[model_output_index + 4],
-               (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2],
+               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2[model_output_index + 2],
                clustererNN.outputDataReg2[model_output_index + 6],
                clustererNN.clusterFlags[2 * glo_idx],
                clustererNN.clusterFlags[2 * glo_idx + 1]);
 
     tpc::ClusterNative myCluster;
-    bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
+    bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -284,11 +285,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     }
 
     uint rowIndex = 0;
-    if (clusterer.mPclusterByRow != nullptr) {
+    if (clusterOut != nullptr) {
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
         myCluster,
-        clustererNN.peakPositions[glo_idx].row(),
+        peak.row(),
         clusterer.mNMaxClusterPerRow,
         clusterer.mPclusterInRow,
         clusterOut);
@@ -298,18 +299,18 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     } else if (clusterer.mPclusterPosInRow) {
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-    CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow));
+    CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
 
     // Cluster 2
-    pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 9],
-               static_cast<float>(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index + 1],
+    pc.setFull(central_charge * clustererNN.outputDataReg2[model_output_index + 9],
+               static_cast<float>(peak.pad()) + clustererNN.outputDataReg2[model_output_index + 1],
                clustererNN.outputDataReg2[model_output_index + 5],
-               (clusterer.mPmemory->fragment).start + static_cast<float>(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3],
+               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2[model_output_index + 3],
                clustererNN.outputDataReg2[model_output_index + 7],
                clustererNN.clusterFlags[2 * glo_idx],
                clustererNN.clusterFlags[2 * glo_idx + 1]);
 
-    rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap);
+    rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
     if (rejectCluster) {
       if (clusterer.mPclusterPosInRow) {
         clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;
@@ -317,11 +318,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
       return;
     }
 
-    if (clusterer.mPclusterByRow != nullptr) {
+    if (clusterOut != nullptr) {
       rowIndex = GPUTPCCFClusterizer::sortIntoBuckets(
         clusterer,
         myCluster,
-        clustererNN.peakPositions[glo_idx].row(),
+        peak.row(),
         clusterer.mNMaxClusterPerRow,
         clusterer.mPclusterInRow,
         clusterOut);
@@ -331,7 +332,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     } else if (clusterer.mPclusterPosInRow) {
       rowIndex = clusterer.mPclusterPosInRow[full_glo_idx];
     }
-    // CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
+    // CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters?
   } else {
     if (clusterer.mPclusterPosInRow) {
       clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow;

From 4ef35fc1c25611d79c88ab40cd9b1ffacd4829f5 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 1 Apr 2025 15:10:33 +0200
Subject: [PATCH 18/40] Found the stream allocation issue. Now starting
 optimizations

---
 Common/ML/include/ML/OrtInterface.h                    | 1 +
 Common/ML/src/OrtInterface.cxx                         | 5 +++++
 GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu     | 9 ++-------
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 1 -
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 44c89b748f52c..47e98683c3800 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -55,6 +55,7 @@ class OrtModel
   void initEnvironment();
   bool isInitialized() { return mInitialized; }
   Ort::SessionOptions& updateSessionOptions();
+  Ort::MemoryInfo& updateMemoryInfo();
   void setIO();
 
   virtual ~OrtModel() = default;
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 3100eb6dd2243..149f86d98eb0e 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -40,6 +40,11 @@ Ort::SessionOptions& OrtModel::updateSessionOptions()
   return pImplOrt->sessionOptions;
 }
 
+Ort::MemoryInfo& OrtModel::updateMemoryInfo()
+{
+  return pImplOrt->memoryInfo;
+}
+
 void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsMap)
 {
   pImplOrt = new OrtVariables();
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 959072222125e..844e754ee2f6c 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -697,16 +697,11 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options
   const auto& api = Ort::GetApi();
   // api.GetCurrentGpuDeviceId(deviceId);
   OrtROCMProviderOptions rocm_options;
-  LOG(info) << "Creating ROCm provider options";
-  // rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
-  // LOG(info) << "Setting user compute stream";
-  // rocm_options.user_compute_stream = &(mInternals->Streams[stream]);
-  // LOG(info) << "Stream is set with streamId " << stream << " and reference " << &(mInternals->Streams[stream]);
+  rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
+  rocm_options.user_compute_stream = mInternals->Streams[stream];
   session_options.AppendExecutionProvider_ROCM(rocm_options);
-  LOG(info) << "Appending ROCm provider options";
   // OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, *deviceId);
   // api.ReleaseROCMProviderOptions(rocm_options);
-  LOG(info) << "Releasing ROCm provider options";
 }
 
 #endif // GPUCA_HAS_ONNX
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index e3088d6143f9b..2905601bd8f28 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -988,7 +988,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
-            LOG(info) << "ONNX stream set. Device ID is " << deviceId << " for stream " << lane;
             nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType, deviceId);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels

From 4faaa4a69e0f6cf53b65c09f973490f0089e0fa5 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 1 Apr 2025 20:20:41 +0200
Subject: [PATCH 19/40] Improve readability and adapt for some comments

---
 Common/ML/include/ML/OrtInterface.h           |  57 +++---
 Common/ML/src/OrtInterface.cxx                | 168 ++++++++----------
 .../Base/GPUReconstructionProcessing.h        |   2 +-
 .../Base/cuda/GPUReconstructionCUDA.cu        |   4 +-
 GPU/GPUTracking/CMakeLists.txt                |  25 +--
 .../Global/GPUChainTrackingClusterizer.cxx    | 122 +++++--------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |   1 +
 .../GPUTPCNNClusterizerHost.cxx               |  47 ++---
 .../GPUTPCNNClusterizerHost.h                 |   4 +-
 9 files changed, 197 insertions(+), 233 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 47e98683c3800..56be450fb2ff1 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -41,24 +41,34 @@ class OrtModel
 {
 
  public:
-  // Constructor
+  // Constructors & destructors
   OrtModel() = default;
-  OrtModel(std::unordered_map<std::string, std::string> optionsMap) {
-    initOptions(optionsMap);
-    initEnvironment();
-  }
+  OrtModel(std::unordered_map<std::string, std::string> optionsMap) { init(optionsMap); }
   void init(std::unordered_map<std::string, std::string> optionsMap) {
     initOptions(optionsMap);
     initEnvironment();
   }
+  virtual ~OrtModel() = default;
+
+  // General purpose
   void initOptions(std::unordered_map<std::string, std::string> optionsMap);
   void initEnvironment();
+  void memoryOnDevice(int32_t = 0);
   bool isInitialized() { return mInitialized; }
-  Ort::SessionOptions& updateSessionOptions();
-  Ort::MemoryInfo& updateMemoryInfo();
-  void setIO();
+  void resetSession();
 
-  virtual ~OrtModel() = default;
+  // Getters
+  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
+  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
+  std::vector<std::string> getInputNames() const { return mInputNames; }
+  std::vector<std::string> getOutputNames() const { return mOutputNames; }
+  Ort::SessionOptions& getSessionOptions();
+  Ort::MemoryInfo& getMemoryInfo();
+
+  // Setters
+  void setDeviceId(int32_t id) { deviceId = id; }
+  void setIO();
+  void setActiveThreads(int threads) { intraOpNumThreads = threads; }
 
   // Conversion
   template <class I, class O>
@@ -66,29 +76,16 @@ class OrtModel
 
   // Inferencing
   template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
-  std::vector<O> inference(std::vector<I>&, int32_t = -1);
-
-  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h
-  std::vector<O> inference(std::vector<std::vector<I>>&, int32_t = -1);
-
-  template <class I, class O> // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h
-  void inference(I*, size_t, O*, int32_t = -1);
-
-  // template<class I, class T, class O> // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type
-  // std::vector<O> inference(std::vector<I>&);
-
-  // Reset session
-  void resetSession();
+  std::vector<O> inference(std::vector<I>&);
 
-  std::vector<std::vector<int64_t>> getNumInputNodes() const { return mInputShapes; }
-  std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
-  std::vector<std::string> getInputNames() const { return mInputNames; }
-  std::vector<std::string> getOutputNames() const { return mOutputNames; }
+  template <class I, class O>
+  std::vector<O> inference(std::vector<std::vector<I>>&);
 
-  void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+  template <class I, class O>
+  void inference(I*, size_t, O*);
 
  private:
-  // ORT variables -> need to be hidden as Pimpl
+  // ORT variables -> need to be hidden as pImpl
   struct OrtVariables;
   OrtVariables* pImplOrt;
 
@@ -99,8 +96,8 @@ class OrtModel
 
   // Environment settings
   bool mInitialized = false;
-  std::string modelPath, envName = "", device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
-  int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
+  std::string modelPath, envName = "", deviceType = "CPU", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda
+  int32_t intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = -1, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
   std::string printShape(const std::vector<int64_t>&);
 };
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 149f86d98eb0e..49ca969c811df 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -35,16 +35,7 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c
   Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
 };
 
-Ort::SessionOptions& OrtModel::updateSessionOptions()
-{
-  return pImplOrt->sessionOptions;
-}
-
-Ort::MemoryInfo& OrtModel::updateMemoryInfo()
-{
-  return pImplOrt->memoryInfo;
-}
-
+// General purpose
 void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsMap)
 {
   pImplOrt = new OrtVariables();
@@ -56,7 +47,8 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
 
   if (!optionsMap["model-path"].empty()) {
     modelPath = optionsMap["model-path"];
-    device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU");
+    deviceType = (optionsMap.contains("device-type") ? optionsMap["device-type"] : "CPU");
+    deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : -1);
     allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0);
     intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0);
     interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0);
@@ -65,7 +57,7 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
     enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0);
     envName = (optionsMap.contains("onnx-environment-name") ? optionsMap["onnx-environment-name"] : "onnx_model_inference");
 
-    if (device == "CPU") {
+    if (deviceType == "CPU") {
       (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads);
       (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads);
       if (intraOpNumThreads > 1 || interOpNumThreads > 1) {
@@ -97,6 +89,8 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
 
     (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations));
     (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel));
+
+    mInitialized = true;
   } else {
     LOG(fatal) << "(ORT) Model path cannot be empty!";
   }
@@ -104,7 +98,9 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
 
 void OrtModel::initEnvironment()
 {
-  mInitialized = true;
+  if(allocateDeviceMemory) {
+    memoryOnDevice(deviceId);
+  }
   pImplOrt->env = std::make_shared<Ort::Env>(
     OrtLoggingLevel(loggingLevel),
     (envName.empty() ? "ORT" : envName.c_str()),
@@ -128,32 +124,30 @@ void OrtModel::initEnvironment()
   (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
   pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
 
+  if (loggingLevel < 2) {
+    LOG(info) << "(ORT) Model loaded successfully! (input: " << printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")";
+  }
+
   setIO();
 }
 
-void OrtModel::setIO() {
-  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-    mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
-  }
-  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
-    mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-  }
-  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-    mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
-  }
-  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
-    mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
-  }
-
-  inputNamesChar.resize(mInputNames.size(), nullptr);
-  std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
-                 [&](const std::string& str) { return str.c_str(); });
-  outputNamesChar.resize(mOutputNames.size(), nullptr);
-  std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
-                 [&](const std::string& str) { return str.c_str(); });
-  if (loggingLevel < 2) {
-    LOG(info) << "(ORT) Model loaded successfully! (input: " << printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")";
+void OrtModel::memoryOnDevice(int32_t deviceIndex)
+{
+#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
+  if (deviceIndex >= 0) {
+    std::string dev_mem_str = "";
+    if (deviceType == "ROCM") {
+      dev_mem_str = "Hip";
+    }
+    if (deviceType == "CUDA") {
+      dev_mem_str = "Cuda";
+    }
+    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault);
+    if (loggingLevel < 2) {
+      LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex;
+    }
   }
+#endif
 }
 
 void OrtModel::resetSession()
@@ -161,6 +155,17 @@ void OrtModel::resetSession()
   pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
 }
 
+// Getters
+Ort::SessionOptions& OrtModel::getSessionOptions()
+{
+  return pImplOrt->sessionOptions;
+}
+
+Ort::MemoryInfo& OrtModel::getMemoryInfo()
+{
+  return pImplOrt->memoryInfo;
+}
+
 template <class I, class O>
 std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
 {
@@ -176,32 +181,32 @@ std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
   }
 }
 
-std::string OrtModel::printShape(const std::vector<int64_t>& v)
-{
-  std::stringstream ss("");
-  for (size_t i = 0; i < v.size() - 1; i++) {
-    ss << v[i] << "x";
+void OrtModel::setIO() {
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+    mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
   }
-  ss << v[v.size() - 1];
-  return ss.str();
+  for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
+    mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+    mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get());
+  }
+  for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) {
+    mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape());
+  }
+
+  inputNamesChar.resize(mInputNames.size(), nullptr);
+  std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar),
+                 [&](const std::string& str) { return str.c_str(); });
+  outputNamesChar.resize(mOutputNames.size(), nullptr);
+  std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
+                 [&](const std::string& str) { return str.c_str(); });
 }
 
+// Inference
 template <class I, class O>
-std::vector<O> OrtModel::inference(std::vector<I>& input, int32_t deviceIndex)
+std::vector<O> OrtModel::inference(std::vector<I>& input)
 {
-#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
-  if (allocateDeviceMemory) {
-    std::string dev_mem_str = "";
-    if (device == "ROCM") {
-      dev_mem_str = "Hip";
-    }
-    if (device == "CUDA") {
-      dev_mem_str = "Cuda";
-    }
-    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault);
-    LOG(info) << "(ORT) Memory info set to on-device memory for device " << device << " with ID "<< deviceIndex;
-  }
-#endif
   std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
   std::vector<Ort::Value> inputTensor;
   if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
@@ -217,32 +222,19 @@ std::vector<O> OrtModel::inference(std::vector<I>& input, int32_t deviceIndex)
   return outputValuesVec;
 }
 
-template std::vector<float> OrtModel::inference<float, float>(std::vector<float>&, int32_t);
+template std::vector<float> OrtModel::inference<float, float>(std::vector<float>&);
 
-template std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>&, int32_t);
+template std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>&);
 
-template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>&, int32_t);
+template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>&);
 
 template <class I, class O>
-void OrtModel::inference(I* input, size_t input_size, O* output, int32_t deviceIndex)
+void OrtModel::inference(I* input, size_t input_size, O* output)
 {
   // std::vector<std::string> providers = Ort::GetAvailableProviders();
   // for (const auto& provider : providers) {
   //     LOG(info) << "Available Execution Provider: " << provider;
   // }
-#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
-  if (allocateDeviceMemory) {
-    std::string dev_mem_str = "";
-    if (device == "ROCM") {
-      dev_mem_str = "Hip";
-    }
-    if (device == "CUDA") {
-      dev_mem_str = "Cuda";
-    }
-    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault);
-    LOG(info) << "(ORT) Memory info set to on-device memory for device " << device << " with ID "<< deviceIndex;
-  }
-#endif
   std::vector<int64_t> inputShape{input_size, (int64_t)mInputShapes[0][1]};
   Ort::Value inputTensor = Ort::Value(nullptr);
   if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
@@ -257,26 +249,13 @@ void OrtModel::inference(I* input, size_t input_size, O* output, int32_t deviceI
   (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size());
 }
 
-template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*, int32_t);
+template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
 
-template void OrtModel::inference<float, float>(float*, size_t, float*, int32_t);
+template void OrtModel::inference<float, float>(float*, size_t, float*);
 
 template <class I, class O>
-std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input, int32_t deviceIndex)
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
 {
-#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
-  if (allocateDeviceMemory) {
-    std::string dev_mem_str = "";
-    if (device == "ROCM") {
-      dev_mem_str = "Hip";
-    }
-    if (device == "CUDA") {
-      dev_mem_str = "Cuda";
-    }
-    pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault);
-    LOG(info) << "(ORT) Memory info set to on-device memory for device " << device << " with ID " << deviceIndex;
-  }
-#endif
   std::vector<Ort::Value> inputTensor;
   for (auto i : input) {
     std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
@@ -294,6 +273,17 @@ std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input, int32_t d
   return outputValuesVec;
 }
 
+// private
+std::string OrtModel::printShape(const std::vector<int64_t>& v)
+{
+  std::stringstream ss("");
+  for (size_t i = 0; i < v.size() - 1; i++) {
+    ss << v[i] << "x";
+  }
+  ss << v[v.size() - 1];
+  return ss.str();
+}
+
 } // namespace ml
 
 } // namespace o2
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
index 353c4bd76abb9..0e826b8794983 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
@@ -92,7 +92,7 @@ class GPUReconstructionProcessing : public GPUReconstruction
   void AddGPUEvents(T*& events);
 
   virtual std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() override;
-  virtual void SetONNXGPUStream(Ort::SessionOptions&, int32_t, int32_t*) {}
+  // virtual void SetONNXGPUStream(Ort::SessionOptions&, int32_t, int32_t*) {}
 
   struct RecoStepTimerMeta {
     HighResTimer timerToGPU;
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 844e754ee2f6c..4c3dc12d04568 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -673,7 +673,7 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_option
   // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
 
   // this implicitly sets "has_user_compute_stream"
-  UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", &mInternals->Streams[stream]);
+  UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream]);
   session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 
   // Finally, don't forget to release the provider options
@@ -694,7 +694,7 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options
 {
   // Create ROCm provider options
   cudaGetDevice(deviceId);
-  const auto& api = Ort::GetApi();
+  // const auto& api = Ort::GetApi();
   // api.GetCurrentGpuDeviceId(deviceId);
   OrtROCMProviderOptions rocm_options;
   rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 186d8ce4b0551..e0f5e3bc37c8f 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -14,17 +14,6 @@ set(MODULE GPUTracking)
 # set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O0") # to uncomment if needed, tired of typing this...
 # set(GPUCA_BUILD_DEBUG 1)
 
-# Pass ORT variables as a preprocessor definition
-add_compile_definitions(ORT_ROCM_BUILD=${ORT_ROCM_BUILD})
-add_compile_definitions(ORT_CUDA_BUILD=${ORT_CUDA_BUILD})
-add_compile_definitions(ORT_MIGRAPHX_BUILD=${ORT_MIGRAPHX_BUILD})
-add_compile_definitions(ORT_TENSORRT_BUILD=${ORT_TENSORRT_BUILD})
-
-message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}")
-message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}")
-message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}")
-message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}")
-
 if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH})
   set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}")
   if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_OPTO2})
@@ -345,7 +334,19 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
     ${targetName}
     PRIVATE $<TARGET_PROPERTY:O2::Framework,INTERFACE_INCLUDE_DIRECTORIES>)
 
-  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1)
+  message("Compile definitions for ONNX runtime:")
+  message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}")
+  message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}")
+  message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}")
+  message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}")
+
+
+  target_compile_definitions(${targetName} PRIVATE
+    GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1
+    ORT_ROCM_BUILD=$<BOOL:${ORT_ROCM_BUILD}>
+    ORT_CUDA_BUILD=$<BOOL:${ORT_CUDA_BUILD}>
+    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>
+    ORT_TENSORRT_BUILD=$<BOOL:${ORT_TENSORRT_BUILD}>)
 
   o2_target_root_dictionary(${MODULE}
                             HEADERS ${HDRS_CINT_O2} ${HDRS_CINT_O2_ADDITIONAL}
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 2905601bd8f28..67ed38ee04aa8 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -613,64 +613,49 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
 #ifdef GPUCA_HAS_ONNX
   const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
-  GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only
+  GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes];
+
   if (GetProcessingSettings().nn.applyNNclusterizer) {
     uint32_t maxClusters = 0;
-    nnApplication.init(nn_settings);
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
-      maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters);
+    for (uint32_t lane = 0; lane < GetProcessingSettings().nTPCClustererLanes; lane++) {
+      maxClusters = std::max(maxClusters, processors()->tpcClusterer[lane].mNMaxClusters);
     }
-    for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) {
-      GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
-      GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[iSector] : clustererNN;
+    for (uint32_t lane = 0; lane < GetProcessingSettings().nTPCClustererLanes; lane++) {
+      nnApplications[lane].init(nn_settings);
+      GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane];
+      GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
+
+      int32_t deviceId = -1;
+      if (clustererNNShadow.nnClusterizerVerbosity < 3) {
+        LOG(info) << "Allocating ONNX stream for lane " << lane << " and lane " << lane;
+      }
+      if (nnApplications[lane].modelsUsed[0]) {
+        SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
+        (nnApplications[lane].model_class).setDeviceId(deviceId);
+        (nnApplications[lane].model_class).initEnvironment();
+      }
+      if (nnApplications[lane].modelsUsed[1]) {
+        SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
+        (nnApplications[lane].model_reg_1).setDeviceId(deviceId);
+        (nnApplications[lane].model_reg_1).initEnvironment();
+      }
+      if (nnApplications[lane].modelsUsed[2]) {
+        SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
+        (nnApplications[lane].model_reg_2).setDeviceId(deviceId);
+        (nnApplications[lane].model_reg_2).initEnvironment();
+      }
 
       if (doGPU){
-        clustererNNShadow.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
-        clustererNNShadow.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
-        clustererNNShadow.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
-        clustererNNShadow.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
-        clustererNNShadow.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
-        clustererNNShadow.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
-        clustererNNShadow.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
-        clustererNNShadow.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
+        clustererNNShadow.deviceId = deviceId;
+        clustererNNShadow.mISector = lane;
         clustererNNShadow.nnClusterizerTotalClusters = maxClusters;
-        clustererNNShadow.nnClassThreshold = nn_settings.nnClassThreshold;
-        clustererNNShadow.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
-        if (clustererNNShadow.nnSigmoidTrafoClassThreshold) {
-          clustererNNShadow.nnClassThreshold = (float)std::log(clustererNNShadow.nnClassThreshold / (1.f - clustererNNShadow.nnClassThreshold));
-        }
-        if (nn_settings.nnClusterizerVerbosity < 0) {
-          clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
-        } else {
-          clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
-        }
-        clustererNNShadow.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos;
-        nnApplication.initModels();
-        nnApplication.initClusterizer(nn_settings, clustererNNShadow);
+        nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
       } else {
-        // not sure if this part is needed at all
-        clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression;
-        clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow;
-        clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad;
-        clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime;
-        clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData;
-        clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0);
-        clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode;
-        clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue;
+        // TODO: not sure if this part is needed at all
+        clustererNN.deviceId = deviceId;
+        clustererNN.mISector = lane;
         clustererNN.nnClusterizerTotalClusters = maxClusters;
-        clustererNN.nnClassThreshold = nn_settings.nnClassThreshold;
-        clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold;
-        if (clustererNN.nnSigmoidTrafoClassThreshold) {
-          clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
-        }
-        if (nn_settings.nnClusterizerVerbosity < 0) {
-          clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity;
-        } else {
-          clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity;
-        }
-        clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos;
-        nnApplication.initModels();
-        nnApplication.initClusterizer(nn_settings, clustererNN);
+        nnApplications[lane].initClusterizer(nn_settings, clustererNN);
       }
       AllocateRegisteredMemory(clustererNN.mMemoryId);
     }
@@ -945,32 +930,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
         if (GetProcessingSettings().nn.applyNNclusterizer) {
 #ifdef GPUCA_HAS_ONNX
-          GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector];
-          GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[iSector] : clustererNN;
-          const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
-
-          int32_t deviceId = -1;
-          GPUTPCNNClusterizerHost nnApplication(nn_settings);
-          LOG(info) << "Allocating ONNX stream for lane " << lane << " and sector " << iSector;
-          if (nnApplication.modelsUsed[0]) {
-            SetONNXGPUStream((nnApplication.model_class).updateSessionOptions(), lane, &deviceId);
-            (nnApplication.model_class).initEnvironment();
-          }
-          if (nnApplication.modelsUsed[1]) {
-            SetONNXGPUStream((nnApplication.model_reg_1).updateSessionOptions(), lane, &deviceId);
-            (nnApplication.model_reg_1).initEnvironment();
-          }
-          if (nnApplication.modelsUsed[2]) {
-            SetONNXGPUStream((nnApplication.model_reg_2).updateSessionOptions(), lane, &deviceId);
-            (nnApplication.model_reg_2).initEnvironment();
-          }
+          GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane];
+          GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
+          GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];
 
           int withMC = (doGPU && propagateMCLabels);
           if (doGPU){
             // SetupGPUProcessor(&clustererNN, true);
-            WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer[lane] - (char*)processors(), &clustererNNShadow, sizeof(clustererNN), lane);
+            WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&clustererNN - (char*)processors(), &clustererNNShadow, sizeof(clustererNN), lane);
             TransferMemoryResourcesToGPU(RecoStep::TPCClusterFinding, &clustererNNShadow, lane);
-            LOG(info) << "Successfully allocated for stream " << lane << " and sector " << iSector << " with memory size " << sizeof(clustererNN) << " and shadow size " << sizeof(clustererNNShadow);
           }
 
           if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
@@ -988,7 +956,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
-            nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType, deviceId);
+            nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType);
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
             } else {
@@ -996,10 +964,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             }
 
             if (!clustererNNShadow.nnClusterizerUseCfRegression) {
-              nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType, deviceId);
+              nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType);
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
-                nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType, deviceId);
+                nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType);
                 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2
               }
             }
@@ -1008,12 +976,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
             time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
           }
-          auto start1 = std::chrono::high_resolution_clock::now();
           if (clustererNNShadow.nnClusterizerUseCfRegression) {
+            auto start1 = std::chrono::high_resolution_clock::now();
             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            auto stop1 = std::chrono::high_resolution_clock::now();
+            time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
           }
-          auto stop1 = std::chrono::high_resolution_clock::now();
-          time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
           if (clustererNNShadow.nnClusterizerVerbosity < 3) {
             int acceptedClusters = 0;
             for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 0457534b3f903..9bf89e6337c6e 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -57,6 +57,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int nnClusterizerModelReg2NumOutputNodes = -1;
   int nnInferenceInputDType = 0; // 0: float16, 1: float32
   int mISector = -1;
+  int deviceId = -1;
 
   // Memory allocation for neural network
   float* inputData32 = nullptr;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index bd17d27edb3c4..0a2c35a6f6623 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -97,7 +97,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
 
   OrtOptions = {
     {"model-path", class_model_path},
-    {"device", settings.nnInferenceDevice},
+    {"device-type", settings.nnInferenceDevice},
     {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)},
     {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)},
     {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)},
@@ -128,36 +128,43 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
   }
 }
 
-void GPUTPCNNClusterizerHost::initModels() {
-  if (!model_class.isInitialized() && modelsUsed[0]) {
-    model_class.initEnvironment();
-  }
-  if (!model_reg_1.isInitialized() && modelsUsed[1]) {
-    model_reg_1.initEnvironment();
+void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN)
+{
+  clustererNN.nnClusterizerUseCfRegression = settings.nnClusterizerUseCfRegression;
+  clustererNN.nnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow;
+  clustererNN.nnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad;
+  clustererNN.nnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime;
+  clustererNN.nnClusterizerAddIndexData = settings.nnClusterizerAddIndexData;
+  clustererNN.nnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0);
+  clustererNN.nnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
+  clustererNN.nnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
+  clustererNN.nnClassThreshold = settings.nnClassThreshold;
+  clustererNN.nnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
+  if (clustererNN.nnSigmoidTrafoClassThreshold) {
+    clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
   }
-  if (!model_reg_2.isInitialized() && modelsUsed[2]) {
-    model_reg_2.initEnvironment();
+  if (settings.nnClusterizerVerbosity < 0) {
+    clustererNN.nnClusterizerVerbosity = settings.nnInferenceVerbosity;
+  } else {
+    clustererNN.nnClusterizerVerbosity = settings.nnClusterizerVerbosity;
   }
-}
-
-void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer)
-{
-  clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
+  clustererNN.nnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos;
+  clustererNN.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
   if (!settings.nnClusterizerUseCfRegression) {
     if (model_class.getNumOutputNodes()[0][1] == 1 || model_reg_2.isInitialized()) {
-      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
+      clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
     } else {
-      clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
-      clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1];
+      clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
+      clustererNN.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1];
     }
   }
 }
 
-void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype, int32_t deviceId)
+void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype)
 {
   if (dtype == 0) {
-    model.inference<OrtDataType::Float16_t, float>(clustererNN.inputData16, size, output, deviceId);
+    model.inference<OrtDataType::Float16_t, float>(clustererNN.inputData16, size, output);
   } else {
-    model.inference<float, float>(clustererNN.inputData32, size, output, deviceId);
+    model.inference<float, float>(clustererNN.inputData32, size, output);
   }
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index a383cbfd2bc7f..2c0e704595933 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -41,14 +41,14 @@ class GPUTPCNNClusterizerHost
 
   void init(const GPUSettingsProcessingNNclusterizer&);
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
-  void initModels();
   void loadFromCCDB(std::map<std::string, std::string>);
 
-  void networkInference(o2::ml::OrtModel, GPUTPCNNClusterizer&, size_t, float*, int32_t, int32_t);
+  void networkInference(o2::ml::OrtModel, GPUTPCNNClusterizer&, size_t, float*, int32_t);
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
   std::vector<bool> modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2
+  int32_t deviceId = -1;
   std::vector<std::string> reg_model_paths;
 
  private:

From 2801c2e4a73e2cf85ff88cf51d77816210d73c10 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 2 Apr 2025 11:31:55 +0200
Subject: [PATCH 20/40] Fixing memory assignment issue. Reconstruction runs
 through with FP32 networks

---
 .../Global/GPUChainTrackingClusterizer.cxx    | 37 +++++++++++--------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 67ed38ee04aa8..532ea169cd006 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -617,18 +617,16 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
   if (GetProcessingSettings().nn.applyNNclusterizer) {
     uint32_t maxClusters = 0;
-    for (uint32_t lane = 0; lane < GetProcessingSettings().nTPCClustererLanes; lane++) {
+    int32_t deviceId = -1;
+    int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
+    for (uint32_t lane = 0; lane < NSECTORS; lane++) {
       maxClusters = std::max(maxClusters, processors()->tpcClusterer[lane].mNMaxClusters);
     }
-    for (uint32_t lane = 0; lane < GetProcessingSettings().nTPCClustererLanes; lane++) {
+    mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
       nnApplications[lane].init(nn_settings);
       GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane];
       GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
 
-      int32_t deviceId = -1;
-      if (clustererNNShadow.nnClusterizerVerbosity < 3) {
-        LOG(info) << "Allocating ONNX stream for lane " << lane << " and lane " << lane;
-      }
       if (nnApplications[lane].modelsUsed[0]) {
         SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_class).setDeviceId(deviceId);
@@ -644,21 +642,32 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         (nnApplications[lane].model_reg_2).setDeviceId(deviceId);
         (nnApplications[lane].model_reg_2).initEnvironment();
       }
-
+      if (clustererNNShadow.nnClusterizerVerbosity < 3) {
+        LOG(info) << "Allocated ONNX stream for lane " << lane << " and device " << deviceId;
+      }
+    });
+    mRec->runParallelOuterLoop(doGPU, NSECTORS, [&](uint32_t sector) {
+      GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector];
+      GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN;
+      int32_t lane = sector % numLanes;
       if (doGPU){
         clustererNNShadow.deviceId = deviceId;
-        clustererNNShadow.mISector = lane;
+        clustererNNShadow.mISector = sector;
         clustererNNShadow.nnClusterizerTotalClusters = maxClusters;
         nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
       } else {
         // TODO: not sure if this part is needed at all
         clustererNN.deviceId = deviceId;
-        clustererNN.mISector = lane;
+        clustererNN.mISector = sector;
         clustererNN.nnClusterizerTotalClusters = maxClusters;
         nnApplications[lane].initClusterizer(nn_settings, clustererNN);
       }
       AllocateRegisteredMemory(clustererNN.mMemoryId);
-    }
+      if (doGPU){
+        WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&clustererNN - (char*)processors(), &clustererNNShadow, sizeof(clustererNN), lane);
+        TransferMemoryResourcesToGPU(RecoStep::TPCClusterFinding, &clustererNNShadow, lane);
+      }
+    });
   }
 #endif
 
@@ -934,12 +943,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
           GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];
 
+          LOG(info) << "clustererNNShadow.inputData32: " << clustererNNShadow.inputData32;
+          LOG(info) << "clustererShadow.mPclusterInRow: " << clustererShadow.mPclusterInRow;
+
           int withMC = (doGPU && propagateMCLabels);
-          if (doGPU){
-            // SetupGPUProcessor(&clustererNN, true);
-            WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&clustererNN - (char*)processors(), &clustererNNShadow, sizeof(clustererNN), lane);
-            TransferMemoryResourcesToGPU(RecoStep::TPCClusterFinding, &clustererNNShadow, lane);
-          }
 
           if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
             runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});

From 1dcb1daf2e670620b43ddb27ddc19efef2c0f5f1 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 2 Apr 2025 15:39:58 +0200
Subject: [PATCH 21/40] Major reworkings to add FP16 support

---
 Common/ML/include/ML/3rdparty/GPUORTFloat16.h |   2 +-
 Common/ML/src/OrtInterface.cxx                |  11 +-
 .../Global/GPUChainTrackingClusterizer.cxx    |  58 +++++++--
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  |  35 ++++--
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  25 ++--
 .../GPUTPCNNClusterizerHost.cxx               |  10 +-
 .../GPUTPCNNClusterizerHost.h                 |   2 -
 .../GPUTPCNNClusterizerKernels.cxx            | 112 ++++++++++++------
 8 files changed, 177 insertions(+), 78 deletions(-)

diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
index 76fd6734cf9db..9516ba5dad573 100644
--- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
+++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h
@@ -882,4 +882,4 @@ static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
 } // namespace OrtDataType
 
 } // namespace o2
-#endif
\ No newline at end of file
+#endif
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 49ca969c811df..1a729a97c6952 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -244,13 +244,22 @@ void OrtModel::inference(I* input, size_t input_size, O* output)
   }
 
   std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
-  Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
+  Ort::Value outputTensor = Ort::Value(nullptr);
+  if constexpr (std::is_same_v<O, OrtDataType::Float16_t>) {
+    Ort::Value outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
+  } else {
+    Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
+  }
 
   (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size());
 }
 
+template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t*, size_t, OrtDataType::Float16_t*);
+
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
 
+template void OrtModel::inference<float, OrtDataType::Float16_t>(float*, size_t, OrtDataType::Float16_t*);
+
 template void OrtModel::inference<float, float>(float*, size_t, float*);
 
 template <class I, class O>
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 532ea169cd006..d9bc4ac30190b 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -943,9 +943,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
           GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane];
 
-          LOG(info) << "clustererNNShadow.inputData32: " << clustererNNShadow.inputData32;
-          LOG(info) << "clustererShadow.mPclusterInRow: " << clustererShadow.mPclusterInRow;
-
           int withMC = (doGPU && propagateMCLabels);
 
           if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
@@ -963,19 +960,58 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
-            nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType);
+
+            // nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType);
+            if (clustererNNShadow.nnInferenceInputDType == 0) {
+              if (clustererNNShadow.nnInferenceOutputDType == 0) {
+                (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_16);
+              } else if (clustererNNShadow.nnInferenceOutputDType == 1) {
+                (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_32);
+              }
+            } else if (clustererNNShadow.nnInferenceInputDType == 1) {
+              if (clustererNNShadow.nnInferenceOutputDType == 0) {
+                (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_16);
+              } else if (clustererNNShadow.nnInferenceOutputDType == 1) {
+                (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_32);
+              }
+            }
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
             } else {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
             }
-
             if (!clustererNNShadow.nnClusterizerUseCfRegression) {
-              nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType);
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1
+              // nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType);
+              if (clustererNNShadow.nnInferenceInputDType == 0) {
+                if (clustererNNShadow.nnInferenceOutputDType == 0) {
+                  (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_16);
+                } else if (clustererNNShadow.nnInferenceOutputDType == 1) {
+                  (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_32);
+                }
+              } else if (clustererNNShadow.nnInferenceInputDType == 1) {
+                if (clustererNNShadow.nnInferenceOutputDType == 0) {
+                  (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_16);
+                } else if (clustererNNShadow.nnInferenceOutputDType == 1) {
+                  (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_32);
+                }
+              }
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
-                nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType);
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2
+                // nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType);
+                if (clustererNNShadow.nnInferenceInputDType == 0) {
+                  if (clustererNNShadow.nnInferenceOutputDType == 0) {
+                    (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_16);
+                  } else if (clustererNNShadow.nnInferenceOutputDType == 1) {
+                    (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_32);
+                  }
+                } else if (clustererNNShadow.nnInferenceInputDType == 1) {
+                  if (clustererNNShadow.nnInferenceOutputDType == 0) {
+                    (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_16);
+                  } else if (clustererNNShadow.nnInferenceOutputDType == 1) {
+                    (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_32);
+                  }
+                }
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Running the NN for regression class 2
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 208e8c6428cb5..4ae5e0d9b49a7 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -26,20 +26,35 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
 {
   if (nnClusterizerBatchedMode > 0) {
     if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) {
-      computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize);
+      computePointerWithAlignment(mem, inputData_16, nnClusterizerBatchedMode * nnClusterizerElementSize);
     } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
-      computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize);
+      computePointerWithAlignment(mem, inputData_32, nnClusterizerBatchedMode * nnClusterizerElementSize);
     }
     computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode);
-    if (nnClusterizerModelClassNumOutputNodes > 0) {
-      computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
-    }
-    if (!nnClusterizerUseCfRegression) {
-      if (nnClusterizerModelReg1NumOutputNodes > 0) {
-        computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
+
+    if (nnInferenceOutputDType == 0 && nnClusterizerElementSize > 0) {
+      if (nnClusterizerModelClassNumOutputNodes > 0) {
+        computePointerWithAlignment(mem, modelProbabilities_16, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
       }
-      if (nnClusterizerModelReg2NumOutputNodes > 0) {
-        computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
+      if (!nnClusterizerUseCfRegression) {
+        if (nnClusterizerModelReg1NumOutputNodes > 0) {
+          computePointerWithAlignment(mem, outputDataReg1_16, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
+        }
+        if (nnClusterizerModelReg2NumOutputNodes > 0) {
+          computePointerWithAlignment(mem, outputDataReg2_16, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
+        }
+      }
+    } else if (nnInferenceOutputDType == 1 && nnClusterizerElementSize > 0) {
+      if (nnClusterizerModelClassNumOutputNodes > 0) {
+        computePointerWithAlignment(mem, modelProbabilities_32, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes);
+      }
+      if (!nnClusterizerUseCfRegression) {
+        if (nnClusterizerModelReg1NumOutputNodes > 0) {
+          computePointerWithAlignment(mem, outputDataReg1_32, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes);
+        }
+        if (nnClusterizerModelReg2NumOutputNodes > 0) {
+          computePointerWithAlignment(mem, outputDataReg2_32, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes);
+        }
       }
     }
   }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 9bf89e6337c6e..70c9e9c20d18b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -56,20 +56,27 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int nnClusterizerModelReg1NumOutputNodes = -1;
   int nnClusterizerModelReg2NumOutputNodes = -1;
   int nnInferenceInputDType = 0; // 0: float16, 1: float32
+  int nnInferenceOutputDType = 0; // 0: float16, 1: float32
   int mISector = -1;
   int deviceId = -1;
 
   // Memory allocation for neural network
-  float* inputData32 = nullptr;
-  OrtDataType::Float16_t* inputData16 = nullptr;
-  float* outputDataClass = nullptr;
-  float* modelProbabilities = nullptr;
-  float* outputDataReg1 = nullptr;
-  float* outputDataReg2 = nullptr;
 
-  ChargePos* peakPositions = nullptr;
-  bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx
-  float* centralCharges = nullptr;
+  bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr
+  int* outputDataClass = nullptr;
+
+  // FP32
+  float* inputData_32 = nullptr;
+  float* modelProbabilities_32 = nullptr;
+  float* outputDataReg1_32 = nullptr;
+  float* outputDataReg2_32 = nullptr;
+
+  // FP16
+  OrtDataType::Float16_t* inputData_16 = nullptr;
+  OrtDataType::Float16_t* modelProbabilities_16 = nullptr;
+  OrtDataType::Float16_t* outputDataReg1_16 = nullptr;
+  OrtDataType::Float16_t* outputDataReg2_16 = nullptr;
+
   int16_t mMemoryId = -1;
 }; // class GPUTPCNNClusterizer
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 0a2c35a6f6623..4372fea7ed9e5 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -149,6 +149,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
     clustererNN.nnClusterizerVerbosity = settings.nnClusterizerVerbosity;
   }
   clustererNN.nnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos;
+  clustererNN.nnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos;
   clustererNN.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
   if (!settings.nnClusterizerUseCfRegression) {
     if (model_class.getNumOutputNodes()[0][1] == 1 || model_reg_2.isInitialized()) {
@@ -159,12 +160,3 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
     }
   }
 }
-
-void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype)
-{
-  if (dtype == 0) {
-    model.inference<OrtDataType::Float16_t, float>(clustererNN.inputData16, size, output);
-  } else {
-    model.inference<float, float>(clustererNN.inputData32, size, output);
-  }
-}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 2c0e704595933..87532deff9917 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -43,8 +43,6 @@ class GPUTPCNNClusterizerHost
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
   void loadFromCCDB(std::map<std::string, std::string>);
 
-  void networkInference(o2::ml::OrtModel, GPUTPCNNClusterizer&, size_t, float*, int32_t);
-
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
   std::vector<bool> modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index ef75e1c1af19e..70d605ac72fc7 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -81,16 +81,16 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
             clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
           }
           if (dtype == 0) {
-            clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
-          } else {
-            clustererNN.inputData32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+            clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+          } else if (dtype == 1) {
+            clustererNN.inputData_32[write_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
           }
         } else {
           // Filling boundary just to make sure that no values are left unintentionally
           if (dtype == 0) {
-            clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
+            clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
           } else {
-            clustererNN.inputData32[write_idx] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
+            clustererNN.inputData_32[write_idx] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
           }
         }
         write_idx++;
@@ -99,13 +99,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   }
   if (clustererNN.nnClusterizerAddIndexData) {
     if (dtype == 0) {
-      clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f);
-      clustererNN.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
-      clustererNN.inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
+      clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f);
+      clustererNN.inputData_16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
+      clustererNN.inputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
     } else {
-      clustererNN.inputData32[write_idx] = clusterer.mISector / 36.f;
-      clustererNN.inputData32[write_idx + 1] = row / 152.f;
-      clustererNN.inputData32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
+      clustererNN.inputData_32[write_idx] = clusterer.mISector / 36.f;
+      clustererNN.inputData_32[write_idx + 1] = row / 152.f;
+      clustererNN.inputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
     }
   }
 }
@@ -114,7 +114,11 @@ template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
   uint glo_idx = get_global_id(0);
-  processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
+  if (dtype == 0) {
+    processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)((processors.tpcNNClusterer[sector].modelProbabilities_16[glo_idx]).ToFloat() > processors.tpcNNClusterer[sector].nnClassThreshold);
+  } else if (dtype == 1) {
+    processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities_32[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold);
+  }
 }
 
 template <>
@@ -127,9 +131,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
   uint class_label = 0;
   for (int pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes; pIdx++) {
     if (pIdx == elem_iterator) {
-      current_max_prob = clustererNN.modelProbabilities[pIdx];
+      if (dtype == 0) {
+        current_max_prob = static_cast<float>(clustererNN.modelProbabilities_16[pIdx]);
+      } else if (dtype == 1) {
+        current_max_prob = clustererNN.modelProbabilities_32[pIdx];
+      }
     } else {
-      class_label = (clustererNN.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label);
+      if (dtype == 0) {
+        current_max_prob = CAMath::Max(current_max_prob, clustererNN.modelProbabilities_16[pIdx].ToFloat());
+      } else if (dtype == 1) {
+        current_max_prob = CAMath::Max(current_max_prob, clustererNN.modelProbabilities_32[pIdx]);
+      }
     }
   }
   // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins"
@@ -184,13 +196,23 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
       return;
     }
 
-    pc.setFull(central_charge * clustererNN.outputDataReg1[model_output_index + 4],
-               static_cast<float>(peak.pad()) + clustererNN.outputDataReg1[model_output_index],
-               clustererNN.outputDataReg1[model_output_index + 2],
-               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg1[model_output_index + 1],
-               clustererNN.outputDataReg1[model_output_index + 3],
-               clustererNN.clusterFlags[2 * glo_idx],
-               clustererNN.clusterFlags[2 * glo_idx + 1]);
+    if (dtype == 0) {
+      pc.setFull(central_charge * clustererNN.outputDataReg1_16[model_output_index + 4].ToFloat(),
+        static_cast<float>(peak.pad()) + clustererNN.outputDataReg1_16[model_output_index].ToFloat(),
+        clustererNN.outputDataReg1_16[model_output_index + 2].ToFloat(),
+        (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg1_16[model_output_index + 1].ToFloat(),
+        clustererNN.outputDataReg1_16[model_output_index + 3].ToFloat(),
+        clustererNN.clusterFlags[2 * glo_idx],
+        clustererNN.clusterFlags[2 * glo_idx + 1]);
+    } else if (dtype == 1) {
+      pc.setFull(central_charge * clustererNN.outputDataReg1_32[model_output_index + 4],
+        static_cast<float>(peak.pad()) + clustererNN.outputDataReg1_32[model_output_index],
+        clustererNN.outputDataReg1_32[model_output_index + 2],
+        (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg1_32[model_output_index + 1],
+        clustererNN.outputDataReg1_32[model_output_index + 3],
+        clustererNN.clusterFlags[2 * glo_idx],
+        clustererNN.clusterFlags[2 * glo_idx + 1]);
+    }
 
     tpc::ClusterNative myCluster;
     bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
@@ -267,13 +289,23 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     }
 
     // Cluster 1
-    pc.setFull(central_charge * clustererNN.outputDataReg2[model_output_index + 8],
-               static_cast<float>(peak.pad()) + clustererNN.outputDataReg2[model_output_index],
-               clustererNN.outputDataReg2[model_output_index + 4],
-               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2[model_output_index + 2],
-               clustererNN.outputDataReg2[model_output_index + 6],
-               clustererNN.clusterFlags[2 * glo_idx],
-               clustererNN.clusterFlags[2 * glo_idx + 1]);
+    if (dtype == 0) {
+      pc.setFull(central_charge * clustererNN.outputDataReg2_16[model_output_index + 8].ToFloat(),
+        static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index].ToFloat(),
+        clustererNN.outputDataReg2_16[model_output_index + 4].ToFloat(),
+        (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 2].ToFloat(),
+        clustererNN.outputDataReg2_16[model_output_index + 6].ToFloat(),
+        clustererNN.clusterFlags[2 * glo_idx],
+        clustererNN.clusterFlags[2 * glo_idx + 1]);
+    } else if (dtype == 1) {
+      pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 8],
+        static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index],
+        clustererNN.outputDataReg2_32[model_output_index + 4],
+        (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 2],
+        clustererNN.outputDataReg2_32[model_output_index + 6],
+        clustererNN.clusterFlags[2 * glo_idx],
+        clustererNN.clusterFlags[2 * glo_idx + 1]);
+    }
 
     tpc::ClusterNative myCluster;
     bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
@@ -302,13 +334,23 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow));
 
     // Cluster 2
-    pc.setFull(central_charge * clustererNN.outputDataReg2[model_output_index + 9],
-               static_cast<float>(peak.pad()) + clustererNN.outputDataReg2[model_output_index + 1],
-               clustererNN.outputDataReg2[model_output_index + 5],
-               (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2[model_output_index + 3],
-               clustererNN.outputDataReg2[model_output_index + 7],
-               clustererNN.clusterFlags[2 * glo_idx],
-               clustererNN.clusterFlags[2 * glo_idx + 1]);
+    if (dtype == 0) {
+      pc.setFull(central_charge * clustererNN.outputDataReg2_16[model_output_index + 9].ToFloat(),
+        static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index + 1].ToFloat(),
+        clustererNN.outputDataReg2_16[model_output_index + 5].ToFloat(),
+        (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 3].ToFloat(),
+        clustererNN.outputDataReg2_16[model_output_index + 7].ToFloat(),
+        clustererNN.clusterFlags[2 * glo_idx],
+        clustererNN.clusterFlags[2 * glo_idx + 1]);
+    } else if (dtype == 1) {
+      pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 9],
+        static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index + 1],
+        clustererNN.outputDataReg2_32[model_output_index + 5],
+        (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 3],
+        clustererNN.outputDataReg2_32[model_output_index + 7],
+        clustererNN.clusterFlags[2 * glo_idx],
+        clustererNN.clusterFlags[2 * glo_idx + 1]);
+    }
 
     rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
     if (rejectCluster) {

From 381955a57f5ce34f3318eb974263f9d0a7e6f7c3 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 3 Apr 2025 15:17:13 +0200
Subject: [PATCH 22/40] Bug-fixes

---
 Common/ML/src/OrtInterface.cxx                |  4 +-
 .../Global/GPUChainTrackingClusterizer.cxx    | 40 ++++++--------
 .../TPCClusterFinder/GPUTPCNNClusterizer.cxx  | 52 +++++++++----------
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  3 --
 .../GPUTPCNNClusterizerHost.cxx               |  7 +--
 5 files changed, 49 insertions(+), 57 deletions(-)

diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 1a729a97c6952..4e41fec0c8ca9 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -246,9 +246,9 @@ void OrtModel::inference(I* input, size_t input_size, O* output)
   std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
   Ort::Value outputTensor = Ort::Value(nullptr);
   if constexpr (std::is_same_v<O, OrtDataType::Float16_t>) {
-    Ort::Value outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
+    outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
   } else {
-    Ort::Value outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
+    outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
   }
 
   (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size());
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index d9bc4ac30190b..e313f30c07656 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -611,6 +611,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline)
   }
 
+  if (doGPU && mIOPtrs.tpcZS) {
+    processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta;
+    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
+  }
+  if (doGPU) {
+    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
+  }
+
 #ifdef GPUCA_HAS_ONNX
   const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn;
   GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes];
@@ -624,9 +632,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     }
     mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
       nnApplications[lane].init(nn_settings);
-      GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane];
-      GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN;
-
       if (nnApplications[lane].modelsUsed[0]) {
         SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_class).setDeviceId(deviceId);
@@ -642,7 +647,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         (nnApplications[lane].model_reg_2).setDeviceId(deviceId);
         (nnApplications[lane].model_reg_2).initEnvironment();
       }
-      if (clustererNNShadow.nnClusterizerVerbosity < 3) {
+      if (nn_settings.nnClusterizerVerbosity < 3) {
         LOG(info) << "Allocated ONNX stream for lane " << lane << " and device " << deviceId;
       }
     });
@@ -650,35 +655,24 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector];
       GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN;
       int32_t lane = sector % numLanes;
+      clustererNN.deviceId = deviceId;
+      clustererNN.mISector = sector;
+      clustererNN.nnClusterizerTotalClusters = maxClusters;
+      nnApplications[lane].initClusterizer(nn_settings, clustererNN);
       if (doGPU){
         clustererNNShadow.deviceId = deviceId;
         clustererNNShadow.mISector = sector;
         clustererNNShadow.nnClusterizerTotalClusters = maxClusters;
         nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
-      } else {
-        // TODO: not sure if this part is needed at all
-        clustererNN.deviceId = deviceId;
-        clustererNN.mISector = sector;
-        clustererNN.nnClusterizerTotalClusters = maxClusters;
-        nnApplications[lane].initClusterizer(nn_settings, clustererNN);
       }
       AllocateRegisteredMemory(clustererNN.mMemoryId);
-      if (doGPU){
-        WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&clustererNN - (char*)processors(), &clustererNNShadow, sizeof(clustererNN), lane);
-        TransferMemoryResourcesToGPU(RecoStep::TPCClusterFinding, &clustererNNShadow, lane);
-      }
     });
+    if (doGPU){
+      WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer)*NSECTORS, mRec->NStreams() - 1, &mEvents->init);
+    }
   }
 #endif
 
-  if (doGPU && mIOPtrs.tpcZS) {
-    processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta;
-    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1);
-  }
-  if (doGPU) {
-    WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
-  }
-
   size_t nClsTotal = 0;
   ClusterNativeAccess* tmpNativeAccess = mClusterNativeAccess.get();
   ClusterNative* tmpNativeClusters = nullptr;
@@ -961,7 +955,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             auto stop0 = std::chrono::high_resolution_clock::now();
             auto start1 = std::chrono::high_resolution_clock::now();
 
-            // nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType);
             if (clustererNNShadow.nnInferenceInputDType == 0) {
               if (clustererNNShadow.nnInferenceOutputDType == 0) {
                 (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_16);
@@ -975,6 +968,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                 (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_32);
               }
             }
+
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
             } else {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
index 4ae5e0d9b49a7..fe3202fe7b439 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx
@@ -64,32 +64,32 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem)
   return mem;
 }
 
-std::vector<int32_t> GPUTPCNNClusterizer::pointerSizes() {
-  std::vector<int32_t> sizes(7, -1);
-  if (nnClusterizerBatchedMode > 0) {
-    if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) {
-      sizes[0] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData16
-    } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
-      sizes[1] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData32
-    }
-    sizes[2] = 2 * nnClusterizerBatchedMode; // clusterFlags
-    if (nnClusterizerModelClassNumOutputNodes > 0) {
-      sizes[3] = nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes; // modelProbabilities
-    }
-    if (!nnClusterizerUseCfRegression) {
-      if (nnClusterizerModelReg1NumOutputNodes > 0) {
-        sizes[4] = nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes; // outputDataReg1
-      }
-      if (nnClusterizerModelReg2NumOutputNodes > 0) {
-        sizes[5] = nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes; // outputDataReg2
-      }
-    }
-  }
-  if (nnClusterizerTotalClusters > 0) {
-    sizes[6] = nnClusterizerTotalClusters; // outputDataClass
-  }
-  return sizes;
-}
+// std::vector<int32_t> GPUTPCNNClusterizer::pointerSizes() {
+//   std::vector<int32_t> sizes(7, -1);
+//   if (nnClusterizerBatchedMode > 0) {
+//     if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) {
+//       sizes[0] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData16
+//     } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) {
+//       sizes[1] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData32
+//     }
+//     sizes[2] = 2 * nnClusterizerBatchedMode; // clusterFlags
+//     if (nnClusterizerModelClassNumOutputNodes > 0) {
+//       sizes[3] = nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes; // modelProbabilities
+//     }
+//     if (!nnClusterizerUseCfRegression) {
+//       if (nnClusterizerModelReg1NumOutputNodes > 0) {
+//         sizes[4] = nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes; // outputDataReg1
+//       }
+//       if (nnClusterizerModelReg2NumOutputNodes > 0) {
+//         sizes[5] = nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes; // outputDataReg2
+//       }
+//     }
+//   }
+//   if (nnClusterizerTotalClusters > 0) {
+//     sizes[6] = nnClusterizerTotalClusters; // outputDataClass
+//   }
+//   return sizes;
+// }
 
 void GPUTPCNNClusterizer::RegisterMemoryAllocation()
 {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 70c9e9c20d18b..e9b2061bea36a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -34,7 +34,6 @@ class GPUTPCNNClusterizer : public GPUProcessor
   void RegisterMemoryAllocation();
   void InitializeProcessor();
   void SetMaxData(const GPUTrackingInOutPointers&);
-  std::vector<int32_t> pointerSizes();
 
   // Neural network clusterization
 
@@ -50,8 +49,6 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int nnClusterizerTotalClusters = 1;
   int nnClusterizerVerbosity = 0;
   int nnClusterizerBoundaryFillValue = -1;
-  int nnClusterizerDumpDigits = 0;
-  int nnClusterizerApplyCfDeconvolution = 0;
   int nnClusterizerModelClassNumOutputNodes = -1;
   int nnClusterizerModelReg1NumOutputNodes = -1;
   int nnClusterizerModelReg2NumOutputNodes = -1;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 4372fea7ed9e5..29fdaada06855 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -138,10 +138,11 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.nnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0);
   clustererNN.nnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
   clustererNN.nnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
-  clustererNN.nnClassThreshold = settings.nnClassThreshold;
   clustererNN.nnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
   if (clustererNN.nnSigmoidTrafoClassThreshold) {
-    clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold));
+    clustererNN.nnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold));
+  } else {
+    clustererNN.nnClassThreshold = settings.nnClassThreshold;
   }
   if (settings.nnClusterizerVerbosity < 0) {
     clustererNN.nnClusterizerVerbosity = settings.nnInferenceVerbosity;
@@ -152,7 +153,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.nnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos;
   clustererNN.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1];
   if (!settings.nnClusterizerUseCfRegression) {
-    if (model_class.getNumOutputNodes()[0][1] == 1 || model_reg_2.isInitialized()) {
+    if (model_class.getNumOutputNodes()[0][1] == 1 || !model_reg_2.isInitialized()) {
       clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];
     } else {
       clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1];

From 19b5bd596ce191f177884cbe880029c8f9fe65a0 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 3 Apr 2025 21:12:02 +0200
Subject: [PATCH 23/40] Improved data filling speeds by factor 3

---
 .../Definitions/GPUDefParametersDefault.h     |  1 +
 .../Global/GPUChainTrackingClusterizer.cxx    |  4 +-
 .../GPUTPCNNClusterizerKernels.cxx            | 74 ++++++++++++++++++-
 .../GPUTPCNNClusterizerKernels.h              |  9 ++-
 GPU/GPUTracking/kernels.cmake                 |  1 +
 5 files changed, 80 insertions(+), 9 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h
index 4435e69c60ff6..8249bbb70832d 100644
--- a/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h
+++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h
@@ -500,6 +500,7 @@
   #ifdef GPUCA_HAS_ONNX
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN GPUCA_LB_GPUTPCNNClusterizerKernels
+  #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNSingleElement GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index e313f30c07656..0077cc3cda9aa 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -950,9 +950,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             size_t iSize = CAMath::Min((uint)clustererNNShadow.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
 
             auto start0 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNN>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data
-
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data
             auto stop0 = std::chrono::high_resolution_clock::now();
+
             auto start1 = std::chrono::high_resolution_clock::now();
 
             if (clustererNNShadow.nnInferenceInputDType == 0) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 70d605ac72fc7..202860733b839 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -77,7 +77,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
         if (!is_boundary) {
           ChargePos tmp_pos(row + r, pad + p, time + t);
           if (r == 0 && !clustererNN.clusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization
-            clustererNN.clusterFlags[2 * glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]);
+            clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
             clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
           }
           if (dtype == 0) {
@@ -99,17 +99,85 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   }
   if (clustererNN.nnClusterizerAddIndexData) {
     if (dtype == 0) {
-      clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f);
+      clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(sector / 36.f);
       clustererNN.inputData_16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f);
       clustererNN.inputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
     } else {
-      clustererNN.inputData_32[write_idx] = clusterer.mISector / 36.f;
+      clustererNN.inputData_32[write_idx] = sector / 36.f;
       clustererNN.inputData_32[write_idx + 1] = row / 152.f;
       clustererNN.inputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
     }
   }
 }
 
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNSingleElement>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  uint glo_idx = get_global_id(0);
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  uint base_idx = CAMath::Floor(glo_idx / clustererNN.nnClusterizerElementSize);
+  uint transient_index = glo_idx % clustererNN.nnClusterizerElementSize;
+
+  Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  Array2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
+  ChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart];
+  int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad());
+
+  if (clustererNN.nnClusterizerAddIndexData && transient_index == 0) {
+    uint top_idx = (base_idx + 1) * clustererNN.nnClusterizerElementSize;
+    for (uint16_t i = 0; i < 8; i++) {
+      Delta2 d = cfconsts::InnerNeighbors[i];
+      ChargePos tmp_pos = peak.delta(d);
+      clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
+      clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx];
+    }
+    if (dtype == 0) {
+      clustererNN.inputData_16[top_idx - 3] = (OrtDataType::Float16_t)(sector / 36.f);
+      clustererNN.inputData_16[top_idx - 2] = (OrtDataType::Float16_t)(row / 152.f);
+      clustererNN.inputData_16[top_idx - 1] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
+    } else {
+      clustererNN.inputData_32[top_idx - 3] = sector / 36.f;
+      clustererNN.inputData_32[top_idx - 2] = row / 152.f;
+      clustererNN.inputData_32[top_idx - 1] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
+    }
+  } else {
+    int time = static_cast<int>(peak.time());
+    int r = CAMath::Floor(transient_index / ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1))) - clustererNN.nnClusterizerSizeInputRow;
+    bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
+    if (is_row_boundary) {
+      if (dtype == 0) {
+        clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
+      } else {
+        clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
+      }
+    } else {
+      int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
+      int pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r);
+      int rest_1 = transient_index % ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1));
+      int p = CAMath::Floor(rest_1 / (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputPad + pad_offset;
+      bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow);
+
+      if (!is_boundary) {
+        float central_charge = static_cast<float>(chargeMap[peak].unpack());
+        int t = (rest_1 % (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputTime;
+        ChargePos tmp_pos(row + r, pad + p, time + t);
+        if (dtype == 0) {
+          clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+        } else if (dtype == 1) {
+          clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+        }
+      } else {
+        if (dtype == 0) {
+          clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
+        } else {
+          clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
+        }
+      }
+    }
+  }
+}
+
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
 {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index e6c1dc508d6e4..0e15913dcee0c 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -53,10 +53,11 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
   enum K : int32_t {
     runCfClusterizer = 0,
     fillInputNN = 1,
-    determineClass1Labels = 2,
-    determineClass2Labels = 3,
-    publishClass1Regression = 4,
-    publishClass2Regression = 5,
+    fillInputNNSingleElement = 2,
+    determineClass1Labels = 3,
+    determineClass2Labels = 4,
+    publishClass1Regression = 5,
+    publishClass2Regression = 6,
   };
 
   template <int32_t iKernel = defaultKernel, typename... Args>
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index 994f10a516b10..3e435fe7e74bc 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -116,6 +116,7 @@ o2_gpu_add_kernel("GPUTPCCFClusterizer"                               "= TPCCLUS
 if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone")
 o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer"        "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN"             "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement"             "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels"   "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels"   "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER"                                LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)

From 83d02579b0f1804dc1084a3c51c07e2fc85a3fcc Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 3 Apr 2025 21:49:53 +0200
Subject: [PATCH 24/40] Limiting threads for ONNX evaluation

---
 Common/ML/CMakeLists.txt                            | 13 +++++++------
 Common/ML/include/ML/OrtInterface.h                 |  4 ++++
 .../Global/GPUChainTrackingClusterizer.cxx          | 10 ++++++++++
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt
index 5bfa05b716123..7e2107651cf10 100644
--- a/Common/ML/CMakeLists.txt
+++ b/Common/ML/CMakeLists.txt
@@ -9,13 +9,14 @@
 # granted to it by virtue of its status as an Intergovernmental Organization
 # or submit itself to any jurisdiction.
 
-# Pass ORT variables as a preprocessor definition
-add_compile_definitions(ORT_ROCM_BUILD=${ORT_ROCM_BUILD})
-add_compile_definitions(ORT_CUDA_BUILD=${ORT_CUDA_BUILD})
-add_compile_definitions(ORT_MIGRAPHX_BUILD=${ORT_MIGRAPHX_BUILD})
-add_compile_definitions(ORT_TENSORRT_BUILD=${ORT_TENSORRT_BUILD})
-
 o2_add_library(ML
                SOURCES src/OrtInterface.cxx
                TARGETVARNAME targetName
                PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime)
+
+# Pass ORT variables as a preprocessor definition
+target_compile_definitions(${targetName} PRIVATE
+    ORT_ROCM_BUILD=$<BOOL:${ORT_ROCM_BUILD}>
+    ORT_CUDA_BUILD=$<BOOL:${ORT_CUDA_BUILD}>
+    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>
+    ORT_TENSORRT_BUILD=$<BOOL:${ORT_TENSORRT_BUILD}>)
diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 56be450fb2ff1..bd81ecca109c9 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -64,11 +64,15 @@ class OrtModel
   std::vector<std::string> getOutputNames() const { return mOutputNames; }
   Ort::SessionOptions& getSessionOptions();
   Ort::MemoryInfo& getMemoryInfo();
+  int32_t getIntraOpNumThreads() const { return intraOpNumThreads; }
+  int32_t getInterOpNumThreads() const { return interOpNumThreads; }
 
   // Setters
   void setDeviceId(int32_t id) { deviceId = id; }
   void setIO();
   void setActiveThreads(int threads) { intraOpNumThreads = threads; }
+  void setIntraOpNumThreads(int threads) { if(deviceType == "CPU") { intraOpNumThreads = threads; } }
+  void setInterOpNumThreads(int threads) { if(deviceType == "CPU") { interOpNumThreads = threads; } }
 
   // Conversion
   template <class I, class O>
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 0077cc3cda9aa..621952b1ac654 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -627,6 +627,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     uint32_t maxClusters = 0;
     int32_t deviceId = -1;
     int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
+    int32_t maxThreads = mRec->MemoryScalers()->nTPCdigits / 6000;
     for (uint32_t lane = 0; lane < NSECTORS; lane++) {
       maxClusters = std::max(maxClusters, processors()->tpcClusterer[lane].mNMaxClusters);
     }
@@ -635,16 +636,25 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       if (nnApplications[lane].modelsUsed[0]) {
         SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_class).setDeviceId(deviceId);
+        if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) {
+          nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads);
+        }
         (nnApplications[lane].model_class).initEnvironment();
       }
       if (nnApplications[lane].modelsUsed[1]) {
         SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_reg_1).setDeviceId(deviceId);
+        if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) {
+          nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads);
+        }
         (nnApplications[lane].model_reg_1).initEnvironment();
       }
       if (nnApplications[lane].modelsUsed[2]) {
         SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_reg_2).setDeviceId(deviceId);
+        if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) {
+          nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads);
+        }
         (nnApplications[lane].model_reg_2).initEnvironment();
       }
       if (nn_settings.nnClusterizerVerbosity < 3) {

From fff6dc39c42f72afeed643ffe11ec1e1a55052f4 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 3 Apr 2025 23:42:20 +0200
Subject: [PATCH 25/40] Bug-fix for correct thread assignment and input data
 filling

---
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 202860733b839..d1be1d00027e2 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -124,7 +124,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   ChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart];
   int row = static_cast<int>(peak.row()), pad = static_cast<int>(peak.pad());
 
-  if (clustererNN.nnClusterizerAddIndexData && transient_index == 0) {
+  if (clustererNN.nnClusterizerAddIndexData && transient_index == (clustererNN.nnClusterizerElementSize - 1)) {
     uint top_idx = (base_idx + 1) * clustererNN.nnClusterizerElementSize;
     for (uint16_t i = 0; i < 8; i++) {
       Delta2 d = cfconsts::InnerNeighbors[i];
@@ -141,7 +141,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       clustererNN.inputData_32[top_idx - 2] = row / 152.f;
       clustererNN.inputData_32[top_idx - 1] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
     }
-  } else {
+  } else if (transient_index < (clustererNN.nnClusterizerElementSize - 3)) {
     int time = static_cast<int>(peak.time());
     int r = CAMath::Floor(transient_index / ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1))) - clustererNN.nnClusterizerSizeInputRow;
     bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);

From b437e38aa18b46c54432a9560002abc98eb22869 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 4 Apr 2025 08:59:57 +0200
Subject: [PATCH 26/40] Minor changes

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 621952b1ac654..b03a27867f8fa 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -624,13 +624,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes];
 
   if (GetProcessingSettings().nn.applyNNclusterizer) {
-    uint32_t maxClusters = 0;
     int32_t deviceId = -1;
     int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
     int32_t maxThreads = mRec->MemoryScalers()->nTPCdigits / 6000;
-    for (uint32_t lane = 0; lane < NSECTORS; lane++) {
-      maxClusters = std::max(maxClusters, processors()->tpcClusterer[lane].mNMaxClusters);
-    }
     mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
       nnApplications[lane].init(nn_settings);
       if (nnApplications[lane].modelsUsed[0]) {
@@ -667,12 +663,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       int32_t lane = sector % numLanes;
       clustererNN.deviceId = deviceId;
       clustererNN.mISector = sector;
-      clustererNN.nnClusterizerTotalClusters = maxClusters;
+      clustererNN.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
       nnApplications[lane].initClusterizer(nn_settings, clustererNN);
       if (doGPU){
         clustererNNShadow.deviceId = deviceId;
         clustererNNShadow.mISector = sector;
-        clustererNNShadow.nnClusterizerTotalClusters = maxClusters;
+        clustererNNShadow.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
         nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow);
       }
       AllocateRegisteredMemory(clustererNN.mMemoryId);
@@ -1034,7 +1030,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
               acceptedClusters += clustererNNShadow.outputDataClass[i];
             }
-            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");

From 710993a7df62a37e0d73c2ec484caa872a39aa76 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 5 Apr 2025 13:55:40 +0200
Subject: [PATCH 27/40] Adding I** inference, potentally needed for CNN + FC
 inference

---
 Common/ML/include/ML/OrtInterface.h |   8 +-
 Common/ML/src/OrtInterface.cxx      | 173 ++++++++++++++++++++++++----
 2 files changed, 158 insertions(+), 23 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index bd81ecca109c9..d496de866da7f 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -88,6 +88,9 @@ class OrtModel
   template <class I, class O>
   void inference(I*, size_t, O*);
 
+  template <class I, class O>
+  void inference(I**, size_t, O*);
+
  private:
   // ORT variables -> need to be hidden as pImpl
   struct OrtVariables;
@@ -96,7 +99,9 @@ class OrtModel
   // Input & Output specifications of the loaded network
   std::vector<const char*> inputNamesChar, outputNamesChar;
   std::vector<std::string> mInputNames, mOutputNames;
-  std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes;
+  std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes, inputShapesCopy, outputShapesCopy; // Input shapes
+  std::vector<int64_t> inputSizePerNode, outputSizePerNode; // Output shapes
+  int32_t mInputsTotal = 0, mOutputsTotal = 0; // Total number of inputs and outputs
 
   // Environment settings
   bool mInitialized = false;
@@ -104,6 +109,7 @@ class OrtModel
   int32_t intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = -1, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0;
 
   std::string printShape(const std::vector<int64_t>&);
+  std::string printShape(const std::vector<std::vector<int64_t>>&, std::vector<std::string>&);
 };
 
 } // namespace ml
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 4e41fec0c8ca9..e525fc1d2709f 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -124,11 +124,11 @@ void OrtModel::initEnvironment()
   (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
   pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
 
+  setIO();
+
   if (loggingLevel < 2) {
-    LOG(info) << "(ORT) Model loaded successfully! (input: " << printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")";
+    LOG(info) << "(ORT) Model loaded successfully! (inputs: " << printShape(mInputShapes, mInputNames) << ", outputs: " << printShape(mOutputShapes, mInputNames) << ")";
   }
-
-  setIO();
 }
 
 void OrtModel::memoryOnDevice(int32_t deviceIndex)
@@ -201,13 +201,45 @@ void OrtModel::setIO() {
   outputNamesChar.resize(mOutputNames.size(), nullptr);
   std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar),
                  [&](const std::string& str) { return str.c_str(); });
+
+  inputShapesCopy = mInputShapes;
+  outputShapesCopy = mOutputShapes;
+  inputSizePerNode.resize(mInputShapes.size(), 1);
+  outputSizePerNode.resize(mOutputShapes.size(), 1);
+  mInputsTotal = 1;
+  for (size_t i = 0; i < mInputShapes.size(); ++i) {
+    if(mInputShapes[i].size() > 0) {
+      for (size_t j = 1; j < mInputShapes[i].size(); ++j) {
+        if (mInputShapes[i][j] > 0) {
+          mInputsTotal *= mInputShapes[i][j];
+          inputSizePerNode[i] *= mInputShapes[i][j];
+        }
+      }
+    }
+  }
+  mOutputsTotal = 1;
+  for (size_t i = 0; i < mOutputShapes.size(); ++i) {
+    if(mOutputShapes[i].size() > 0) {
+      for (size_t j = 1; j < mOutputShapes[i].size(); ++j) {
+        if (mOutputShapes[i][j] > 0) {
+          mOutputsTotal *= mOutputShapes[i][j];
+          outputSizePerNode[i] *= mOutputShapes[i][j];
+        }
+      }
+    }
+  }
 }
 
 // Inference
 template <class I, class O>
 std::vector<O> OrtModel::inference(std::vector<I>& input)
 {
-  std::vector<int64_t> inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+  std::vector<int64_t> inputShape = mInputShapes[0];
+  inputShape[0] = input.size();
+  for (size_t i = 1; i < mInputShapes[0].size(); ++i)
+  {
+    inputShape[0] /= mInputShapes[0][i];
+  }
   std::vector<Ort::Value> inputTensor;
   if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
     inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input.data()), input.size(), inputShape.data(), inputShape.size()));
@@ -223,9 +255,7 @@ std::vector<O> OrtModel::inference(std::vector<I>& input)
 }
 
 template std::vector<float> OrtModel::inference<float, float>(std::vector<float>&);
-
 template std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(std::vector<OrtDataType::Float16_t>&);
-
 template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>&);
 
 template <class I, class O>
@@ -255,33 +285,119 @@ void OrtModel::inference(I* input, size_t input_size, O* output)
 }
 
 template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t*, size_t, OrtDataType::Float16_t*);
-
 template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
-
 template void OrtModel::inference<float, OrtDataType::Float16_t>(float*, size_t, OrtDataType::Float16_t*);
-
 template void OrtModel::inference<float, float>(float*, size_t, float*);
 
 template <class I, class O>
-std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& input)
-{
-  std::vector<Ort::Value> inputTensor;
-  for (auto i : input) {
-    std::vector<int64_t> inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]};
+void OrtModel::inference(I** input, size_t input_size, O* output) {
+  std::vector<Ort::Value> inputTensors(inputShapesCopy.size());
+
+  for (size_t i = 0; i < inputShapesCopy.size(); ++i) {
+
+    inputShapesCopy[i][0] = input_size; // batch-size
+    outputShapesCopy[i][0] = input_size; // batch-size
+
     if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
-      inputTensor.emplace_back(Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(i.data()), i.size(), inputShape.data(), inputShape.size()));
+      inputTensors[i] = Ort::Value::CreateTensor<Ort::Float16_t>(
+          pImplOrt->memoryInfo,
+          reinterpret_cast<Ort::Float16_t*>(input[i]),
+          inputSizePerNode[i]*input_size,
+          inputShapesCopy[i].data(),
+          inputShapesCopy[i].size());
     } else {
-      inputTensor.emplace_back(Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, i.data(), i.size(), inputShape.data(), inputShape.size()));
+      inputTensors[i] = Ort::Value::CreateTensor<I>(
+          pImplOrt->memoryInfo,
+          input[i],
+          inputSizePerNode[i]*input_size,
+          inputShapesCopy[i].data(),
+          inputShapesCopy[i].size());
     }
   }
-  // input.clear();
-  auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size());
-  O* outputValues = reinterpret_cast<O*>(outputTensors[0].template GetTensorMutableData<O>());
-  std::vector<O> outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]};
-  outputTensors.clear();
-  return outputValuesVec;
+
+  Ort::Value outputTensor = Ort::Value(nullptr);
+  if constexpr (std::is_same_v<O, OrtDataType::Float16_t>) {
+    outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(
+      pImplOrt->memoryInfo,
+      reinterpret_cast<Ort::Float16_t*>(output),
+      outputSizePerNode[0]*input_size, // assumes that there is only one output node
+      outputShapesCopy[0].data(),
+      outputShapesCopy[0].size());
+  } else {
+    outputTensor = Ort::Value::CreateTensor<O>(
+      pImplOrt->memoryInfo,
+      output,
+      outputSizePerNode[0]*input_size, // assumes that there is only one output node
+      outputShapesCopy[0].data(),
+      outputShapesCopy[0].size());
+  }
+
+  // === Run inference ===
+  pImplOrt->session->Run(
+    pImplOrt->runOptions,
+    inputNamesChar.data(),
+    inputTensors.data(),
+    inputNamesChar.size(),
+    outputNamesChar.data(),
+    &outputTensor,
+    outputNamesChar.size()
+  );
+}
+
+template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t**, size_t, OrtDataType::Float16_t*);
+template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t**, size_t, float*);
+template void OrtModel::inference<float, OrtDataType::Float16_t>(float**, size_t, OrtDataType::Float16_t*);
+template void OrtModel::inference<float, float>(float**, size_t, float*);
+
+template <class I, class O>
+std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& inputs)
+{
+    std::vector<Ort::Value> input_tensors;
+
+    for (size_t i = 0; i < inputs.size(); ++i) {
+
+      inputShapesCopy[i][0] = inputs[i].size() / inputSizePerNode[i]; // batch-size
+
+      if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
+          input_tensors.emplace_back(
+              Ort::Value::CreateTensor<Ort::Float16_t>(
+                  pImplOrt->memoryInfo,
+                  reinterpret_cast<Ort::Float16_t*>(inputs[i].data()),
+                  inputSizePerNode[i]*inputShapesCopy[i][0],
+                  inputShapesCopy[i].data(),
+                  inputShapesCopy[i].size()));
+      } else {
+          input_tensors.emplace_back(
+              Ort::Value::CreateTensor<I>(
+                  pImplOrt->memoryInfo,
+                  inputs[i].data(),
+                  inputSizePerNode[i]*inputShapesCopy[i][0],
+                  inputShapesCopy[i].data(),
+                  inputShapesCopy[i].size()));
+      }
+    }
+
+    int32_t totalOutputSize = mOutputsTotal*inputShapesCopy[0][0];
+
+    // === Run inference ===
+    auto output_tensors = pImplOrt->session->Run(
+        pImplOrt->runOptions,
+        inputNamesChar.data(),
+        input_tensors.data(),
+        input_tensors.size(),
+        outputNamesChar.data(),
+        outputNamesChar.size());
+
+    // === Extract output values ===
+    O* output_data = output_tensors[0].template GetTensorMutableData<O>();
+    std::vector<O> output_vec(output_data, output_data + totalOutputSize);
+    output_tensors.clear();
+    return output_vec;
 }
 
+template std::vector<float> OrtModel::inference<float, float>(std::vector<std::vector<float>>&);
+template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>&);
+
 // private
 std::string OrtModel::printShape(const std::vector<int64_t>& v)
 {
@@ -293,6 +409,19 @@ std::string OrtModel::printShape(const std::vector<int64_t>& v)
   return ss.str();
 }
 
+std::string OrtModel::printShape(const std::vector<std::vector<int64_t>>& v, std::vector<std::string>& n)
+{
+  std::stringstream ss("");
+  for (size_t i = 0; i < v.size(); i++) {
+    ss << n[i] << " -> (";
+    for (size_t j = 0; j < v[i].size() - 1; j++) {
+      ss << v[i][j] << "x";
+    }
+    ss << v[i][v[i].size() - 1] << "); ";
+  }
+  return ss.str();
+}
+
 } // namespace ml
 
 } // namespace o2

From 77c1691202239c3f0a60e7dd930aa8cacd0dc760 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 7 Apr 2025 21:04:23 +0200
Subject: [PATCH 28/40] CCDB fetching of NNs ported to GPUWorkflowSpec

---
 Detectors/TPC/calibration/CMakeLists.txt      |  2 +
 .../TPCCalibration/NeuralNetworkClusterizer.h | 39 ++++++++++
 .../src/NeuralNetworkClusterizer.cxx          | 47 ++++++++++++
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  3 +-
 .../GPUTPCNNClusterizerHost.cxx               | 75 +++----------------
 .../GPUTPCNNClusterizerHost.h                 |  4 -
 .../include/GPUWorkflow/GPUWorkflowSpec.h     |  3 +
 GPU/Workflow/src/GPUWorkflowSpec.cxx          | 45 +++++++++++
 8 files changed, 148 insertions(+), 70 deletions(-)
 create mode 100644 Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h
 create mode 100644 Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx

diff --git a/Detectors/TPC/calibration/CMakeLists.txt b/Detectors/TPC/calibration/CMakeLists.txt
index 0ec62e5f323b3..7722fc4e2884f 100644
--- a/Detectors/TPC/calibration/CMakeLists.txt
+++ b/Detectors/TPC/calibration/CMakeLists.txt
@@ -25,6 +25,7 @@ o2_add_library(TPCCalibration
                        src/CalibPadGainTracksBase.cxx
                        src/CalibLaserTracks.cxx
                        src/LaserTracksCalibrator.cxx
+                       src/NeuralNetworkClusterizer.cxx
                        src/SACDecoder.cxx
                        src/IDCAverageGroup.cxx
                        src/IDCAverageGroupBase.cxx
@@ -82,6 +83,7 @@ o2_target_root_dictionary(TPCCalibration
                                   include/TPCCalibration/FastHisto.h
                                   include/TPCCalibration/CalibLaserTracks.h
                                   include/TPCCalibration/LaserTracksCalibrator.h
+                                  include/TPCCalibration/NeuralNetworkClusterizer.h
                                   include/TPCCalibration/SACDecoder.h
                                   include/TPCCalibration/IDCAverageGroup.h
                                   include/TPCCalibration/IDCAverageGroupBase.h
diff --git a/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h
new file mode 100644
index 0000000000000..e4fcfa56df438
--- /dev/null
+++ b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h
@@ -0,0 +1,39 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file   NeuralNetworkClusterizer.h
+/// \brief  Fetching neural networks for clusterization from CCDB
+/// \author Christian Sonnabend
+
+#ifndef AliceO2_TPC_NeuralNetworkClusterizer_h
+#define AliceO2_TPC_NeuralNetworkClusterizer_h
+
+#include "CCDB/CcdbApi.h"
+
+namespace o2::tpc
+{
+
+class NeuralNetworkClusterizer
+{
+    public:
+        NeuralNetworkClusterizer() = default;
+        void initCcdbApi(std::string url);
+        void loadIndividualFromCCDB(std::map<std::string, std::string> settings);
+
+    private:
+        o2::ccdb::CcdbApi ccdbApi;
+        std::map<std::string, std::string> metadata;
+        std::map<std::string, std::string> headers;
+
+};
+
+} // namespace o2::tpc
+#endif
diff --git a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx
new file mode 100644
index 0000000000000..8a2e739b772fb
--- /dev/null
+++ b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx
@@ -0,0 +1,47 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file   NeuralNetworkClusterizer.cxx
+/// \brief  Fetching neural networks for clusterization from CCDB
+/// \author Christian Sonnabend
+
+#include <CommonUtils/StringUtils.h>
+#include "TPCCalibration/NeuralNetworkClusterizer.h"
+
+using namespace o2::tpc;
+
+void NeuralNetworkClusterizer::initCcdbApi(std::string url) {
+  ccdbApi.init(url);
+}
+
+void NeuralNetworkClusterizer::loadIndividualFromCCDB(std::map<std::string, std::string> settings)
+{
+  metadata["inputDType"] = settings["inputDType"];
+  metadata["outputDType"] = settings["outputDType"];
+  metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"];         // classification_1C, classification_2C, regression_1C, regression_2C
+  metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model
+  metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"];       // FC, CNN
+  if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) {
+    metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"];
+  }
+  if (settings["nnCCDBBeamType"] != "") {
+    metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"];
+  }
+
+  bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], settings["outputFolder"], metadata, 1, false, settings["outputFile"]);
+  // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables
+
+  if (retrieveSuccess) {
+    LOG(info) << "Network " << settings["nnCCDBPath"] << " retrieved from CCDB, stored at " << settings["outputFile"];
+  } else {
+    LOG(error) << "Failed to retrieve network from CCDB";
+  }
+}
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 78c77c37dd511..a978e657e76b7 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -268,11 +268,12 @@ AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The c
 AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
 AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
 AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
+AddOption(nnEvalMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)")
 // CCDB
 AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally")
+AddOption(nnLocalFolder, std::string, ".", "", 0, "Local folder in which the networks will be fetched")
 AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched")
 AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks")
-AddOption(nnCCDBFetchMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)")
 AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression")
 AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
 AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 29fdaada06855..5125d7a3fd364 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -16,82 +16,27 @@
 
 #include "GPUTPCNNClusterizerHost.h"
 #include "GPUTPCNNClusterizer.h"
-#include "CCDB/CcdbApi.h"
 #include "GPUSettings.h"
 #include "ML/3rdparty/GPUORTFloat16.h"
 
 using namespace o2::gpu;
 
-void GPUTPCNNClusterizerHost::loadFromCCDB(std::map<std::string, std::string> settings)
-{
-  o2::ccdb::CcdbApi ccdbApi;
-  ccdbApi.init(settings["nnCCDBURL"]);
-
-  metadata["inputDType"] = settings["inputDType"];
-  metadata["outputDType"] = settings["outputDType"];
-  metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"];         // classification_1C, classification_2C, regression_1C, regression_2C
-  metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model
-  metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"];       // FC, CNN
-  if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) {
-    metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"];
-  }
-  if (settings["nnCCDBBeamType"] != "") {
-    metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"];
-  }
-
-  bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], ".", metadata, 1, false, settings["outputFile"]);
-  // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables
-
-  if (retrieveSuccess) {
-    LOG(info) << "Network " << settings["nnCCDBPath"] << " retrieved from CCDB, stored at " << settings["outputFile"];
-  } else {
-    LOG(error) << "Failed to retrieve network from CCDB";
-  }
-}
-
 void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings)
 {
   std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath;
   std::vector<std::string> reg_model_paths;
-
-  if (settings.nnLoadFromCCDB) {
-    std::map<std::string, std::string> ccdbSettings = {
-      {"nnCCDBURL", settings.nnCCDBURL},
-      {"nnCCDBPath", settings.nnCCDBPath},
-      {"inputDType", settings.nnInferenceInputDType},
-      {"outputDType", settings.nnInferenceOutputDType},
-      {"nnCCDBWithMomentum", std::to_string(settings.nnCCDBWithMomentum)},
-      {"nnCCDBBeamType", settings.nnCCDBBeamType},
-      {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)}};
-
-    std::string nnFetchFolder = "";
-    std::vector<std::string> fetchMode = o2::utils::Str::tokenize(settings.nnCCDBFetchMode, ':');
-    std::map<std::string, std::string> networkRetrieval = ccdbSettings;
-
-    if (fetchMode[0] == "c1") {
-      networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType;
-      networkRetrieval["nnCCDBEvalType"] = "classification_c1";
-      networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx";
-      loadFromCCDB(networkRetrieval);
-    } else if (fetchMode[0] == "c2") {
-      networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType;
-      networkRetrieval["nnCCDBEvalType"] = "classification_c2";
-      networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx";
-      loadFromCCDB(networkRetrieval);
+  std::vector<std::string> evalMode = o2::utils::Str::tokenize(settings.nnEvalMode, ':');
+
+  if(settings.nnLoadFromCCDB) {
+    reg_model_path = settings.nnLocalFolder + "/net_regression_c1.onnx"; // Needs to be set identical to NeuralNetworkClusterizer.cxx, otherwise the networks might be loaded from the wrong place
+    if (evalMode[0] == "c1") {
+      class_model_path = settings.nnLocalFolder + "/net_classification_c1.onnx";
+    } else if (evalMode[0] == "c2") {
+      class_model_path = settings.nnLocalFolder + "/net_classification_c2.onnx";
     }
-    class_model_path = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally
 
-    networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType;
-    networkRetrieval["nnCCDBEvalType"] = "regression_c1";
-    networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx";
-    loadFromCCDB(networkRetrieval);
-    reg_model_path = networkRetrieval["outputFile"];
-    if (fetchMode[1] == "r2") {
-      networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType;
-      networkRetrieval["nnCCDBEvalType"] = "regression_c2";
-      networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx";
-      loadFromCCDB(networkRetrieval);
-      reg_model_path += ":", networkRetrieval["outputFile"];
+    if (evalMode[1] == "r2") {
+      reg_model_path += ":" + settings.nnLocalFolder + "/net_regression_c2.onnx";
     }
   }
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 87532deff9917..8001ecc96fcfd 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -48,10 +48,6 @@ class GPUTPCNNClusterizerHost
   std::vector<bool> modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2
   int32_t deviceId = -1;
   std::vector<std::string> reg_model_paths;
-
- private:
-  std::map<std::string, std::string> metadata;
-  std::map<std::string, std::string> headers;
 }; // class GPUTPCNNClusterizerHost
 
 } // namespace o2::gpu
diff --git a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
index eda3b28c6cff6..6d471da2879cb 100644
--- a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
+++ b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
@@ -83,6 +83,7 @@ class GPUO2Interface;
 struct TPCPadGainCalib;
 struct TPCZSLinkMapping;
 struct GPUSettingsO2;
+struct GPUSettingsProcessingNNclusterizer;
 class GPUO2InterfaceQA;
 struct GPUTrackingInOutPointers;
 struct GPUTrackingInOutZS;
@@ -224,6 +225,8 @@ class GPURecoWorkflowSpec : public o2::framework::Task
   uint32_t mNextThreadIndex = 0;
   bool mUpdateGainMapCCDB = true;
   std::unique_ptr<o2::gpu::GPUSettingsTF> mTFSettings;
+  std::unique_ptr<o2::gpu::GPUSettingsProcessingNNclusterizer> mNNClusterizerSettings;
+
   Config mSpecConfig;
   std::shared_ptr<o2::base::GRPGeomRequest> mGGR;
   bool mGRPGeomUpdated = false;
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index aa4f3cfca1289..f2cc2806115fb 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -78,6 +78,7 @@
 #include "DetectorsRaw/RDHUtils.h"
 #include "ITStracking/TrackingInterface.h"
 #include "GPUWorkflowInternal.h"
+#include "TPCCalibration/NeuralNetworkClusterizer.h"
 // #include "Framework/ThreadPool.h"
 
 #include <TStopwatch.h>
@@ -118,6 +119,7 @@ GPURecoWorkflowSpec::GPURecoWorkflowSpec(GPURecoWorkflowSpec::CompletionPolicyDa
   mConfig.reset(new GPUO2InterfaceConfiguration);
   mConfParam.reset(new GPUSettingsO2);
   mTFSettings.reset(new GPUSettingsTF);
+  mNNClusterizerSettings.reset(new GPUSettingsProcessingNNclusterizer);
   mTimer.reset(new TStopwatch);
   mPipeline.reset(new GPURecoWorkflowSpec_PipelineInternals);
 
@@ -133,6 +135,49 @@ void GPURecoWorkflowSpec::init(InitContext& ic)
   GRPGeomHelper::instance().setRequest(mGGR);
   GPUO2InterfaceConfiguration& config = *mConfig.get();
 
+  if (mNNClusterizerSettings->nnLoadFromCCDB){
+    LOG(info) << "Loading neural networks from CCDB";
+    o2::tpc::NeuralNetworkClusterizer nnClusterizerFetcher;
+    nnClusterizerFetcher.initCcdbApi(mNNClusterizerSettings->nnCCDBURL);
+    std::map<std::string, std::string> ccdbSettings = {
+    {"nnCCDBURL", mNNClusterizerSettings->nnCCDBURL},
+    {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath},
+    {"inputDType", mNNClusterizerSettings->nnInferenceInputDType},
+    {"outputDType", mNNClusterizerSettings->nnInferenceOutputDType},
+    {"outputFolder", mNNClusterizerSettings->nnLocalFolder},
+    {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath},
+    {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings->nnCCDBWithMomentum)},
+    {"nnCCDBBeamType", mNNClusterizerSettings->nnCCDBBeamType},
+    {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings->nnCCDBInteractionRate)}};
+
+    std::string nnFetchFolder = mNNClusterizerSettings->nnLocalFolder;
+    std::vector<std::string> evalMode = o2::utils::Str::tokenize(mNNClusterizerSettings->nnEvalMode, ':');
+
+    if (evalMode[0] == "c1") {
+      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBClassificationLayerType;
+      ccdbSettings["nnCCDBEvalType"] = "classification_c1";
+      ccdbSettings["outputFile"] = "net_classification_c1.onnx";
+      nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
+    } else if (evalMode[0] == "c2") {
+      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBClassificationLayerType;
+      ccdbSettings["nnCCDBEvalType"] = "classification_c2";
+      ccdbSettings["outputFile"] = "net_classification_c2.onnx";
+      nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
+    }
+
+    ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBRegressionLayerType;
+    ccdbSettings["nnCCDBEvalType"] = "regression_c1";
+    ccdbSettings["outputFile"] = "net_regression_c1.onnx";
+    nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
+    if (evalMode[1] == "r2") {
+      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBRegressionLayerType;
+      ccdbSettings["nnCCDBEvalType"] = "regression_c2";
+      ccdbSettings["outputFile"] = "net_regression_c2.onnx";
+      nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
+    }
+    LOG(info) << "Neural network loading done!";
+  }
+
   // Create configuration object and fill settings
   mConfig->configGRP.solenoidBzNominalGPU = 0;
   mTFSettings->hasSimStartOrbit = 1;

From a9857986166b76fd898d845e9b42a20c98eee368 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 10 Apr 2025 10:22:30 +0200
Subject: [PATCH 29/40] Adjusting CPU threads and ORT copmile definitions

---
 GPU/GPUTracking/Base/cuda/CMakeLists.txt            | 13 +++++++++++++
 GPU/GPUTracking/Base/hip/CMakeLists.txt             | 13 +++++++++++++
 GPU/GPUTracking/CMakeLists.txt                      | 13 +------------
 .../Global/GPUChainTrackingClusterizer.cxx          |  2 +-
 4 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
index 6c88c69cbe35f..2611af88ad113 100644
--- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
@@ -122,6 +122,19 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
       ${CMAKE_CURRENT_SOURCE_DIR}
     TARGETVARNAME targetName)
 
+  message("Compile definitions for ONNX runtime (CUDA):")
+  message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}")
+  message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}")
+  message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}")
+  message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}")
+
+  target_compile_definitions(${targetName} PRIVATE
+    GPUCA_HAS_ONNX=1
+    ORT_ROCM_BUILD=$<BOOL:${ORT_ROCM_BUILD}>
+    ORT_CUDA_BUILD=$<BOOL:${ORT_CUDA_BUILD}>
+    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>
+    ORT_TENSORRT_BUILD=$<BOOL:${ORT_TENSORRT_BUILD}>)
+
   install(FILES ${HDRS} DESTINATION include/GPU)
 endif()
 
diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt
index 2a3c0ea7d9eb1..570b9c4bd2683 100644
--- a/GPU/GPUTracking/Base/hip/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt
@@ -160,6 +160,19 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
       ${GPUCA_HIP_SOURCE_DIR}
     TARGETVARNAME targetName)
 
+  message("Compile definitions for ONNX runtime (HIP / ROCM):")
+  message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}")
+  message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}")
+  message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}")
+  message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}")
+
+  target_compile_definitions(${targetName} PRIVATE
+    GPUCA_HAS_ONNX=1
+    ORT_ROCM_BUILD=$<BOOL:${ORT_ROCM_BUILD}>
+    ORT_CUDA_BUILD=$<BOOL:${ORT_CUDA_BUILD}>
+    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>
+    ORT_TENSORRT_BUILD=$<BOOL:${ORT_TENSORRT_BUILD}>)
+
   install(FILES ${HDRS} DESTINATION include/GPU)
 
 #  o2_add_test(GPUsortHIP NAME test_GPUsortHIP
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index c4b084e260ea8..c6cccfb71a27a 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -352,19 +352,8 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
     ${targetName}
     PRIVATE $<TARGET_PROPERTY:O2::Framework,INTERFACE_INCLUDE_DIRECTORIES>)
 
-  message("Compile definitions for ONNX runtime:")
-  message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}")
-  message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}")
-  message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}")
-  message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}")
-
-
   target_compile_definitions(${targetName} PRIVATE
-    GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1
-    ORT_ROCM_BUILD=$<BOOL:${ORT_ROCM_BUILD}>
-    ORT_CUDA_BUILD=$<BOOL:${ORT_CUDA_BUILD}>
-    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>
-    ORT_TENSORRT_BUILD=$<BOOL:${ORT_TENSORRT_BUILD}>)
+    GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1)
 
   o2_target_root_dictionary(${MODULE}
                             HEADERS ${HDRS_CINT_O2} ${HDRS_CINT_O2_ADDITIONAL}
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index b03a27867f8fa..5565958d8d9ab 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -626,7 +626,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   if (GetProcessingSettings().nn.applyNNclusterizer) {
     int32_t deviceId = -1;
     int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
-    int32_t maxThreads = mRec->MemoryScalers()->nTPCdigits / 6000;
+    int32_t maxThreads = mRec->getNKernelHostThreads(true);
     mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
       nnApplications[lane].init(nn_settings);
       if (nnApplications[lane].modelsUsed[0]) {

From fb08f18d45df31b34aa969387f38ad898d2f6faa Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 10 Apr 2025 23:22:15 +0200
Subject: [PATCH 30/40] About 10x speed-up due to explicit io binding

---
 Common/ML/include/ML/OrtInterface.h           |  2 +
 Common/ML/src/OrtInterface.cxx                | 23 +++++++---
 .../Base/cuda/GPUReconstructionCUDA.cu        |  4 +-
 .../Global/GPUChainTrackingClusterizer.cxx    | 44 ++++++++++++-------
 4 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index d496de866da7f..625f506684fd8 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -91,6 +91,8 @@ class OrtModel
   template <class I, class O>
   void inference(I**, size_t, O*);
 
+  void release();
+
  private:
   // ORT variables -> need to be hidden as pImpl
   struct OrtVariables;
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index e525fc1d2709f..52ab22b5d1f87 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -33,6 +33,7 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c
   Ort::SessionOptions sessionOptions;
   Ort::AllocatorWithDefaultOptions allocator;
   Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
+  std::unique_ptr<Ort::IoBinding> ioBinding = nullptr;
 };
 
 // General purpose
@@ -122,7 +123,8 @@ void OrtModel::initEnvironment()
     },
     (void*)3);
   (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
-  pImplOrt->session = std::make_shared<Ort::Session>(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions);
+  pImplOrt->session = std::make_shared<Ort::Session>(*pImplOrt->env, modelPath.c_str(), pImplOrt->sessionOptions);
+  pImplOrt->ioBinding = std::make_unique<Ort::IoBinding>(*pImplOrt->session);
 
   setIO();
 
@@ -135,6 +137,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
 {
 #if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
   if (deviceIndex >= 0) {
+    (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1");
     std::string dev_mem_str = "";
     if (deviceType == "ROCM") {
       dev_mem_str = "Hip";
@@ -268,20 +271,22 @@ void OrtModel::inference(I* input, size_t input_size, O* output)
   std::vector<int64_t> inputShape{input_size, (int64_t)mInputShapes[0][1]};
   Ort::Value inputTensor = Ort::Value(nullptr);
   if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
-    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
+    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1] * sizeof(Ort::Float16_t), inputShape.data(), inputShape.size());
   } else {
-    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
+    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1] * sizeof(float), inputShape.data(), inputShape.size());
   }
+  (pImplOrt->ioBinding)->BindInput(mInputNames[0].c_str(), inputTensor);
 
   std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
   Ort::Value outputTensor = Ort::Value(nullptr);
   if constexpr (std::is_same_v<O, OrtDataType::Float16_t>) {
-    outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
+    outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(output), input_size * mOutputShapes[0][1] * sizeof(Ort::Float16_t), outputShape.data(), outputShape.size());
   } else {
-    outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
+    outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1] * sizeof(float), outputShape.data(), outputShape.size());
   }
+  (pImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor);
 
-  (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size());
+  (pImplOrt->session)->Run(pImplOrt->runOptions, *pImplOrt->ioBinding);
 }
 
 template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t*, size_t, OrtDataType::Float16_t*);
@@ -398,6 +403,12 @@ std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& inputs)
 template std::vector<float> OrtModel::inference<float, float>(std::vector<std::vector<float>>&);
 template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>&);
 
+// Release session
+void OrtModel::release()
+{
+  LOG(info) << "(ORT) Size of pImplOrt: " << sizeof(*pImplOrt) << " bytes";
+}
+
 // private
 std::string OrtModel::printShape(const std::vector<int64_t>& v)
 {
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index d71c5f3211774..4e36b3fd3380a 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -673,6 +673,7 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_option
   // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size());
 
   // this implicitly sets "has_user_compute_stream"
+  cuda_options.has_user_compute_stream = 1;
   UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream]);
   session_options.AppendExecutionProvider_CUDA_V2(cuda_options);
 
@@ -698,10 +699,9 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options
   // api.GetCurrentGpuDeviceId(deviceId);
   OrtROCMProviderOptions rocm_options;
   rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
+  rocm_options.arena_extend_strategy = 0;
   rocm_options.user_compute_stream = mInternals->Streams[stream];
   session_options.AppendExecutionProvider_ROCM(rocm_options);
-  // OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, *deviceId);
-  // api.ReleaseROCMProviderOptions(rocm_options);
 }
 
 #endif // GPUCA_HAS_ONNX
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 5565958d8d9ab..6c0b9140297b1 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -630,7 +630,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
       nnApplications[lane].init(nn_settings);
       if (nnApplications[lane].modelsUsed[0]) {
-        SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
+        SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane + numLanes, &deviceId);
         (nnApplications[lane].model_class).setDeviceId(deviceId);
         if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads);
@@ -638,7 +638,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         (nnApplications[lane].model_class).initEnvironment();
       }
       if (nnApplications[lane].modelsUsed[1]) {
-        SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
+        SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane + 2*numLanes, &deviceId);
         (nnApplications[lane].model_reg_1).setDeviceId(deviceId);
         if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads);
@@ -646,7 +646,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         (nnApplications[lane].model_reg_1).initEnvironment();
       }
       if (nnApplications[lane].modelsUsed[2]) {
-        SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
+        SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane + 3*numLanes, &deviceId);
         (nnApplications[lane].model_reg_2).setDeviceId(deviceId);
         if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads);
@@ -950,7 +950,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           }
 
-          float time_clusterizer = 0, time_fill = 0;
+          float time_clusterizer = 0, time_fill = 0, time_networks = 0;
           for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.nnClusterizerBatchedMode); batch++) {
             uint batchStart = batch * clustererNNShadow.nnClusterizerBatchedMode;
             size_t iSize = CAMath::Min((uint)clustererNNShadow.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart));
@@ -961,6 +961,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto start1 = std::chrono::high_resolution_clock::now();
 
+            // NN evaluations
             if (clustererNNShadow.nnInferenceInputDType == 0) {
               if (clustererNNShadow.nnInferenceOutputDType == 0) {
                 (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_16);
@@ -974,14 +975,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                 (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_32);
               }
             }
-
-            if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
-            } else {
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
-            }
             if (!clustererNNShadow.nnClusterizerUseCfRegression) {
-              // nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType);
               if (clustererNNShadow.nnInferenceInputDType == 0) {
                 if (clustererNNShadow.nnInferenceOutputDType == 0) {
                   (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_16);
@@ -995,9 +989,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                   (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_32);
                 }
               }
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Running the NN for regression class 1
               if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
-                // nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType);
                 if (clustererNNShadow.nnInferenceInputDType == 0) {
                   if (clustererNNShadow.nnInferenceOutputDType == 0) {
                     (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_16);
@@ -1011,11 +1003,26 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                     (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_32);
                   }
                 }
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Running the NN for regression class 2
+              }
+            }
+
+            auto stopNNs = std::chrono::high_resolution_clock::now();
+
+            // Publishing kernels
+            if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass1Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
+            } else {
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::determineClass2Labels>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels
+            }
+            if (!clustererNNShadow.nnClusterizerUseCfRegression) {
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass1Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 1 regression results
+              if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) {
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results
               }
             }
             auto stop1 = std::chrono::high_resolution_clock::now();
 
+            time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count() / 1e9;
             time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
             time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
           }
@@ -1030,8 +1037,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
               acceptedClusters += clustererNNShadow.outputDataClass[i];
             }
-            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
+          TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clustererNN, lane);
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif
@@ -1132,6 +1140,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     }
   }
   for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
+    if (GetProcessingSettings().nn.applyNNclusterizer) {
+      GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
+      nnApplication.model_class.release();
+      nnApplication.model_reg_1.release();
+      nnApplication.model_reg_2.release();
+    }
     if (transferRunning[i]) {
       ReleaseEvent(mEvents->stream[i], doGPU);
     }

From b1c88f09a758c3e0cb67cbbac063fc4c82071d82 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 11 Apr 2025 14:08:13 +0200
Subject: [PATCH 31/40] Changes for synchronization and consistency. No
 performance loss.

---
 Common/ML/src/OrtInterface.cxx                         | 8 ++++----
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 7 +++----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 52ab22b5d1f87..bfbd8343efedf 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -271,18 +271,18 @@ void OrtModel::inference(I* input, size_t input_size, O* output)
   std::vector<int64_t> inputShape{input_size, (int64_t)mInputShapes[0][1]};
   Ort::Value inputTensor = Ort::Value(nullptr);
   if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
-    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1] * sizeof(Ort::Float16_t), inputShape.data(), inputShape.size());
+    inputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
   } else {
-    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1] * sizeof(float), inputShape.data(), inputShape.size());
+    inputTensor = Ort::Value::CreateTensor<I>(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size());
   }
   (pImplOrt->ioBinding)->BindInput(mInputNames[0].c_str(), inputTensor);
 
   std::vector<int64_t> outputShape{input_size, mOutputShapes[0][1]};
   Ort::Value outputTensor = Ort::Value(nullptr);
   if constexpr (std::is_same_v<O, OrtDataType::Float16_t>) {
-    outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(output), input_size * mOutputShapes[0][1] * sizeof(Ort::Float16_t), outputShape.data(), outputShape.size());
+    outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(pImplOrt->memoryInfo, reinterpret_cast<Ort::Float16_t*>(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
   } else {
-    outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1] * sizeof(float), outputShape.data(), outputShape.size());
+    outputTensor = Ort::Value::CreateTensor<O>(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size());
   }
   (pImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor);
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 6c0b9140297b1..7026a5ea01a1a 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -630,7 +630,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
       nnApplications[lane].init(nn_settings);
       if (nnApplications[lane].modelsUsed[0]) {
-        SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane + numLanes, &deviceId);
+        SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_class).setDeviceId(deviceId);
         if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads);
@@ -638,7 +638,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         (nnApplications[lane].model_class).initEnvironment();
       }
       if (nnApplications[lane].modelsUsed[1]) {
-        SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane + 2*numLanes, &deviceId);
+        SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_reg_1).setDeviceId(deviceId);
         if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads);
@@ -646,7 +646,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         (nnApplications[lane].model_reg_1).initEnvironment();
       }
       if (nnApplications[lane].modelsUsed[2]) {
-        SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane + 3*numLanes, &deviceId);
+        SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_reg_2).setDeviceId(deviceId);
         if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads);
@@ -1039,7 +1039,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             }
             LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
           }
-          TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clustererNN, lane);
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif

From 32cab70fa540e3976e9b1d9cc5e0664cb21001b7 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Fri, 11 Apr 2025 12:08:51 +0000
Subject: [PATCH 32/40] Please consider the following formatting changes

---
 Common/ML/include/ML/OrtInterface.h           |  30 +++--
 Common/ML/src/OrtInterface.cxx                | 116 +++++++++---------
 .../TPCCalibration/NeuralNetworkClusterizer.h |  19 ++-
 .../src/NeuralNetworkClusterizer.cxx          |   3 +-
 GPU/GPUTracking/Base/GPUReconstructionCPU.h   |   5 +-
 .../Base/GPUReconstructionProcessing.h        |   3 +-
 .../Base/cuda/GPUReconstructionCUDA.h         |   5 +-
 .../Global/GPUChainTrackingClusterizer.cxx    |   6 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |   2 +-
 .../GPUTPCNNClusterizerHost.cxx               |   2 +-
 .../GPUTPCNNClusterizerHost.h                 |   2 +-
 .../GPUTPCNNClusterizerKernels.cxx            |  84 ++++++-------
 GPU/Workflow/src/GPUWorkflowSpec.cxx          |  20 +--
 13 files changed, 156 insertions(+), 141 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 625f506684fd8..e44b56e62a04e 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -26,10 +26,11 @@
 // O2 includes
 #include "Framework/Logger.h"
 
-namespace Ort {
-  struct SessionOptions;
-  struct MemoryInfo;
-}
+namespace Ort
+{
+struct SessionOptions;
+struct MemoryInfo;
+} // namespace Ort
 
 namespace o2
 {
@@ -44,7 +45,8 @@ class OrtModel
   // Constructors & destructors
   OrtModel() = default;
   OrtModel(std::unordered_map<std::string, std::string> optionsMap) { init(optionsMap); }
-  void init(std::unordered_map<std::string, std::string> optionsMap) {
+  void init(std::unordered_map<std::string, std::string> optionsMap)
+  {
     initOptions(optionsMap);
     initEnvironment();
   }
@@ -71,8 +73,18 @@ class OrtModel
   void setDeviceId(int32_t id) { deviceId = id; }
   void setIO();
   void setActiveThreads(int threads) { intraOpNumThreads = threads; }
-  void setIntraOpNumThreads(int threads) { if(deviceType == "CPU") { intraOpNumThreads = threads; } }
-  void setInterOpNumThreads(int threads) { if(deviceType == "CPU") { interOpNumThreads = threads; } }
+  void setIntraOpNumThreads(int threads)
+  {
+    if (deviceType == "CPU") {
+      intraOpNumThreads = threads;
+    }
+  }
+  void setInterOpNumThreads(int threads)
+  {
+    if (deviceType == "CPU") {
+      interOpNumThreads = threads;
+    }
+  }
 
   // Conversion
   template <class I, class O>
@@ -102,8 +114,8 @@ class OrtModel
   std::vector<const char*> inputNamesChar, outputNamesChar;
   std::vector<std::string> mInputNames, mOutputNames;
   std::vector<std::vector<int64_t>> mInputShapes, mOutputShapes, inputShapesCopy, outputShapesCopy; // Input shapes
-  std::vector<int64_t> inputSizePerNode, outputSizePerNode; // Output shapes
-  int32_t mInputsTotal = 0, mOutputsTotal = 0; // Total number of inputs and outputs
+  std::vector<int64_t> inputSizePerNode, outputSizePerNode;                                         // Output shapes
+  int32_t mInputsTotal = 0, mOutputsTotal = 0;                                                      // Total number of inputs and outputs
 
   // Environment settings
   bool mInitialized = false;
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index bfbd8343efedf..8771a312a7e45 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -99,7 +99,7 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
 
 void OrtModel::initEnvironment()
 {
-  if(allocateDeviceMemory) {
+  if (allocateDeviceMemory) {
     memoryOnDevice(deviceId);
   }
   pImplOrt->env = std::make_shared<Ort::Env>(
@@ -184,7 +184,8 @@ std::vector<O> OrtModel::v2v(std::vector<I>& input, bool clearInput)
   }
 }
 
-void OrtModel::setIO() {
+void OrtModel::setIO()
+{
   for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) {
     mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get());
   }
@@ -211,7 +212,7 @@ void OrtModel::setIO() {
   outputSizePerNode.resize(mOutputShapes.size(), 1);
   mInputsTotal = 1;
   for (size_t i = 0; i < mInputShapes.size(); ++i) {
-    if(mInputShapes[i].size() > 0) {
+    if (mInputShapes[i].size() > 0) {
       for (size_t j = 1; j < mInputShapes[i].size(); ++j) {
         if (mInputShapes[i][j] > 0) {
           mInputsTotal *= mInputShapes[i][j];
@@ -222,7 +223,7 @@ void OrtModel::setIO() {
   }
   mOutputsTotal = 1;
   for (size_t i = 0; i < mOutputShapes.size(); ++i) {
-    if(mOutputShapes[i].size() > 0) {
+    if (mOutputShapes[i].size() > 0) {
       for (size_t j = 1; j < mOutputShapes[i].size(); ++j) {
         if (mOutputShapes[i][j] > 0) {
           mOutputsTotal *= mOutputShapes[i][j];
@@ -239,8 +240,7 @@ std::vector<O> OrtModel::inference(std::vector<I>& input)
 {
   std::vector<int64_t> inputShape = mInputShapes[0];
   inputShape[0] = input.size();
-  for (size_t i = 1; i < mInputShapes[0].size(); ++i)
-  {
+  for (size_t i = 1; i < mInputShapes[0].size(); ++i) {
     inputShape[0] /= mInputShapes[0][i];
   }
   std::vector<Ort::Value> inputTensor;
@@ -295,28 +295,29 @@ template void OrtModel::inference<float, OrtDataType::Float16_t>(float*, size_t,
 template void OrtModel::inference<float, float>(float*, size_t, float*);
 
 template <class I, class O>
-void OrtModel::inference(I** input, size_t input_size, O* output) {
+void OrtModel::inference(I** input, size_t input_size, O* output)
+{
   std::vector<Ort::Value> inputTensors(inputShapesCopy.size());
 
   for (size_t i = 0; i < inputShapesCopy.size(); ++i) {
 
-    inputShapesCopy[i][0] = input_size; // batch-size
+    inputShapesCopy[i][0] = input_size;  // batch-size
     outputShapesCopy[i][0] = input_size; // batch-size
 
     if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
       inputTensors[i] = Ort::Value::CreateTensor<Ort::Float16_t>(
-          pImplOrt->memoryInfo,
-          reinterpret_cast<Ort::Float16_t*>(input[i]),
-          inputSizePerNode[i]*input_size,
-          inputShapesCopy[i].data(),
-          inputShapesCopy[i].size());
+        pImplOrt->memoryInfo,
+        reinterpret_cast<Ort::Float16_t*>(input[i]),
+        inputSizePerNode[i] * input_size,
+        inputShapesCopy[i].data(),
+        inputShapesCopy[i].size());
     } else {
       inputTensors[i] = Ort::Value::CreateTensor<I>(
-          pImplOrt->memoryInfo,
-          input[i],
-          inputSizePerNode[i]*input_size,
-          inputShapesCopy[i].data(),
-          inputShapesCopy[i].size());
+        pImplOrt->memoryInfo,
+        input[i],
+        inputSizePerNode[i] * input_size,
+        inputShapesCopy[i].data(),
+        inputShapesCopy[i].size());
     }
   }
 
@@ -325,14 +326,14 @@ void OrtModel::inference(I** input, size_t input_size, O* output) {
     outputTensor = Ort::Value::CreateTensor<Ort::Float16_t>(
       pImplOrt->memoryInfo,
       reinterpret_cast<Ort::Float16_t*>(output),
-      outputSizePerNode[0]*input_size, // assumes that there is only one output node
+      outputSizePerNode[0] * input_size, // assumes that there is only one output node
       outputShapesCopy[0].data(),
       outputShapesCopy[0].size());
   } else {
     outputTensor = Ort::Value::CreateTensor<O>(
       pImplOrt->memoryInfo,
       output,
-      outputSizePerNode[0]*input_size, // assumes that there is only one output node
+      outputSizePerNode[0] * input_size, // assumes that there is only one output node
       outputShapesCopy[0].data(),
       outputShapesCopy[0].size());
   }
@@ -345,8 +346,7 @@ void OrtModel::inference(I** input, size_t input_size, O* output) {
     inputNamesChar.size(),
     outputNamesChar.data(),
     &outputTensor,
-    outputNamesChar.size()
-  );
+    outputNamesChar.size());
 }
 
 template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t**, size_t, OrtDataType::Float16_t*);
@@ -357,47 +357,47 @@ template void OrtModel::inference<float, float>(float**, size_t, float*);
 template <class I, class O>
 std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& inputs)
 {
-    std::vector<Ort::Value> input_tensors;
+  std::vector<Ort::Value> input_tensors;
 
-    for (size_t i = 0; i < inputs.size(); ++i) {
+  for (size_t i = 0; i < inputs.size(); ++i) {
 
-      inputShapesCopy[i][0] = inputs[i].size() / inputSizePerNode[i]; // batch-size
+    inputShapesCopy[i][0] = inputs[i].size() / inputSizePerNode[i]; // batch-size
 
-      if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
-          input_tensors.emplace_back(
-              Ort::Value::CreateTensor<Ort::Float16_t>(
-                  pImplOrt->memoryInfo,
-                  reinterpret_cast<Ort::Float16_t*>(inputs[i].data()),
-                  inputSizePerNode[i]*inputShapesCopy[i][0],
-                  inputShapesCopy[i].data(),
-                  inputShapesCopy[i].size()));
-      } else {
-          input_tensors.emplace_back(
-              Ort::Value::CreateTensor<I>(
-                  pImplOrt->memoryInfo,
-                  inputs[i].data(),
-                  inputSizePerNode[i]*inputShapesCopy[i][0],
-                  inputShapesCopy[i].data(),
-                  inputShapesCopy[i].size()));
-      }
+    if constexpr (std::is_same_v<I, OrtDataType::Float16_t>) {
+      input_tensors.emplace_back(
+        Ort::Value::CreateTensor<Ort::Float16_t>(
+          pImplOrt->memoryInfo,
+          reinterpret_cast<Ort::Float16_t*>(inputs[i].data()),
+          inputSizePerNode[i] * inputShapesCopy[i][0],
+          inputShapesCopy[i].data(),
+          inputShapesCopy[i].size()));
+    } else {
+      input_tensors.emplace_back(
+        Ort::Value::CreateTensor<I>(
+          pImplOrt->memoryInfo,
+          inputs[i].data(),
+          inputSizePerNode[i] * inputShapesCopy[i][0],
+          inputShapesCopy[i].data(),
+          inputShapesCopy[i].size()));
     }
+  }
+
+  int32_t totalOutputSize = mOutputsTotal * inputShapesCopy[0][0];
+
+  // === Run inference ===
+  auto output_tensors = pImplOrt->session->Run(
+    pImplOrt->runOptions,
+    inputNamesChar.data(),
+    input_tensors.data(),
+    input_tensors.size(),
+    outputNamesChar.data(),
+    outputNamesChar.size());
 
-    int32_t totalOutputSize = mOutputsTotal*inputShapesCopy[0][0];
-
-    // === Run inference ===
-    auto output_tensors = pImplOrt->session->Run(
-        pImplOrt->runOptions,
-        inputNamesChar.data(),
-        input_tensors.data(),
-        input_tensors.size(),
-        outputNamesChar.data(),
-        outputNamesChar.size());
-
-    // === Extract output values ===
-    O* output_data = output_tensors[0].template GetTensorMutableData<O>();
-    std::vector<O> output_vec(output_data, output_data + totalOutputSize);
-    output_tensors.clear();
-    return output_vec;
+  // === Extract output values ===
+  O* output_data = output_tensors[0].template GetTensorMutableData<O>();
+  std::vector<O> output_vec(output_data, output_data + totalOutputSize);
+  output_tensors.clear();
+  return output_vec;
 }
 
 template std::vector<float> OrtModel::inference<float, float>(std::vector<std::vector<float>>&);
diff --git a/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h
index e4fcfa56df438..196bba644714c 100644
--- a/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h
+++ b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h
@@ -23,16 +23,15 @@ namespace o2::tpc
 
 class NeuralNetworkClusterizer
 {
-    public:
-        NeuralNetworkClusterizer() = default;
-        void initCcdbApi(std::string url);
-        void loadIndividualFromCCDB(std::map<std::string, std::string> settings);
-
-    private:
-        o2::ccdb::CcdbApi ccdbApi;
-        std::map<std::string, std::string> metadata;
-        std::map<std::string, std::string> headers;
-
+ public:
+  NeuralNetworkClusterizer() = default;
+  void initCcdbApi(std::string url);
+  void loadIndividualFromCCDB(std::map<std::string, std::string> settings);
+
+ private:
+  o2::ccdb::CcdbApi ccdbApi;
+  std::map<std::string, std::string> metadata;
+  std::map<std::string, std::string> headers;
 };
 
 } // namespace o2::tpc
diff --git a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx
index 8a2e739b772fb..bfbb7afc946f8 100644
--- a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx
+++ b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx
@@ -18,7 +18,8 @@
 
 using namespace o2::tpc;
 
-void NeuralNetworkClusterizer::initCcdbApi(std::string url) {
+void NeuralNetworkClusterizer::initCcdbApi(std::string url)
+{
   ccdbApi.init(url);
 }
 
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index ec02015fc91cb..1174fcd8a38d7 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -24,8 +24,9 @@
 #include "GPUReconstructionKernelIncludes.h"
 #include "GPUReconstructionKernels.h"
 
-namespace Ort {
-  struct SessionOptions;
+namespace Ort
+{
+struct SessionOptions;
 }
 
 namespace o2::gpu
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
index b83de29ef4af2..94e13d15d9c89 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
@@ -22,7 +22,8 @@
 #include <functional>
 #include <atomic>
 
-namespace Ort {
+namespace Ort
+{
 struct SessionOptions;
 }
 
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
index 08e3078f767e6..d4712f2c0ed25 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h
@@ -25,8 +25,9 @@ extern "C" __declspec(dllexport) o2::gpu::GPUReconstruction* GPUReconstruction_C
 extern "C" o2::gpu::GPUReconstruction* GPUReconstruction_Create_CUDA(const o2::gpu::GPUSettingsDeviceBackend& cfg);
 #endif
 
-namespace Ort {
-  struct SessionOptions;
+namespace Ort
+{
+struct SessionOptions;
 }
 
 namespace o2::gpu
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 7026a5ea01a1a..1d71c92bcdfe9 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -665,7 +665,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       clustererNN.mISector = sector;
       clustererNN.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
       nnApplications[lane].initClusterizer(nn_settings, clustererNN);
-      if (doGPU){
+      if (doGPU) {
         clustererNNShadow.deviceId = deviceId;
         clustererNNShadow.mISector = sector;
         clustererNNShadow.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters;
@@ -673,8 +673,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       }
       AllocateRegisteredMemory(clustererNN.mMemoryId);
     });
-    if (doGPU){
-      WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer)*NSECTORS, mRec->NStreams() - 1, &mEvents->init);
+    if (doGPU) {
+      WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
     }
   }
 #endif
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index e9b2061bea36a..da490b0f94d58 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -52,7 +52,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   int nnClusterizerModelClassNumOutputNodes = -1;
   int nnClusterizerModelReg1NumOutputNodes = -1;
   int nnClusterizerModelReg2NumOutputNodes = -1;
-  int nnInferenceInputDType = 0; // 0: float16, 1: float32
+  int nnInferenceInputDType = 0;  // 0: float16, 1: float32
   int nnInferenceOutputDType = 0; // 0: float16, 1: float32
   int mISector = -1;
   int deviceId = -1;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 5125d7a3fd364..8c6d4fff67528 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -27,7 +27,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
   std::vector<std::string> reg_model_paths;
   std::vector<std::string> evalMode = o2::utils::Str::tokenize(settings.nnEvalMode, ':');
 
-  if(settings.nnLoadFromCCDB) {
+  if (settings.nnLoadFromCCDB) {
     reg_model_path = settings.nnLocalFolder + "/net_regression_c1.onnx"; // Needs to be set identical to NeuralNetworkClusterizer.cxx, otherwise the networks might be loaded from the wrong place
     if (evalMode[0] == "c1") {
       class_model_path = settings.nnLocalFolder + "/net_classification_c1.onnx";
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 8001ecc96fcfd..bae9e5fa677b2 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -45,7 +45,7 @@ class GPUTPCNNClusterizerHost
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
-  std::vector<bool> modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2
+  std::vector<bool> modelsUsed = {false, false, false};   // 0: class, 1: reg_1, 2: reg_2
   int32_t deviceId = -1;
   std::vector<std::string> reg_model_paths;
 }; // class GPUTPCNNClusterizerHost
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index d1be1d00027e2..2cf9ab2037007 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -147,9 +147,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
     if (is_row_boundary) {
       if (dtype == 0) {
-        clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
+        clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
       } else {
-        clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
+        clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
       }
     } else {
       int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow);
@@ -163,15 +163,15 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
         int t = (rest_1 % (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputTime;
         ChargePos tmp_pos(row + r, pad + p, time + t);
         if (dtype == 0) {
-          clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+          clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
         } else if (dtype == 1) {
-          clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+          clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
         }
       } else {
         if (dtype == 0) {
-          clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
+          clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue));
         } else {
-          clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
+          clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.nnClusterizerBoundaryFillValue);
         }
       }
     }
@@ -266,20 +266,20 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
     if (dtype == 0) {
       pc.setFull(central_charge * clustererNN.outputDataReg1_16[model_output_index + 4].ToFloat(),
-        static_cast<float>(peak.pad()) + clustererNN.outputDataReg1_16[model_output_index].ToFloat(),
-        clustererNN.outputDataReg1_16[model_output_index + 2].ToFloat(),
-        (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg1_16[model_output_index + 1].ToFloat(),
-        clustererNN.outputDataReg1_16[model_output_index + 3].ToFloat(),
-        clustererNN.clusterFlags[2 * glo_idx],
-        clustererNN.clusterFlags[2 * glo_idx + 1]);
+                 static_cast<float>(peak.pad()) + clustererNN.outputDataReg1_16[model_output_index].ToFloat(),
+                 clustererNN.outputDataReg1_16[model_output_index + 2].ToFloat(),
+                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg1_16[model_output_index + 1].ToFloat(),
+                 clustererNN.outputDataReg1_16[model_output_index + 3].ToFloat(),
+                 clustererNN.clusterFlags[2 * glo_idx],
+                 clustererNN.clusterFlags[2 * glo_idx + 1]);
     } else if (dtype == 1) {
       pc.setFull(central_charge * clustererNN.outputDataReg1_32[model_output_index + 4],
-        static_cast<float>(peak.pad()) + clustererNN.outputDataReg1_32[model_output_index],
-        clustererNN.outputDataReg1_32[model_output_index + 2],
-        (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg1_32[model_output_index + 1],
-        clustererNN.outputDataReg1_32[model_output_index + 3],
-        clustererNN.clusterFlags[2 * glo_idx],
-        clustererNN.clusterFlags[2 * glo_idx + 1]);
+                 static_cast<float>(peak.pad()) + clustererNN.outputDataReg1_32[model_output_index],
+                 clustererNN.outputDataReg1_32[model_output_index + 2],
+                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg1_32[model_output_index + 1],
+                 clustererNN.outputDataReg1_32[model_output_index + 3],
+                 clustererNN.clusterFlags[2 * glo_idx],
+                 clustererNN.clusterFlags[2 * glo_idx + 1]);
     }
 
     tpc::ClusterNative myCluster;
@@ -359,20 +359,20 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     // Cluster 1
     if (dtype == 0) {
       pc.setFull(central_charge * clustererNN.outputDataReg2_16[model_output_index + 8].ToFloat(),
-        static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index].ToFloat(),
-        clustererNN.outputDataReg2_16[model_output_index + 4].ToFloat(),
-        (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 2].ToFloat(),
-        clustererNN.outputDataReg2_16[model_output_index + 6].ToFloat(),
-        clustererNN.clusterFlags[2 * glo_idx],
-        clustererNN.clusterFlags[2 * glo_idx + 1]);
+                 static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index].ToFloat(),
+                 clustererNN.outputDataReg2_16[model_output_index + 4].ToFloat(),
+                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 2].ToFloat(),
+                 clustererNN.outputDataReg2_16[model_output_index + 6].ToFloat(),
+                 clustererNN.clusterFlags[2 * glo_idx],
+                 clustererNN.clusterFlags[2 * glo_idx + 1]);
     } else if (dtype == 1) {
       pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 8],
-        static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index],
-        clustererNN.outputDataReg2_32[model_output_index + 4],
-        (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 2],
-        clustererNN.outputDataReg2_32[model_output_index + 6],
-        clustererNN.clusterFlags[2 * glo_idx],
-        clustererNN.clusterFlags[2 * glo_idx + 1]);
+                 static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index],
+                 clustererNN.outputDataReg2_32[model_output_index + 4],
+                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 2],
+                 clustererNN.outputDataReg2_32[model_output_index + 6],
+                 clustererNN.clusterFlags[2 * glo_idx],
+                 clustererNN.clusterFlags[2 * glo_idx + 1]);
     }
 
     tpc::ClusterNative myCluster;
@@ -404,20 +404,20 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     // Cluster 2
     if (dtype == 0) {
       pc.setFull(central_charge * clustererNN.outputDataReg2_16[model_output_index + 9].ToFloat(),
-        static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index + 1].ToFloat(),
-        clustererNN.outputDataReg2_16[model_output_index + 5].ToFloat(),
-        (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 3].ToFloat(),
-        clustererNN.outputDataReg2_16[model_output_index + 7].ToFloat(),
-        clustererNN.clusterFlags[2 * glo_idx],
-        clustererNN.clusterFlags[2 * glo_idx + 1]);
+                 static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index + 1].ToFloat(),
+                 clustererNN.outputDataReg2_16[model_output_index + 5].ToFloat(),
+                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 3].ToFloat(),
+                 clustererNN.outputDataReg2_16[model_output_index + 7].ToFloat(),
+                 clustererNN.clusterFlags[2 * glo_idx],
+                 clustererNN.clusterFlags[2 * glo_idx + 1]);
     } else if (dtype == 1) {
       pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 9],
-        static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index + 1],
-        clustererNN.outputDataReg2_32[model_output_index + 5],
-        (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 3],
-        clustererNN.outputDataReg2_32[model_output_index + 7],
-        clustererNN.clusterFlags[2 * glo_idx],
-        clustererNN.clusterFlags[2 * glo_idx + 1]);
+                 static_cast<float>(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index + 1],
+                 clustererNN.outputDataReg2_32[model_output_index + 5],
+                 (clusterer.mPmemory->fragment).start + static_cast<float>(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 3],
+                 clustererNN.outputDataReg2_32[model_output_index + 7],
+                 clustererNN.clusterFlags[2 * glo_idx],
+                 clustererNN.clusterFlags[2 * glo_idx + 1]);
     }
 
     rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index f2cc2806115fb..7aae3c176db74 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -135,20 +135,20 @@ void GPURecoWorkflowSpec::init(InitContext& ic)
   GRPGeomHelper::instance().setRequest(mGGR);
   GPUO2InterfaceConfiguration& config = *mConfig.get();
 
-  if (mNNClusterizerSettings->nnLoadFromCCDB){
+  if (mNNClusterizerSettings->nnLoadFromCCDB) {
     LOG(info) << "Loading neural networks from CCDB";
     o2::tpc::NeuralNetworkClusterizer nnClusterizerFetcher;
     nnClusterizerFetcher.initCcdbApi(mNNClusterizerSettings->nnCCDBURL);
     std::map<std::string, std::string> ccdbSettings = {
-    {"nnCCDBURL", mNNClusterizerSettings->nnCCDBURL},
-    {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath},
-    {"inputDType", mNNClusterizerSettings->nnInferenceInputDType},
-    {"outputDType", mNNClusterizerSettings->nnInferenceOutputDType},
-    {"outputFolder", mNNClusterizerSettings->nnLocalFolder},
-    {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath},
-    {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings->nnCCDBWithMomentum)},
-    {"nnCCDBBeamType", mNNClusterizerSettings->nnCCDBBeamType},
-    {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings->nnCCDBInteractionRate)}};
+      {"nnCCDBURL", mNNClusterizerSettings->nnCCDBURL},
+      {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath},
+      {"inputDType", mNNClusterizerSettings->nnInferenceInputDType},
+      {"outputDType", mNNClusterizerSettings->nnInferenceOutputDType},
+      {"outputFolder", mNNClusterizerSettings->nnLocalFolder},
+      {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath},
+      {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings->nnCCDBWithMomentum)},
+      {"nnCCDBBeamType", mNNClusterizerSettings->nnCCDBBeamType},
+      {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings->nnCCDBInteractionRate)}};
 
     std::string nnFetchFolder = mNNClusterizerSettings->nnLocalFolder;
     std::vector<std::string> evalMode = o2::utils::Str::tokenize(mNNClusterizerSettings->nnEvalMode, ':');

From 70907aa03b7dbadf7f43cdd56f6ee58625db2c49 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 11 Apr 2025 20:46:06 +0200
Subject: [PATCH 33/40] Fixing warnings (errors due to size_t)

---
 Common/ML/include/ML/OrtInterface.h |  4 ++--
 Common/ML/src/OrtInterface.cxx      | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index e44b56e62a04e..a1d8123073ef5 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -98,10 +98,10 @@ class OrtModel
   std::vector<O> inference(std::vector<std::vector<I>>&);
 
   template <class I, class O>
-  void inference(I*, size_t, O*);
+  void inference(I*, int64_t, O*);
 
   template <class I, class O>
-  void inference(I**, size_t, O*);
+  void inference(I**, int64_t, O*);
 
   void release();
 
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 8771a312a7e45..000ffdfa39d94 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -262,7 +262,7 @@ template std::vector<float> OrtModel::inference<OrtDataType::Float16_t, float>(s
 template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<OrtDataType::Float16_t>&);
 
 template <class I, class O>
-void OrtModel::inference(I* input, size_t input_size, O* output)
+void OrtModel::inference(I* input, int64_t input_size, O* output)
 {
   // std::vector<std::string> providers = Ort::GetAvailableProviders();
   // for (const auto& provider : providers) {
@@ -289,13 +289,13 @@ void OrtModel::inference(I* input, size_t input_size, O* output)
   (pImplOrt->session)->Run(pImplOrt->runOptions, *pImplOrt->ioBinding);
 }
 
-template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t*, size_t, OrtDataType::Float16_t*);
-template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, size_t, float*);
-template void OrtModel::inference<float, OrtDataType::Float16_t>(float*, size_t, OrtDataType::Float16_t*);
-template void OrtModel::inference<float, float>(float*, size_t, float*);
+template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t*, int64_t, OrtDataType::Float16_t*);
+template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t*, int64_t, float*);
+template void OrtModel::inference<float, OrtDataType::Float16_t>(float*, int64_t, OrtDataType::Float16_t*);
+template void OrtModel::inference<float, float>(float*, int64_t, float*);
 
 template <class I, class O>
-void OrtModel::inference(I** input, size_t input_size, O* output)
+void OrtModel::inference(I** input, int64_t input_size, O* output)
 {
   std::vector<Ort::Value> inputTensors(inputShapesCopy.size());
 
@@ -349,10 +349,10 @@ void OrtModel::inference(I** input, size_t input_size, O* output)
     outputNamesChar.size());
 }
 
-template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t**, size_t, OrtDataType::Float16_t*);
-template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t**, size_t, float*);
-template void OrtModel::inference<float, OrtDataType::Float16_t>(float**, size_t, OrtDataType::Float16_t*);
-template void OrtModel::inference<float, float>(float**, size_t, float*);
+template void OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(OrtDataType::Float16_t**, int64_t, OrtDataType::Float16_t*);
+template void OrtModel::inference<OrtDataType::Float16_t, float>(OrtDataType::Float16_t**, int64_t, float*);
+template void OrtModel::inference<float, OrtDataType::Float16_t>(float**, int64_t, OrtDataType::Float16_t*);
+template void OrtModel::inference<float, float>(float**, int64_t, float*);
 
 template <class I, class O>
 std::vector<O> OrtModel::inference(std::vector<std::vector<I>>& inputs)

From e46cdfa184efcb58b534f6d32482ce3dd9a22e47 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 13 Apr 2025 11:33:58 +0200
Subject: [PATCH 34/40] Fixing linker issues

---
 GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 4e36b3fd3380a..e06c43db814fe 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -661,9 +661,9 @@ void GPUReconstructionCUDA::endGPUProfiling()
   GPUChkErr(cudaProfilerStop());
 }
 
-#if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1
 void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
 {
+#if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1
   cudaGetDevice(deviceId);
   OrtCUDAProviderOptionsV2* cuda_options = nullptr;
   CreateCUDAProviderOptions(&cuda_options);
@@ -679,8 +679,8 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_option
 
   // Finally, don't forget to release the provider options
   ReleaseCUDAProviderOptions(cuda_options);
+#endif // ORT_CUDA_BUILD
 }
-#endif // GPUCA_HAS_ONNX
 
 #else  // HIP
 void* GPUReconstructionHIP::getGPUPointer(void* ptr)
@@ -690,9 +690,9 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr)
   return retVal;
 }
 
-#if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1
 void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
 {
+#if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1
   // Create ROCm provider options
   cudaGetDevice(deviceId);
   // const auto& api = Ort::GetApi();
@@ -702,9 +702,8 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options
   rocm_options.arena_extend_strategy = 0;
   rocm_options.user_compute_stream = mInternals->Streams[stream];
   session_options.AppendExecutionProvider_ROCM(rocm_options);
+#endif // ORT_ROCM_BUILD
 }
-
-#endif // GPUCA_HAS_ONNX
 #endif // __HIPCC__
 
 namespace o2::gpu

From 4b0825ac8d86909442981849ff7619aadceb5b5d Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 16 Apr 2025 13:54:20 +0200
Subject: [PATCH 35/40] Adding volatile memory allocation and
 MockedOrtAllocator. Removing print statements and time measurements

---
 Common/ML/include/ML/OrtInterface.h           | 10 +-
 Common/ML/src/OrtInterface.cxx                | 40 ++++++--
 .../Base/GPUReconstructionProcessing.h        |  1 -
 .../Base/cuda/GPUReconstructionCUDA.cu        |  2 +-
 GPU/GPUTracking/CMakeLists.txt                |  1 +
 .../Global/GPUChainTrackingClusterizer.cxx    | 85 +++++++++++------
 .../GPUTPCNNClusterizerHost.cxx               | 93 ++++++++++++++++++-
 .../GPUTPCNNClusterizerHost.h                 | 16 +++-
 8 files changed, 203 insertions(+), 45 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index a1d8123073ef5..e37b6a69b6036 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -30,6 +30,7 @@ namespace Ort
 {
 struct SessionOptions;
 struct MemoryInfo;
+struct Env;
 } // namespace Ort
 
 namespace o2
@@ -55,6 +56,7 @@ class OrtModel
   // General purpose
   void initOptions(std::unordered_map<std::string, std::string> optionsMap);
   void initEnvironment();
+  void initSession();
   void memoryOnDevice(int32_t = 0);
   bool isInitialized() { return mInitialized; }
   void resetSession();
@@ -64,8 +66,9 @@ class OrtModel
   std::vector<std::vector<int64_t>> getNumOutputNodes() const { return mOutputShapes; }
   std::vector<std::string> getInputNames() const { return mInputNames; }
   std::vector<std::string> getOutputNames() const { return mOutputNames; }
-  Ort::SessionOptions& getSessionOptions();
-  Ort::MemoryInfo& getMemoryInfo();
+  Ort::SessionOptions* getSessionOptions();
+  Ort::MemoryInfo* getMemoryInfo();
+  Ort::Env* getEnv();
   int32_t getIntraOpNumThreads() const { return intraOpNumThreads; }
   int32_t getInterOpNumThreads() const { return interOpNumThreads; }
 
@@ -85,6 +88,7 @@ class OrtModel
       interOpNumThreads = threads;
     }
   }
+  void setEnv(Ort::Env*);
 
   // Conversion
   template <class I, class O>
@@ -103,7 +107,7 @@ class OrtModel
   template <class I, class O>
   void inference(I**, int64_t, O*);
 
-  void release();
+  void release(bool = false);
 
  private:
   // ORT variables -> need to be hidden as pImpl
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 000ffdfa39d94..6dd3887c82417 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -99,9 +99,6 @@ void OrtModel::initOptions(std::unordered_map<std::string, std::string> optionsM
 
 void OrtModel::initEnvironment()
 {
-  if (allocateDeviceMemory) {
-    memoryOnDevice(deviceId);
-  }
   pImplOrt->env = std::make_shared<Ort::Env>(
     OrtLoggingLevel(loggingLevel),
     (envName.empty() ? "ORT" : envName.c_str()),
@@ -123,6 +120,13 @@ void OrtModel::initEnvironment()
     },
     (void*)3);
   (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
+}
+
+void OrtModel::initSession()
+{
+  if (allocateDeviceMemory) {
+    memoryOnDevice(deviceId);
+  }
   pImplOrt->session = std::make_shared<Ort::Session>(*pImplOrt->env, modelPath.c_str(), pImplOrt->sessionOptions);
   pImplOrt->ioBinding = std::make_unique<Ort::IoBinding>(*pImplOrt->session);
 
@@ -138,6 +142,13 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
 #if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
   if (deviceIndex >= 0) {
     (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1");
+    (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+    (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
+
+    // Arena memory shrinkage comes at performance cost
+    /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
+    // (pImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
+
     std::string dev_mem_str = "";
     if (deviceType == "ROCM") {
       dev_mem_str = "Hip";
@@ -159,14 +170,19 @@ void OrtModel::resetSession()
 }
 
 // Getters
-Ort::SessionOptions& OrtModel::getSessionOptions()
+Ort::SessionOptions* OrtModel::getSessionOptions()
+{
+  return &pImplOrt->sessionOptions;
+}
+
+Ort::MemoryInfo* OrtModel::getMemoryInfo()
 {
-  return pImplOrt->sessionOptions;
+  return &pImplOrt->memoryInfo;
 }
 
-Ort::MemoryInfo& OrtModel::getMemoryInfo()
+Ort::Env* OrtModel::getEnv()
 {
-  return pImplOrt->memoryInfo;
+  return (pImplOrt->env).get();
 }
 
 template <class I, class O>
@@ -234,6 +250,11 @@ void OrtModel::setIO()
   }
 }
 
+void OrtModel::setEnv(Ort::Env* env)
+{
+  pImplOrt->env = std::shared_ptr<Ort::Env>(env);
+}
+
 // Inference
 template <class I, class O>
 std::vector<O> OrtModel::inference(std::vector<I>& input)
@@ -404,8 +425,11 @@ template std::vector<float> OrtModel::inference<float, float>(std::vector<std::v
 template std::vector<OrtDataType::Float16_t> OrtModel::inference<OrtDataType::Float16_t, OrtDataType::Float16_t>(std::vector<std::vector<OrtDataType::Float16_t>>&);
 
 // Release session
-void OrtModel::release()
+void OrtModel::release(bool profilingEnabled)
 {
+  // if (profilingEnabled) {
+  //   pImplOrt->session->EndProfiling();
+  // }
   LOG(info) << "(ORT) Size of pImplOrt: " << sizeof(*pImplOrt) << " bytes";
 }
 
diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
index 94e13d15d9c89..2339ee9fb6b83 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h
@@ -95,7 +95,6 @@ class GPUReconstructionProcessing : public GPUReconstruction
   void AddGPUEvents(T*& events);
 
   virtual std::unique_ptr<gpu_reconstruction_kernels::threadContext> GetThreadContext() override;
-  // virtual void SetONNXGPUStream(Ort::SessionOptions&, int32_t, int32_t*) {}
 
   struct RecoStepTimerMeta {
     HighResTimer timerToGPU;
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index e06c43db814fe..247438fa8a13f 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -699,7 +699,7 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options
   // api.GetCurrentGpuDeviceId(deviceId);
   OrtROCMProviderOptions rocm_options;
   rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
-  rocm_options.arena_extend_strategy = 0;
+  rocm_options.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
   rocm_options.user_compute_stream = mInternals->Streams[stream];
   session_options.AppendExecutionProvider_ROCM(rocm_options);
 #endif // ORT_ROCM_BUILD
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index eb7481819ea89..673c93cddb8ca 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -336,6 +336,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
                                        O2::DetectorsRaw
                                        O2::Steer
                                        O2::ML
+                 PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime
                  PUBLIC_INCLUDE_DIRECTORIES ${INCDIRS}
                  SOURCES ${SRCS} ${SRCS_NO_CINT} ${SRCS_NO_H})
 
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 1d71c92bcdfe9..0b9897977cc98 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -42,6 +42,7 @@
 #ifdef GPUCA_HAS_ONNX
 #include "GPUTPCNNClusterizerKernels.h"
 #include "GPUTPCNNClusterizerHost.h"
+// #include "ML/3rdparty/GPUORTFloat16.h"
 #endif
 
 using namespace o2::gpu;
@@ -630,31 +631,39 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
       nnApplications[lane].init(nn_settings);
       if (nnApplications[lane].modelsUsed[0]) {
-        SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
+        SetONNXGPUStream(*(nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_class).setDeviceId(deviceId);
         if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads);
         }
         (nnApplications[lane].model_class).initEnvironment();
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, 0);
+        (nnApplications[lane].model_class).initSession();
       }
       if (nnApplications[lane].modelsUsed[1]) {
-        SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
+        SetONNXGPUStream(*(nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_reg_1).setDeviceId(deviceId);
         if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads);
         }
+        // (nnApplications[lane].model_reg_1).setEnv((nnApplications[lane].model_class).getEnv());
         (nnApplications[lane].model_reg_1).initEnvironment();
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, 1);
+        (nnApplications[lane].model_reg_1).initSession();
       }
       if (nnApplications[lane].modelsUsed[2]) {
-        SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
+        SetONNXGPUStream(*(nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId);
         (nnApplications[lane].model_reg_2).setDeviceId(deviceId);
         if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads);
         }
+        // (nnApplications[lane].model_reg_2).setEnv((nnApplications[lane].model_class).getEnv());
         (nnApplications[lane].model_reg_2).initEnvironment();
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_2).getEnv(), (nnApplications[lane].model_reg_2).getMemoryInfo(), mRec, 2);
+        (nnApplications[lane].model_reg_2).initSession();
       }
       if (nn_settings.nnClusterizerVerbosity < 3) {
-        LOG(info) << "Allocated ONNX stream for lane " << lane << " and device " << deviceId;
+        LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
       }
     });
     mRec->runParallelOuterLoop(doGPU, NSECTORS, [&](uint32_t sector) {
@@ -957,9 +966,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto start0 = std::chrono::high_resolution_clock::now();
             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data
-            auto stop0 = std::chrono::high_resolution_clock::now();
+            // auto stop0 = std::chrono::high_resolution_clock::now();
 
-            auto start1 = std::chrono::high_resolution_clock::now();
+            // auto start1 = std::chrono::high_resolution_clock::now();
 
             // NN evaluations
             if (clustererNNShadow.nnInferenceInputDType == 0) {
@@ -1006,7 +1015,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
               }
             }
 
-            auto stopNNs = std::chrono::high_resolution_clock::now();
+            // auto stopNNs = std::chrono::high_resolution_clock::now();
 
             // Publishing kernels
             if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) {
@@ -1020,25 +1029,41 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
                 runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishClass2Regression>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results
               }
             }
-            auto stop1 = std::chrono::high_resolution_clock::now();
 
-            time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count() / 1e9;
-            time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
-            time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
-          }
-          if (clustererNNShadow.nnClusterizerUseCfRegression) {
-            auto start1 = std::chrono::high_resolution_clock::now();
-            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
-            auto stop1 = std::chrono::high_resolution_clock::now();
-            time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
-          }
-          if (clustererNNShadow.nnClusterizerVerbosity < 3) {
-            int acceptedClusters = 0;
-            for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
-              acceptedClusters += clustererNNShadow.outputDataClass[i];
-            }
-            LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+            // for(int i = 0; i < iSize; ++i) {
+            //   if(clustererNNShadow.outputDataClass[i + batchStart] > 1) {
+            //     LOG(info) << "WARNING ORT: Output of  " << i + batchStart << " / " << clusterer.mPmemory->counters.nClusters << " is " << clustererNNShadow.modelProbabilities_16[i].ToFloat() << " and " << clustererNNShadow.outputDataClass[i + batchStart] << " thresh " << clustererNNShadow.nnClassThreshold << " instead of 0 or 1. Please check the model and the input data.";
+            //     // std::string input = "[";
+            //     // for(int j = 0; j < clustererNNShadow.nnClusterizerElementSize; j++){
+            //     //   input += std::to_string(clustererNNShadow.inputData_16[i * clustererNNShadow.nnClusterizerElementSize + j].ToFloat()) + ", ";
+            //     // }
+            //     // input += "]";
+            //     // LOG(info) << "Input is: " << input;
+            //   }
+            // }
+
+            // auto stop1 = std::chrono::high_resolution_clock::now();
+
+            // time_networks += std::chrono::duration_cast<std::chrono::nanoseconds>(stopNNs - start1).count() / 1e9;
+            // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+            // time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
           }
+          // if (clustererNNShadow.nnClusterizerUseCfRegression) {
+          //   auto start1 = std::chrono::high_resolution_clock::now();
+          //   runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+          //   auto stop1 = std::chrono::high_resolution_clock::now();
+          //   time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+          // }
+          // if (clustererNNShadow.nnClusterizerVerbosity < 3) {
+          //   int acceptedClusters = 0;
+          //   for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
+          //     if(clustererNNShadow.outputDataClass[i] > 1 || clustererNNShadow.outputDataClass[i] < 0) {
+          //       LOG(info) << "WARNING ORT 2: " << clustererNNShadow.outputDataClass[i] << " for index " << i << " / " << clusterer.mPmemory->counters.nClusters;
+          //     }
+          //     acceptedClusters += clustererNNShadow.outputDataClass[i];
+          //   }
+          //   LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s";
+          // }
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif
@@ -1139,12 +1164,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     }
   }
   for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) {
-    if (GetProcessingSettings().nn.applyNNclusterizer) {
-      GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
-      nnApplication.model_class.release();
-      nnApplication.model_reg_1.release();
-      nnApplication.model_reg_2.release();
-    }
+    // if (GetProcessingSettings().nn.applyNNclusterizer) {
+    //   GPUTPCNNClusterizerHost& nnApplication = nnApplications[i];
+    //   nnApplication.model_class.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
+    //   nnApplication.model_reg_1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
+    //   nnApplication.model_reg_2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling);
+    // }
     if (transferRunning[i]) {
       ReleaseEvent(mEvents->stream[i], doGPU);
     }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 8c6d4fff67528..bda4c70d79c9d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -18,6 +18,11 @@
 #include "GPUTPCNNClusterizer.h"
 #include "GPUSettings.h"
 #include "ML/3rdparty/GPUORTFloat16.h"
+#include "GPUReconstruction.h"
+
+#ifdef GPUCA_HAS_ONNX
+#include <onnxruntime_cxx_api.h>
+#endif
 
 using namespace o2::gpu;
 
@@ -51,7 +56,6 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
     {"profiling-output-path", settings.nnInferenceOrtProfilingPath},
     {"logging-level", std::to_string(settings.nnInferenceVerbosity)}};
 
-  LOG(info) << "Model path: " << class_model_path;
   model_class.initOptions(OrtOptions);
   modelsUsed[0] = true;
 
@@ -106,3 +110,90 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
     }
   }
 }
+
+// MockedOrtAllocator implementation to be able to use volatile assignment
+struct MockedOrtAllocator : OrtAllocator {
+  MockedOrtAllocator(GPUReconstruction* = nullptr, OrtMemoryInfo* = nullptr);
+  ~MockedOrtAllocator();
+
+  void* Alloc(size_t size);
+  void Free(void* p);
+  const OrtMemoryInfo* Info() const;
+  void* Reserve(size_t size);
+  size_t NumAllocations() const;
+  size_t NumReserveAllocations() const;
+
+  void LeakCheck();
+
+private:
+  MockedOrtAllocator(const MockedOrtAllocator&) = delete;
+  MockedOrtAllocator& operator=(const MockedOrtAllocator&) = delete;
+
+  std::atomic<size_t> memory_inuse{0};
+  std::atomic<size_t> num_allocations{0};
+  std::atomic<size_t> num_reserve_allocations{0};
+  OrtMemoryInfo* memory_info;
+  GPUReconstruction* rec;
+};
+
+MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info) {
+  OrtAllocator::version = ORT_API_VERSION;
+  OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Alloc(size); };
+  OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
+  OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast<const MockedOrtAllocator*>(this_)->Info(); };
+  OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Reserve(size); };
+  rec = r;
+  memory_info = info;
+}
+
+MockedOrtAllocator::~MockedOrtAllocator() {
+  // Ort::GetApi().ReleaseMemoryInfo(memory_info);
+}
+
+void* MockedOrtAllocator::Alloc(size_t size) {
+  return rec->AllocateVolatileDeviceMemory(size);
+}
+
+void* MockedOrtAllocator::Reserve(size_t size) {
+  return rec->AllocateVolatileDeviceMemory(size);
+}
+
+void MockedOrtAllocator::Free(void* p) {
+  rec->ReturnVolatileDeviceMemory();
+}
+
+const OrtMemoryInfo* MockedOrtAllocator::Info() const {
+  return memory_info;
+}
+
+size_t MockedOrtAllocator::NumAllocations() const {
+  return num_allocations.load();
+}
+
+size_t MockedOrtAllocator::NumReserveAllocations() const {
+  return num_reserve_allocations.load();
+}
+
+void MockedOrtAllocator::LeakCheck() {
+  if (memory_inuse.load())
+    LOG(warning) << "memory leak!!!";
+}
+
+void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, int32_t chooseMockedAlloc)
+{
+  if(chooseMockedAlloc == 0) {
+    mockedAlloc_class = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)memInfo);
+    Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_class.get());
+    LOG(info) << "(ORT) Mocked ORT allocator for classification network registered";
+  } else if (chooseMockedAlloc == 1) {
+    mockedAlloc_reg_1 = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)memInfo);
+    Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_1.get());
+    LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 1) registered";
+  } else if (chooseMockedAlloc == 2) {
+    mockedAlloc_reg_2 = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)memInfo);
+    Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_2.get());
+    LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 2) registered";
+  } else {
+    LOG(fatal) << "Invalid choice for mocked allocator";
+  }
+}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index bae9e5fa677b2..1e0df7ea578f1 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -22,6 +22,15 @@
 
 using namespace o2::ml;
 
+struct OrtAllocator;
+struct OrtMemoryInfo;
+struct MockedOrtAllocator;
+namespace Ort
+{
+struct Env;
+struct MemoryInfo;
+} // namespace Ort
+
 namespace o2::OrtDataType
 {
 struct Float16_t;
@@ -30,6 +39,7 @@ struct Float16_t;
 namespace o2::gpu
 {
 
+class GPUReconstruction;
 class GPUTPCNNClusterizer;
 struct GPUSettingsProcessingNNclusterizer;
 
@@ -41,13 +51,17 @@ class GPUTPCNNClusterizerHost
 
   void init(const GPUSettingsProcessingNNclusterizer&);
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
-  void loadFromCCDB(std::map<std::string, std::string>);
+
+  // ONNX
+  void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, int32_t = 0);
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
   std::vector<bool> modelsUsed = {false, false, false};   // 0: class, 1: reg_1, 2: reg_2
   int32_t deviceId = -1;
   std::vector<std::string> reg_model_paths;
+
+  std::shared_ptr<MockedOrtAllocator> mockedAlloc_class = nullptr, mockedAlloc_reg_1 = nullptr, mockedAlloc_reg_2 = nullptr;
 }; // class GPUTPCNNClusterizerHost
 
 } // namespace o2::gpu

From 497a9d421671f78f93c5b266235e5c0742aa4df1 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Wed, 16 Apr 2025 11:54:58 +0000
Subject: [PATCH 36/40] Please consider the following formatting changes

---
 Common/ML/src/OrtInterface.cxx                |  2 +-
 .../Base/cuda/GPUReconstructionCUDA.cu        |  2 +-
 .../GPUTPCNNClusterizerHost.cxx               | 31 ++++++++++++-------
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 6dd3887c82417..520d2273e2185 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -143,7 +143,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
   if (deviceIndex >= 0) {
     (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1");
     (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
-    (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
+    (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1");                    // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
 
     // Arena memory shrinkage comes at performance cost
     /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 247438fa8a13f..382e93f06aea8 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -699,7 +699,7 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options
   // api.GetCurrentGpuDeviceId(deviceId);
   OrtROCMProviderOptions rocm_options;
   rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
-  rocm_options.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
+  rocm_options.arena_extend_strategy = 0;   // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
   rocm_options.user_compute_stream = mInternals->Streams[stream];
   session_options.AppendExecutionProvider_ROCM(rocm_options);
 #endif // ORT_ROCM_BUILD
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index bda4c70d79c9d..ceda3acd7db46 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -125,7 +125,7 @@ struct MockedOrtAllocator : OrtAllocator {
 
   void LeakCheck();
 
-private:
+ private:
   MockedOrtAllocator(const MockedOrtAllocator&) = delete;
   MockedOrtAllocator& operator=(const MockedOrtAllocator&) = delete;
 
@@ -136,7 +136,8 @@ struct MockedOrtAllocator : OrtAllocator {
   GPUReconstruction* rec;
 };
 
-MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info) {
+MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info)
+{
   OrtAllocator::version = ORT_API_VERSION;
   OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast<MockedOrtAllocator*>(this_)->Alloc(size); };
   OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast<MockedOrtAllocator*>(this_)->Free(p); };
@@ -146,42 +147,50 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info
   memory_info = info;
 }
 
-MockedOrtAllocator::~MockedOrtAllocator() {
+MockedOrtAllocator::~MockedOrtAllocator()
+{
   // Ort::GetApi().ReleaseMemoryInfo(memory_info);
 }
 
-void* MockedOrtAllocator::Alloc(size_t size) {
+void* MockedOrtAllocator::Alloc(size_t size)
+{
   return rec->AllocateVolatileDeviceMemory(size);
 }
 
-void* MockedOrtAllocator::Reserve(size_t size) {
+void* MockedOrtAllocator::Reserve(size_t size)
+{
   return rec->AllocateVolatileDeviceMemory(size);
 }
 
-void MockedOrtAllocator::Free(void* p) {
+void MockedOrtAllocator::Free(void* p)
+{
   rec->ReturnVolatileDeviceMemory();
 }
 
-const OrtMemoryInfo* MockedOrtAllocator::Info() const {
+const OrtMemoryInfo* MockedOrtAllocator::Info() const
+{
   return memory_info;
 }
 
-size_t MockedOrtAllocator::NumAllocations() const {
+size_t MockedOrtAllocator::NumAllocations() const
+{
   return num_allocations.load();
 }
 
-size_t MockedOrtAllocator::NumReserveAllocations() const {
+size_t MockedOrtAllocator::NumReserveAllocations() const
+{
   return num_reserve_allocations.load();
 }
 
-void MockedOrtAllocator::LeakCheck() {
+void MockedOrtAllocator::LeakCheck()
+{
   if (memory_inuse.load())
     LOG(warning) << "memory leak!!!";
 }
 
 void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, int32_t chooseMockedAlloc)
 {
-  if(chooseMockedAlloc == 0) {
+  if (chooseMockedAlloc == 0) {
     mockedAlloc_class = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)memInfo);
     Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_class.get());
     LOG(info) << "(ORT) Mocked ORT allocator for classification network registered";

From a67b634643064c9fa5ab6504a540a0687a513a4e Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Wed, 16 Apr 2025 15:05:06 +0200
Subject: [PATCH 37/40] Circumvent "unused result" warning and build failure

---
 .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx            | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index ceda3acd7db46..9ca899158c199 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -192,15 +192,15 @@ void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInf
 {
   if (chooseMockedAlloc == 0) {
     mockedAlloc_class = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)memInfo);
-    Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_class.get());
+    Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_class.get()));
     LOG(info) << "(ORT) Mocked ORT allocator for classification network registered";
   } else if (chooseMockedAlloc == 1) {
     mockedAlloc_reg_1 = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)memInfo);
-    Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_1.get());
+    Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_1.get()));
     LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 1) registered";
   } else if (chooseMockedAlloc == 2) {
     mockedAlloc_reg_2 = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)memInfo);
-    Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_2.get());
+    Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_2.get()));
     LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 2) registered";
   } else {
     LOG(fatal) << "Invalid choice for mocked allocator";

From 938a1edbe6695280e475560d3315b034ba2db754 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 20 Apr 2025 00:38:19 +0200
Subject: [PATCH 38/40] Adjust for comments

---
 Common/ML/CMakeLists.txt                      |  8 ++--
 Common/ML/src/OrtInterface.cxx                |  4 +-
 GPU/GPUTracking/Base/cuda/CMakeLists.txt      |  8 ----
 .../Base/cuda/GPUReconstructionCUDA.cu        |  5 ++-
 GPU/GPUTracking/Base/hip/CMakeLists.txt       | 10 +----
 GPU/GPUTracking/CMakeLists.txt                |  3 +-
 .../Global/GPUChainTrackingClusterizer.cxx    | 30 +++++++++------
 .../GPUTPCNNClusterizerHost.cxx               | 38 +++++++++++--------
 .../GPUTPCNNClusterizerHost.h                 | 10 +++--
 GPU/Workflow/src/GPUWorkflowSpec.cxx          | 38 +++++++++----------
 10 files changed, 76 insertions(+), 78 deletions(-)

diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt
index 7e2107651cf10..2db91fc4f4320 100644
--- a/Common/ML/CMakeLists.txt
+++ b/Common/ML/CMakeLists.txt
@@ -16,7 +16,7 @@ o2_add_library(ML
 
 # Pass ORT variables as a preprocessor definition
 target_compile_definitions(${targetName} PRIVATE
-    ORT_ROCM_BUILD=$<BOOL:${ORT_ROCM_BUILD}>
-    ORT_CUDA_BUILD=$<BOOL:${ORT_CUDA_BUILD}>
-    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>
-    ORT_TENSORRT_BUILD=$<BOOL:${ORT_TENSORRT_BUILD}>)
+    $<$<BOOL:${ORT_ROCM_BUILD}>:ORT_ROCM_BUILD>
+    $<$<BOOL:${ORT_CUDA_BUILD}>:ORT_CUDA_BUILD>
+    $<$<BOOL:${ORT_MIGRAPHX_BUILD}>:ORT_MIGRAPHX_BUILD>
+    $<$<BOOL:${ORT_TENSORRT_BUILD}>:ORT_TENSORRT_BUILD>)
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 520d2273e2185..a0665841bec31 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -144,7 +144,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
     (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1");
     (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
     (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1");                    // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
-
+    (pImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0");                    // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
     // Arena memory shrinkage comes at performance cost
     /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
     // (pImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
@@ -158,7 +158,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
     }
     pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault);
     if (loggingLevel < 2) {
-      LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex;
+      LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex << " and pImplOrt pointer " << pImplOrt;
     }
   }
 #endif
diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
index e4877d8ccef25..554f700bd57df 100644
--- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
@@ -122,17 +122,9 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
       ${CMAKE_CURRENT_SOURCE_DIR}
     TARGETVARNAME targetName)
 
-  message("Compile definitions for ONNX runtime (CUDA):")
-  message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}")
-  message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}")
-  message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}")
-  message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}")
-
   target_compile_definitions(${targetName} PRIVATE
     GPUCA_HAS_ONNX=1
-    ORT_ROCM_BUILD=$<BOOL:${ORT_ROCM_BUILD}>
     ORT_CUDA_BUILD=$<BOOL:${ORT_CUDA_BUILD}>
-    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>
     ORT_TENSORRT_BUILD=$<BOOL:${ORT_TENSORRT_BUILD}>)
 
   install(FILES ${HDRS} DESTINATION include/GPU)
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 32e3ae76abe7d..741f160158b43 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -655,7 +655,7 @@ void GPUReconstructionCUDA::endGPUProfiling()
 
 void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
 {
-#if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1
+#ifdef ORT_CUDA_BUILD
   cudaGetDevice(deviceId);
   OrtCUDAProviderOptionsV2* cuda_options = nullptr;
   CreateCUDAProviderOptions(&cuda_options);
@@ -684,7 +684,7 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr)
 
 void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId)
 {
-#if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1
+#ifdef ORT_ROCM_BUILD
   // Create ROCm provider options
   cudaGetDevice(deviceId);
   // const auto& api = Ort::GetApi();
@@ -692,6 +692,7 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options
   OrtROCMProviderOptions rocm_options;
   rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream
   rocm_options.arena_extend_strategy = 0;   // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code
+  // rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit
   rocm_options.user_compute_stream = mInternals->Streams[stream];
   session_options.AppendExecutionProvider_ROCM(rocm_options);
 #endif // ORT_ROCM_BUILD
diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt
index c3cee1e4ebf18..bd3ebe6bc667f 100644
--- a/GPU/GPUTracking/Base/hip/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt
@@ -170,18 +170,10 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
       ${GPUCA_HIP_SOURCE_DIR}
     TARGETVARNAME targetName)
 
-  message("Compile definitions for ONNX runtime (HIP / ROCM):")
-  message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}")
-  message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}")
-  message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}")
-  message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}")
-
   target_compile_definitions(${targetName} PRIVATE
     GPUCA_HAS_ONNX=1
     ORT_ROCM_BUILD=$<BOOL:${ORT_ROCM_BUILD}>
-    ORT_CUDA_BUILD=$<BOOL:${ORT_CUDA_BUILD}>
-    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>
-    ORT_TENSORRT_BUILD=$<BOOL:${ORT_TENSORRT_BUILD}>)
+    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>)
 
   install(FILES ${HDRS} DESTINATION include/GPU)
 
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 414f72eef7329..e82799b9e59c3 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -349,8 +349,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
     ${targetName}
     PRIVATE $<TARGET_PROPERTY:O2::Framework,INTERFACE_INCLUDE_DIRECTORIES>)
 
-  target_compile_definitions(${targetName} PRIVATE
-    GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1)
+  target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1)
 
   o2_target_root_dictionary(${MODULE}
                             HEADERS ${HDRS_CINT_O2} ${HDRS_CINT_O2_ADDITIONAL}
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 0b9897977cc98..7db0ba66305e9 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -42,7 +42,6 @@
 #ifdef GPUCA_HAS_ONNX
 #include "GPUTPCNNClusterizerKernels.h"
 #include "GPUTPCNNClusterizerHost.h"
-// #include "ML/3rdparty/GPUORTFloat16.h"
 #endif
 
 using namespace o2::gpu;
@@ -628,6 +627,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     int32_t deviceId = -1;
     int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
     int32_t maxThreads = mRec->getNKernelHostThreads(true);
+    // bool recreateMemoryAllocator = false;
     mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) {
       nnApplications[lane].init(nn_settings);
       if (nnApplications[lane].modelsUsed[0]) {
@@ -637,7 +637,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads);
         }
         (nnApplications[lane].model_class).initEnvironment();
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, 0);
+        // Registering this once seems to be enough, even with different environmnents / models. ONNX apparently uses this per device and stores the OrtAllocator internally. All models will then use the volatile allocation.
+        // But environment must be valid, so we init the model environment first and use it here afterwards.
+        // Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator.
+        // TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, recreateMemoryAllocator);
+        // recreateMemoryAllocator = true;
         (nnApplications[lane].model_class).initSession();
       }
       if (nnApplications[lane].modelsUsed[1]) {
@@ -648,7 +653,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         }
         // (nnApplications[lane].model_reg_1).setEnv((nnApplications[lane].model_class).getEnv());
         (nnApplications[lane].model_reg_1).initEnvironment();
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, 1);
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, recreateMemoryAllocator);
         (nnApplications[lane].model_reg_1).initSession();
       }
       if (nnApplications[lane].modelsUsed[2]) {
@@ -657,9 +662,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) {
           nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads);
         }
-        // (nnApplications[lane].model_reg_2).setEnv((nnApplications[lane].model_class).getEnv());
         (nnApplications[lane].model_reg_2).initEnvironment();
-        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_2).getEnv(), (nnApplications[lane].model_reg_2).getMemoryInfo(), mRec, 2);
+        // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, recreateMemoryAllocator);
         (nnApplications[lane].model_reg_2).initSession();
       }
       if (nn_settings.nnClusterizerVerbosity < 3) {
@@ -685,6 +689,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
     if (doGPU) {
       WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init);
     }
+    LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes";
+    LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes";
   }
 #endif
 
@@ -966,8 +972,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
             auto start0 = std::chrono::high_resolution_clock::now();
             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data
-            // auto stop0 = std::chrono::high_resolution_clock::now();
 
+            // auto stop0 = std::chrono::high_resolution_clock::now();
             // auto start1 = std::chrono::high_resolution_clock::now();
 
             // NN evaluations
@@ -1048,12 +1054,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
             // time_fill += std::chrono::duration_cast<std::chrono::nanoseconds>(stop0 - start0).count() / 1e9;
           }
-          // if (clustererNNShadow.nnClusterizerUseCfRegression) {
-          //   auto start1 = std::chrono::high_resolution_clock::now();
-          //   runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
-          //   auto stop1 = std::chrono::high_resolution_clock::now();
-          //   time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
-          // }
+          if (clustererNNShadow.nnClusterizerUseCfRegression) {
+            // auto start1 = std::chrono::high_resolution_clock::now();
+            runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::runCfClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0
+            // auto stop1 = std::chrono::high_resolution_clock::now();
+            // time_clusterizer += std::chrono::duration_cast<std::chrono::nanoseconds>(stop1 - start1).count() / 1e9;
+          }
           // if (clustererNNShadow.nnClusterizerVerbosity < 3) {
           //   int acceptedClusters = 0;
           //   for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 9ca899158c199..2e98ca1982ad5 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -54,7 +54,8 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
     {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)},
     {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)},
     {"profiling-output-path", settings.nnInferenceOrtProfilingPath},
-    {"logging-level", std::to_string(settings.nnInferenceVerbosity)}};
+    {"logging-level", std::to_string(settings.nnInferenceVerbosity)},
+    {"onnx-environment-name", "c1"}};
 
   model_class.initOptions(OrtOptions);
   modelsUsed[0] = true;
@@ -64,13 +65,16 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
   if (!settings.nnClusterizerUseCfRegression) {
     if (reg_model_paths.size() == 1) {
       OrtOptions["model-path"] = reg_model_paths[0];
+      OrtOptions["onnx-environment-name"] = "r1";
       model_reg_1.initOptions(OrtOptions);
       modelsUsed[1] = true;
     } else {
       OrtOptions["model-path"] = reg_model_paths[0];
+      OrtOptions["onnx-environment-name"] = "r1";
       model_reg_1.initOptions(OrtOptions);
       modelsUsed[1] = true;
       OrtOptions["model-path"] = reg_model_paths[1];
+      OrtOptions["onnx-environment-name"] = "r2";
       model_reg_2.initOptions(OrtOptions);
       modelsUsed[2] = true;
     }
@@ -154,16 +158,19 @@ MockedOrtAllocator::~MockedOrtAllocator()
 
 void* MockedOrtAllocator::Alloc(size_t size)
 {
+  // LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes";
   return rec->AllocateVolatileDeviceMemory(size);
 }
 
 void* MockedOrtAllocator::Reserve(size_t size)
 {
+  // LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes";
   return rec->AllocateVolatileDeviceMemory(size);
 }
 
 void MockedOrtAllocator::Free(void* p)
 {
+  // LOG(info) << "(ORT) Freeing volatile memory " << p;
   rec->ReturnVolatileDeviceMemory();
 }
 
@@ -188,21 +195,20 @@ void MockedOrtAllocator::LeakCheck()
     LOG(warning) << "memory leak!!!";
 }
 
-void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, int32_t chooseMockedAlloc)
+void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate)
 {
-  if (chooseMockedAlloc == 0) {
-    mockedAlloc_class = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)memInfo);
-    Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_class.get()));
-    LOG(info) << "(ORT) Mocked ORT allocator for classification network registered";
-  } else if (chooseMockedAlloc == 1) {
-    mockedAlloc_reg_1 = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)memInfo);
-    Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_1.get()));
-    LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 1) registered";
-  } else if (chooseMockedAlloc == 2) {
-    mockedAlloc_reg_2 = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)memInfo);
-    Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_2.get()));
-    LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 2) registered";
-  } else {
-    LOG(fatal) << "Invalid choice for mocked allocator";
+  mockedAlloc = std::make_shared<MockedOrtAllocator>(rec, (OrtMemoryInfo*)(*memInfo));
+  if (recreate) {
+    Ort::ThrowOnError(Ort::GetApi().UnregisterAllocator((OrtEnv*)(*env), (OrtMemoryInfo*)(*memInfo)));
   }
+  Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc.get()));
+  memInfo = (Ort::MemoryInfo*)mockedAlloc->Info();
+}
+
+const OrtMemoryInfo* GPUTPCNNClusterizerHost::getMockedMemoryInfo() {
+  return mockedAlloc->Info();
+}
+
+MockedOrtAllocator* GPUTPCNNClusterizerHost::getMockedAllocator() {
+  return mockedAlloc.get();
 }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
index 1e0df7ea578f1..0379b83d0ae02 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h
@@ -22,8 +22,8 @@
 
 using namespace o2::ml;
 
-struct OrtAllocator;
-struct OrtMemoryInfo;
+class OrtMemoryInfo;
+class OrtAllocator;
 struct MockedOrtAllocator;
 namespace Ort
 {
@@ -53,7 +53,9 @@ class GPUTPCNNClusterizerHost
   void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&);
 
   // ONNX
-  void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, int32_t = 0);
+  void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false);
+  MockedOrtAllocator* getMockedAllocator();
+  const OrtMemoryInfo* getMockedMemoryInfo();
 
   std::unordered_map<std::string, std::string> OrtOptions;
   o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters
@@ -61,7 +63,7 @@ class GPUTPCNNClusterizerHost
   int32_t deviceId = -1;
   std::vector<std::string> reg_model_paths;
 
-  std::shared_ptr<MockedOrtAllocator> mockedAlloc_class = nullptr, mockedAlloc_reg_1 = nullptr, mockedAlloc_reg_2 = nullptr;
+  std::shared_ptr<MockedOrtAllocator> mockedAlloc = nullptr;
 }; // class GPUTPCNNClusterizerHost
 
 } // namespace o2::gpu
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index dde5810b89c82..8a755a703705f 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -119,7 +119,6 @@ GPURecoWorkflowSpec::GPURecoWorkflowSpec(GPURecoWorkflowSpec::CompletionPolicyDa
   mConfig.reset(new GPUO2InterfaceConfiguration);
   mConfParam.reset(new GPUSettingsO2);
   mTFSettings.reset(new GPUSettingsTF);
-  mNNClusterizerSettings.reset(new GPUSettingsProcessingNNclusterizer);
   mTimer.reset(new TStopwatch);
   mPipeline.reset(new GPURecoWorkflowSpec_PipelineInternals);
 
@@ -134,43 +133,44 @@ void GPURecoWorkflowSpec::init(InitContext& ic)
 {
   GRPGeomHelper::instance().setRequest(mGGR);
   GPUO2InterfaceConfiguration& config = *mConfig.get();
+  GPUSettingsProcessingNNclusterizer& mNNClusterizerSettings = mConfig->configProcessing.nn;
 
-  if (mNNClusterizerSettings->nnLoadFromCCDB) {
+  if (mNNClusterizerSettings.nnLoadFromCCDB) {
     LOG(info) << "Loading neural networks from CCDB";
     o2::tpc::NeuralNetworkClusterizer nnClusterizerFetcher;
-    nnClusterizerFetcher.initCcdbApi(mNNClusterizerSettings->nnCCDBURL);
+    nnClusterizerFetcher.initCcdbApi(mNNClusterizerSettings.nnCCDBURL);
     std::map<std::string, std::string> ccdbSettings = {
-      {"nnCCDBURL", mNNClusterizerSettings->nnCCDBURL},
-      {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath},
-      {"inputDType", mNNClusterizerSettings->nnInferenceInputDType},
-      {"outputDType", mNNClusterizerSettings->nnInferenceOutputDType},
-      {"outputFolder", mNNClusterizerSettings->nnLocalFolder},
-      {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath},
-      {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings->nnCCDBWithMomentum)},
-      {"nnCCDBBeamType", mNNClusterizerSettings->nnCCDBBeamType},
-      {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings->nnCCDBInteractionRate)}};
-
-    std::string nnFetchFolder = mNNClusterizerSettings->nnLocalFolder;
-    std::vector<std::string> evalMode = o2::utils::Str::tokenize(mNNClusterizerSettings->nnEvalMode, ':');
+      {"nnCCDBURL", mNNClusterizerSettings.nnCCDBURL},
+      {"nnCCDBPath", mNNClusterizerSettings.nnCCDBPath},
+      {"inputDType", mNNClusterizerSettings.nnInferenceInputDType},
+      {"outputDType", mNNClusterizerSettings.nnInferenceOutputDType},
+      {"outputFolder", mNNClusterizerSettings.nnLocalFolder},
+      {"nnCCDBPath", mNNClusterizerSettings.nnCCDBPath},
+      {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings.nnCCDBWithMomentum)},
+      {"nnCCDBBeamType", mNNClusterizerSettings.nnCCDBBeamType},
+      {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings.nnCCDBInteractionRate)}};
+
+    std::string nnFetchFolder = mNNClusterizerSettings.nnLocalFolder;
+    std::vector<std::string> evalMode = o2::utils::Str::tokenize(mNNClusterizerSettings.nnEvalMode, ':');
 
     if (evalMode[0] == "c1") {
-      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBClassificationLayerType;
+      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBClassificationLayerType;
       ccdbSettings["nnCCDBEvalType"] = "classification_c1";
       ccdbSettings["outputFile"] = "net_classification_c1.onnx";
       nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
     } else if (evalMode[0] == "c2") {
-      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBClassificationLayerType;
+      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBClassificationLayerType;
       ccdbSettings["nnCCDBEvalType"] = "classification_c2";
       ccdbSettings["outputFile"] = "net_classification_c2.onnx";
       nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
     }
 
-    ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBRegressionLayerType;
+    ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBRegressionLayerType;
     ccdbSettings["nnCCDBEvalType"] = "regression_c1";
     ccdbSettings["outputFile"] = "net_regression_c1.onnx";
     nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
     if (evalMode[1] == "r2") {
-      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBRegressionLayerType;
+      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBRegressionLayerType;
       ccdbSettings["nnCCDBEvalType"] = "regression_c2";
       ccdbSettings["outputFile"] = "net_regression_c2.onnx";
       nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);

From 7b07496138b0f17bef84628ec21ddd0e93b0cb17 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Sat, 19 Apr 2025 22:39:06 +0000
Subject: [PATCH 39/40] Please consider the following formatting changes

---
 Common/ML/src/OrtInterface.cxx                              | 2 +-
 .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx            | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index a0665841bec31..8ce6b673660fb 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -144,7 +144,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex)
     (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1");
     (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
     (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1");                    // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
-    (pImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0");                    // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
+    (pImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0");          // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time
     // Arena memory shrinkage comes at performance cost
     /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0;
     // (pImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 2e98ca1982ad5..db2f05711f537 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -205,10 +205,12 @@ void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInf
   memInfo = (Ort::MemoryInfo*)mockedAlloc->Info();
 }
 
-const OrtMemoryInfo* GPUTPCNNClusterizerHost::getMockedMemoryInfo() {
+const OrtMemoryInfo* GPUTPCNNClusterizerHost::getMockedMemoryInfo()
+{
   return mockedAlloc->Info();
 }
 
-MockedOrtAllocator* GPUTPCNNClusterizerHost::getMockedAllocator() {
+MockedOrtAllocator* GPUTPCNNClusterizerHost::getMockedAllocator()
+{
   return mockedAlloc.get();
 }

From af89c9a63b025828b6d83a0598846aca00a1498d Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 20 Apr 2025 09:01:47 +0200
Subject: [PATCH 40/40] Fixing build flags

---
 Common/ML/src/OrtInterface.cxx           | 2 +-
 GPU/GPUTracking/Base/cuda/CMakeLists.txt | 4 ++--
 GPU/GPUTracking/Base/hip/CMakeLists.txt  | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 8ce6b673660fb..24a2fbffb252c 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -139,7 +139,7 @@ void OrtModel::initSession()
 
 void OrtModel::memoryOnDevice(int32_t deviceIndex)
 {
-#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1)
+#if (defined(ORT_ROCM_BUILD) || defined(ORT_MIGRAPHX_BUILD) || defined(ORT_CUDA_BUILD) || defined(ORT_TENSORRT_BUILD))
   if (deviceIndex >= 0) {
     (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1");
     (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
index 554f700bd57df..f595fb051db54 100644
--- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
@@ -124,8 +124,8 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
 
   target_compile_definitions(${targetName} PRIVATE
     GPUCA_HAS_ONNX=1
-    ORT_CUDA_BUILD=$<BOOL:${ORT_CUDA_BUILD}>
-    ORT_TENSORRT_BUILD=$<BOOL:${ORT_TENSORRT_BUILD}>)
+    $<$<BOOL:${ORT_CUDA_BUILD}>:ORT_CUDA_BUILD>
+    $<$<BOOL:${ORT_TENSORRT_BUILD}>:ORT_TENSORRT_BUILD>)
 
   install(FILES ${HDRS} DESTINATION include/GPU)
 endif()
diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt
index bd3ebe6bc667f..d7adb222d547b 100644
--- a/GPU/GPUTracking/Base/hip/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt
@@ -172,8 +172,8 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2")
 
   target_compile_definitions(${targetName} PRIVATE
     GPUCA_HAS_ONNX=1
-    ORT_ROCM_BUILD=$<BOOL:${ORT_ROCM_BUILD}>
-    ORT_MIGRAPHX_BUILD=$<BOOL:${ORT_MIGRAPHX_BUILD}>)
+    $<$<BOOL:${ORT_ROCM_BUILD}>:ORT_ROCM_BUILD>
+    $<$<BOOL:${ORT_MIGRAPHX_BUILD}>:ORT_MIGRAPHX_BUILD>)
 
   install(FILES ${HDRS} DESTINATION include/GPU)