From 9c8984c53d01d851f4294d5a68d6a8bbefd39295 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 18 Oct 2025 00:55:54 +0200
Subject: [PATCH 01/16] Improve GPU filling kernel speed

---
 .../Global/GPUChainTrackingClusterizer.cxx    |   7 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |   1 +
 .../GPUTPCNNClusterizerHost.cxx               |   1 +
 .../GPUTPCNNClusterizerKernels.cxx            | 181 +++++++++---------
 .../GPUTPCNNClusterizerKernels.h              |   4 +-
 5 files changed, 104 insertions(+), 90 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index b0d466f13e5ef..c4b7b2e47cbd9 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -642,6 +642,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
   // Maximum of 4 lanes supported
   HighResTimer* nnTimers[12];
+  int32_t countLoops = 0;
 
   if (GetProcessingSettings().nn.applyNNclusterizer) {
     int32_t deviceId = -1;
@@ -1035,7 +1036,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             // Filling the data
             if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
               // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeFull, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
+              for(int throughput_counter = 0; throughput_counter < 16; throughput_counter++) { // Loop to increase throughput on GPU, at least for large batch sizes
+                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
+                countLoops++;
+              }
             } else {
               // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
@@ -1138,6 +1142,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
               LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
             }
           }
+          LOG(info) << "countLoops: " << countLoops;
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index 0b9553437765c..b7bc1575d349a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -71,6 +71,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   uint32_t mNnClusterizerPadTimeSize = 0;
   uint32_t mNnClusterizerRowTimeSize = 0;
   uint32_t mNnClusterizerRowTimeSizeFull = 0;
+  uint32_t mNnClusterizerRowTimeSizeThreads = 0;
 
   // Boundary lookup table
   // int32_t mBoundaryMapSizeRow = 0;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index ae833ace2f648..582a0c6d7435a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -98,6 +98,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnClusterizerPadTimeSize = clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize;
   clustererNN.mNnClusterizerRowTimeSize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullTimeSize;
   clustererNN.mNnClusterizerRowTimeSizeFull = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 3 : 0);
+  clustererNN.mNnClusterizerRowTimeSizeThreads = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 1 : 0);
   clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0);
   // clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW;
   // clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 55fefa7dcf149..1c76fc3f1fa57 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -72,15 +72,19 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   int32_t time = static_cast<int>(peak.time());
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
   int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
+  const int32_t iroc_row = 63 + row_offset;
+  const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset;
+  const int32_t npads_row = GPUTPCGeometry::NPads(row);
 
   for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) {
     int32_t target_row = row + r;
     bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
     int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row);
+    int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(target_row + row_offset);
 
     for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
       int32_t target_pad = pad + p;
-      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow);
+      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference);
 
       for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) {
         int32_t target_time = time + t;
@@ -143,125 +147,119 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
-  if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeFull) {
+  if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeThreads) {
     return;
   }
 
-  uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeFull;
-  uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeFull);
+  uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads;
+  uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads);
 
   // Early exit for out-of-bounds threads
-  if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
-    return;
-  }
+  // if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
+  //   return;
+  // }
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
 
   // Use dedicated neural network shared memory arrays for warp-level caching
   // First thread in each warp loads shared data
   CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
-  float central_charge = static_cast<float>(chargeMap[peak].unpack());
+  float central_charge = chargeMap[peak].unpack();
   int32_t row = static_cast<int>(peak.row());
   int32_t pad = static_cast<int>(peak.pad());
   int32_t time = static_cast<int>(peak.time());
 
+  const int32_t npads_row = GPUTPCGeometry::NPads(row);
+
   // Handle index data with fewer branches
   if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
-    int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
-    uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
-
-    float index_values[3] = {
-      static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR,
-      static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW,
-      static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
-
+    // int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
+    // uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
+//
+    // float index_values[3] = {
+    //   static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR,
+    //   static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW,
+    //   static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
+//
+    // if (dtype == 0) {
+    //   clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
+    // } else {
+    //   clustererNN.mInputData_32[write_idx] = index_values[data_idx];
+    // }
+//
+    // // Handle deconvolution flags only once per cluster (last thread in element)
+    // if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) {
+    //   uint8_t cluster_flags = 0;
+    //   for (uint16_t i = 0; i < 8; i++) {
+    //     Delta2 d = cfconsts::InnerNeighbors[i];
+    //     CfChargePos tmp_pos = peak.delta(d);
+    //     cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
+    //   }
+    //   clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
+    //   clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
+    // }
+    // return;
+    uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize;
     if (dtype == 0) {
-      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
+      clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
+      clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
+      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads_row);
     } else {
-      clustererNN.mInputData_32[write_idx] = index_values[data_idx];
-    }
-
-    // Handle deconvolution flags only once per cluster (last thread in element)
-    if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) {
-      uint8_t cluster_flags = 0;
-      for (uint16_t i = 0; i < 8; i++) {
-        Delta2 d = cfconsts::InnerNeighbors[i];
-        CfChargePos tmp_pos = peak.delta(d);
-        cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
-      }
-      clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
-      clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
+      clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
+      clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
+      clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads_row;
     }
-    return;
   }
 
   // Main data processing - optimize index calculations
   if (transient_index < clustererNN.mNnClusterizerRowTimeSize) {
     // Optimize 3D index calculation
-    int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
-    int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow;
-    int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize;
-    int32_t t_local = time_idx - clustererNN.mNnClusterizerSizeInputTime;
+    const int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize;
+    const int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize;
     int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx;
 
     // Early boundary check for row
-    int32_t target_row = row + r_local;
-    int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1));
+    const int32_t target_row = row + row_idx - clustererNN.mNnClusterizerSizeInputRow;
+    const int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1));
+    const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime;
+    const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
+    const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide
 
     // Calculate offsets
-    int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-    int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
-    for (int32_t p_local = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local++) {
+    // int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
+    // int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
+    const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
+    const int32_t iroc_row = 63 + row_offset;
+    const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset;
+    const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row);
+    const int32_t boundary_row = target_row + row_offset;
+    const int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(boundary_row - clustererNN.mNnClusterizerSizeInputRow);
+    const float boundary_val = clustererNN.mNnClusterizerBoundaryFillValue;
+
+    float output_value = boundary_val;
+
+    const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local;
+    const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local;
+
+    for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) {
       if (is_row_boundary) {
-        // Use boundary fill value
-        float boundary_val = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
-        if (dtype == 0) {
-          clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_val;
+        output_value = boundary_val;
+      } else {
+        const uint8_t is_boundary = is_time_boundary || GPUTPCNNClusterizerKernels::isBoundary(boundary_row, target_pad, maxrow, iroc_row, npads_row, npads_reference);
+        if (!is_boundary) {
+          CfChargePos pos(target_row, target_pad, target_time);
+          // one load + one multiply
+          output_value = chargeMap[pos].unpack() * inverse_central_charge;
         } else {
-          clustererNN.mInputData_32[write_idx] = boundary_val;
+          output_value = boundary_val;
         }
-        write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
-        continue;
-      }
-
-      // Calculate target pad and time
-      int32_t target_pad = pad + p_local;
-      int32_t target_time = time + t_local;
-
-      // Optimized boundary check
-      int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
-
-      float output_value;
-      if (is_boundary) {
-        output_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
-      } else {
-        // Coalesced memory access - create position and read charge
-        CfChargePos tmp_pos(target_row, target_pad, target_time);
-        output_value = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge
       }
-
-      // Write output with reduced branching
       if (dtype == 0) {
         clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
       } else {
         clustererNN.mInputData_32[write_idx] = output_value;
       }
-      // if (write_idx >= clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode) {
-      //   printf("Error: Write index out of bounds (central array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
-      //          write_idx, (int)(clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
-      // }
-      // if ((clusterer.mPmemory->counters.nClusters - batchStart) < clustererNN.mNnClusterizerBatchedMode) {
-      //   if (write_idx >= ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize)) {
-      //     printf("Error: Write index out of bounds (end of array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
-      //           write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
-      //   }
-      //   if (write_idx > ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 5)) {
-      //     printf("Sanity check (should appear only once) %d == %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n",
-      //           write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 4), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local);
-      //   }
-      // }
-
-      write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position
+      write_idx += clustererNN.mNnClusterizerFullTimeSize;
     }
   }
 }
@@ -275,6 +273,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
   if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
+  if (glo_idx + batchStart >= clustererNN.mNnClusterizerTotalClusters) {
+    printf("Error: Class output index out of bounds! %d >= %d (glo_idx: %d, batchStart: %d, mNnClusterizerBatchedMode: %d, mNnClusterizerModelClassNumOutputNodes: %d, clusterer.mPmemory->counters.nClusters %d)\n",
+           glo_idx + batchStart, clustererNN.mNnClusterizerTotalClusters, glo_idx, batchStart, clustererNN.mNnClusterizerBatchedMode, clustererNN.mNnClusterizerModelClassNumOutputNodes, clusterer.mPmemory->counters.nClusters);
+  }
   if (clustererNN.mNnClusterizerUseClassification) {
     if (dtype == 0) {
       clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold);
@@ -364,6 +366,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     return;
   }
 
+  if (full_glo_idx >=  clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes) {
+    printf("Error: Global index out of bounds! %d >= %d (full_glo_idx: %d, maxClusterNum: %d, batchStart: %d)\n",
+           full_glo_idx, clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes, full_glo_idx, maxClusterNum, batchStart);
+  }
+
   tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
 
   ClusterAccumulator pc;
@@ -737,16 +744,16 @@ GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset
   return (row > 62 ? offset : 0);
 }
 
-GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset)
+GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t maxrow, int32_t iroc_row, int32_t npads_row, int32_t npads_reference)
 {
-  if (pad < 0 || row < 0) { // Faster short-circuit
+  if (pad < 0) { // Faster short-circuit
     return true;
   } else if (row < 63) {
-    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row)));
-  } else if (row < (63 + offset)) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
+    return (pad >= npads_row);
+  } else if (row < iroc_row) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network
     return true;
-  } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + offset)) {
-    return (pad >= static_cast<int>(GPUTPCGeometry::NPads(row - offset)));
+  } else if (row < maxrow) {
+    return (pad >= npads_reference);
   } else {
     return true;
   }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index cd3d7783771fe..9353722568b1f 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -57,7 +57,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
     determineClass2Labels = 4,
     publishClass1Regression = 5,
     publishClass2Regression = 6,
-    publishDeconvolutionFlags = 7,
+    publishDeconvolutionFlags = 7
   };
 
   template <int32_t iKernel = defaultKernel, typename... Args>
@@ -66,7 +66,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
  private:
   static GPUd() int32_t padOffset(int32_t, int32_t);
   static GPUd() int32_t rowOffset(int32_t, int32_t);
-  static GPUd() bool isBoundary(int32_t, int32_t, int32_t);
+  static GPUd() bool isBoundary(int32_t, int32_t, int32_t, int32_t, int32_t, int32_t);
   static GPUd() bool isBoundaryPublish(int32_t, int32_t, float&, float&);
 };
 

From a075c43cd3da71e56da1cd96769100c68c6c6c11 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 18 Oct 2025 18:33:48 +0200
Subject: [PATCH 02/16] Adjusting parameter bounds and additional GPU kernel
 optimizations

---
 .../Definitions/GPUDefParametersDefaults.h    |   2 +-
 .../Global/GPUChainTrackingClusterizer.cxx    |   7 +-
 .../GPUTPCNNClusterizerKernels.cxx            | 104 +++++++-----------
 3 files changed, 40 insertions(+), 73 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
index 29aa3808506dc..648482304aca2 100644
--- a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
+++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
@@ -482,7 +482,7 @@
 
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNCPU GPUCA_LB_GPUTPCNNClusterizerKernels
-  #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU GPUCA_LB_GPUTPCNNClusterizerKernels
+  #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU 1024
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index c4b7b2e47cbd9..96ee665bda4eb 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -642,7 +642,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
 
   // Maximum of 4 lanes supported
   HighResTimer* nnTimers[12];
-  int32_t countLoops = 0;
 
   if (GetProcessingSettings().nn.applyNNclusterizer) {
     int32_t deviceId = -1;
@@ -1036,10 +1035,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             // Filling the data
             if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
               // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations
-              for(int throughput_counter = 0; throughput_counter < 16; throughput_counter++) { // Loop to increase throughput on GPU, at least for large batch sizes
-                runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
-                countLoops++;
-              }
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
             } else {
               // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
@@ -1142,7 +1138,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
               LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")";
             }
           }
-          LOG(info) << "countLoops: " << countLoops;
 #else
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 1c76fc3f1fa57..3a27b8740e46d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -72,8 +72,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   int32_t time = static_cast<int>(peak.time());
   float central_charge = static_cast<float>(chargeMap[peak].unpack());
   int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-  const int32_t iroc_row = 63 + row_offset;
-  const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset;
+  const int32_t iroc_row = 63 + clustererNN.mNnClusterizerSizeInputRow;
+  const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + clustererNN.mNnClusterizerSizeInputRow;
   const int32_t npads_row = GPUTPCGeometry::NPads(row);
 
   for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) {
@@ -169,45 +169,18 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   int32_t pad = static_cast<int>(peak.pad());
   int32_t time = static_cast<int>(peak.time());
 
-  const int32_t npads_row = GPUTPCGeometry::NPads(row);
-
   // Handle index data with fewer branches
   if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {
-    // int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize;
-    // uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx;
-//
-    // float index_values[3] = {
-    //   static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR,
-    //   static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW,
-    //   static_cast<float>(pad) / GPUTPCGeometry::NPads(row)};
-//
-    // if (dtype == 0) {
-    //   clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx];
-    // } else {
-    //   clustererNN.mInputData_32[write_idx] = index_values[data_idx];
-    // }
-//
-    // // Handle deconvolution flags only once per cluster (last thread in element)
-    // if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) {
-    //   uint8_t cluster_flags = 0;
-    //   for (uint16_t i = 0; i < 8; i++) {
-    //     Delta2 d = cfconsts::InnerNeighbors[i];
-    //     CfChargePos tmp_pos = peak.delta(d);
-    //     cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]);
-    //   }
-    //   clustererNN.mClusterFlags[2 * base_idx] = cluster_flags;
-    //   clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags;
-    // }
-    // return;
     uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize;
+    const int32_t npads = GPUTPCGeometry::NPads(row);
     if (dtype == 0) {
       clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
       clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
-      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads_row);
+      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads);
     } else {
       clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
       clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
-      clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads_row;
+      clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads;
     }
   }
 
@@ -220,46 +193,45 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 
     // Early boundary check for row
     const int32_t target_row = row + row_idx - clustererNN.mNnClusterizerSizeInputRow;
-    const int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1));
-    const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime;
-    const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
-    const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide
-
-    // Calculate offsets
-    // int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-    // int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row);
-    const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
-    const int32_t iroc_row = 63 + row_offset;
-    const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset;
-    const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row);
-    const int32_t boundary_row = target_row + row_offset;
-    const int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(boundary_row - clustererNN.mNnClusterizerSizeInputRow);
-    const float boundary_val = clustererNN.mNnClusterizerBoundaryFillValue;
-
-    float output_value = boundary_val;
-
-    const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local;
-    const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local;
-
-    for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) {
-      if (is_row_boundary) {
-        output_value = boundary_val;
-      } else {
-        const uint8_t is_boundary = is_time_boundary || GPUTPCNNClusterizerKernels::isBoundary(boundary_row, target_pad, maxrow, iroc_row, npads_row, npads_reference);
-        if (!is_boundary) {
+    float output_value = clustererNN.mNnClusterizerBoundaryFillValue;
+
+    if ((row < 63 && target_row > 62) || (target_row < 0) || (row > 62 && target_row < 63) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) {
+      for (int32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) {
+        if (dtype == 0) {
+          clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
+        } else {
+          clustererNN.mInputData_32[write_idx] = output_value;
+        }
+        write_idx += clustererNN.mNnClusterizerFullTimeSize;
+      }
+      return;
+    } else {
+      // Calculate offsets
+      const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime;
+      const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin);
+      const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide
+      const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row);
+      const int32_t npads = GPUTPCGeometry::NPads(target_row);
+
+      const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local;
+      const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local;
+
+      for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) {
+        if (target_pad >= npads || target_pad < 0 || is_time_boundary) {
+          output_value = clustererNN.mNnClusterizerBoundaryFillValue;
+        } else {
           CfChargePos pos(target_row, target_pad, target_time);
           // one load + one multiply
           output_value = chargeMap[pos].unpack() * inverse_central_charge;
+        }
+        if (dtype == 0) {
+          clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
         } else {
-          output_value = boundary_val;
+          clustererNN.mInputData_32[write_idx] = output_value;
         }
+        write_idx += clustererNN.mNnClusterizerFullTimeSize;
       }
-      if (dtype == 0) {
-        clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
-      } else {
-        clustererNN.mInputData_32[write_idx] = output_value;
-      }
-      write_idx += clustererNN.mNnClusterizerFullTimeSize;
+      return;
     }
   }
 }

From 587c3e6fadd812b0489891c28759dc3d6865e036 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 18 Oct 2025 21:12:47 +0200
Subject: [PATCH 03/16] Adding back if statement for early exit

---
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 3a27b8740e46d..0f000451794ec 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -155,9 +155,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads);
 
   // Early exit for out-of-bounds threads
-  // if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
-  //   return;
-  // }
+  if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
+    return;
+  }
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
 

From 6e432576d34b871974fdb236bc9c6bc75619fcb8 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sat, 18 Oct 2025 22:19:01 +0200
Subject: [PATCH 04/16] const'ing + fixing CPU kernel

---
 .../Global/GPUChainTrackingClusterizer.cxx    |  2 +-
 .../GPUTPCNNClusterizerKernels.cxx            | 63 +++++++++----------
 2 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 96ee665bda4eb..1740e525937f3 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -1035,7 +1035,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             // Filling the data
             if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) {
               // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations
-              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNGPU>({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
             } else {
               // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability
               runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNCPU>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 0f000451794ec..7927dcf572acc 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -57,7 +57,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
-  uint32_t glo_idx = get_global_id(0);
+  const uint32_t glo_idx = get_global_id(0);
   if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
@@ -67,43 +67,42 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
   CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
-  int32_t row = static_cast<int>(peak.row());
-  int32_t pad = static_cast<int>(peak.pad());
-  int32_t time = static_cast<int>(peak.time());
-  float central_charge = static_cast<float>(chargeMap[peak].unpack());
-  int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
+  const int32_t row = static_cast<int>(peak.row());
+  const int32_t pad = static_cast<int>(peak.pad());
+  const int32_t time = static_cast<int>(peak.time());
+  const float central_charge = static_cast<float>(chargeMap[peak].unpack());
+  const float inverse_charge = 1.f / central_charge;
+
+  const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
   const int32_t iroc_row = 63 + clustererNN.mNnClusterizerSizeInputRow;
   const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + clustererNN.mNnClusterizerSizeInputRow;
   const int32_t npads_row = GPUTPCGeometry::NPads(row);
+  float output_value = clustererNN.mNnClusterizerBoundaryFillValue;
 
-  for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) {
-    int32_t target_row = row + r;
-    bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
-    int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row);
-    int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(target_row + row_offset);
+  for (int32_t target_row = -clustererNN.mNnClusterizerSizeInputRow + row; target_row <= clustererNN.mNnClusterizerSizeInputRow + row; ++target_row) {
+    uint8_t is_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW);
+    const int32_t p_local = pad + (is_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row));
+    const int32_t npads_reference = is_boundary ? 0 : GPUTPCGeometry::NPads(target_row - row_offset);
 
-    for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) {
-      int32_t target_pad = pad + p;
-      bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference);
-
-      for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) {
-        int32_t target_time = time + t;
+    for (int32_t target_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; target_pad <= clustererNN.mNnClusterizerSizeInputPad + p_local; ++target_pad) {
+      is_boundary = is_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference);
 
+      for (int32_t target_time = -clustererNN.mNnClusterizerSizeInputTime + time; target_time <= clustererNN.mNnClusterizerSizeInputTime + time; ++target_time) {
         if (is_boundary || target_time < 0 || target_time >= clustererNN.maxAllowedTimebin) {
           // Fill boundary value
-          float boundary_value = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+          output_value = clustererNN.mNnClusterizerBoundaryFillValue;
           if (dtype == 0) {
-            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_value;
+            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
           } else {
-            clustererNN.mInputData_32[write_idx] = boundary_value;
+            clustererNN.mInputData_32[write_idx] = output_value;
           }
         } else {
           CfChargePos tmp_pos(target_row, target_pad, target_time);
-          float normalized_charge = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+          output_value = chargeMap[tmp_pos].unpack() * inverse_charge;
           if (dtype == 0) {
-            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)normalized_charge;
+            clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
           } else {
-            clustererNN.mInputData_32[write_idx] = normalized_charge;
+            clustererNN.mInputData_32[write_idx] = output_value;
           }
         }
         // if((CAMath::Abs(static_cast<float>(clustererNN.mInputData_16_Test[write_idx]) - static_cast<float>(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) {
@@ -119,11 +118,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     if (dtype == 0) {
       clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR);
       clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW);
-      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / GPUTPCGeometry::NPads(row));
+      clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast<float>(pad) / npads_row);
     } else {
       clustererNN.mInputData_32[write_idx] = static_cast<float>(sector) / o2::tpc::constants::MAXSECTOR;
       clustererNN.mInputData_32[write_idx + 1] = static_cast<float>(row) / o2::tpc::constants::MAXGLOBALPADROW;
-      clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / GPUTPCGeometry::NPads(row);
+      clustererNN.mInputData_32[write_idx + 2] = static_cast<float>(pad) / npads_row;
     }
   }
 
@@ -143,7 +142,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 template <>
 GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNGPU>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
-  uint32_t glo_idx = get_global_id(0);
+  const uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
   auto& clustererNN = processors.tpcNNClusterer[sector];
 
@@ -151,8 +150,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     return;
   }
 
-  uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads;
-  uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads);
+  const uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads;
+  const uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads);
 
   // Early exit for out-of-bounds threads
   if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) {
@@ -164,10 +163,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
   // Use dedicated neural network shared memory arrays for warp-level caching
   // First thread in each warp loads shared data
   CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))];
-  float central_charge = chargeMap[peak].unpack();
-  int32_t row = static_cast<int>(peak.row());
-  int32_t pad = static_cast<int>(peak.pad());
-  int32_t time = static_cast<int>(peak.time());
+  const float central_charge = chargeMap[peak].unpack();
+  const int32_t row = static_cast<int>(peak.row());
+  const int32_t pad = static_cast<int>(peak.pad());
+  const int32_t time = static_cast<int>(peak.time());
 
   // Handle index data with fewer branches
   if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) {

From bb795c4170289bae93b19223264dc09aacaebea0 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 19 Oct 2025 09:43:08 +0200
Subject: [PATCH 05/16] Remiving print statements

---
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx      | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 7927dcf572acc..b214a8f549a56 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -244,10 +244,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
   if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) {
     return;
   }
-  if (glo_idx + batchStart >= clustererNN.mNnClusterizerTotalClusters) {
-    printf("Error: Class output index out of bounds! %d >= %d (glo_idx: %d, batchStart: %d, mNnClusterizerBatchedMode: %d, mNnClusterizerModelClassNumOutputNodes: %d, clusterer.mPmemory->counters.nClusters %d)\n",
-           glo_idx + batchStart, clustererNN.mNnClusterizerTotalClusters, glo_idx, batchStart, clustererNN.mNnClusterizerBatchedMode, clustererNN.mNnClusterizerModelClassNumOutputNodes, clusterer.mPmemory->counters.nClusters);
-  }
   if (clustererNN.mNnClusterizerUseClassification) {
     if (dtype == 0) {
       clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold);
@@ -337,11 +333,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
     return;
   }
 
-  if (full_glo_idx >=  clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes) {
-    printf("Error: Global index out of bounds! %d >= %d (full_glo_idx: %d, maxClusterNum: %d, batchStart: %d)\n",
-           full_glo_idx, clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes, full_glo_idx, maxClusterNum, batchStart);
-  }
-
   tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow;
 
   ClusterAccumulator pc;

From f7cdc0be08798e9b10a490a8b08697d0826450bc Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 27 Oct 2025 13:08:55 +0100
Subject: [PATCH 06/16] Fixing CI build issue

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index b214a8f549a56..ee0fa217b8095 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -195,7 +195,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     float output_value = clustererNN.mNnClusterizerBoundaryFillValue;
 
     if ((row < 63 && target_row > 62) || (target_row < 0) || (row > 62 && target_row < 63) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) {
-      for (int32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) {
+      for (uint32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) {
         if (dtype == 0) {
           clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value;
         } else {

From 3775044da0baef9edade6b9f2ac01041871b4166 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 21 Nov 2025 13:02:54 +0100
Subject: [PATCH 07/16] Working version of NN CCDB fetching and loading to file

---
 Common/ML/include/ML/OrtInterface.h           |   1 +
 Common/ML/src/OrtInterface.cxx                |  15 ++
 .../src/NeuralNetworkClusterizer.cxx          |   2 +
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  10 +-
 .../include/GPUWorkflow/GPUWorkflowSpec.h     |   5 +-
 GPU/Workflow/src/GPUWorkflowSpec.cxx          | 168 +++++++++++++-----
 6 files changed, 151 insertions(+), 50 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 04a5e0ba5c9fc..1893cf5a90179 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -51,6 +51,7 @@ class OrtModel
   void initOptions(std::unordered_map<std::string, std::string> optionsMap);
   void initEnvironment();
   void initSession();
+  void initSessionFromBuffer(const void* buffer, size_t bufferSize);
   void memoryOnDevice(int32_t = 0);
   bool isInitialized() { return mInitialized; }
   void resetSession();
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index d30d05d1d1a00..8d22d0075b887 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -138,6 +138,21 @@ void OrtModel::initEnvironment()
   (mPImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
 }
 
+void OrtModel::initSessionFromBuffer(const void* buffer, size_t bufferSize)
+{
+  mPImplOrt->session = std::make_unique<Ort::Session>(*mPImplOrt->env,
+                                                      static_cast<const uint8_t*>(buffer),
+                                                      bufferSize,
+                                                      mPImplOrt->sessionOptions);
+  mPImplOrt->ioBinding = std::make_unique<Ort::IoBinding>(*mPImplOrt->session);
+
+  setIO();
+
+  if (mLoggingLevel < 2) {
+    LOG(info) << "(ORT) Model loaded successfully from buffer! (inputs: " << printShape(mInputShapes, mInputNames) << ", outputs: " << printShape(mOutputShapes, mInputNames) << ")";
+  }
+}
+
 void OrtModel::initSession()
 {
   if (mAllocateDeviceMemory) {
diff --git a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx
index bfbb7afc946f8..5b0a06086e50d 100644
--- a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx
+++ b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx
@@ -37,6 +37,8 @@ void NeuralNetworkClusterizer::loadIndividualFromCCDB(std::map<std::string, std:
     metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"];
   }
 
+  LOG(info) << "(NN CLUS) Retrieving network " << settings["nnCCDBPath"] << " from CCDB (NeuralNetworkClusterizer.cxx)";
+
   bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], settings["outputFolder"], metadata, 1, false, settings["outputFile"]);
   // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables
 
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 052da8ae54c60..aa9e8a78c4484 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -285,14 +285,14 @@ AddOption(nnClusterizerUseClassification, int, 1, "", 0, "If 1, the classificati
 AddOption(nnClusterizerForceGpuInputFill, int, 0, "", 0, "Forces to use the fillInputNNGPU function")
 // CCDB
 AddOption(nnLoadFromCCDB, int, 0, "", 0, "If 1 networks are fetched from ccdb, else locally")
+AddOption(nnCCDBDumpToFile, int, 1, "", 0, "If 1, additionally dump fetched CCDB networks to nnLocalFolder")
 AddOption(nnLocalFolder, std::string, ".", "", 0, "Local folder in which the networks will be fetched")
-AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched")
 AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks")
-AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression")
+AddOption(nnCCDBWithMomentum, std::string, "", "", 0, "Distinguishes between the network with and without momentum output for the regression")
 AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
-AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
-AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp")
-AddOption(nnCCDBInteractionRate, int, 50, "", 0, "Distinguishes between networks for different interaction rates [kHz].")
+AddOption(nnCCDBRegressionLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
+AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp, pPb")
+AddOption(nnCCDBInteractionRate, std::string, "500", "", 0, "Distinguishes between networks for different interaction rates [kHz].")
 AddHelp("help", 'h')
 EndConfig()
 
diff --git a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
index 160efd4048af0..3655ffd2dd0dc 100644
--- a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
+++ b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
@@ -135,6 +135,9 @@ class GPURecoWorkflowSpec : public o2::framework::Task
     bool tpcTriggerHandling = false;
     bool isITS3 = false;
     bool useFilteredOutputSpecs = false;
+
+    // NN clusterizer
+    bool nnLoadFromCCDB = false;
   };
 
   GPURecoWorkflowSpec(CompletionPolicyData* policyData, Config const& specconfig, std::vector<int32_t> const& tpcsectors, uint64_t tpcSectorMask, std::shared_ptr<o2::base::GRPGeomRequest>& ggr, std::function<bool(o2::framework::DataProcessingHeader::StartTime)>** gPolicyOrder = nullptr);
@@ -230,7 +233,7 @@ class GPURecoWorkflowSpec : public o2::framework::Task
   uint32_t mNextThreadIndex = 0;
   bool mUpdateGainMapCCDB = true;
   std::unique_ptr<o2::gpu::GPUSettingsTF> mTFSettings;
-  std::unique_ptr<o2::gpu::GPUSettingsProcessingNNclusterizer> mNNClusterizerSettings;
+  std::map<std::string, std::string> nnCCDBSettings;
 
   Config mSpecConfig;
   std::shared_ptr<o2::base::GRPGeomRequest> mGGR;
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index d3d3eb14869e0..32a2a572523ea 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -133,50 +133,6 @@ void GPURecoWorkflowSpec::init(InitContext& ic)
 {
   GRPGeomHelper::instance().setRequest(mGGR);
   GPUO2InterfaceConfiguration& config = *mConfig.get();
-  GPUSettingsProcessingNNclusterizer& mNNClusterizerSettings = mConfig->configProcessing.nn;
-
-  if (mNNClusterizerSettings.nnLoadFromCCDB) {
-    LOG(info) << "Loading neural networks from CCDB";
-    o2::tpc::NeuralNetworkClusterizer nnClusterizerFetcher;
-    nnClusterizerFetcher.initCcdbApi(mNNClusterizerSettings.nnCCDBURL);
-    std::map<std::string, std::string> ccdbSettings = {
-      {"nnCCDBURL", mNNClusterizerSettings.nnCCDBURL},
-      {"nnCCDBPath", mNNClusterizerSettings.nnCCDBPath},
-      {"inputDType", mNNClusterizerSettings.nnInferenceInputDType},
-      {"outputDType", mNNClusterizerSettings.nnInferenceOutputDType},
-      {"outputFolder", mNNClusterizerSettings.nnLocalFolder},
-      {"nnCCDBPath", mNNClusterizerSettings.nnCCDBPath},
-      {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings.nnCCDBWithMomentum)},
-      {"nnCCDBBeamType", mNNClusterizerSettings.nnCCDBBeamType},
-      {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings.nnCCDBInteractionRate)}};
-
-    std::string nnFetchFolder = mNNClusterizerSettings.nnLocalFolder;
-    std::vector<std::string> evalMode = o2::utils::Str::tokenize(mNNClusterizerSettings.nnEvalMode, ':');
-
-    if (evalMode[0] == "c1") {
-      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBClassificationLayerType;
-      ccdbSettings["nnCCDBEvalType"] = "classification_c1";
-      ccdbSettings["outputFile"] = "net_classification_c1.onnx";
-      nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
-    } else if (evalMode[0] == "c2") {
-      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBClassificationLayerType;
-      ccdbSettings["nnCCDBEvalType"] = "classification_c2";
-      ccdbSettings["outputFile"] = "net_classification_c2.onnx";
-      nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
-    }
-
-    ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBRegressionLayerType;
-    ccdbSettings["nnCCDBEvalType"] = "regression_c1";
-    ccdbSettings["outputFile"] = "net_regression_c1.onnx";
-    nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
-    if (evalMode[1] == "r2") {
-      ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBRegressionLayerType;
-      ccdbSettings["nnCCDBEvalType"] = "regression_c2";
-      ccdbSettings["outputFile"] = "net_regression_c2.onnx";
-      nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings);
-    }
-    LOG(info) << "Neural network loading done!";
-  }
 
   // Create configuration object and fill settings
   mConfig->configGRP.solenoidBzNominalGPU = 0;
@@ -185,6 +141,7 @@ void GPURecoWorkflowSpec::init(InitContext& ic)
   mTFSettings->simStartOrbit = hbfu.getFirstIRofTF(o2::InteractionRecord(0, hbfu.orbitFirstSampled)).orbit;
 
   *mConfParam = mConfig->ReadConfigurableParam();
+
   if (mConfParam->display) {
     mDisplayFrontend.reset(GPUDisplayFrontendInterface::getFrontend(mConfig->configDisplay.displayFrontend.c_str()));
     mConfig->configProcessing.eventDisplay = mDisplayFrontend.get();
@@ -814,6 +771,68 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
 
   // ------------------------------ Actual processing ------------------------------
 
+  if (mSpecConfig.nnLoadFromCCDB) {
+    LOG(info) << "(NN CLUS) Fetching CCDB calib objects";
+
+    auto dumpOnnxToFile = [](const char* buffer, std::size_t size, const std::string& path) {
+      const char* marker = "Accept-Ranges";
+      const char* pos = std::search(buffer, buffer + size, marker, marker + std::strlen(marker));
+
+      // Compute the actual number of bytes to write
+      std::size_t writeSize = (pos != buffer + size)
+                                  ? static_cast<std::size_t>(pos - buffer)
+                                  : size;
+
+      std::ofstream out(path, std::ios::binary | std::ios::trunc);
+      if (!out.is_open()) {
+          throw std::runtime_error("Failed to open ONNX output file: " + path);
+      }
+
+      out.write(buffer, static_cast<std::streamsize>(writeSize));
+      if (!out) {
+          throw std::runtime_error("Failed while writing ONNX data to: " + path);
+      }
+    };
+
+    GPUSettingsProcessingNNclusterizer& nnClusterizerSettings = mConfig->configProcessing.nn;
+    std::vector<std::string> evalMode = o2::utils::Str::tokenize(nnClusterizerSettings.nnEvalMode, ':');
+
+    DataRef m;
+    if (evalMode[0] == "c1") {
+      m = pc.inputs().get("nn_classification_c1");
+      const char* buffer = const_cast<char*>(m.payload);
+      size_t size = DataRefUtils::getPayloadSize(m);
+      if (nnClusterizerSettings.nnCCDBDumpToFile == 1) {
+        dumpOnnxToFile(buffer, size, "net_classification_c1.onnx");
+        LOG(info) << "(NN CLUS) Dumped nn_classification_c1 from CCDB to net_classification_c1.onnx";
+      }
+    } else if (evalMode[0] == "c2") {
+      m = pc.inputs().get("nn_classification_c2");
+      const char* buffer = const_cast<char*>(m.payload);
+      size_t size = DataRefUtils::getPayloadSize(m);
+      if (nnClusterizerSettings.nnCCDBDumpToFile == 1) {
+        dumpOnnxToFile(buffer, size, "net_classification_c2.onnx");
+        LOG(info) << "(NN CLUS) Dumped nn_classification_c2 from CCDB to net_classification_c2.onnx";
+      }
+    }
+
+    m = pc.inputs().get("nn_regression_c1");
+    const char* buffer = const_cast<char*>(m.payload);
+    size_t size = DataRefUtils::getPayloadSize(m);
+    if (nnClusterizerSettings.nnCCDBDumpToFile == 1) {
+      dumpOnnxToFile(buffer, size, "net_regression_c1.onnx");
+      LOG(info) << "(NN CLUS) Dumped nn_regression_c1 from CCDB to net_regression_c1.onnx";
+    }
+    if (evalMode[1] == "r2") {
+      m = pc.inputs().get("nn_regression_c2");
+      const char* buffer = const_cast<char*>(m.payload);
+      size_t size = DataRefUtils::getPayloadSize(m);
+      if (nnClusterizerSettings.nnCCDBDumpToFile == 1) {
+        dumpOnnxToFile(buffer, size, "net_regression_c2.onnx");
+        LOG(info) << "(NN CLUS) Dumped nn_regression_c2 from CCDB to net_regression_c2.onnx";
+      }
+    }
+  }
   if ((int32_t)(ptrs.tpcZS != nullptr) + (int32_t)(ptrs.tpcPackedDigits != nullptr && (ptrs.tpcZS == nullptr || ptrs.tpcPackedDigits->tpcDigitsMC == nullptr)) + (int32_t)(ptrs.clustersNative != nullptr) + (int32_t)(ptrs.tpcCompressedClusters != nullptr) != 1) {
     throw std::runtime_error("Invalid input for gpu tracking");
   }
@@ -1262,6 +1281,67 @@ Inputs GPURecoWorkflowSpec::inputs()
     }
   }
 
+  // NN clusterizer
+  *mConfParam = mConfig->ReadConfigurableParam();
+  if (mConfig->configProcessing.nn.nnLoadFromCCDB) {
+
+    LOG(info) << "(NN CLUS) Enabling fetching of TPC NN clusterizer from CCDB";
+    mSpecConfig.nnLoadFromCCDB = true;
+    GPUSettingsProcessingNNclusterizer& nnClusterizerSettings = mConfig->configProcessing.nn;
+
+    std::map<std::string, std::string> metadata;
+    metadata["inputDType"] = nnClusterizerSettings.nnInferenceInputDType;                               // FP16 or FP32
+    metadata["outputDType"] = nnClusterizerSettings.nnInferenceOutputDType;                             // FP16 or FP32
+    metadata["nnCCDBWithMomentum"] = nnClusterizerSettings.nnCCDBWithMomentum;          // 0, 1 -> Only for regression model
+    metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBClassificationLayerType;                  // FC, CNN
+    metadata["nnCCDBInteractionRate"] = nnClusterizerSettings.nnCCDBInteractionRate;    // in kHz
+    metadata["nnCCDBBeamType"] = nnClusterizerSettings.nnCCDBBeamType;                                  // pp, pPb, PbPb
+
+    auto convert_map_to_metadata = [](const std::map<std::string, std::string>& inputMap, std::vector<o2::framework::CCDBMetadata>& outputMetadata) {
+      for (const auto& [key, value] : inputMap) {
+        if (value != "") {
+          outputMetadata.push_back({key, value});
+        }
+      }
+    };
+
+    std::vector<std::string> evalMode = o2::utils::Str::tokenize(nnClusterizerSettings.nnEvalMode, ':');
+    std::vector<o2::framework::CCDBMetadata> ccdb_metadata;
+
+    auto printSettings = [](const std::map<std::string, std::string>& settings) {
+      LOG(info) << "(NN CLUS) NN Clusterizer CCDB settings:";
+      for (const auto& [key, value] : settings) {
+        LOG(info) << "  " << key << " : " << value;
+      }
+    };
+    printSettings(metadata);
+
+    if (evalMode[0] == "c1") {
+      metadata["nnCCDBEvalType"] = "classification_c1";
+      convert_map_to_metadata(metadata, ccdb_metadata);
+      inputs.emplace_back("nn_classification_c1", "TPC", "NNCLUSTERIZER_C1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0));
+      LOG(info) << "(NN CLUS) Loading NN clusterizer classification (c1) from CCDB";
+    } else if (evalMode[0] == "c2") {
+      metadata["nnCCDBEvalType"] = "classification_c2";
+      convert_map_to_metadata(metadata, ccdb_metadata);
+      inputs.emplace_back("nn_classification_c2", "TPC", "NNCLUSTERIZER_C2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0));
+      LOG(info) << "(NN CLUS) Loading NN clusterizer classification (c2) from CCDB";
+    }
+
+    metadata["nnCCDBEvalType"] = "regression_c1";
+    metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBRegressionLayerType;
+    convert_map_to_metadata(metadata, ccdb_metadata);
+    inputs.emplace_back("nn_regression_c1", "TPC", "NNCLUSTERIZER_R1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0));
+    LOG(info) << "(NN CLUS) Loading NN clusterizer regression (r1) from CCDB";
+
+    if (evalMode[1] == "r2") {
+      metadata["nnCCDBEvalType"] = "regression_c2";
+      convert_map_to_metadata(metadata, ccdb_metadata);
+      inputs.emplace_back("nn_regression_c2", "TPC", "NNCLUSTERIZER_R2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0));
+      LOG(info) << "(NN CLUS) Loading NN clusterizer regression (r2) from CCDB";
+    }
+  }
+
   return inputs;
 };
 

From a963c0191e3b4e37c81e09497512281907e58add Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Fri, 21 Nov 2025 16:28:19 +0100
Subject: [PATCH 08/16] Cleanup

---
 Detectors/TPC/calibration/CMakeLists.txt      |  2 -
 .../TPCCalibration/NeuralNetworkClusterizer.h | 38 --------------
 .../src/NeuralNetworkClusterizer.cxx          | 50 -------------------
 .../GPUTPCNNClusterizerHost.cxx               |  2 +-
 GPU/Workflow/src/GPUWorkflowSpec.cxx          | 25 ++++------
 5 files changed, 12 insertions(+), 105 deletions(-)
 delete mode 100644 Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h
 delete mode 100644 Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx

diff --git a/Detectors/TPC/calibration/CMakeLists.txt b/Detectors/TPC/calibration/CMakeLists.txt
index 8bcb3254edb32..e5cc25230d2fc 100644
--- a/Detectors/TPC/calibration/CMakeLists.txt
+++ b/Detectors/TPC/calibration/CMakeLists.txt
@@ -25,7 +25,6 @@ o2_add_library(TPCCalibration
                        src/CalibPadGainTracksBase.cxx
                        src/CalibLaserTracks.cxx
                        src/LaserTracksCalibrator.cxx
-                       src/NeuralNetworkClusterizer.cxx
                        src/SACDecoder.cxx
                        src/IDCAverageGroup.cxx
                        src/IDCAverageGroupBase.cxx
@@ -84,7 +83,6 @@ o2_target_root_dictionary(TPCCalibration
                                   include/TPCCalibration/FastHisto.h
                                   include/TPCCalibration/CalibLaserTracks.h
                                   include/TPCCalibration/LaserTracksCalibrator.h
-                                  include/TPCCalibration/NeuralNetworkClusterizer.h
                                   include/TPCCalibration/SACDecoder.h
                                   include/TPCCalibration/IDCAverageGroup.h
                                   include/TPCCalibration/IDCAverageGroupBase.h
diff --git a/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h
deleted file mode 100644
index 196bba644714c..0000000000000
--- a/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-/// \file   NeuralNetworkClusterizer.h
-/// \brief  Fetching neural networks for clusterization from CCDB
-/// \author Christian Sonnabend
-
-#ifndef AliceO2_TPC_NeuralNetworkClusterizer_h
-#define AliceO2_TPC_NeuralNetworkClusterizer_h
-
-#include "CCDB/CcdbApi.h"
-
-namespace o2::tpc
-{
-
-class NeuralNetworkClusterizer
-{
- public:
-  NeuralNetworkClusterizer() = default;
-  void initCcdbApi(std::string url);
-  void loadIndividualFromCCDB(std::map<std::string, std::string> settings);
-
- private:
-  o2::ccdb::CcdbApi ccdbApi;
-  std::map<std::string, std::string> metadata;
-  std::map<std::string, std::string> headers;
-};
-
-} // namespace o2::tpc
-#endif
diff --git a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx
deleted file mode 100644
index 5b0a06086e50d..0000000000000
--- a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
-// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
-// All rights not expressly granted are reserved.
-//
-// This software is distributed under the terms of the GNU General Public
-// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
-//
-// In applying this license CERN does not waive the privileges and immunities
-// granted to it by virtue of its status as an Intergovernmental Organization
-// or submit itself to any jurisdiction.
-
-/// \file   NeuralNetworkClusterizer.cxx
-/// \brief  Fetching neural networks for clusterization from CCDB
-/// \author Christian Sonnabend
-
-#include <CommonUtils/StringUtils.h>
-#include "TPCCalibration/NeuralNetworkClusterizer.h"
-
-using namespace o2::tpc;
-
-void NeuralNetworkClusterizer::initCcdbApi(std::string url)
-{
-  ccdbApi.init(url);
-}
-
-void NeuralNetworkClusterizer::loadIndividualFromCCDB(std::map<std::string, std::string> settings)
-{
-  metadata["inputDType"] = settings["inputDType"];
-  metadata["outputDType"] = settings["outputDType"];
-  metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"];         // classification_1C, classification_2C, regression_1C, regression_2C
-  metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model
-  metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"];       // FC, CNN
-  if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) {
-    metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"];
-  }
-  if (settings["nnCCDBBeamType"] != "") {
-    metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"];
-  }
-
-  LOG(info) << "(NN CLUS) Retrieving network " << settings["nnCCDBPath"] << " from CCDB (NeuralNetworkClusterizer.cxx)";
-
-  bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], settings["outputFolder"], metadata, 1, false, settings["outputFile"]);
-  // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables
-
-  if (retrieveSuccess) {
-    LOG(info) << "Network " << settings["nnCCDBPath"] << " retrieved from CCDB, stored at " << settings["outputFile"];
-  } else {
-    LOG(error) << "Failed to retrieve network from CCDB";
-  }
-}
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 582a0c6d7435a..77d5ee13f85fb 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -36,7 +36,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set
   std::vector<std::string> evalMode = o2::utils::Str::tokenize(settings.nnEvalMode, ':');
 
   if (settings.nnLoadFromCCDB) {
-    reg_model_path = settings.nnLocalFolder + "/net_regression_c1.onnx"; // Needs to be set identical to NeuralNetworkClusterizer.cxx, otherwise the networks might be loaded from the wrong place
+    reg_model_path = settings.nnLocalFolder + "/net_regression_c1.onnx"; // Needs to be set identical to GPUWorkflowSpec.cxx, otherwise the networks might be loaded from the wrong place
     if (evalMode[0] == "c1") {
       class_model_path = settings.nnLocalFolder + "/net_classification_c1.onnx";
     } else if (evalMode[0] == "c2") {
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index 32a2a572523ea..c5a75dcd762df 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -78,7 +78,6 @@
 #include "DetectorsRaw/RDHUtils.h"
 #include "ITStracking/TrackingInterface.h"
 #include "GPUWorkflowInternal.h"
-#include "TPCCalibration/NeuralNetworkClusterizer.h"
 // #include "Framework/ThreadPool.h"
 
 #include <TStopwatch.h>
@@ -804,7 +803,6 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
       size_t size = DataRefUtils::getPayloadSize(m);
       if (nnClusterizerSettings.nnCCDBDumpToFile == 1) {
         dumpOnnxToFile(buffer, size, "net_classification_c1.onnx");
-        LOG(info) << "(NN CLUS) Dumped nn_classification_c1 from CCDB to net_classification_c1.onnx";
       }
     } else if (evalMode[0] == "c2") {
       m = pc.inputs().get("nn_classification_c2");
@@ -812,7 +810,6 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
       size_t size = DataRefUtils::getPayloadSize(m);
       if (nnClusterizerSettings.nnCCDBDumpToFile == 1) {
         dumpOnnxToFile(buffer, size, "net_classification_c2.onnx");
-        LOG(info) << "(NN CLUS) Dumped nn_classification_c2 from CCDB to net_classification_c2.onnx";
       }
     }
 
@@ -821,7 +818,6 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
     size_t size = DataRefUtils::getPayloadSize(m);
     if (nnClusterizerSettings.nnCCDBDumpToFile == 1) {
       dumpOnnxToFile(buffer, size, "net_regression_c1.onnx");
-      LOG(info) << "(NN CLUS) Dumped nn_regression_c1 from CCDB to net_regression_c1.onnx";
     }
     if (evalMode[1] == "r2") {
       m = pc.inputs().get("nn_regression_c2");
@@ -829,7 +825,6 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
       size_t size = DataRefUtils::getPayloadSize(m);
       if (nnClusterizerSettings.nnCCDBDumpToFile == 1) {
         dumpOnnxToFile(buffer, size, "net_regression_c2.onnx");
-        LOG(info) << "(NN CLUS) Dumped nn_regression_c2 from CCDB to net_regression_c2.onnx";
       }
     }
   }
@@ -1292,9 +1287,9 @@ Inputs GPURecoWorkflowSpec::inputs()
     std::map<std::string, std::string> metadata;
     metadata["inputDType"] = nnClusterizerSettings.nnInferenceInputDType;                               // FP16 or FP32
     metadata["outputDType"] = nnClusterizerSettings.nnInferenceOutputDType;                             // FP16 or FP32
-    metadata["nnCCDBWithMomentum"] = nnClusterizerSettings.nnCCDBWithMomentum;          // 0, 1 -> Only for regression model
+    metadata["nnCCDBWithMomentum"] = nnClusterizerSettings.nnCCDBWithMomentum;                          // 0, 1 -> Only for regression model
     metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBClassificationLayerType;                  // FC, CNN
-    metadata["nnCCDBInteractionRate"] = nnClusterizerSettings.nnCCDBInteractionRate;    // in kHz
+    metadata["nnCCDBInteractionRate"] = nnClusterizerSettings.nnCCDBInteractionRate;                    // in kHz
     metadata["nnCCDBBeamType"] = nnClusterizerSettings.nnCCDBBeamType;                                  // pp, pPb, PbPb
 
     auto convert_map_to_metadata = [](const std::map<std::string, std::string>& inputMap, std::vector<o2::framework::CCDBMetadata>& outputMetadata) {
@@ -1308,13 +1303,15 @@ Inputs GPURecoWorkflowSpec::inputs()
     std::vector<std::string> evalMode = o2::utils::Str::tokenize(nnClusterizerSettings.nnEvalMode, ':');
     std::vector<o2::framework::CCDBMetadata> ccdb_metadata;
 
-    auto printSettings = [](const std::map<std::string, std::string>& settings) {
-      LOG(info) << "(NN CLUS) NN Clusterizer CCDB settings:";
-      for (const auto& [key, value] : settings) {
-        LOG(info) << "  " << key << " : " << value;
-      }
-    };
-    printSettings(metadata);
+    if (mConfParam->printSettings) {
+      auto printSettings = [](const std::map<std::string, std::string>& settings) {
+        LOG(info) << "(NN CLUS) NN Clusterizer CCDB settings:";
+        for (const auto& [key, value] : settings) {
+          LOG(info) << "  " << key << " : " << value;
+        }
+      };
+      printSettings(metadata);
+    }
 
     if (evalMode[0] == "c1") {
       metadata["nnCCDBEvalType"] = "classification_c1";

From caf20fc089be9e1dc9a6fab02ab90814b2935627 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Fri, 21 Nov 2025 15:35:02 +0000
Subject: [PATCH 09/16] Please consider the following formatting changes

---
 GPU/Workflow/src/GPUWorkflowSpec.cxx | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index c5a75dcd762df..a5bdc53911e58 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -779,17 +779,17 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
 
       // Compute the actual number of bytes to write
       std::size_t writeSize = (pos != buffer + size)
-                                  ? static_cast<std::size_t>(pos - buffer)
-                                  : size;
+                                ? static_cast<std::size_t>(pos - buffer)
+                                : size;
 
       std::ofstream out(path, std::ios::binary | std::ios::trunc);
       if (!out.is_open()) {
-          throw std::runtime_error("Failed to open ONNX output file: " + path);
+        throw std::runtime_error("Failed to open ONNX output file: " + path);
       }
 
       out.write(buffer, static_cast<std::streamsize>(writeSize));
       if (!out) {
-          throw std::runtime_error("Failed while writing ONNX data to: " + path);
+        throw std::runtime_error("Failed while writing ONNX data to: " + path);
       }
     };
 
@@ -1285,12 +1285,12 @@ Inputs GPURecoWorkflowSpec::inputs()
     GPUSettingsProcessingNNclusterizer& nnClusterizerSettings = mConfig->configProcessing.nn;
 
     std::map<std::string, std::string> metadata;
-    metadata["inputDType"] = nnClusterizerSettings.nnInferenceInputDType;                               // FP16 or FP32
-    metadata["outputDType"] = nnClusterizerSettings.nnInferenceOutputDType;                             // FP16 or FP32
-    metadata["nnCCDBWithMomentum"] = nnClusterizerSettings.nnCCDBWithMomentum;                          // 0, 1 -> Only for regression model
-    metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBClassificationLayerType;                  // FC, CNN
-    metadata["nnCCDBInteractionRate"] = nnClusterizerSettings.nnCCDBInteractionRate;                    // in kHz
-    metadata["nnCCDBBeamType"] = nnClusterizerSettings.nnCCDBBeamType;                                  // pp, pPb, PbPb
+    metadata["inputDType"] = nnClusterizerSettings.nnInferenceInputDType;              // FP16 or FP32
+    metadata["outputDType"] = nnClusterizerSettings.nnInferenceOutputDType;            // FP16 or FP32
+    metadata["nnCCDBWithMomentum"] = nnClusterizerSettings.nnCCDBWithMomentum;         // 0, 1 -> Only for regression model
+    metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBClassificationLayerType; // FC, CNN
+    metadata["nnCCDBInteractionRate"] = nnClusterizerSettings.nnCCDBInteractionRate;   // in kHz
+    metadata["nnCCDBBeamType"] = nnClusterizerSettings.nnCCDBBeamType;                 // pp, pPb, PbPb
 
     auto convert_map_to_metadata = [](const std::map<std::string, std::string>& inputMap, std::vector<o2::framework::CCDBMetadata>& outputMetadata) {
       for (const auto& [key, value] : inputMap) {

From 5284b01db9b48d2cd97ad569e378c25eabb267a9 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 23 Nov 2025 11:52:54 +0100
Subject: [PATCH 10/16] Using char* buffer for model loading

---
 Common/ML/include/ML/OrtInterface.h           |  2 +-
 Common/ML/src/OrtInterface.cxx                |  7 +-
 GPU/GPUTracking/DataTypes/GPUDataTypes.h      |  4 +
 .../Global/GPUChainTrackingClusterizer.cxx    | 20 ++++-
 .../include/GPUWorkflow/GPUWorkflowSpec.h     |  2 +
 GPU/Workflow/src/GPUWorkflowSpec.cxx          | 77 +++----------------
 GPU/Workflow/src/GPUWorkflowTPC.cxx           | 70 +++++++++++++++++
 7 files changed, 109 insertions(+), 73 deletions(-)

diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h
index 1893cf5a90179..987ce8fb4d6dd 100644
--- a/Common/ML/include/ML/OrtInterface.h
+++ b/Common/ML/include/ML/OrtInterface.h
@@ -51,7 +51,7 @@ class OrtModel
   void initOptions(std::unordered_map<std::string, std::string> optionsMap);
   void initEnvironment();
   void initSession();
-  void initSessionFromBuffer(const void* buffer, size_t bufferSize);
+  void initSessionFromBuffer(const char* buffer, size_t bufferSize);
   void memoryOnDevice(int32_t = 0);
   bool isInitialized() { return mInitialized; }
   void resetSession();
diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx
index 8d22d0075b887..8f88ab18dacbd 100644
--- a/Common/ML/src/OrtInterface.cxx
+++ b/Common/ML/src/OrtInterface.cxx
@@ -138,10 +138,13 @@ void OrtModel::initEnvironment()
   (mPImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events
 }
 
-void OrtModel::initSessionFromBuffer(const void* buffer, size_t bufferSize)
+void OrtModel::initSessionFromBuffer(const char* buffer, size_t bufferSize)
 {
+  mPImplOrt->sessionOptions.AddConfigEntry("session.load_model_format", "ONNX");
+  mPImplOrt->sessionOptions.AddConfigEntry("session.use_ort_model_bytes_directly", "1");
+
   mPImplOrt->session = std::make_unique<Ort::Session>(*mPImplOrt->env,
-                                                      static_cast<const uint8_t*>(buffer),
+                                                      buffer,
                                                       bufferSize,
                                                       mPImplOrt->sessionOptions);
   mPImplOrt->ioBinding = std::make_unique<Ort::IoBinding>(*mPImplOrt->session);
diff --git a/GPU/GPUTracking/DataTypes/GPUDataTypes.h b/GPU/GPUTracking/DataTypes/GPUDataTypes.h
index 967d6a73914dd..e84ebe7231b34 100644
--- a/GPU/GPUTracking/DataTypes/GPUDataTypes.h
+++ b/GPU/GPUTracking/DataTypes/GPUDataTypes.h
@@ -182,6 +182,10 @@ struct GPUCalibObjectsTemplate { // use only pointers on PODs or flat objects he
   typename S<o2::tpc::CalibdEdxContainer>::type* dEdxCalibContainer = nullptr;
   typename S<o2::base::PropagatorImpl<float>>::type* o2Propagator = nullptr;
   typename S<o2::itsmft::TopologyDictionary>::type* itsPatternDict = nullptr;
+
+  // NN clusterizer objects
+  char* nnClusterizerNetworks[3] = {nullptr, nullptr, nullptr}; // [c, r1, r2] networks as char arrays from CCDB
+  uint32_t nnClusterizerNetworkSizes[3] = {0, 0, 0};
 };
 typedef GPUCalibObjectsTemplate<DefaultPtr> GPUCalibObjects; // NOTE: These 2 must have identical layout since they are memcopied
 typedef GPUCalibObjectsTemplate<ConstPtr> GPUCalibObjectsConst;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index bfb0457744ce5..2b4003396e4f7 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -639,7 +639,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   // Maximum of 4 lanes supported
   HighResTimer* nnTimers[12];
 
-  if (GetProcessingSettings().nn.applyNNclusterizer) {
+  if (nn_settings.applyNNclusterizer) {
     int32_t deviceId = -1;
     int32_t numLanes = GetProcessingSettings().nTPCClustererLanes;
     int32_t maxThreads = mRec->getNKernelHostThreads(true);
@@ -677,7 +677,11 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         //   nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
         // }
         // recreateMemoryAllocator = true;
-        (nnApplications[lane].mModelClass).initSession();
+        if (!nn_settings.nnLoadFromCCDB){
+          (nnApplications[lane].mModelClass).initSession(); // loads from file
+        } else {
+          (nnApplications[lane].mModelClass).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[0], processors()->calibObjects.nnClusterizerNetworkSizes[0]); // loads from CCDB
+        }
       }
       if (nnApplications[lane].mModelsUsed[1]) {
         SetONNXGPUStream(*(nnApplications[lane].mModelReg1).getSessionOptions(), lane, &deviceId);
@@ -688,7 +692,11 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         // (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv());
         (nnApplications[lane].mModelReg1).initEnvironment();
         // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
-        (nnApplications[lane].mModelReg1).initSession();
+        if (!nn_settings.nnLoadFromCCDB){
+          (nnApplications[lane].mModelReg1).initSession(); // loads from file
+        } else {
+          (nnApplications[lane].mModelReg1).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[1], processors()->calibObjects.nnClusterizerNetworkSizes[1]); // loads from CCDB
+        }
       }
       if (nnApplications[lane].mModelsUsed[2]) {
         SetONNXGPUStream(*(nnApplications[lane].mModelReg2).getSessionOptions(), lane, &deviceId);
@@ -699,7 +707,11 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         // (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv());
         (nnApplications[lane].mModelReg2).initEnvironment();
         // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
-        (nnApplications[lane].mModelReg2).initSession();
+        if (!nn_settings.nnLoadFromCCDB){
+          (nnApplications[lane].mModelReg2).initSession(); // loads from file
+        } else {
+          (nnApplications[lane].mModelReg2).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[2], processors()->calibObjects.nnClusterizerNetworkSizes[2]); // loads from CCDB
+        }
       }
       if (nn_settings.nnClusterizerVerbosity > 0) {
         LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId;
diff --git a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
index 3655ffd2dd0dc..d610269abca81 100644
--- a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
+++ b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h
@@ -138,6 +138,8 @@ class GPURecoWorkflowSpec : public o2::framework::Task
 
     // NN clusterizer
     bool nnLoadFromCCDB = false;
+    bool nnDumpToFile = false;
+    std::vector<std::string> nnEvalMode;
   };
 
   GPURecoWorkflowSpec(CompletionPolicyData* policyData, Config const& specconfig, std::vector<int32_t> const& tpcsectors, uint64_t tpcSectorMask, std::shared_ptr<o2::base::GRPGeomRequest>& ggr, std::function<bool(o2::framework::DataProcessingHeader::StartTime)>** gPolicyOrder = nullptr);
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index a5bdc53911e58..11c26c40f3077 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -770,64 +770,6 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc)
 
   // ------------------------------ Actual processing ------------------------------
 
-  if (mSpecConfig.nnLoadFromCCDB) {
-    LOG(info) << "(NN CLUS) Fetching CCDB calib objects";
-
-    auto dumpOnnxToFile = [](const char* buffer, std::size_t size, const std::string& path) {
-      const char* marker = "Accept-Ranges";
-      const char* pos = std::search(buffer, buffer + size, marker, marker + std::strlen(marker));
-
-      // Compute the actual number of bytes to write
-      std::size_t writeSize = (pos != buffer + size)
-                                ? static_cast<std::size_t>(pos - buffer)
-                                : size;
-
-      std::ofstream out(path, std::ios::binary | std::ios::trunc);
-      if (!out.is_open()) {
-        throw std::runtime_error("Failed to open ONNX output file: " + path);
-      }
-
-      out.write(buffer, static_cast<std::streamsize>(writeSize));
-      if (!out) {
-        throw std::runtime_error("Failed while writing ONNX data to: " + path);
-      }
-    };
-
-    GPUSettingsProcessingNNclusterizer& nnClusterizerSettings = mConfig->configProcessing.nn;
-    std::vector<std::string> evalMode = o2::utils::Str::tokenize(nnClusterizerSettings.nnEvalMode, ':');
-
-    DataRef m;
-    if (evalMode[0] == "c1") {
-      m = pc.inputs().get("nn_classification_c1");
-      const char* buffer = const_cast<char*>(m.payload);
-      size_t size = DataRefUtils::getPayloadSize(m);
-      if (nnClusterizerSettings.nnCCDBDumpToFile == 1) {
-        dumpOnnxToFile(buffer, size, "net_classification_c1.onnx");
-      }
-    } else if (evalMode[0] == "c2") {
-      m = pc.inputs().get("nn_classification_c2");
-      const char* buffer = const_cast<char*>(m.payload);
-      size_t size = DataRefUtils::getPayloadSize(m);
-      if (nnClusterizerSettings.nnCCDBDumpToFile == 1) {
-        dumpOnnxToFile(buffer, size, "net_classification_c2.onnx");
-      }
-    }
-
-    m = pc.inputs().get("nn_regression_c1");
-    const char* buffer = const_cast<char*>(m.payload);
-    size_t size = DataRefUtils::getPayloadSize(m);
-    if (nnClusterizerSettings.nnCCDBDumpToFile == 1) {
-      dumpOnnxToFile(buffer, size, "net_regression_c1.onnx");
-    }
-    if (evalMode[1] == "r2") {
-      m = pc.inputs().get("nn_regression_c2");
-      const char* buffer = const_cast<char*>(m.payload);
-      size_t size = DataRefUtils::getPayloadSize(m);
-      if (nnClusterizerSettings.nnCCDBDumpToFile == 1) {
-        dumpOnnxToFile(buffer, size, "net_regression_c2.onnx");
-      }
-    }
-  }
   if ((int32_t)(ptrs.tpcZS != nullptr) + (int32_t)(ptrs.tpcPackedDigits != nullptr && (ptrs.tpcZS == nullptr || ptrs.tpcPackedDigits->tpcDigitsMC == nullptr)) + (int32_t)(ptrs.clustersNative != nullptr) + (int32_t)(ptrs.tpcCompressedClusters != nullptr) != 1) {
     throw std::runtime_error("Invalid input for gpu tracking");
   }
@@ -1138,6 +1080,12 @@ void GPURecoWorkflowSpec::doCalibUpdates(o2::framework::ProcessingContext& pc, c
     newCalibValues.tpcTimeBinCut = mConfig->configGRP.tpcCutTimeBin = mTPCCutAtTimeBin;
     needCalibUpdate = true;
   }
+  if (mSpecConfig.nnLoadFromCCDB) {
+    for (int i = 0; i < 3; i++) {
+      newCalibObjects.nnClusterizerNetworks[i] = mConfig->configCalib.nnClusterizerNetworks[i];
+      newCalibObjects.nnClusterizerNetworkSizes[i] = mConfig->configCalib.nnClusterizerNetworkSizes[i];
+    }
+  }
   if (needCalibUpdate) {
     LOG(info) << "Updating GPUReconstruction calibration objects";
     mGPUReco->UpdateCalibration(newCalibObjects, newCalibValues);
@@ -1282,6 +1230,7 @@ Inputs GPURecoWorkflowSpec::inputs()
 
     LOG(info) << "(NN CLUS) Enabling fetching of TPC NN clusterizer from CCDB";
     mSpecConfig.nnLoadFromCCDB = true;
+    mSpecConfig.nnDumpToFile = mConfig->configProcessing.nn.nnCCDBDumpToFile;
     GPUSettingsProcessingNNclusterizer& nnClusterizerSettings = mConfig->configProcessing.nn;
 
     std::map<std::string, std::string> metadata;
@@ -1300,7 +1249,7 @@ Inputs GPURecoWorkflowSpec::inputs()
       }
     };
 
-    std::vector<std::string> evalMode = o2::utils::Str::tokenize(nnClusterizerSettings.nnEvalMode, ':');
+    mSpecConfig.nnEvalMode = o2::utils::Str::tokenize(nnClusterizerSettings.nnEvalMode, ':');
     std::vector<o2::framework::CCDBMetadata> ccdb_metadata;
 
     if (mConfParam->printSettings) {
@@ -1313,29 +1262,25 @@ Inputs GPURecoWorkflowSpec::inputs()
       printSettings(metadata);
     }
 
-    if (evalMode[0] == "c1") {
+    if (mSpecConfig.nnEvalMode[0] == "c1") {
       metadata["nnCCDBEvalType"] = "classification_c1";
       convert_map_to_metadata(metadata, ccdb_metadata);
       inputs.emplace_back("nn_classification_c1", "TPC", "NNCLUSTERIZER_C1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0));
-      LOG(info) << "(NN CLUS) Loading NN clusterizer classification (c1) from CCDB";
-    } else if (evalMode[0] == "c2") {
+    } else if (mSpecConfig.nnEvalMode[0] == "c2") {
       metadata["nnCCDBEvalType"] = "classification_c2";
       convert_map_to_metadata(metadata, ccdb_metadata);
       inputs.emplace_back("nn_classification_c2", "TPC", "NNCLUSTERIZER_C2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0));
-      LOG(info) << "(NN CLUS) Loading NN clusterizer classification (c2) from CCDB";
     }
 
     metadata["nnCCDBEvalType"] = "regression_c1";
     metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBRegressionLayerType;
     convert_map_to_metadata(metadata, ccdb_metadata);
     inputs.emplace_back("nn_regression_c1", "TPC", "NNCLUSTERIZER_R1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0));
-    LOG(info) << "(NN CLUS) Loading NN clusterizer regression (r1) from CCDB";
 
-    if (evalMode[1] == "r2") {
+    if (mSpecConfig.nnEvalMode[1] == "r2") {
       metadata["nnCCDBEvalType"] = "regression_c2";
       convert_map_to_metadata(metadata, ccdb_metadata);
       inputs.emplace_back("nn_regression_c2", "TPC", "NNCLUSTERIZER_R2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0));
-      LOG(info) << "(NN CLUS) Loading NN clusterizer regression (r2) from CCDB";
     }
   }
 
diff --git a/GPU/Workflow/src/GPUWorkflowTPC.cxx b/GPU/Workflow/src/GPUWorkflowTPC.cxx
index 6606386819b64..2d7d63e1cf25b 100644
--- a/GPU/Workflow/src/GPUWorkflowTPC.cxx
+++ b/GPU/Workflow/src/GPUWorkflowTPC.cxx
@@ -405,6 +405,76 @@ bool GPURecoWorkflowSpec::fetchCalibsCCDBTPC<GPUCalibObjectsConst>(ProcessingCon
       newCalibObjects.tpcPadGain = mCalibObjects.mTPCPadGainCalib.get();
       mustUpdate = true;
     }
+
+    // NN clusterizer networks
+    if (mSpecConfig.nnLoadFromCCDB) {
+
+      auto findValidObjectEnd = [](const char* buffer, std::size_t size) {
+        const char* marker = "Accept-Ranges";
+        std::size_t markerLen = std::strlen(marker);
+
+        auto rpos = std::search(
+          std::make_reverse_iterator(buffer + size),
+          std::make_reverse_iterator(buffer),
+          std::make_reverse_iterator(marker + markerLen),
+          std::make_reverse_iterator(marker));
+
+        if (rpos == std::make_reverse_iterator(buffer)) {
+          return size; // Marker not found: keep full buffer
+        }
+
+        const char* pos = rpos.base() - markerLen; // Convert reverse iterator back
+        return static_cast<std::size_t>(pos - buffer);
+      };
+
+      auto dumpToFile = [](const char* buffer, std::size_t validSize, const std::string& path) {
+        std::ofstream out(path, std::ios::binary | std::ios::trunc);
+        if (!out.is_open()) {
+          throw std::runtime_error("Failed to open output file: " + path);
+        }
+
+        out.write(buffer, static_cast<std::streamsize>(validSize));
+        if (!out) {
+          throw std::runtime_error("Failed while writing data to: " + path);
+        }
+      };
+
+      DataRef m;
+      if (mSpecConfig.nnEvalMode[0] == "c1") {
+        m = pc.inputs().get("nn_classification_c1");
+        mConfig->configCalib.nnClusterizerNetworks[0] = const_cast<char*>(m.payload);
+        size_t size = DataRefUtils::getPayloadSize(m);
+        mConfig->configCalib.nnClusterizerNetworkSizes[0] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[0], size);
+        if (mSpecConfig.nnDumpToFile) {
+          dumpToFile(mConfig->configCalib.nnClusterizerNetworks[0], mConfig->configCalib.nnClusterizerNetworkSizes[0], "net_classification_c1.onnx");
+        }
+      } else if (mSpecConfig.nnEvalMode[0] == "c2") {
+        m = pc.inputs().get("nn_classification_c2");
+        mConfig->configCalib.nnClusterizerNetworks[0] = const_cast<char*>(m.payload);
+        size_t size = DataRefUtils::getPayloadSize(m);
+        mConfig->configCalib.nnClusterizerNetworkSizes[0] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[0], size);
+        if (mSpecConfig.nnDumpToFile) {
+          dumpToFile(mConfig->configCalib.nnClusterizerNetworks[0], mConfig->configCalib.nnClusterizerNetworkSizes[0], "net_classification_c2.onnx");
+        }
+      }
+
+      m = pc.inputs().get("nn_regression_c1");
+      mConfig->configCalib.nnClusterizerNetworks[2] = const_cast<char*>(m.payload);
+      size_t size = DataRefUtils::getPayloadSize(m);
+      mConfig->configCalib.nnClusterizerNetworkSizes[2] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[2], size);
+      if (mSpecConfig.nnDumpToFile) {
+        dumpToFile(mConfig->configCalib.nnClusterizerNetworks[2], mConfig->configCalib.nnClusterizerNetworkSizes[2], "net_regression_c1.onnx");
+      }
+      if (mSpecConfig.nnEvalMode[1] == "r2") {
+        m = pc.inputs().get("nn_regression_c2");
+        mConfig->configCalib.nnClusterizerNetworks[3] = const_cast<char*>(m.payload);
+        size_t size = DataRefUtils::getPayloadSize(m);
+        mConfig->configCalib.nnClusterizerNetworkSizes[3] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[3], size);
+        if (mSpecConfig.nnDumpToFile) {
+          dumpToFile(mConfig->configCalib.nnClusterizerNetworks[3], mConfig->configCalib.nnClusterizerNetworkSizes[3], "net_regression_c2.onnx");
+        }
+      }
+    }
   }
   return mustUpdate;
 }

From ab19782deff6d71ba65fd774b1388991e6ead679 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Sun, 23 Nov 2025 10:53:45 +0000
Subject: [PATCH 11/16] Please consider the following formatting changes

---
 GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 2b4003396e4f7..5daf3043f6c47 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -677,7 +677,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         //   nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
         // }
         // recreateMemoryAllocator = true;
-        if (!nn_settings.nnLoadFromCCDB){
+        if (!nn_settings.nnLoadFromCCDB) {
           (nnApplications[lane].mModelClass).initSession(); // loads from file
         } else {
           (nnApplications[lane].mModelClass).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[0], processors()->calibObjects.nnClusterizerNetworkSizes[0]); // loads from CCDB
@@ -692,7 +692,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         // (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv());
         (nnApplications[lane].mModelReg1).initEnvironment();
         // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator);
-        if (!nn_settings.nnLoadFromCCDB){
+        if (!nn_settings.nnLoadFromCCDB) {
           (nnApplications[lane].mModelReg1).initSession(); // loads from file
         } else {
           (nnApplications[lane].mModelReg1).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[1], processors()->calibObjects.nnClusterizerNetworkSizes[1]); // loads from CCDB
@@ -707,7 +707,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         // (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv());
         (nnApplications[lane].mModelReg2).initEnvironment();
         // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator);
-        if (!nn_settings.nnLoadFromCCDB){
+        if (!nn_settings.nnLoadFromCCDB) {
           (nnApplications[lane].mModelReg2).initSession(); // loads from file
         } else {
           (nnApplications[lane].mModelReg2).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[2], processors()->calibObjects.nnClusterizerNetworkSizes[2]); // loads from CCDB

From 4fed621b94cc7583649b479bee3672c8acbfdb4c Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 23 Nov 2025 12:37:09 +0100
Subject: [PATCH 12/16] Bug-fix

---
 GPU/Workflow/src/GPUWorkflowTPC.cxx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/GPU/Workflow/src/GPUWorkflowTPC.cxx b/GPU/Workflow/src/GPUWorkflowTPC.cxx
index 2d7d63e1cf25b..35103a6da6842 100644
--- a/GPU/Workflow/src/GPUWorkflowTPC.cxx
+++ b/GPU/Workflow/src/GPUWorkflowTPC.cxx
@@ -459,19 +459,19 @@ bool GPURecoWorkflowSpec::fetchCalibsCCDBTPC<GPUCalibObjectsConst>(ProcessingCon
       }
 
       m = pc.inputs().get("nn_regression_c1");
-      mConfig->configCalib.nnClusterizerNetworks[2] = const_cast<char*>(m.payload);
+      mConfig->configCalib.nnClusterizerNetworks[1] = const_cast<char*>(m.payload);
       size_t size = DataRefUtils::getPayloadSize(m);
-      mConfig->configCalib.nnClusterizerNetworkSizes[2] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[2], size);
+      mConfig->configCalib.nnClusterizerNetworkSizes[1] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[2], size);
       if (mSpecConfig.nnDumpToFile) {
-        dumpToFile(mConfig->configCalib.nnClusterizerNetworks[2], mConfig->configCalib.nnClusterizerNetworkSizes[2], "net_regression_c1.onnx");
+        dumpToFile(mConfig->configCalib.nnClusterizerNetworks[1], mConfig->configCalib.nnClusterizerNetworkSizes[2], "net_regression_c1.onnx");
       }
       if (mSpecConfig.nnEvalMode[1] == "r2") {
         m = pc.inputs().get("nn_regression_c2");
-        mConfig->configCalib.nnClusterizerNetworks[3] = const_cast<char*>(m.payload);
+        mConfig->configCalib.nnClusterizerNetworks[2] = const_cast<char*>(m.payload);
         size_t size = DataRefUtils::getPayloadSize(m);
-        mConfig->configCalib.nnClusterizerNetworkSizes[3] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[3], size);
+        mConfig->configCalib.nnClusterizerNetworkSizes[3] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[2], size);
         if (mSpecConfig.nnDumpToFile) {
-          dumpToFile(mConfig->configCalib.nnClusterizerNetworks[3], mConfig->configCalib.nnClusterizerNetworkSizes[3], "net_regression_c2.onnx");
+          dumpToFile(mConfig->configCalib.nnClusterizerNetworks[2], mConfig->configCalib.nnClusterizerNetworkSizes[3], "net_regression_c2.onnx");
         }
       }
     }

From e7cd6fa5d7c5a6cbebdb2c0ef31b689fa981ac76 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 25 Nov 2025 11:26:01 +0100
Subject: [PATCH 13/16] Working version of CCDB fetching and loading into ROOT
 class of std::vector<char>

---
 .../TPC/base/test/testTPCCDBInterface.cxx     |   1 -
 GPU/GPUTracking/CMakeLists.txt                |   1 +
 GPU/GPUTracking/DataTypes/GPUDataTypes.h      |   4 +-
 .../DataTypes/ORTRootSerializer.cxx           |  25 ++
 GPU/GPUTracking/DataTypes/ORTRootSerializer.h |  43 ++++
 GPU/GPUTracking/Definitions/GPUSettingsList.h |   2 +-
 .../GPUTrackingLinkDef_O2_DataTypes.h         |   1 +
 .../Global/GPUChainTrackingClusterizer.cxx    |   7 +-
 .../utils/convert_onnx_to_root_serialized.C   | 220 ++++++++++++++++++
 GPU/Workflow/src/GPUWorkflowSpec.cxx          |  33 ++-
 GPU/Workflow/src/GPUWorkflowTPC.cxx           |  76 ++----
 11 files changed, 339 insertions(+), 74 deletions(-)
 create mode 100644 GPU/GPUTracking/DataTypes/ORTRootSerializer.cxx
 create mode 100644 GPU/GPUTracking/DataTypes/ORTRootSerializer.h
 create mode 100644 GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C

diff --git a/Detectors/TPC/base/test/testTPCCDBInterface.cxx b/Detectors/TPC/base/test/testTPCCDBInterface.cxx
index 3074c5e90a00c..5a5384a4134ed 100644
--- a/Detectors/TPC/base/test/testTPCCDBInterface.cxx
+++ b/Detectors/TPC/base/test/testTPCCDBInterface.cxx
@@ -22,7 +22,6 @@
 
 // o2 includes
 #include "TPCBase/CDBInterface.h"
-#include "TPCBase/CDBInterface.h"
 #include "TPCBase/CalArray.h"
 #include "TPCBase/CalDet.h"
 #include "TPCBase/Mapper.h"
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index 2a0b9b9edfa09..6dd718f07a9f1 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -209,6 +209,7 @@ set(SRCS_DATATYPES
     DataTypes/TPCPadBitMap.cxx
     DataTypes/TPCZSLinkMapping.cxx
     DataTypes/CalibdEdxContainer.cxx
+    DataTypes/ORTRootSerializer.cxx
     DataTypes/CalibdEdxTrackTopologyPol.cxx
     DataTypes/CalibdEdxTrackTopologySpline.cxx
     DataTypes/GPUTRDTrackO2.cxx)
diff --git a/GPU/GPUTracking/DataTypes/GPUDataTypes.h b/GPU/GPUTracking/DataTypes/GPUDataTypes.h
index e84ebe7231b34..8bf8084e048fd 100644
--- a/GPU/GPUTracking/DataTypes/GPUDataTypes.h
+++ b/GPU/GPUTracking/DataTypes/GPUDataTypes.h
@@ -85,6 +85,7 @@ class Cluster;
 namespace tpc
 {
 class CalibdEdxContainer;
+class ORTRootSerializer;
 } // namespace tpc
 } // namespace o2
 
@@ -184,8 +185,7 @@ struct GPUCalibObjectsTemplate { // use only pointers on PODs or flat objects he
   typename S<o2::itsmft::TopologyDictionary>::type* itsPatternDict = nullptr;
 
   // NN clusterizer objects
-  char* nnClusterizerNetworks[3] = {nullptr, nullptr, nullptr}; // [c, r1, r2] networks as char arrays from CCDB
-  uint32_t nnClusterizerNetworkSizes[3] = {0, 0, 0};
+  typename S<o2::tpc::ORTRootSerializer>::type* nnClusterizerNetworks[3] = {nullptr, nullptr, nullptr};
 };
 typedef GPUCalibObjectsTemplate<DefaultPtr> GPUCalibObjects; // NOTE: These 2 must have identical layout since they are memcopied
 typedef GPUCalibObjectsTemplate<ConstPtr> GPUCalibObjectsConst;
diff --git a/GPU/GPUTracking/DataTypes/ORTRootSerializer.cxx b/GPU/GPUTracking/DataTypes/ORTRootSerializer.cxx
new file mode 100644
index 0000000000000..82a8be1fdfec8
--- /dev/null
+++ b/GPU/GPUTracking/DataTypes/ORTRootSerializer.cxx
@@ -0,0 +1,25 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file   ORTRootSerializer.cxx
+/// \author Christian Sonnabend <christian.sonnabend@cern.ch>
+
+#include "ORTRootSerializer.h"
+#include <cstring>
+
+using namespace o2::tpc;
+
+/// Initialize the serialization from a char* buffer containing the model
+void ORTRootSerializer::setOnnxModel(const char* onnxModel, uint32_t size)
+{
+  mModelBuffer.resize(size);
+  std::memcpy(mModelBuffer.data(), onnxModel, size);
+}
diff --git a/GPU/GPUTracking/DataTypes/ORTRootSerializer.h b/GPU/GPUTracking/DataTypes/ORTRootSerializer.h
new file mode 100644
index 0000000000000..9952abfd55997
--- /dev/null
+++ b/GPU/GPUTracking/DataTypes/ORTRootSerializer.h
@@ -0,0 +1,43 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file   ORTRootSerializer.h
+/// \brief  Class to serialize ONNX objects for ROOT snapshots of CCDB objects at runtime
+/// \author Christian Sonnabend <christian.sonnabend@cern.ch>
+
+#ifndef ALICEO2_TPC_ORTROOTSERIALIZER_H_
+#define ALICEO2_TPC_ORTROOTSERIALIZER_H_
+
+#include "GPUCommonRtypes.h"
+#include <vector>
+#include <string>
+
+namespace o2::tpc
+{
+
+class ORTRootSerializer
+{
+  public:
+    ORTRootSerializer() = default;
+    ~ORTRootSerializer() = default;
+
+    void setOnnxModel(const char* onnxModel, uint32_t size);
+    const char* getONNXModel() const { return mModelBuffer.data(); }
+    uint32_t getONNXModelSize() const { return static_cast<uint32_t>(mModelBuffer.size()); }
+
+  private:
+    std::vector<char> mModelBuffer; ///< buffer for serialization
+    ClassDefNV(ORTRootSerializer, 1);
+};
+
+} // namespace o2::tpc
+
+#endif // ALICEO2_TPC_ORTROOTSERIALIZER_H_
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index aa9e8a78c4484..06f681dea4c1a 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -277,8 +277,8 @@ AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the bo
 AddOption(nnClusterizerApplyNoiseSuppression, int, 1, "", 0, "Applies the NoiseSuppression kernel before the digits to the network are filled")
 AddOption(nnClusterizerSetDeconvolutionFlags, int, 1, "", 0, "Runs the deconvolution kernel without overwriting the charge in order to make cluster-to-track attachment identical to heuristic CF")
 AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
-AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
 AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
+AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
 AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).")
 AddOption(nnEvalMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)")
 AddOption(nnClusterizerUseClassification, int, 1, "", 0, "If 1, the classification output of the network is used to select clusters, else only the regression output is used and no clusters are rejected by classification")
diff --git a/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h b/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h
index 46fd50464c69b..7bd2c689c5354 100644
--- a/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h
+++ b/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h
@@ -43,5 +43,6 @@
 #pragma link C++ class o2::tpc::CalibdEdxTrackTopologyPol + ;
 #pragma link C++ class o2::tpc::CalibdEdxTrackTopologySpline + ;
 #pragma link C++ struct o2::tpc::CalibdEdxTrackTopologyPolContainer + ;
+#pragma link C++ struct o2::tpc::ORTRootSerializer + ;
 
 #endif
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 5daf3043f6c47..5426f0eafdad6 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -47,6 +47,7 @@
 #ifdef GPUCA_HAS_ONNX
 #include "GPUTPCNNClusterizerKernels.h"
 #include "GPUTPCNNClusterizerHost.h"
+#include "ORTRootSerializer.h"
 #endif
 
 #ifdef GPUCA_O2_LIB
@@ -680,7 +681,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (!nn_settings.nnLoadFromCCDB) {
           (nnApplications[lane].mModelClass).initSession(); // loads from file
         } else {
-          (nnApplications[lane].mModelClass).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[0], processors()->calibObjects.nnClusterizerNetworkSizes[0]); // loads from CCDB
+          (nnApplications[lane].mModelClass).initSessionFromBuffer((processors()->calibObjects.nnClusterizerNetworks[0])->getONNXModel(), (processors()->calibObjects.nnClusterizerNetworks[0])->getONNXModelSize()); // loads from CCDB
         }
       }
       if (nnApplications[lane].mModelsUsed[1]) {
@@ -695,7 +696,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (!nn_settings.nnLoadFromCCDB) {
           (nnApplications[lane].mModelReg1).initSession(); // loads from file
         } else {
-          (nnApplications[lane].mModelReg1).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[1], processors()->calibObjects.nnClusterizerNetworkSizes[1]); // loads from CCDB
+          (nnApplications[lane].mModelReg1).initSessionFromBuffer((processors()->calibObjects.nnClusterizerNetworks[1])->getONNXModel(), (processors()->calibObjects.nnClusterizerNetworks[1])->getONNXModelSize()); // loads from CCDB
         }
       }
       if (nnApplications[lane].mModelsUsed[2]) {
@@ -710,7 +711,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         if (!nn_settings.nnLoadFromCCDB) {
           (nnApplications[lane].mModelReg2).initSession(); // loads from file
         } else {
-          (nnApplications[lane].mModelReg2).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[2], processors()->calibObjects.nnClusterizerNetworkSizes[2]); // loads from CCDB
+          (nnApplications[lane].mModelReg2).initSessionFromBuffer((processors()->calibObjects.nnClusterizerNetworks[2])->getONNXModel(), (processors()->calibObjects.nnClusterizerNetworks[2])->getONNXModelSize()); // loads from CCDB
         }
       }
       if (nn_settings.nnClusterizerVerbosity > 0) {
diff --git a/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C b/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C
new file mode 100644
index 0000000000000..1970c57353e54
--- /dev/null
+++ b/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C
@@ -0,0 +1,220 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file   convert_onnx_to_root_serialized.C
+/// \brief  Utility functions to be executed as a ROOT macro for uploading ONNX models to CCDB as ROOT serialized objects and vice versa
+/// \author Christian Sonnabend <christian.sonnabend@cern.ch>
+
+// Example execution: root -l -b -q '/scratch/csonnabe/MyO2/O2/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C("/scratch/csonnabe/PhD/jobs/clusterization/NN/output/21082025_smallWindow_clean/SC/training_data_21082025_reco_noise_supressed_p3t6_CoGselected/SC/PbPb_24arp2/0_5/class1/regression/399_noMom/network/net_fp16.onnx", "", 1, 1, "nnCCDBLayerType=FC/nnCCDBWithMomentum=0/inputDType=FP16/nnCCDBInteractionRate=500/outputDType=FP16/nnCCDBEvalType=regression_c1/nnCCDBBeamType=pp/partName=blob/quality=3", 1, 4108971600000, "Users/c/csonnabe/TPC/Clusterization", "model.root")'
+
+#include "ORTRootSerializer.h"
+#include "CCDB/CcdbApi.h"
+#include "CCDB/CcdbObjectInfo.h"
+#include "TFile.h"
+#include <fstream>
+#include <stdexcept>
+
+o2::tpc::ORTRootSerializer serializer;
+
+/// Dumps the char* to a .onnx file -> Directly readable by ONNX runtime or Netron
+void dumpOnnxToFile(const char* modelBuffer, uint32_t size, const std::string outputPath)
+{
+  std::ofstream outFile(outputPath, std::ios::binary | std::ios::trunc);
+  if (!outFile.is_open()) {
+    throw std::runtime_error("Failed to open output ONNX file: " + outputPath);
+  }
+  outFile.write(modelBuffer, static_cast<std::streamsize>(size));
+  if (!outFile) {
+    throw std::runtime_error("Failed while writing data to: " + outputPath);
+  }
+  outFile.close();
+}
+
+/// Initialize the serialization from an ONNX file
+void readOnnxModelFromFile(const std::string modelPath)
+{
+  std::ifstream inFile(modelPath, std::ios::binary | std::ios::ate);
+  if (!inFile.is_open()) {
+    throw std::runtime_error("Could not open input ONNX file " + modelPath);
+  }
+  std::streamsize size = inFile.tellg();
+  std::vector<char> mModelBuffer(size);
+  inFile.seekg(0, std::ios::beg);
+  if (!inFile.read(mModelBuffer.data(), size)) {
+    throw std::runtime_error("Could not read input ONNX file " + modelPath);
+  }
+  inFile.close();
+  serializer.setOnnxModel(mModelBuffer.data(), static_cast<uint32_t>(size));
+}
+
+/// Initialize the serialization from a ROOT file
+void readRootModelFromFile(const std::string rootFilePath, std::string key)
+{
+  TFile inRootFile(rootFilePath.c_str());
+  if (inRootFile.IsZombie()) {
+    throw std::runtime_error("Could not open input ROOT file " + rootFilePath);
+  }
+  auto* serPtr = inRootFile.Get<o2::tpc::ORTRootSerializer>(key.c_str());
+  if (!serPtr) {
+    throw std::runtime_error("Could not find " + key + " in ROOT file " + rootFilePath);
+  }
+  serializer = *serPtr;
+  inRootFile.Close();
+}
+
+/// Serialize the ONNX model to a ROOT object and store to file
+void onnxToRoot(std::string infile, std::string outfile, std::string key)
+{
+  readOnnxModelFromFile(infile);
+  TFile outRootFile(outfile.c_str(), "RECREATE");
+  if (outRootFile.IsZombie()) {
+    throw std::runtime_error("Could not create output ROOT file " + outfile);
+  }
+  outRootFile.WriteObject(&serializer, key.c_str());
+  outRootFile.Close();
+}
+
+/// Deserialize the ONNX model from a ROOT object and store to a .onnx file
+void rootToOnnx(std::string infile, std::string outfile, std::string key)
+{
+  TFile inRootFile(infile.c_str());
+  if (inRootFile.IsZombie()) {
+    throw std::runtime_error("Could not open input ROOT file " + infile);
+  }
+  auto* serPtr = inRootFile.Get<o2::tpc::ORTRootSerializer>(key.c_str());
+  if (!serPtr) {
+    throw std::runtime_error("Could not find " + key + " in ROOT file " + infile);
+  }
+  serializer = *serPtr;
+
+  std::ofstream outFile(outfile, std::ios::binary | std::ios::trunc);
+  if (!outFile.is_open()) {
+    throw std::runtime_error("Failed to open output ONNX file: " + outfile);
+  }
+  outFile.write(serializer.getONNXModel(), static_cast<std::streamsize>(serializer.getONNXModelSize()));
+  if (!outFile) {
+    throw std::runtime_error("Failed while writing data to: " + outfile);
+  }
+  outFile.close();
+
+  inRootFile.Close();
+}
+
+/// Upload the ONNX model to CCDB from an ONNX file
+/// !!! Adjust the metadata, path and validity !!!
+void uploadToCCDBFromONNX(std::string onnxFile,
+  const std::map<std::string, std::string>& metadata,
+  // { // some example metadata entries
+  //   "nnCCDBLayerType": "FC",
+  //   "nnCCDBWithMomentum": "0",
+  //   "inputDType": "FP16",
+  //   "nnCCDBInteractionRate": "500",
+  //   "outputDType": "FP16",
+  //   "nnCCDBEvalType": "regression_c1",
+  //   "nnCCDBBeamType": "pp",
+  //   "partName": "blob",
+  //   "quality": "3"
+  // }
+  long tsMin /* = 1 */,
+  long tsMax /* = 4108971600000 */,
+  std::string ccdbPath /* = "Users/c/csonnabe/TPC/Clusterization" */,
+  std::string objname /* = "net_regression_r1.root" */,
+  std::string ccdbUrl /* = "http://alice-ccdb.cern.ch" */)
+{
+  readOnnxModelFromFile(onnxFile);
+
+  o2::ccdb::CcdbApi api;
+  api.init(ccdbUrl);
+
+  // build full CCDB path including filename
+  const std::string fullPath = ccdbPath;//.back() == '/' ? (ccdbPath + objname) : (ccdbPath + "/" + objname);
+
+  api.storeAsTFileAny(&serializer, fullPath, metadata, tsMin, tsMax);
+}
+
+/// Upload the ONNX model to CCDB from a ROOT file
+/// !!! Adjust the metadata, path and validity !!!
+void uploadToCCDBFromROOT(std::string rootFile,
+  const std::map<std::string, std::string>& metadata,
+  long tsMin /* = 1 */,
+  long tsMax /* = 4108971600000 */,
+  std::string ccdbPath /* = "Users/c/csonnabe/TPC/Clusterization" */,
+  std::string objname /* = "net_regression_r1.root" */,
+  std::string ccdbUrl /* = "http://alice-ccdb.cern.ch" */)
+{
+  // read ROOT file, extract ORTRootSerializer object and upload via storeAsTFileAny
+  TFile inRootFile(rootFile.c_str());
+  if (inRootFile.IsZombie()) {
+    throw std::runtime_error("Could not open input ROOT file " + rootFile);
+  }
+
+  // if objname is empty, fall back to default CCDB object key
+  const std::string key = objname.empty() ? o2::ccdb::CcdbApi::CCDBOBJECT_ENTRY : objname;
+
+  auto* serPtr = inRootFile.Get<o2::tpc::ORTRootSerializer>(key.c_str());
+  if (!serPtr) {
+    inRootFile.Close();
+    throw std::runtime_error("Could not find " + key + " in ROOT file " + rootFile);
+  }
+  serializer = *serPtr;
+
+  o2::ccdb::CcdbApi api;
+  api.init(ccdbUrl);
+
+  // build full CCDB path including filename
+  const std::string fullPath = ccdbPath;//.back() == '/' ? (ccdbPath + objname) : (ccdbPath + "/" + objname);
+
+  api.storeAsTFileAny(&serializer, fullPath, metadata, tsMin, tsMax);
+
+  inRootFile.Close();
+}
+
+void convert_onnx_to_root_serialized(const std::string& onnxFile,
+  const std::string& rootFile,
+  int mode = 0,
+  int ccdbUpload = 0,
+  const std::string& metadataStr = "nnCCDBLayerType=FC/nnCCDBWithMomentum=0/inputDType=FP16/nnCCDBInteractionRate=500/outputDType=FP16/nnCCDBEvalType=regression_c1/nnCCDBBeamType=pp/partName=blob/quality=3",
+  long tsMin = 1,
+  long tsMax = 4108971600000,
+  std::string ccdbPath = "Users/c/csonnabe/TPC/Clusterization",
+  std::string objname = "net_regression_r1.root",
+  std::string ccdbUrl = "http://alice-ccdb.cern.ch")
+{
+  // parse metadataStr of the form key=value/key2=value2/...
+  std::map<std::string, std::string> metadata;
+  std::size_t start = 0;
+  while (start < metadataStr.size()) {
+    auto sep = metadataStr.find('/', start);
+    auto token = metadataStr.substr(start, sep == std::string::npos ? std::string::npos : sep - start);
+    if (!token.empty()) {
+      auto eq = token.find('=');
+      if (eq != std::string::npos && eq > 0 && eq + 1 < token.size()) {
+        metadata.emplace(token.substr(0, eq), token.substr(eq + 1));
+      }
+    }
+    if (sep == std::string::npos) {
+      break;
+    }
+    start = sep + 1;
+  }
+
+  if (ccdbUpload == 0){
+    if (mode == 0)
+      onnxToRoot(onnxFile, rootFile, o2::ccdb::CcdbApi::CCDBOBJECT_ENTRY);
+    else if (mode == 1)
+      rootToOnnx(rootFile, onnxFile, o2::ccdb::CcdbApi::CCDBOBJECT_ENTRY);
+  } else if (ccdbUpload == 1){
+    if (mode == 0)
+      uploadToCCDBFromROOT(rootFile, metadata, tsMin, tsMax, ccdbPath, objname, ccdbUrl);
+    else if (mode == 1)
+      uploadToCCDBFromONNX(onnxFile, metadata, tsMin, tsMax, ccdbPath, objname, ccdbUrl);
+  }
+}
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index 11c26c40f3077..c0e46cc7b75f3 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -54,6 +54,7 @@
 #include "GPUO2Interface.h"
 #include "GPUO2InterfaceUtils.h"
 #include "CalibdEdxContainer.h"
+#include "ORTRootSerializer.h"
 #include "GPUNewCalibValues.h"
 #include "TPCPadGainCalib.h"
 #include "TPCZSLinkMapping.h"
@@ -1081,9 +1082,24 @@ void GPURecoWorkflowSpec::doCalibUpdates(o2::framework::ProcessingContext& pc, c
     needCalibUpdate = true;
   }
   if (mSpecConfig.nnLoadFromCCDB) {
+    auto dumpToFile = [](const char* buffer, std::size_t validSize, const std::string& path) {
+      std::ofstream out(path, std::ios::binary | std::ios::trunc);
+      if (!out.is_open()) {
+        throw std::runtime_error("Failed to open output file: " + path);
+      }
+
+      out.write(buffer, static_cast<std::streamsize>(validSize));
+      if (!out) {
+        throw std::runtime_error("Failed while writing data to: " + path);
+      }
+    };
     for (int i = 0; i < 3; i++) {
       newCalibObjects.nnClusterizerNetworks[i] = mConfig->configCalib.nnClusterizerNetworks[i];
-      newCalibObjects.nnClusterizerNetworkSizes[i] = mConfig->configCalib.nnClusterizerNetworkSizes[i];
+      if (mSpecConfig.nnDumpToFile && newCalibObjects.nnClusterizerNetworks[i]) {
+        std::string path = "tpc_nn_clusterizer_" + std::to_string(i) + ".onnx";
+        dumpToFile(newCalibObjects.nnClusterizerNetworks[i]->getONNXModel(), newCalibObjects.nnClusterizerNetworks[i]->getONNXModelSize(), path);
+        LOG(info) << "Dumped TPC clusterizer NN " << i << " to file " << path;
+      }
     }
   }
   if (needCalibUpdate) {
@@ -1262,25 +1278,26 @@ Inputs GPURecoWorkflowSpec::inputs()
       printSettings(metadata);
     }
 
-    if (mSpecConfig.nnEvalMode[0] == "c1") {
+    if(mSpecConfig.nnEvalMode[0] == "c1") {
       metadata["nnCCDBEvalType"] = "classification_c1";
       convert_map_to_metadata(metadata, ccdb_metadata);
-      inputs.emplace_back("nn_classification_c1", "TPC", "NNCLUSTERIZER_C1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0));
-    } else if (mSpecConfig.nnEvalMode[0] == "c2") {
+      inputs.emplace_back("nn_classification_c1", gDataOriginTPC, "NNCLUSTERIZER_C1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0));
+    } else if(mSpecConfig.nnEvalMode[0] == "c2") {
+      metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBRegressionLayerType;
       metadata["nnCCDBEvalType"] = "classification_c2";
       convert_map_to_metadata(metadata, ccdb_metadata);
-      inputs.emplace_back("nn_classification_c2", "TPC", "NNCLUSTERIZER_C2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0));
+      inputs.emplace_back("nn_classification_c2", gDataOriginTPC, "NNCLUSTERIZER_C2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0));
     }
 
     metadata["nnCCDBEvalType"] = "regression_c1";
     metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBRegressionLayerType;
     convert_map_to_metadata(metadata, ccdb_metadata);
-    inputs.emplace_back("nn_regression_c1", "TPC", "NNCLUSTERIZER_R1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0));
+    inputs.emplace_back("nn_regression_c1", gDataOriginTPC, "NNCLUSTERIZER_R1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0));
 
-    if (mSpecConfig.nnEvalMode[1] == "r2") {
+    if(mSpecConfig.nnEvalMode[1] == "r2") {
       metadata["nnCCDBEvalType"] = "regression_c2";
       convert_map_to_metadata(metadata, ccdb_metadata);
-      inputs.emplace_back("nn_regression_c2", "TPC", "NNCLUSTERIZER_R2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0));
+      inputs.emplace_back("nn_regression_c2", gDataOriginTPC, "NNCLUSTERIZER_R2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0));
     }
   }
 
diff --git a/GPU/Workflow/src/GPUWorkflowTPC.cxx b/GPU/Workflow/src/GPUWorkflowTPC.cxx
index 35103a6da6842..ae9aabda1fd74 100644
--- a/GPU/Workflow/src/GPUWorkflowTPC.cxx
+++ b/GPU/Workflow/src/GPUWorkflowTPC.cxx
@@ -49,6 +49,7 @@
 #include "GPUO2Interface.h"
 #include "GPUO2InterfaceUtils.h"
 #include "CalibdEdxContainer.h"
+#include "ORTRootSerializer.h"
 #include "GPUNewCalibValues.h"
 #include "TPCPadGainCalib.h"
 #include "TPCZSLinkMapping.h"
@@ -293,6 +294,18 @@ void GPURecoWorkflowSpec::finaliseCCDBTPC(ConcreteDataMatcher& matcher, void* ob
          mTPCDeadChannelMapCreator->getDeadChannelMapFEE().getSum<int32_t>(), mTPCDeadChannelMapCreator->getDeadChannelMap().getSum<int32_t>());
   } else if (mTPCVDriftHelper->accountCCDBInputs(matcher, obj)) {
   } else if (mCalibObjects.mFastTransformHelper->accountCCDBInputs(matcher, obj)) {
+  } else if (matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_C1", 0)) {
+    mConfig->configCalib.nnClusterizerNetworks[0] = static_cast<o2::tpc::ORTRootSerializer*>(obj);
+    LOG(info) << "(NN CLUS) " << (mConfig->configCalib.nnClusterizerNetworks[0])->getONNXModelSize() << " bytes loaded for NN clusterizer: classification_c1";
+  } else if (matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_C2", 0)) {
+    mConfig->configCalib.nnClusterizerNetworks[0] = static_cast<o2::tpc::ORTRootSerializer*>(obj);
+    LOG(info) << "(NN CLUS) " << (mConfig->configCalib.nnClusterizerNetworks[0])->getONNXModelSize() << " bytes loaded for NN clusterizer: classification_c2";
+  } else if(matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_R1", 0)){
+    mConfig->configCalib.nnClusterizerNetworks[1] = static_cast<o2::tpc::ORTRootSerializer*>(obj);
+    LOG(info) << "(NN CLUS) " << (mConfig->configCalib.nnClusterizerNetworks[1])->getONNXModelSize() << " bytes loaded for NN clusterizer: regression_c1";
+  } else if (matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_R2", 0)) {
+    mConfig->configCalib.nnClusterizerNetworks[2] = static_cast<o2::tpc::ORTRootSerializer*>(obj);
+    LOG(info) << "(NN CLUS) " << (mConfig->configCalib.nnClusterizerNetworks[2])->getONNXModelSize() << " bytes loaded for NN clusterizer: regression_c2";
   }
 }
 
@@ -409,70 +422,15 @@ bool GPURecoWorkflowSpec::fetchCalibsCCDBTPC<GPUCalibObjectsConst>(ProcessingCon
     // NN clusterizer networks
     if (mSpecConfig.nnLoadFromCCDB) {
 
-      auto findValidObjectEnd = [](const char* buffer, std::size_t size) {
-        const char* marker = "Accept-Ranges";
-        std::size_t markerLen = std::strlen(marker);
-
-        auto rpos = std::search(
-          std::make_reverse_iterator(buffer + size),
-          std::make_reverse_iterator(buffer),
-          std::make_reverse_iterator(marker + markerLen),
-          std::make_reverse_iterator(marker));
-
-        if (rpos == std::make_reverse_iterator(buffer)) {
-          return size; // Marker not found: keep full buffer
-        }
-
-        const char* pos = rpos.base() - markerLen; // Convert reverse iterator back
-        return static_cast<std::size_t>(pos - buffer);
-      };
-
-      auto dumpToFile = [](const char* buffer, std::size_t validSize, const std::string& path) {
-        std::ofstream out(path, std::ios::binary | std::ios::trunc);
-        if (!out.is_open()) {
-          throw std::runtime_error("Failed to open output file: " + path);
-        }
-
-        out.write(buffer, static_cast<std::streamsize>(validSize));
-        if (!out) {
-          throw std::runtime_error("Failed while writing data to: " + path);
-        }
-      };
-
-      DataRef m;
       if (mSpecConfig.nnEvalMode[0] == "c1") {
-        m = pc.inputs().get("nn_classification_c1");
-        mConfig->configCalib.nnClusterizerNetworks[0] = const_cast<char*>(m.payload);
-        size_t size = DataRefUtils::getPayloadSize(m);
-        mConfig->configCalib.nnClusterizerNetworkSizes[0] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[0], size);
-        if (mSpecConfig.nnDumpToFile) {
-          dumpToFile(mConfig->configCalib.nnClusterizerNetworks[0], mConfig->configCalib.nnClusterizerNetworkSizes[0], "net_classification_c1.onnx");
-        }
+        pc.inputs().get<o2::tpc::ORTRootSerializer*>("nn_classification_c1");
       } else if (mSpecConfig.nnEvalMode[0] == "c2") {
-        m = pc.inputs().get("nn_classification_c2");
-        mConfig->configCalib.nnClusterizerNetworks[0] = const_cast<char*>(m.payload);
-        size_t size = DataRefUtils::getPayloadSize(m);
-        mConfig->configCalib.nnClusterizerNetworkSizes[0] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[0], size);
-        if (mSpecConfig.nnDumpToFile) {
-          dumpToFile(mConfig->configCalib.nnClusterizerNetworks[0], mConfig->configCalib.nnClusterizerNetworkSizes[0], "net_classification_c2.onnx");
-        }
+        pc.inputs().get<o2::tpc::ORTRootSerializer*>("nn_classification_c2");
       }
 
-      m = pc.inputs().get("nn_regression_c1");
-      mConfig->configCalib.nnClusterizerNetworks[1] = const_cast<char*>(m.payload);
-      size_t size = DataRefUtils::getPayloadSize(m);
-      mConfig->configCalib.nnClusterizerNetworkSizes[1] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[2], size);
-      if (mSpecConfig.nnDumpToFile) {
-        dumpToFile(mConfig->configCalib.nnClusterizerNetworks[1], mConfig->configCalib.nnClusterizerNetworkSizes[2], "net_regression_c1.onnx");
-      }
+      pc.inputs().get<o2::tpc::ORTRootSerializer*>("nn_regression_c1");
       if (mSpecConfig.nnEvalMode[1] == "r2") {
-        m = pc.inputs().get("nn_regression_c2");
-        mConfig->configCalib.nnClusterizerNetworks[2] = const_cast<char*>(m.payload);
-        size_t size = DataRefUtils::getPayloadSize(m);
-        mConfig->configCalib.nnClusterizerNetworkSizes[3] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[2], size);
-        if (mSpecConfig.nnDumpToFile) {
-          dumpToFile(mConfig->configCalib.nnClusterizerNetworks[2], mConfig->configCalib.nnClusterizerNetworkSizes[3], "net_regression_c2.onnx");
-        }
+        pc.inputs().get<o2::tpc::ORTRootSerializer*>("nn_regression_c2");
       }
     }
   }

From 9ed60e981652ff968c7a563246141e20f8404609 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Tue, 25 Nov 2025 10:26:45 +0000
Subject: [PATCH 14/16] Please consider the following formatting changes

---
 GPU/GPUTracking/DataTypes/ORTRootSerializer.h | 18 ++---
 .../utils/convert_onnx_to_root_serialized.C   | 72 +++++++++----------
 GPU/Workflow/src/GPUWorkflowSpec.cxx          |  6 +-
 GPU/Workflow/src/GPUWorkflowTPC.cxx           |  2 +-
 4 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/GPU/GPUTracking/DataTypes/ORTRootSerializer.h b/GPU/GPUTracking/DataTypes/ORTRootSerializer.h
index 9952abfd55997..24009d4435a96 100644
--- a/GPU/GPUTracking/DataTypes/ORTRootSerializer.h
+++ b/GPU/GPUTracking/DataTypes/ORTRootSerializer.h
@@ -25,17 +25,17 @@ namespace o2::tpc
 
 class ORTRootSerializer
 {
-  public:
-    ORTRootSerializer() = default;
-    ~ORTRootSerializer() = default;
+ public:
+  ORTRootSerializer() = default;
+  ~ORTRootSerializer() = default;
 
-    void setOnnxModel(const char* onnxModel, uint32_t size);
-    const char* getONNXModel() const { return mModelBuffer.data(); }
-    uint32_t getONNXModelSize() const { return static_cast<uint32_t>(mModelBuffer.size()); }
+  void setOnnxModel(const char* onnxModel, uint32_t size);
+  const char* getONNXModel() const { return mModelBuffer.data(); }
+  uint32_t getONNXModelSize() const { return static_cast<uint32_t>(mModelBuffer.size()); }
 
-  private:
-    std::vector<char> mModelBuffer; ///< buffer for serialization
-    ClassDefNV(ORTRootSerializer, 1);
+ private:
+  std::vector<char> mModelBuffer; ///< buffer for serialization
+  ClassDefNV(ORTRootSerializer, 1);
 };
 
 } // namespace o2::tpc
diff --git a/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C b/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C
index 1970c57353e54..b1b8b981393a1 100644
--- a/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C
+++ b/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C
@@ -111,23 +111,23 @@ void rootToOnnx(std::string infile, std::string outfile, std::string key)
 /// Upload the ONNX model to CCDB from an ONNX file
 /// !!! Adjust the metadata, path and validity !!!
 void uploadToCCDBFromONNX(std::string onnxFile,
-  const std::map<std::string, std::string>& metadata,
-  // { // some example metadata entries
-  //   "nnCCDBLayerType": "FC",
-  //   "nnCCDBWithMomentum": "0",
-  //   "inputDType": "FP16",
-  //   "nnCCDBInteractionRate": "500",
-  //   "outputDType": "FP16",
-  //   "nnCCDBEvalType": "regression_c1",
-  //   "nnCCDBBeamType": "pp",
-  //   "partName": "blob",
-  //   "quality": "3"
-  // }
-  long tsMin /* = 1 */,
-  long tsMax /* = 4108971600000 */,
-  std::string ccdbPath /* = "Users/c/csonnabe/TPC/Clusterization" */,
-  std::string objname /* = "net_regression_r1.root" */,
-  std::string ccdbUrl /* = "http://alice-ccdb.cern.ch" */)
+                          const std::map<std::string, std::string>& metadata,
+                          // { // some example metadata entries
+                          //   "nnCCDBLayerType": "FC",
+                          //   "nnCCDBWithMomentum": "0",
+                          //   "inputDType": "FP16",
+                          //   "nnCCDBInteractionRate": "500",
+                          //   "outputDType": "FP16",
+                          //   "nnCCDBEvalType": "regression_c1",
+                          //   "nnCCDBBeamType": "pp",
+                          //   "partName": "blob",
+                          //   "quality": "3"
+                          // }
+                          long tsMin /* = 1 */,
+                          long tsMax /* = 4108971600000 */,
+                          std::string ccdbPath /* = "Users/c/csonnabe/TPC/Clusterization" */,
+                          std::string objname /* = "net_regression_r1.root" */,
+                          std::string ccdbUrl /* = "http://alice-ccdb.cern.ch" */)
 {
   readOnnxModelFromFile(onnxFile);
 
@@ -135,7 +135,7 @@ void uploadToCCDBFromONNX(std::string onnxFile,
   api.init(ccdbUrl);
 
   // build full CCDB path including filename
-  const std::string fullPath = ccdbPath;//.back() == '/' ? (ccdbPath + objname) : (ccdbPath + "/" + objname);
+  const std::string fullPath = ccdbPath; //.back() == '/' ? (ccdbPath + objname) : (ccdbPath + "/" + objname);
 
   api.storeAsTFileAny(&serializer, fullPath, metadata, tsMin, tsMax);
 }
@@ -143,12 +143,12 @@ void uploadToCCDBFromONNX(std::string onnxFile,
 /// Upload the ONNX model to CCDB from a ROOT file
 /// !!! Adjust the metadata, path and validity !!!
 void uploadToCCDBFromROOT(std::string rootFile,
-  const std::map<std::string, std::string>& metadata,
-  long tsMin /* = 1 */,
-  long tsMax /* = 4108971600000 */,
-  std::string ccdbPath /* = "Users/c/csonnabe/TPC/Clusterization" */,
-  std::string objname /* = "net_regression_r1.root" */,
-  std::string ccdbUrl /* = "http://alice-ccdb.cern.ch" */)
+                          const std::map<std::string, std::string>& metadata,
+                          long tsMin /* = 1 */,
+                          long tsMax /* = 4108971600000 */,
+                          std::string ccdbPath /* = "Users/c/csonnabe/TPC/Clusterization" */,
+                          std::string objname /* = "net_regression_r1.root" */,
+                          std::string ccdbUrl /* = "http://alice-ccdb.cern.ch" */)
 {
   // read ROOT file, extract ORTRootSerializer object and upload via storeAsTFileAny
   TFile inRootFile(rootFile.c_str());
@@ -170,7 +170,7 @@ void uploadToCCDBFromROOT(std::string rootFile,
   api.init(ccdbUrl);
 
   // build full CCDB path including filename
-  const std::string fullPath = ccdbPath;//.back() == '/' ? (ccdbPath + objname) : (ccdbPath + "/" + objname);
+  const std::string fullPath = ccdbPath; //.back() == '/' ? (ccdbPath + objname) : (ccdbPath + "/" + objname);
 
   api.storeAsTFileAny(&serializer, fullPath, metadata, tsMin, tsMax);
 
@@ -178,15 +178,15 @@ void uploadToCCDBFromROOT(std::string rootFile,
 }
 
 void convert_onnx_to_root_serialized(const std::string& onnxFile,
-  const std::string& rootFile,
-  int mode = 0,
-  int ccdbUpload = 0,
-  const std::string& metadataStr = "nnCCDBLayerType=FC/nnCCDBWithMomentum=0/inputDType=FP16/nnCCDBInteractionRate=500/outputDType=FP16/nnCCDBEvalType=regression_c1/nnCCDBBeamType=pp/partName=blob/quality=3",
-  long tsMin = 1,
-  long tsMax = 4108971600000,
-  std::string ccdbPath = "Users/c/csonnabe/TPC/Clusterization",
-  std::string objname = "net_regression_r1.root",
-  std::string ccdbUrl = "http://alice-ccdb.cern.ch")
+                                     const std::string& rootFile,
+                                     int mode = 0,
+                                     int ccdbUpload = 0,
+                                     const std::string& metadataStr = "nnCCDBLayerType=FC/nnCCDBWithMomentum=0/inputDType=FP16/nnCCDBInteractionRate=500/outputDType=FP16/nnCCDBEvalType=regression_c1/nnCCDBBeamType=pp/partName=blob/quality=3",
+                                     long tsMin = 1,
+                                     long tsMax = 4108971600000,
+                                     std::string ccdbPath = "Users/c/csonnabe/TPC/Clusterization",
+                                     std::string objname = "net_regression_r1.root",
+                                     std::string ccdbUrl = "http://alice-ccdb.cern.ch")
 {
   // parse metadataStr of the form key=value/key2=value2/...
   std::map<std::string, std::string> metadata;
@@ -206,12 +206,12 @@ void convert_onnx_to_root_serialized(const std::string& onnxFile,
     start = sep + 1;
   }
 
-  if (ccdbUpload == 0){
+  if (ccdbUpload == 0) {
     if (mode == 0)
       onnxToRoot(onnxFile, rootFile, o2::ccdb::CcdbApi::CCDBOBJECT_ENTRY);
     else if (mode == 1)
       rootToOnnx(rootFile, onnxFile, o2::ccdb::CcdbApi::CCDBOBJECT_ENTRY);
-  } else if (ccdbUpload == 1){
+  } else if (ccdbUpload == 1) {
     if (mode == 0)
       uploadToCCDBFromROOT(rootFile, metadata, tsMin, tsMax, ccdbPath, objname, ccdbUrl);
     else if (mode == 1)
diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx
index c0e46cc7b75f3..d7ea772c31653 100644
--- a/GPU/Workflow/src/GPUWorkflowSpec.cxx
+++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx
@@ -1278,11 +1278,11 @@ Inputs GPURecoWorkflowSpec::inputs()
       printSettings(metadata);
     }
 
-    if(mSpecConfig.nnEvalMode[0] == "c1") {
+    if (mSpecConfig.nnEvalMode[0] == "c1") {
       metadata["nnCCDBEvalType"] = "classification_c1";
       convert_map_to_metadata(metadata, ccdb_metadata);
       inputs.emplace_back("nn_classification_c1", gDataOriginTPC, "NNCLUSTERIZER_C1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0));
-    } else if(mSpecConfig.nnEvalMode[0] == "c2") {
+    } else if (mSpecConfig.nnEvalMode[0] == "c2") {
       metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBRegressionLayerType;
       metadata["nnCCDBEvalType"] = "classification_c2";
       convert_map_to_metadata(metadata, ccdb_metadata);
@@ -1294,7 +1294,7 @@ Inputs GPURecoWorkflowSpec::inputs()
     convert_map_to_metadata(metadata, ccdb_metadata);
     inputs.emplace_back("nn_regression_c1", gDataOriginTPC, "NNCLUSTERIZER_R1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0));
 
-    if(mSpecConfig.nnEvalMode[1] == "r2") {
+    if (mSpecConfig.nnEvalMode[1] == "r2") {
       metadata["nnCCDBEvalType"] = "regression_c2";
       convert_map_to_metadata(metadata, ccdb_metadata);
       inputs.emplace_back("nn_regression_c2", gDataOriginTPC, "NNCLUSTERIZER_R2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0));
diff --git a/GPU/Workflow/src/GPUWorkflowTPC.cxx b/GPU/Workflow/src/GPUWorkflowTPC.cxx
index ae9aabda1fd74..13a3c4b6162b8 100644
--- a/GPU/Workflow/src/GPUWorkflowTPC.cxx
+++ b/GPU/Workflow/src/GPUWorkflowTPC.cxx
@@ -300,7 +300,7 @@ void GPURecoWorkflowSpec::finaliseCCDBTPC(ConcreteDataMatcher& matcher, void* ob
   } else if (matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_C2", 0)) {
     mConfig->configCalib.nnClusterizerNetworks[0] = static_cast<o2::tpc::ORTRootSerializer*>(obj);
     LOG(info) << "(NN CLUS) " << (mConfig->configCalib.nnClusterizerNetworks[0])->getONNXModelSize() << " bytes loaded for NN clusterizer: classification_c2";
-  } else if(matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_R1", 0)){
+  } else if (matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_R1", 0)) {
     mConfig->configCalib.nnClusterizerNetworks[1] = static_cast<o2::tpc::ORTRootSerializer*>(obj);
     LOG(info) << "(NN CLUS) " << (mConfig->configCalib.nnClusterizerNetworks[1])->getONNXModelSize() << " bytes loaded for NN clusterizer: regression_c1";
   } else if (matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_R2", 0)) {

From 6cba1f3caa943f252177372caaee05d7922c226f Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 25 Nov 2025 11:29:51 +0100
Subject: [PATCH 15/16] Disable dumpToFile by default

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 06f681dea4c1a..28cb2dd6dfbc3 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -285,7 +285,7 @@ AddOption(nnClusterizerUseClassification, int, 1, "", 0, "If 1, the classificati
 AddOption(nnClusterizerForceGpuInputFill, int, 0, "", 0, "Forces to use the fillInputNNGPU function")
 // CCDB
 AddOption(nnLoadFromCCDB, int, 0, "", 0, "If 1 networks are fetched from ccdb, else locally")
-AddOption(nnCCDBDumpToFile, int, 1, "", 0, "If 1, additionally dump fetched CCDB networks to nnLocalFolder")
+AddOption(nnCCDBDumpToFile, int, 0, "", 0, "If 1, additionally dump fetched CCDB networks to nnLocalFolder")
 AddOption(nnLocalFolder, std::string, ".", "", 0, "Local folder in which the networks will be fetched")
 AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks")
 AddOption(nnCCDBWithMomentum, std::string, "", "", 0, "Distinguishes between the network with and without momentum output for the regression")

From 5c6d214522dd0b6a1a0ff4d72039880cdbbe23d8 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 25 Nov 2025 21:05:11 +0100
Subject: [PATCH 16/16] Moving macro, adding o2-test

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h                 | 2 +-
 macro/CMakeLists.txt                                          | 4 ++++
 .../utils => macro}/convert_onnx_to_root_serialized.C         | 0
 3 files changed, 5 insertions(+), 1 deletion(-)
 rename {GPU/GPUTracking/utils => macro}/convert_onnx_to_root_serialized.C (100%)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 28cb2dd6dfbc3..dc1742453ef39 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -291,7 +291,7 @@ AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0,
 AddOption(nnCCDBWithMomentum, std::string, "", "", 0, "Distinguishes between the network with and without momentum output for the regression")
 AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
 AddOption(nnCCDBRegressionLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN")
-AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp, pPb")
+AddOption(nnCCDBBeamType, std::string, "pp", "", 0, "Distinguishes between networks trained for different beam types. Options: pp, pPb, PbPb")
 AddOption(nnCCDBInteractionRate, std::string, "500", "", 0, "Distinguishes between networks for different interaction rates [kHz].")
 AddHelp("help", 'h')
 EndConfig()
diff --git a/macro/CMakeLists.txt b/macro/CMakeLists.txt
index 843ad4a3be0ab..b5c51e50d3ffb 100644
--- a/macro/CMakeLists.txt
+++ b/macro/CMakeLists.txt
@@ -58,6 +58,7 @@ install(FILES CheckDigits_mft.C
               CreateGRPLHCIFObject.C
               getTimeStamp.C
               CreateSampleIRFrames.C
+              convert_onnx_to_root_serialized.C
         DESTINATION share/macro/)
 
 # FIXME: a lot of macros that are here should really be elsewhere. Those which
@@ -149,6 +150,9 @@ o2_add_test_root_macro(checkTOFMatching.C
                                              O2::SimulationDataFormat
                                              O2::DataFormatsTOF)
 
+o2_add_test_root_macro(convert_onnx_to_root_serialized.C
+                       PUBLIC_LINK_LIBRARIES O2::GlobalTracking)
+
 # FIXME: move to subsystem dir
 o2_add_test_root_macro(compareTopologyDistributions.C
                        PUBLIC_LINK_LIBRARIES O2::DataFormatsITSMFT
diff --git a/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C b/macro/convert_onnx_to_root_serialized.C
similarity index 100%
rename from GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C
rename to macro/convert_onnx_to_root_serialized.C