From 82f2153cd68dfd0ae394621cd1df9d0093ace680 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 8 Jun 2025 14:22:21 +0200
Subject: [PATCH 1/8] First bug-fixes and optimizations for deconvolution flags

---
 GPU/GPUTracking/Definitions/GPUSettingsList.h |  1 +
 .../Global/GPUChainTrackingClusterizer.cxx    |  6 +++--
 .../GPUTPCCFDeconvolution.cxx                 | 16 +++++++++----
 .../TPCClusterFinder/GPUTPCCFDeconvolution.h  |  2 +-
 .../TPCClusterFinder/GPUTPCNNClusterizer.h    |  1 +
 .../GPUTPCNNClusterizerHost.cxx               |  1 +
 .../GPUTPCNNClusterizerKernels.cxx            | 24 ++++++++++---------
 7 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index d8173a5b62a35..e75d5a5890e4b 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -269,6 +269,7 @@ AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1)
 AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
 AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
 AddOption(nnClusterizerApplyNoiseSuppression, int, 1, "", 0, "Applies the NoiseSuppression kernel before the digits to the network are filled")
+AddOption(nnClusterizerSetDeconvolutionFlags, int, 1, "", 0, "Runs the deconvolution kernel without overwriting the charge in order to make cluster-to-track attachment identical to heuristic CF")
 AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
 AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
 AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 64e6f5a31aaa7..58b06c95ad26a 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -989,8 +989,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           int withMC = (doGPU && propagateMCLabels);
 
           if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
-            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
+            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
             DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+          } else if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
+            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, false);
           }
 
           // float time_clusterizer = 0, time_fill = 0, time_networks = 0;
@@ -1102,7 +1104,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif
         } else {
-          runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
+          runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
           DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0);
         }
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx
index 429d51685e504..406f3cb1d6d5a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx
@@ -22,18 +22,19 @@ using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
 
 template <>
-GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
+GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, bool overwriteCharge)
 {
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
-  GPUTPCCFDeconvolution::deconvolutionImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, isPeakMap, chargeMap, clusterer.mPpositions, clusterer.mPmemory->counters.nPositions);
+  GPUTPCCFDeconvolution::deconvolutionImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, isPeakMap, chargeMap, clusterer.mPpositions, clusterer.mPmemory->counters.nPositions, overwriteCharge);
 }
 
 GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem,
                                                        const CfArray2D<uint8_t>& peakMap,
                                                        CfArray2D<PackedCharge>& chargeMap,
                                                        const CfChargePos* positions,
-                                                       const uint32_t digitnum)
+                                                       const uint32_t digitnum,
+                                                       bool overwriteCharge)
 {
   SizeT idx = get_global_id(0);
 
@@ -111,9 +112,14 @@ GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t
   peakCount = (peakCount == 0) ? 1 : peakCount;
 
   PackedCharge charge = chargeMap[pos];
-  PackedCharge p(charge.unpack() / peakCount, has3x3, split);
 
-  chargeMap[pos] = p;
+  if(overwriteCharge) {
+    PackedCharge p(charge.unpack() / peakCount, has3x3, split);
+    chargeMap[pos] = p;
+  } else {
+    PackedCharge p(charge.unpack(), has3x3, split);
+    chargeMap[pos] = p;
+  }
 }
 
 GPUdi() uint8_t GPUTPCCFDeconvolution::countPeaksInner(
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h
index e971a042e95a4..413d2f087f87a 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h
@@ -51,7 +51,7 @@ class GPUTPCCFDeconvolution : public GPUKernelTemplate
   GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args);
 
  private:
-  static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D<uint8_t>&, CfArray2D<PackedCharge>&, const CfChargePos*, const uint32_t);
+  static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D<uint8_t>&, CfArray2D<PackedCharge>&, const CfChargePos*, const uint32_t, bool);
 
   static GPUdi() uint8_t countPeaksInner(uint16_t, const uint8_t*, uint8_t*);
   static GPUdi() uint8_t countPeaksOuter(uint16_t, uint8_t, const uint8_t*);
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
index f7c2d13407b0e..980c0977aca65 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h
@@ -44,6 +44,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   bool mNnClusterizerAddIndexData = true;
   float mNnClassThreshold = 0.01;
   bool mNnSigmoidTrafoClassThreshold = 1;
+  bool mNnClusterizerSetDeconvolutionFlags = true;
   int mNnClusterizerUseCfRegression = 0;
   int mNnClusterizerBatchedMode = 1;
   int mNnClusterizerTotalClusters = 1;
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
index 90f1d6e27246f..124320396d0d4 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx
@@ -92,6 +92,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
   clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
   clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
+  clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags;
   if (clustererNN.mNnSigmoidTrafoClassThreshold) {
     clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold));
   } else {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 8ca61602ab4e9..8575b1f47be16 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -126,11 +126,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 
   if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.mNnClusterizerElementSize - 1)) {
     uint32_t top_idx = (base_idx + 1) * clustererNN.mNnClusterizerElementSize;
-    for (uint16_t i = 0; i < 8; i++) {
-      Delta2 d = cfconsts::InnerNeighbors[i];
-      CfChargePos tmp_pos = peak.delta(d);
-      clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
-      clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
+    if (!clustererNN.mNnClusterizerSetDeconvolutionFlags){ // Only if deconvolution flags are not set
+      for (uint16_t i = 0; i < 8; i++) { // This solution needs testing. It is not the same as the deconvolution flags
+        Delta2 d = cfconsts::InnerNeighbors[i];
+        CfChargePos tmp_pos = peak.delta(d);
+        clustererNN.mClusterFlags[2 * base_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
+      }
+      clustererNN.mClusterFlags[2 * base_idx + 1] = clustererNN.mClusterFlags[2 * base_idx];
     }
     if (dtype == 0) {
       clustererNN.mInputData_16[top_idx - 3] = (OrtDataType::Float16_t)(sector / 36.f);
@@ -147,9 +149,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
     if (is_row_boundary) {
       if (dtype == 0) {
-        clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
+        clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
       } else {
-        clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+        clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
       }
     } else {
       int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
@@ -164,15 +166,15 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
         float central_charge = static_cast<float>(chargeMap[peak].unpack());
         CfChargePos tmp_pos(row + r, pad + p, time + t);
         if (dtype == 0) {
-          clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+          clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
         } else if (dtype == 1) {
-          clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+          clustererNN.mInputData_32[glo_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
         }
       } else {
         if (dtype == 0) {
-          clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
+          clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
         } else {
-          clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+          clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
         }
       }
     }

From 227d192e1c1f51cfe5ee3582c730508f11d70aa9 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 8 Jun 2025 14:28:57 +0200
Subject: [PATCH 2/8] Adding publishing logic for deconvolution flags

---
 .../Global/GPUChainTrackingClusterizer.cxx    |  4 +++
 .../GPUTPCNNClusterizerKernels.cxx            | 27 +++++++++++++++++++
 .../GPUTPCNNClusterizerKernels.h              |  1 +
 3 files changed, 32 insertions(+)

diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index 58b06c95ad26a..816ee43d50b15 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -1003,6 +1003,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             // auto start0 = std::chrono::high_resolution_clock::now();
             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the data
 
+            if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the regression data
+            }
+
             // auto stop0 = std::chrono::high_resolution_clock::now();
             // auto start1 = std::chrono::high_resolution_clock::now();
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 8575b1f47be16..cb0f5e92468d9 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -459,6 +459,33 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   }
 }
 
+// ---------------------------------
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+{
+  // Implements identical publishing logic as the heuristic clusterizer and deconvolution kernel
+  uint32_t idx = get_global_id(0);
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[idx + batchStart];
+
+  for(int i = 0; i < 8; i++) {
+    Delta2 d = cfconsts::InnerNeighbors[i];
+    CfChargePos tmp_pos = peak.delta(d);
+    PackedCharge charge = chargeMap[tmp_pos];
+    clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit());
+    clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit());
+  }
+  for(int i = 0; i < 16; i++) {
+    Delta2 d = cfconsts::OuterNeighbors[i];
+    CfChargePos tmp_pos = peak.delta(d);
+    PackedCharge charge = chargeMap[tmp_pos];
+    clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit() && !charge.has3x3Peak());
+    clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit() && !charge.has3x3Peak());
+  }
+}
+
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
 {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
index dac2bf9554849..179eb483cdf6b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h
@@ -65,6 +65,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
     determineClass2Labels = 4,
     publishClass1Regression = 5,
     publishClass2Regression = 6,
+    publishDeconvolutionFlags = 7
   };
 
   template <int32_t iKernel = defaultKernel, typename... Args>

From 57ff901bc19f57f3f32dc4f156f80a8aba13e974 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Sun, 8 Jun 2025 15:18:58 +0200
Subject: [PATCH 3/8] Adjusting kernels.cmake

---
 GPU/GPUTracking/kernels.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index 08d879fbb8e9a..05efbf1d6acdb 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -111,7 +111,7 @@ o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, findFragmentStart"            "= TPC
 o2_gpu_add_kernel("GPUTPCCFPeakFinder"                                    "= TPCCLUSTERFINDER"                                    LB)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"            "= TPCCLUSTERFINDER"                                    LB)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"                 "= TPCCLUSTERFINDER"                                    LB)
-o2_gpu_add_kernel("GPUTPCCFDeconvolution"                                 "= TPCCLUSTERFINDER"                                    LB)
+o2_gpu_add_kernel("GPUTPCCFDeconvolution"                                 "= TPCCLUSTERFINDER"                                    LB, bool overwriteCharge)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                                   "= TPCCLUSTERFINDER"                                    LB int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"               "= TPCCLUSTERFINDER")
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                     "= TPCCLUSTERFINDER"                                    NO GPUTPCLinearLabels* out)
@@ -134,6 +134,7 @@ o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels"     "= TPC
 o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels"     "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression"   "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression"   "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishDeconvolutionFlags" "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
 endif()
 
 o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP

From 05a4d3f867fc4427c6331fcb80d7476ddd06d0e7 Mon Sep 17 00:00:00 2001
From: ALICE Action Bot <alibuild@cern.ch>
Date: Sun, 8 Jun 2025 13:40:59 +0000
Subject: [PATCH 4/8] Please consider the following formatting changes

---
 .../TPCClusterFinder/GPUTPCCFDeconvolution.cxx            | 2 +-
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx       | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx
index 406f3cb1d6d5a..a93f970de5729 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx
@@ -113,7 +113,7 @@ GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t
 
   PackedCharge charge = chargeMap[pos];
 
-  if(overwriteCharge) {
+  if (overwriteCharge) {
     PackedCharge p(charge.unpack() / peakCount, has3x3, split);
     chargeMap[pos] = p;
   } else {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index cb0f5e92468d9..bf30d07ad2e77 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -126,8 +126,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 
   if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.mNnClusterizerElementSize - 1)) {
     uint32_t top_idx = (base_idx + 1) * clustererNN.mNnClusterizerElementSize;
-    if (!clustererNN.mNnClusterizerSetDeconvolutionFlags){ // Only if deconvolution flags are not set
-      for (uint16_t i = 0; i < 8; i++) { // This solution needs testing. It is not the same as the deconvolution flags
+    if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { // Only if deconvolution flags are not set
+      for (uint16_t i = 0; i < 8; i++) {                    // This solution needs testing. It is not the same as the deconvolution flags
         Delta2 d = cfconsts::InnerNeighbors[i];
         CfChargePos tmp_pos = peak.delta(d);
         clustererNN.mClusterFlags[2 * base_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
@@ -470,14 +470,14 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfChargePos peak = clusterer.mPfilteredPeakPositions[idx + batchStart];
 
-  for(int i = 0; i < 8; i++) {
+  for (int i = 0; i < 8; i++) {
     Delta2 d = cfconsts::InnerNeighbors[i];
     CfChargePos tmp_pos = peak.delta(d);
     PackedCharge charge = chargeMap[tmp_pos];
     clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit());
     clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit());
   }
-  for(int i = 0; i < 16; i++) {
+  for (int i = 0; i < 16; i++) {
     Delta2 d = cfconsts::OuterNeighbors[i];
     CfChargePos tmp_pos = peak.delta(d);
     PackedCharge charge = chargeMap[tmp_pos];

From 464402feacc581fe44f827d4ea33f9c50967af2d Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Mon, 9 Jun 2025 22:08:02 +0200
Subject: [PATCH 5/8] Bug-fix for time-position and boundary check in
 fillInputSingleElement

---
 .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index bf30d07ad2e77..99fbc1c4bf504 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -158,13 +158,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
       int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r);
       int32_t rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
       int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad + pad_offset;
-      int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
+      int32_t time_pos = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime + time;
 
-      bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (t < 0 || t >= TPC_MAX_FRAGMENT_LEN_GPU);
+      bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (time_pos < 0 || time_pos >= TPC_MAX_FRAGMENT_LEN_GPU);
 
       if (!is_boundary) {
         float central_charge = static_cast<float>(chargeMap[peak].unpack());
-        CfChargePos tmp_pos(row + r, pad + p, time + t);
+        CfChargePos tmp_pos(row + r, pad + p, time_pos);
         if (dtype == 0) {
           clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
         } else if (dtype == 1) {

From e70e4d98bcf9322e1414b95da7ca44954bec4c45 Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Tue, 10 Jun 2025 09:40:52 +0200
Subject: [PATCH 6/8] Fix for kernels.cmake and naming

---
 .../GPUTPCNNClusterizerKernels.cxx             | 10 +++++-----
 GPU/GPUTracking/kernels.cmake                  | 18 +++++++++---------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
index 99fbc1c4bf504..58dd49630d8e6 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx
@@ -51,7 +51,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
   uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
@@ -111,7 +111,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNSingleElement>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNSingleElement>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
   uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
@@ -182,7 +182,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
   uint32_t glo_idx = get_global_id(0);
   if (dtype == 0) {
@@ -193,7 +193,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
   auto& clustererNN = processors.tpcNNClusterer[sector];
   uint32_t glo_idx = get_global_id(0);
@@ -461,7 +461,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
 
 // ---------------------------------
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
 {
   // Implements identical publishing logic as the heuristic clusterizer and deconvolution kernel
   uint32_t idx = get_global_id(0);
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index 05efbf1d6acdb..c7f56919d2ff8 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -111,7 +111,7 @@ o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, findFragmentStart"            "= TPC
 o2_gpu_add_kernel("GPUTPCCFPeakFinder"                                    "= TPCCLUSTERFINDER"                                    LB)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"            "= TPCCLUSTERFINDER"                                    LB)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"                 "= TPCCLUSTERFINDER"                                    LB)
-o2_gpu_add_kernel("GPUTPCCFDeconvolution"                                 "= TPCCLUSTERFINDER"                                    LB, bool overwriteCharge)
+o2_gpu_add_kernel("GPUTPCCFDeconvolution"                                 "= TPCCLUSTERFINDER"                                    LB bool overwriteCharge)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                                   "= TPCCLUSTERFINDER"                                    LB int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"               "= TPCCLUSTERFINDER")
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                     "= TPCCLUSTERFINDER"                                    NO GPUTPCLinearLabels* out)
@@ -127,14 +127,14 @@ o2_gpu_add_kernel("GPUTPCCFGather"                                        "="
 o2_gpu_add_kernel("GPUTrackingRefitKernel, mode0asGPU"                    "= GLOBALREFIT "                                        LB)
 o2_gpu_add_kernel("GPUTrackingRefitKernel, mode1asTrackParCov"            "= GLOBALREFIT "                                        LB)
 if(onnxruntime_FOUND)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer"          "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN"               "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement"  "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels"     "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels"     "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression"   "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression"   "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
-o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishDeconvolutionFlags" "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer"          "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN"               "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement"  "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels"     "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels"     "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression"   "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression"   "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart)
+o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishDeconvolutionFlags" "= TPCNNCLUSTERFINDER"                                  LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart)
 endif()
 
 o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP

From f6a3d5aae9665832fcca897ac377650406f5611e Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 12 Jun 2025 08:49:04 +0200
Subject: [PATCH 7/8] Changing to uint8_t

---
 GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx | 4 ++--
 GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h   | 2 +-
 GPU/GPUTracking/kernels.cmake                              | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx
index a93f970de5729..d6b8703a9b35d 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx
@@ -22,7 +22,7 @@ using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
 
 template <>
-GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, bool overwriteCharge)
+GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, uint8_t overwriteCharge)
 {
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
@@ -34,7 +34,7 @@ GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t
                                                        CfArray2D<PackedCharge>& chargeMap,
                                                        const CfChargePos* positions,
                                                        const uint32_t digitnum,
-                                                       bool overwriteCharge)
+                                                       uint8_t overwriteCharge)
 {
   SizeT idx = get_global_id(0);
 
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h
index 413d2f087f87a..902e3a28fd21b 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h
@@ -51,7 +51,7 @@ class GPUTPCCFDeconvolution : public GPUKernelTemplate
   GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args);
 
  private:
-  static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D<uint8_t>&, CfArray2D<PackedCharge>&, const CfChargePos*, const uint32_t, bool);
+  static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D<uint8_t>&, CfArray2D<PackedCharge>&, const CfChargePos*, const uint32_t, uint8_t);
 
   static GPUdi() uint8_t countPeaksInner(uint16_t, const uint8_t*, uint8_t*);
   static GPUdi() uint8_t countPeaksOuter(uint16_t, uint8_t, const uint8_t*);
diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake
index c7f56919d2ff8..2a59f98a6d5b4 100644
--- a/GPU/GPUTracking/kernels.cmake
+++ b/GPU/GPUTracking/kernels.cmake
@@ -111,7 +111,7 @@ o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, findFragmentStart"            "= TPC
 o2_gpu_add_kernel("GPUTPCCFPeakFinder"                                    "= TPCCLUSTERFINDER"                                    LB)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression"            "= TPCCLUSTERFINDER"                                    LB)
 o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks"                 "= TPCCLUSTERFINDER"                                    LB)
-o2_gpu_add_kernel("GPUTPCCFDeconvolution"                                 "= TPCCLUSTERFINDER"                                    LB bool overwriteCharge)
+o2_gpu_add_kernel("GPUTPCCFDeconvolution"                                 "= TPCCLUSTERFINDER"                                    LB uint8_t overwriteCharge)
 o2_gpu_add_kernel("GPUTPCCFClusterizer"                                   "= TPCCLUSTERFINDER"                                    LB int8_t onlyMC)
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets"               "= TPCCLUSTERFINDER")
 o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten"                     "= TPCCLUSTERFINDER"                                    NO GPUTPCLinearLabels* out)

From ff620320d235483df7a90ee69cc2d5917331e3ed Mon Sep 17 00:00:00 2001
From: Christian Sonnabend <sonnabendch@gmail.com>
Date: Thu, 12 Jun 2025 13:50:41 +0200
Subject: [PATCH 8/8] Adding kernel definition

---
 GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
index 7879789bf91c8..8fe8e8ca68e44 100644
--- a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
+++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h
@@ -497,6 +497,7 @@
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizerKernels
+  #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishDeconvolutionFlags GPUCA_LB_GPUTPCNNClusterizerKernels
 
   #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE
   #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE