AliceO2Group · davidrohr · Jun 16, 2025 · Jun 8, 2025 · Jun 8, 2025 · Jun 8, 2025
@@ -497,6 +497,7 @@
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels
   #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizerKernels
+  #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishDeconvolutionFlags GPUCA_LB_GPUTPCNNClusterizerKernels
 
   #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE
   #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE

@@ -269,6 +269,7 @@ AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1)
 AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed")
 AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN")
 AddOption(nnClusterizerApplyNoiseSuppression, int, 1, "", 0, "Applies the NoiseSuppression kernel before the digits to the network are filled")
+AddOption(nnClusterizerSetDeconvolutionFlags, int, 1, "", 0, "Runs the deconvolution kernel without overwriting the charge in order to make cluster-to-track attachment identical to heuristic CF")
 AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path")
 AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.")
 AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path")

@@ -989,8 +989,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           int withMC = (doGPU && propagateMCLabels);
 
           if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) {
-            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
+            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
             DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+          } else if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
+            runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, false);
           }
 
           // float time_clusterizer = 0, time_fill = 0, time_networks = 0;
@@ -1001,6 +1003,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
             // auto start0 = std::chrono::high_resolution_clock::now();
             runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::fillInputNNSingleElement>({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the data
 
+            if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) {
+              runKernel<GPUTPCNNClusterizerKernels, GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the regression data
+            }
+
             // auto stop0 = std::chrono::high_resolution_clock::now();
             // auto start1 = std::chrono::high_resolution_clock::now();
 
@@ -1102,7 +1108,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
           GPUFatal("Project not compiled with neural network clusterization. Aborting.");
 #endif
         } else {
-          runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}});
+          runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true);
           DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
           runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0);
         }

@@ -22,18 +22,19 @@ using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
 
 template <>
-GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
+GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, uint8_t overwriteCharge)
 {
   CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
   CfArray2D<uint8_t> isPeakMap(clusterer.mPpeakMap);
-  GPUTPCCFDeconvolution::deconvolutionImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, isPeakMap, chargeMap, clusterer.mPpositions, clusterer.mPmemory->counters.nPositions);
+  GPUTPCCFDeconvolution::deconvolutionImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, isPeakMap, chargeMap, clusterer.mPpositions, clusterer.mPmemory->counters.nPositions, overwriteCharge);
 }
 
 GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem,
                                                        const CfArray2D<uint8_t>& peakMap,
                                                        CfArray2D<PackedCharge>& chargeMap,
                                                        const CfChargePos* positions,
-                                                       const uint32_t digitnum)
+                                                       const uint32_t digitnum,
+                                                       uint8_t overwriteCharge)
 {
   SizeT idx = get_global_id(0);
 
@@ -111,9 +112,14 @@ GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t
   peakCount = (peakCount == 0) ? 1 : peakCount;
 
   PackedCharge charge = chargeMap[pos];
-  PackedCharge p(charge.unpack() / peakCount, has3x3, split);
 
-  chargeMap[pos] = p;
+  if (overwriteCharge) {
+    PackedCharge p(charge.unpack() / peakCount, has3x3, split);
+    chargeMap[pos] = p;
+  } else {
+    PackedCharge p(charge.unpack(), has3x3, split);
+    chargeMap[pos] = p;
+  }
 }
 
 GPUdi() uint8_t GPUTPCCFDeconvolution::countPeaksInner(

@@ -51,7 +51,7 @@ class GPUTPCCFDeconvolution : public GPUKernelTemplate
   GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args);
 
  private:
-  static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D<uint8_t>&, CfArray2D<PackedCharge>&, const CfChargePos*, const uint32_t);
+  static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D<uint8_t>&, CfArray2D<PackedCharge>&, const CfChargePos*, const uint32_t, uint8_t);
 
   static GPUdi() uint8_t countPeaksInner(uint16_t, const uint8_t*, uint8_t*);
   static GPUdi() uint8_t countPeaksOuter(uint16_t, uint8_t, const uint8_t*);

@@ -44,6 +44,7 @@ class GPUTPCNNClusterizer : public GPUProcessor
   bool mNnClusterizerAddIndexData = true;
   float mNnClassThreshold = 0.01;
   bool mNnSigmoidTrafoClassThreshold = 1;
+  bool mNnClusterizerSetDeconvolutionFlags = true;
   int mNnClusterizerUseCfRegression = 0;
   int mNnClusterizerBatchedMode = 1;
   int mNnClusterizerTotalClusters = 1;

@@ -92,6 +92,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust
   clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode;
   clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue;
   clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold;
+  clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags;
   if (clustererNN.mNnSigmoidTrafoClassThreshold) {
     clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold));
   } else {

@@ -51,7 +51,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::run
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNN>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
   uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
@@ -111,7 +111,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNSingleElement>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fillInputNNSingleElement>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
   uint32_t glo_idx = get_global_id(0);
   auto& clusterer = processors.tpcClusterer[sector];
@@ -126,11 +126,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
 
   if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.mNnClusterizerElementSize - 1)) {
     uint32_t top_idx = (base_idx + 1) * clustererNN.mNnClusterizerElementSize;
-    for (uint16_t i = 0; i < 8; i++) {
-      Delta2 d = cfconsts::InnerNeighbors[i];
-      CfChargePos tmp_pos = peak.delta(d);
-      clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
-      clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx];
+    if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { // Only if deconvolution flags are not set
+      for (uint16_t i = 0; i < 8; i++) {                    // This solution needs testing. It is not the same as the deconvolution flags
+        Delta2 d = cfconsts::InnerNeighbors[i];
+        CfChargePos tmp_pos = peak.delta(d);
+        clustererNN.mClusterFlags[2 * base_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]);
+      }
+      clustererNN.mClusterFlags[2 * base_idx + 1] = clustererNN.mClusterFlags[2 * base_idx];
     }
     if (dtype == 0) {
       clustererNN.mInputData_16[top_idx - 3] = (OrtDataType::Float16_t)(sector / 36.f);
@@ -147,40 +149,40 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::fil
     bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0);
     if (is_row_boundary) {
       if (dtype == 0) {
-        clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
+        clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
       } else {
-        clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+        clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
       }
     } else {
       int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow);
       int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r);
       int32_t rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1));
       int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad + pad_offset;
-      int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime;
+      int32_t time_pos = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime + time;
 
-      bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (t < 0 || t >= TPC_MAX_FRAGMENT_LEN_GPU);
+      bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (time_pos < 0 || time_pos >= TPC_MAX_FRAGMENT_LEN_GPU);
 
       if (!is_boundary) {
         float central_charge = static_cast<float>(chargeMap[peak].unpack());
-        CfChargePos tmp_pos(row + r, pad + p, time + t);
+        CfChargePos tmp_pos(row + r, pad + p, time_pos);
         if (dtype == 0) {
-          clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
+          clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge);
         } else if (dtype == 1) {
-          clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
+          clustererNN.mInputData_32[glo_idx] = static_cast<float>(chargeMap[tmp_pos].unpack()) / central_charge;
         }
       } else {
         if (dtype == 0) {
-          clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
+          clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue));
         } else {
-          clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
+          clustererNN.mInputData_32[glo_idx] = static_cast<float>(clustererNN.mNnClusterizerBoundaryFillValue);
         }
       }
     }
   }
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass1Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
   uint32_t glo_idx = get_global_id(0);
   if (dtype == 0) {
@@ -191,7 +193,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::det
 }
 
 template <>
-GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart)
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::determineClass2Labels>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart)
 {
   auto& clustererNN = processors.tpcNNClusterer[sector];
   uint32_t glo_idx = get_global_id(0);
@@ -457,6 +459,33 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::pub
   }
 }
 
+// ---------------------------------
+template <>
+GPUdii() void GPUTPCNNClusterizerKernels::Thread<GPUTPCNNClusterizerKernels::publishDeconvolutionFlags>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart)
+{
+  // Implements identical publishing logic as the heuristic clusterizer and deconvolution kernel
+  uint32_t idx = get_global_id(0);
+  auto& clusterer = processors.tpcClusterer[sector];
+  auto& clustererNN = processors.tpcNNClusterer[sector];
+  CfArray2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
+  CfChargePos peak = clusterer.mPfilteredPeakPositions[idx + batchStart];
+
+  for (int i = 0; i < 8; i++) {
+    Delta2 d = cfconsts::InnerNeighbors[i];
+    CfChargePos tmp_pos = peak.delta(d);
+    PackedCharge charge = chargeMap[tmp_pos];
+    clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit());
+    clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit());
+  }
+  for (int i = 0; i < 16; i++) {
+    Delta2 d = cfconsts::OuterNeighbors[i];
+    CfChargePos tmp_pos = peak.delta(d);
+    PackedCharge charge = chargeMap[tmp_pos];
+    clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit() && !charge.has3x3Peak());
+    clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit() && !charge.has3x3Peak());
+  }
+}
+
 // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary
 GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current)
 {

@@ -65,6 +65,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate
     determineClass2Labels = 4,
     publishClass1Regression = 5,
     publishClass2Regression = 6,
+    publishDeconvolutionFlags = 7
   };
 
   template <int32_t iKernel = defaultKernel, typename... Args>