From 33b5b364fcd0778938e6213f059a5c8b40f30a33 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Thu, 13 Mar 2025 09:35:43 +0100
Subject: [PATCH] GPU TPC CF: Split clusterizer CXX functions out into .inc
 file to be used externally

---
 .../TPCClusterFinder/GPUTPCCFClusterizer.cxx  | 235 +----------------
 .../TPCClusterFinder/GPUTPCCFClusterizer.h    |   8 +-
 .../TPCClusterFinder/GPUTPCCFClusterizer.inc  | 249 ++++++++++++++++++
 3 files changed, 256 insertions(+), 236 deletions(-)
 create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
index 1aeae812f5193..7bf53b4878233 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx
@@ -25,6 +25,8 @@
 using namespace o2::gpu;
 using namespace o2::gpu::tpccf;
 
+#include "GPUTPCCFClusterizer.inc"
+
 template <>
 GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t onlyMC)
 {
@@ -34,235 +36,4 @@ GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
   tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow;
 
   GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow);
-}
-
-GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
-                                                       processorType& clusterer,
-                                                       const CfFragment& fragment,
-                                                       GPUSharedMemory& smem,
-                                                       const Array2D<PackedCharge>& chargeMap,
-                                                       const ChargePos* filteredPeakPositions,
-                                                       const GPUSettingsRec& calib,
-                                                       MCLabelAccumulator* labelAcc,
-                                                       uint32_t clusternum,
-                                                       uint32_t maxClusterPerRow,
-                                                       uint32_t* clusterInRow,
-                                                       tpc::ClusterNative* clusterByRow,
-                                                       uint32_t* clusterPosInRow)
-{
-  uint32_t idx = get_global_id(0);
-
-  // For certain configurations dummy work items are added, so the total
-  // number of work items is dividable by 64.
-  // These dummy items also compute the last cluster but discard the result.
-  ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
-  Charge charge = chargeMap[pos].unpack();
-
-  ClusterAccumulator pc;
-  CPU_ONLY(labelAcc->collect(pos, charge));
-
-  buildCluster(
-    calib,
-    chargeMap,
-    pos,
-    smem.posBcast,
-    smem.buf,
-    smem.innerAboveThreshold,
-    &pc,
-    labelAcc);
-
-  if (idx >= clusternum) {
-    return;
-  }
-  if (fragment.isOverlap(pos.time())) {
-    if (clusterPosInRow) {
-      clusterPosInRow[idx] = maxClusterPerRow;
-    }
-    return;
-  }
-  pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry);
-
-  tpc::ClusterNative myCluster;
-  bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param());
-
-  if (rejectCluster) {
-    if (clusterPosInRow) {
-      clusterPosInRow[idx] = maxClusterPerRow;
-    }
-    return;
-  }
-
-  uint32_t rowIndex = 0;
-  if (clusterByRow != nullptr) {
-    rowIndex = sortIntoBuckets(
-      clusterer,
-      myCluster,
-      pos.row(),
-      maxClusterPerRow,
-      clusterInRow,
-      clusterByRow);
-    if (clusterPosInRow != nullptr) {
-      clusterPosInRow[idx] = rowIndex;
-    }
-  } else if (clusterPosInRow) {
-    rowIndex = clusterPosInRow[idx];
-  }
-
-  CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
-}
-
-GPUdii() void GPUTPCCFClusterizer::updateClusterInner(
-  const GPUSettingsRec& calib,
-  uint16_t lid,
-  uint16_t N,
-  const PackedCharge* buf,
-  const ChargePos& pos,
-  ClusterAccumulator* cluster,
-  MCLabelAccumulator* labelAcc,
-  uint8_t* innerAboveThreshold)
-{
-  uint8_t aboveThreshold = 0;
-
-  GPUCA_UNROLL(U(), U())
-  for (uint16_t i = 0; i < N; i++) {
-    Delta2 d = cfconsts::InnerNeighbors[i];
-
-    PackedCharge p = buf[N * lid + i];
-
-    Charge q = cluster->updateInner(p, d);
-
-    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
-
-    aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
-  }
-
-  innerAboveThreshold[lid] = aboveThreshold;
-
-  GPUbarrier();
-}
-
-GPUdii() void GPUTPCCFClusterizer::updateClusterOuter(
-  uint16_t lid,
-  uint16_t N,
-  uint16_t M,
-  uint16_t offset,
-  const PackedCharge* buf,
-  const ChargePos& pos,
-  ClusterAccumulator* cluster,
-  MCLabelAccumulator* labelAcc)
-{
-  GPUCA_UNROLL(U(), U())
-  for (uint16_t i = offset; i < M + offset; i++) {
-    PackedCharge p = buf[N * lid + i];
-
-    Delta2 d = cfconsts::OuterNeighbors[i];
-
-    Charge q = cluster->updateOuter(p, d);
-    static_cast<void>(q); // Avoid unused varible warning on GPU.
-
-    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
-  }
-}
-
-GPUdii() void GPUTPCCFClusterizer::buildCluster(
-  const GPUSettingsRec& calib,
-  const Array2D<PackedCharge>& chargeMap,
-  ChargePos pos,
-  ChargePos* posBcast,
-  PackedCharge* buf,
-  uint8_t* innerAboveThreshold,
-  ClusterAccumulator* myCluster,
-  MCLabelAccumulator* labelAcc)
-{
-  uint16_t ll = get_local_id(0);
-
-  posBcast[ll] = pos;
-  GPUbarrier();
-
-  CfUtils::blockLoad<PackedCharge>(
-    chargeMap,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    8,
-    cfconsts::InnerNeighbors,
-    posBcast,
-    buf);
-  updateClusterInner(
-    calib,
-    ll,
-    8,
-    buf,
-    pos,
-    myCluster,
-    labelAcc,
-    innerAboveThreshold);
-
-  uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
-
-  bool inGroup1 = ll < wgSizeHalf;
-
-  uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
-
-  CfUtils::condBlockLoad(
-    chargeMap,
-    wgSizeHalf,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    16,
-    cfconsts::OuterNeighbors,
-    posBcast,
-    innerAboveThreshold,
-    buf);
-
-  if (inGroup1) {
-    updateClusterOuter(
-      llhalf,
-      16,
-      16,
-      0,
-      buf,
-      pos,
-      myCluster,
-      labelAcc);
-  }
-
-#if defined(GPUCA_GPUCODE)
-  CfUtils::condBlockLoad(
-    chargeMap,
-    wgSizeHalf,
-    SCRATCH_PAD_WORK_GROUP_SIZE,
-    ll,
-    0,
-    16,
-    cfconsts::OuterNeighbors,
-    posBcast + wgSizeHalf,
-    innerAboveThreshold + wgSizeHalf,
-    buf);
-  if (!inGroup1) {
-    updateClusterOuter(
-      llhalf,
-      16,
-      16,
-      0,
-      buf,
-      pos,
-      myCluster,
-      labelAcc);
-  }
-#endif
-}
-
-GPUd() uint32_t GPUTPCCFClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t* elemsInBucket, tpc::ClusterNative* buckets)
-{
-  uint32_t index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
-  if (index < maxElemsPerBucket) {
-    buckets[maxElemsPerBucket * row + index] = cluster;
-  } else {
-    clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISector * 1000 + row, index, maxElemsPerBucket);
-    CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
-  }
-  return index;
-}
+}
\ No newline at end of file
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
index 411c38c39459e..79f3325ed9ad2 100644
--- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h
@@ -59,14 +59,14 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate
 
   static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D<PackedCharge>&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*);
 
+  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
+
+  static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
+
  private:
   static GPUd() void updateClusterInner(const GPUSettingsRec&, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*);
 
   static GPUd() void updateClusterOuter(uint16_t, uint16_t, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*);
-
-  static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D<PackedCharge>&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*);
-
-  static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*);
 };
 
 } // namespace o2::gpu
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
new file mode 100644
index 0000000000000..6545ee0139d4b
--- /dev/null
+++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc
@@ -0,0 +1,249 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUTPCCFClusterizer.cxx
+/// \author Felix Weiglhofer
+
+#ifndef O2_GPU_CLUSTERIZER_INC_H
+#define O2_GPU_CLUSTERIZER_INC_H
+
+GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread,
+                                                       processorType& clusterer,
+                                                       const CfFragment& fragment,
+                                                       GPUSharedMemory& smem,
+                                                       const Array2D<PackedCharge>& chargeMap,
+                                                       const ChargePos* filteredPeakPositions,
+                                                       const GPUSettingsRec& calib,
+                                                       MCLabelAccumulator* labelAcc,
+                                                       uint32_t clusternum,
+                                                       uint32_t maxClusterPerRow,
+                                                       uint32_t* clusterInRow,
+                                                       tpc::ClusterNative* clusterByRow,
+                                                       uint32_t* clusterPosInRow)
+{
+  uint32_t idx = get_global_id(0);
+
+  // For certain configurations dummy work items are added, so the total
+  // number of work items is dividable by 64.
+  // These dummy items also compute the last cluster but discard the result.
+  ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)];
+  Charge charge = chargeMap[pos].unpack();
+
+  ClusterAccumulator pc;
+  CPU_ONLY(labelAcc->collect(pos, charge));
+
+  buildCluster(
+    calib,
+    chargeMap,
+    pos,
+    smem.posBcast,
+    smem.buf,
+    smem.innerAboveThreshold,
+    &pc,
+    labelAcc);
+
+  if (idx >= clusternum) {
+    return;
+  }
+  if (fragment.isOverlap(pos.time())) {
+    if (clusterPosInRow) {
+      clusterPosInRow[idx] = maxClusterPerRow;
+    }
+    return;
+  }
+  pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry);
+
+  tpc::ClusterNative myCluster;
+  bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param());
+
+  if (rejectCluster) {
+    if (clusterPosInRow) {
+      clusterPosInRow[idx] = maxClusterPerRow;
+    }
+    return;
+  }
+
+  uint32_t rowIndex = 0;
+  if (clusterByRow != nullptr) {
+    rowIndex = sortIntoBuckets(
+      clusterer,
+      myCluster,
+      pos.row(),
+      maxClusterPerRow,
+      clusterInRow,
+      clusterByRow);
+    if (clusterPosInRow != nullptr) {
+      clusterPosInRow[idx] = rowIndex;
+    }
+  } else if (clusterPosInRow) {
+    rowIndex = clusterPosInRow[idx];
+  }
+
+  CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow));
+}
+
+GPUdii() void GPUTPCCFClusterizer::updateClusterInner(
+  const GPUSettingsRec& calib,
+  uint16_t lid,
+  uint16_t N,
+  const PackedCharge* buf,
+  const ChargePos& pos,
+  ClusterAccumulator* cluster,
+  MCLabelAccumulator* labelAcc,
+  uint8_t* innerAboveThreshold)
+{
+  uint8_t aboveThreshold = 0;
+
+  GPUCA_UNROLL(U(), U())
+  for (uint16_t i = 0; i < N; i++) {
+    Delta2 d = cfconsts::InnerNeighbors[i];
+
+    PackedCharge p = buf[N * lid + i];
+
+    Charge q = cluster->updateInner(p, d);
+
+    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
+
+    aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i);
+  }
+
+  innerAboveThreshold[lid] = aboveThreshold;
+
+  GPUbarrier();
+}
+
+GPUdii() void GPUTPCCFClusterizer::updateClusterOuter(
+  uint16_t lid,
+  uint16_t N,
+  uint16_t M,
+  uint16_t offset,
+  const PackedCharge* buf,
+  const ChargePos& pos,
+  ClusterAccumulator* cluster,
+  MCLabelAccumulator* labelAcc)
+{
+  GPUCA_UNROLL(U(), U())
+  for (uint16_t i = offset; i < M + offset; i++) {
+    PackedCharge p = buf[N * lid + i];
+
+    Delta2 d = cfconsts::OuterNeighbors[i];
+
+    Charge q = cluster->updateOuter(p, d);
+    static_cast<void>(q); // Avoid unused varible warning on GPU.
+
+    CPU_ONLY(labelAcc->collect(pos.delta(d), q));
+  }
+}
+
+GPUdii() void GPUTPCCFClusterizer::buildCluster(
+  const GPUSettingsRec& calib,
+  const Array2D<PackedCharge>& chargeMap,
+  ChargePos pos,
+  ChargePos* posBcast,
+  PackedCharge* buf,
+  uint8_t* innerAboveThreshold,
+  ClusterAccumulator* myCluster,
+  MCLabelAccumulator* labelAcc)
+{
+  uint16_t ll = get_local_id(0);
+
+  posBcast[ll] = pos;
+  GPUbarrier();
+
+  CfUtils::blockLoad<PackedCharge>(
+    chargeMap,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    8,
+    cfconsts::InnerNeighbors,
+    posBcast,
+    buf);
+  updateClusterInner(
+    calib,
+    ll,
+    8,
+    buf,
+    pos,
+    myCluster,
+    labelAcc,
+    innerAboveThreshold);
+
+  uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2;
+
+  bool inGroup1 = ll < wgSizeHalf;
+
+  uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf);
+
+  CfUtils::condBlockLoad(
+    chargeMap,
+    wgSizeHalf,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    16,
+    cfconsts::OuterNeighbors,
+    posBcast,
+    innerAboveThreshold,
+    buf);
+
+  if (inGroup1) {
+    updateClusterOuter(
+      llhalf,
+      16,
+      16,
+      0,
+      buf,
+      pos,
+      myCluster,
+      labelAcc);
+  }
+
+#if defined(GPUCA_GPUCODE)
+  CfUtils::condBlockLoad(
+    chargeMap,
+    wgSizeHalf,
+    SCRATCH_PAD_WORK_GROUP_SIZE,
+    ll,
+    0,
+    16,
+    cfconsts::OuterNeighbors,
+    posBcast + wgSizeHalf,
+    innerAboveThreshold + wgSizeHalf,
+    buf);
+  if (!inGroup1) {
+    updateClusterOuter(
+      llhalf,
+      16,
+      16,
+      0,
+      buf,
+      pos,
+      myCluster,
+      labelAcc);
+  }
+#endif
+}
+
+GPUd() uint32_t GPUTPCCFClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t* elemsInBucket, tpc::ClusterNative* buckets)
+{
+  uint32_t index = CAMath::AtomicAdd(&elemsInBucket[row], 1u);
+  if (index < maxElemsPerBucket) {
+    buckets[maxElemsPerBucket * row + index] = cluster;
+  } else {
+    clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISector * 1000 + row, index, maxElemsPerBucket);
+    CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket);
+  }
+  return index;
+}
+
+#endif // O2_GPU_CLUSTERIZER_INC_H