From 33b5b364fcd0778938e6213f059a5c8b40f30a33 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 13 Mar 2025 09:35:43 +0100 Subject: [PATCH] GPU TPC CF: Split clusterizer CXX functions out into .inc file to be used externally --- .../TPCClusterFinder/GPUTPCCFClusterizer.cxx | 235 +---------------- .../TPCClusterFinder/GPUTPCCFClusterizer.h | 8 +- .../TPCClusterFinder/GPUTPCCFClusterizer.inc | 249 ++++++++++++++++++ 3 files changed, 256 insertions(+), 236 deletions(-) create mode 100644 GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx index 1aeae812f5193..7bf53b4878233 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.cxx @@ -25,6 +25,8 @@ using namespace o2::gpu; using namespace o2::gpu::tpccf; +#include "GPUTPCCFClusterizer.inc" + template <> GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, int8_t onlyMC) { @@ -34,235 +36,4 @@ GPUdii() void GPUTPCCFClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); -} - -GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, - processorType& clusterer, - const CfFragment& fragment, - GPUSharedMemory& smem, - const Array2D& chargeMap, - const ChargePos* filteredPeakPositions, - const GPUSettingsRec& calib, - MCLabelAccumulator* labelAcc, - uint32_t clusternum, - uint32_t maxClusterPerRow, - uint32_t* clusterInRow, - tpc::ClusterNative* clusterByRow, - uint32_t* clusterPosInRow) -{ - uint32_t idx = get_global_id(0); - - // For certain configurations dummy work items are added, so the total - // number of work items is dividable by 64. - // These dummy items also compute the last cluster but discard the result. - ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)]; - Charge charge = chargeMap[pos].unpack(); - - ClusterAccumulator pc; - CPU_ONLY(labelAcc->collect(pos, charge)); - - buildCluster( - calib, - chargeMap, - pos, - smem.posBcast, - smem.buf, - smem.innerAboveThreshold, - &pc, - labelAcc); - - if (idx >= clusternum) { - return; - } - if (fragment.isOverlap(pos.time())) { - if (clusterPosInRow) { - clusterPosInRow[idx] = maxClusterPerRow; - } - return; - } - pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry); - - tpc::ClusterNative myCluster; - bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param()); - - if (rejectCluster) { - if (clusterPosInRow) { - clusterPosInRow[idx] = maxClusterPerRow; - } - return; - } - - uint32_t rowIndex = 0; - if (clusterByRow != nullptr) { - rowIndex = sortIntoBuckets( - clusterer, - myCluster, - pos.row(), - maxClusterPerRow, - clusterInRow, - clusterByRow); - if (clusterPosInRow != nullptr) { - clusterPosInRow[idx] = rowIndex; - } - } else if (clusterPosInRow) { - rowIndex = clusterPosInRow[idx]; - } - - CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow)); -} - -GPUdii() void GPUTPCCFClusterizer::updateClusterInner( - const GPUSettingsRec& calib, - uint16_t lid, - uint16_t N, - const PackedCharge* buf, - const ChargePos& pos, - ClusterAccumulator* cluster, - MCLabelAccumulator* labelAcc, - uint8_t* innerAboveThreshold) -{ - uint8_t aboveThreshold = 0; - - GPUCA_UNROLL(U(), U()) - for (uint16_t i = 0; i < N; i++) { - Delta2 d = cfconsts::InnerNeighbors[i]; - - PackedCharge p = buf[N * lid + i]; - - Charge q = cluster->updateInner(p, d); - - CPU_ONLY(labelAcc->collect(pos.delta(d), q)); - - aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i); - } - - innerAboveThreshold[lid] = aboveThreshold; - - GPUbarrier(); -} - -GPUdii() void GPUTPCCFClusterizer::updateClusterOuter( - uint16_t lid, - uint16_t N, - uint16_t M, - uint16_t offset, - const PackedCharge* buf, - const ChargePos& pos, - ClusterAccumulator* cluster, - MCLabelAccumulator* labelAcc) -{ - GPUCA_UNROLL(U(), U()) - for (uint16_t i = offset; i < M + offset; i++) { - PackedCharge p = buf[N * lid + i]; - - Delta2 d = cfconsts::OuterNeighbors[i]; - - Charge q = cluster->updateOuter(p, d); - static_cast(q); // Avoid unused varible warning on GPU. - - CPU_ONLY(labelAcc->collect(pos.delta(d), q)); - } -} - -GPUdii() void GPUTPCCFClusterizer::buildCluster( - const GPUSettingsRec& calib, - const Array2D& chargeMap, - ChargePos pos, - ChargePos* posBcast, - PackedCharge* buf, - uint8_t* innerAboveThreshold, - ClusterAccumulator* myCluster, - MCLabelAccumulator* labelAcc) -{ - uint16_t ll = get_local_id(0); - - posBcast[ll] = pos; - GPUbarrier(); - - CfUtils::blockLoad( - chargeMap, - SCRATCH_PAD_WORK_GROUP_SIZE, - SCRATCH_PAD_WORK_GROUP_SIZE, - ll, - 0, - 8, - cfconsts::InnerNeighbors, - posBcast, - buf); - updateClusterInner( - calib, - ll, - 8, - buf, - pos, - myCluster, - labelAcc, - innerAboveThreshold); - - uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2; - - bool inGroup1 = ll < wgSizeHalf; - - uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf); - - CfUtils::condBlockLoad( - chargeMap, - wgSizeHalf, - SCRATCH_PAD_WORK_GROUP_SIZE, - ll, - 0, - 16, - cfconsts::OuterNeighbors, - posBcast, - innerAboveThreshold, - buf); - - if (inGroup1) { - updateClusterOuter( - llhalf, - 16, - 16, - 0, - buf, - pos, - myCluster, - labelAcc); - } - -#if defined(GPUCA_GPUCODE) - CfUtils::condBlockLoad( - chargeMap, - wgSizeHalf, - SCRATCH_PAD_WORK_GROUP_SIZE, - ll, - 0, - 16, - cfconsts::OuterNeighbors, - posBcast + wgSizeHalf, - innerAboveThreshold + wgSizeHalf, - buf); - if (!inGroup1) { - updateClusterOuter( - llhalf, - 16, - 16, - 0, - buf, - pos, - myCluster, - labelAcc); - } -#endif -} - -GPUd() uint32_t GPUTPCCFClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t* elemsInBucket, tpc::ClusterNative* buckets) -{ - uint32_t index = CAMath::AtomicAdd(&elemsInBucket[row], 1u); - if (index < maxElemsPerBucket) { - buckets[maxElemsPerBucket * row + index] = cluster; - } else { - clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISector * 1000 + row, index, maxElemsPerBucket); - CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket); - } - return index; -} +} \ No newline at end of file diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h index 411c38c39459e..79f3325ed9ad2 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.h @@ -59,14 +59,14 @@ class GPUTPCCFClusterizer : public GPUKernelTemplate static GPUd() void computeClustersImpl(int32_t, int32_t, int32_t, int32_t, processorType&, const CfFragment&, GPUSharedMemory&, const Array2D&, const ChargePos*, const GPUSettingsRec&, MCLabelAccumulator*, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*, uint32_t*); + static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*); + + static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*); + private: static GPUd() void updateClusterInner(const GPUSettingsRec&, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*, uint8_t*); static GPUd() void updateClusterOuter(uint16_t, uint16_t, uint16_t, uint16_t, const PackedCharge*, const ChargePos&, ClusterAccumulator*, MCLabelAccumulator*); - - static GPUd() void buildCluster(const GPUSettingsRec&, const Array2D&, ChargePos, ChargePos*, PackedCharge*, uint8_t*, ClusterAccumulator*, MCLabelAccumulator*); - - static GPUd() uint32_t sortIntoBuckets(processorType&, const tpc::ClusterNative&, uint32_t, uint32_t, uint32_t*, tpc::ClusterNative*); }; } // namespace o2::gpu diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc new file mode 100644 index 0000000000000..6545ee0139d4b --- /dev/null +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFClusterizer.inc @@ -0,0 +1,249 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCCFClusterizer.cxx +/// \author Felix Weiglhofer + +#ifndef O2_GPU_CLUSTERIZER_INC_H +#define O2_GPU_CLUSTERIZER_INC_H + +GPUdii() void GPUTPCCFClusterizer::computeClustersImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, + processorType& clusterer, + const CfFragment& fragment, + GPUSharedMemory& smem, + const Array2D& chargeMap, + const ChargePos* filteredPeakPositions, + const GPUSettingsRec& calib, + MCLabelAccumulator* labelAcc, + uint32_t clusternum, + uint32_t maxClusterPerRow, + uint32_t* clusterInRow, + tpc::ClusterNative* clusterByRow, + uint32_t* clusterPosInRow) +{ + uint32_t idx = get_global_id(0); + + // For certain configurations dummy work items are added, so the total + // number of work items is dividable by 64. + // These dummy items also compute the last cluster but discard the result. + ChargePos pos = filteredPeakPositions[CAMath::Min(idx, clusternum - 1)]; + Charge charge = chargeMap[pos].unpack(); + + ClusterAccumulator pc; + CPU_ONLY(labelAcc->collect(pos, charge)); + + buildCluster( + calib, + chargeMap, + pos, + smem.posBcast, + smem.buf, + smem.innerAboveThreshold, + &pc, + labelAcc); + + if (idx >= clusternum) { + return; + } + if (fragment.isOverlap(pos.time())) { + if (clusterPosInRow) { + clusterPosInRow[idx] = maxClusterPerRow; + } + return; + } + pc.finalize(pos, charge, fragment.start, clusterer.Param().tpcGeometry); + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(pos, charge, myCluster, clusterer.Param()); + + if (rejectCluster) { + if (clusterPosInRow) { + clusterPosInRow[idx] = maxClusterPerRow; + } + return; + } + + uint32_t rowIndex = 0; + if (clusterByRow != nullptr) { + rowIndex = sortIntoBuckets( + clusterer, + myCluster, + pos.row(), + maxClusterPerRow, + clusterInRow, + clusterByRow); + if (clusterPosInRow != nullptr) { + clusterPosInRow[idx] = rowIndex; + } + } else if (clusterPosInRow) { + rowIndex = clusterPosInRow[idx]; + } + + CPU_ONLY(labelAcc->commit(pos.row(), rowIndex, maxClusterPerRow)); +} + +GPUdii() void GPUTPCCFClusterizer::updateClusterInner( + const GPUSettingsRec& calib, + uint16_t lid, + uint16_t N, + const PackedCharge* buf, + const ChargePos& pos, + ClusterAccumulator* cluster, + MCLabelAccumulator* labelAcc, + uint8_t* innerAboveThreshold) +{ + uint8_t aboveThreshold = 0; + + GPUCA_UNROLL(U(), U()) + for (uint16_t i = 0; i < N; i++) { + Delta2 d = cfconsts::InnerNeighbors[i]; + + PackedCharge p = buf[N * lid + i]; + + Charge q = cluster->updateInner(p, d); + + CPU_ONLY(labelAcc->collect(pos.delta(d), q)); + + aboveThreshold |= (uint8_t(q > calib.tpc.cfInnerThreshold) << i); + } + + innerAboveThreshold[lid] = aboveThreshold; + + GPUbarrier(); +} + +GPUdii() void GPUTPCCFClusterizer::updateClusterOuter( + uint16_t lid, + uint16_t N, + uint16_t M, + uint16_t offset, + const PackedCharge* buf, + const ChargePos& pos, + ClusterAccumulator* cluster, + MCLabelAccumulator* labelAcc) +{ + GPUCA_UNROLL(U(), U()) + for (uint16_t i = offset; i < M + offset; i++) { + PackedCharge p = buf[N * lid + i]; + + Delta2 d = cfconsts::OuterNeighbors[i]; + + Charge q = cluster->updateOuter(p, d); + static_cast(q); // Avoid unused varible warning on GPU. + + CPU_ONLY(labelAcc->collect(pos.delta(d), q)); + } +} + +GPUdii() void GPUTPCCFClusterizer::buildCluster( + const GPUSettingsRec& calib, + const Array2D& chargeMap, + ChargePos pos, + ChargePos* posBcast, + PackedCharge* buf, + uint8_t* innerAboveThreshold, + ClusterAccumulator* myCluster, + MCLabelAccumulator* labelAcc) +{ + uint16_t ll = get_local_id(0); + + posBcast[ll] = pos; + GPUbarrier(); + + CfUtils::blockLoad( + chargeMap, + SCRATCH_PAD_WORK_GROUP_SIZE, + SCRATCH_PAD_WORK_GROUP_SIZE, + ll, + 0, + 8, + cfconsts::InnerNeighbors, + posBcast, + buf); + updateClusterInner( + calib, + ll, + 8, + buf, + pos, + myCluster, + labelAcc, + innerAboveThreshold); + + uint16_t wgSizeHalf = (SCRATCH_PAD_WORK_GROUP_SIZE + 1) / 2; + + bool inGroup1 = ll < wgSizeHalf; + + uint16_t llhalf = (inGroup1) ? ll : (ll - wgSizeHalf); + + CfUtils::condBlockLoad( + chargeMap, + wgSizeHalf, + SCRATCH_PAD_WORK_GROUP_SIZE, + ll, + 0, + 16, + cfconsts::OuterNeighbors, + posBcast, + innerAboveThreshold, + buf); + + if (inGroup1) { + updateClusterOuter( + llhalf, + 16, + 16, + 0, + buf, + pos, + myCluster, + labelAcc); + } + +#if defined(GPUCA_GPUCODE) + CfUtils::condBlockLoad( + chargeMap, + wgSizeHalf, + SCRATCH_PAD_WORK_GROUP_SIZE, + ll, + 0, + 16, + cfconsts::OuterNeighbors, + posBcast + wgSizeHalf, + innerAboveThreshold + wgSizeHalf, + buf); + if (!inGroup1) { + updateClusterOuter( + llhalf, + 16, + 16, + 0, + buf, + pos, + myCluster, + labelAcc); + } +#endif +} + +GPUd() uint32_t GPUTPCCFClusterizer::sortIntoBuckets(processorType& clusterer, const tpc::ClusterNative& cluster, uint32_t row, uint32_t maxElemsPerBucket, uint32_t* elemsInBucket, tpc::ClusterNative* buckets) +{ + uint32_t index = CAMath::AtomicAdd(&elemsInBucket[row], 1u); + if (index < maxElemsPerBucket) { + buckets[maxElemsPerBucket * row + index] = cluster; + } else { + clusterer.raiseError(GPUErrors::ERROR_CF_ROW_CLUSTER_OVERFLOW, clusterer.mISector * 1000 + row, index, maxElemsPerBucket); + CAMath::AtomicExch(&elemsInBucket[row], maxElemsPerBucket); + } + return index; +} + +#endif // O2_GPU_CLUSTERIZER_INC_H