From 290ba0e44c75d6f58ed030fd4f3ed70e96eed753 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 12 Jul 2025 03:22:17 +0200 Subject: [PATCH 01/14] First version of lookup tables --- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 2 + .../TPCClusterFinder/GPUTPCNNClusterizer.h | 53 +++++++++++-------- .../GPUTPCNNClusterizerHost.cxx | 42 ++++++++++++++- .../GPUTPCNNClusterizerHost.h | 2 + .../GPUTPCNNClusterizerKernels.cxx | 43 ++++++--------- prodtests/full-system-test/dpl-workflow.sh | 2 +- 6 files changed, 94 insertions(+), 50 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index da37c0771fe84..282abe98a80bb 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -61,6 +61,8 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) } if (mNnClusterizerTotalClusters > 0) { computePointerWithAlignment(mem, mOutputDataClass, mNnClusterizerTotalClusters); + computePointerWithAlignment(mem, mIsBoundary, mBoundaryMapSize); + computePointerWithAlignment(mem, mIndexLookup, mIndexLookupSize); } return mem; } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 980c0977aca65..2564515f58551 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -37,31 +37,42 @@ class GPUTPCNNClusterizer : public GPUProcessor // Neural network clusterization - int mNnClusterizerSizeInputRow = 3; - int mNnClusterizerSizeInputPad = 3; - int mNnClusterizerSizeInputTime = 3; - int mNnClusterizerElementSize = -1; - bool mNnClusterizerAddIndexData = true; + int32_t mNnClusterizerSizeInputRow = 3; + int32_t mNnClusterizerSizeInputPad = 3; + int32_t mNnClusterizerSizeInputTime = 3; + int32_t mNnClusterizerChargeArraySize = -1; + int32_t mNnClusterizerElementSize = -1; + int8_t mNnClusterizerAddIndexData = 1; float mNnClassThreshold = 0.01; - bool mNnSigmoidTrafoClassThreshold = 1; - bool mNnClusterizerSetDeconvolutionFlags = true; - int mNnClusterizerUseCfRegression = 0; - int mNnClusterizerBatchedMode = 1; - int mNnClusterizerTotalClusters = 1; - int mNnClusterizerVerbosity = 0; - int mNnClusterizerBoundaryFillValue = -1; - int mNnClusterizerModelClassNumOutputNodes = -1; - int mNnClusterizerModelReg1NumOutputNodes = -1; - int mNnClusterizerModelReg2NumOutputNodes = -1; - int mNnInferenceInputDType = 0; // 0: float16, 1: float32 - int mNnInferenceOutputDType = 0; // 0: float16, 1: float32 - int mISector = -1; - int mDeviceId = -1; + int8_t mNnSigmoidTrafoClassThreshold = 1; + int8_t mNnClusterizerSetDeconvolutionFlags = 1; + int32_t mNnClusterizerUseCfRegression = 0; + int32_t mNnClusterizerBatchedMode = 1; + int32_t mNnClusterizerTotalClusters = 1; + int32_t mNnClusterizerVerbosity = 0; + int32_t mNnClusterizerBoundaryFillValue = -1; + int32_t mNnClusterizerModelClassNumOutputNodes = -1; + int32_t mNnClusterizerModelReg1NumOutputNodes = -1; + int32_t mNnClusterizerModelReg2NumOutputNodes = -1; + int32_t mNnInferenceInputDType = 0; // 0: float16, 1: float32 + int32_t mNnInferenceOutputDType = 0; // 0: float16, 1: float32 + int32_t mISector = -1; + int32_t mDeviceId = -1; + + // Boundary lookup table + int32_t mBoundaryMapSizeRow = 0; + int32_t mBoundaryMapSizePerRow = 0; + int32_t mBoundaryMapSize = 0; + int8_t* mIsBoundary = nullptr; + + // Index lookup table + int32_t mIndexLookupSize = 0; + int32_t* mIndexLookup = nullptr; // Memory allocation for neural network - bool* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr - int* mOutputDataClass = nullptr; + int8_t* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr + int32_t* mOutputDataClass = nullptr; // FP32 float* mInputData_32 = nullptr; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 124320396d0d4..c394d9b0766fc 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -19,6 +19,8 @@ #include "GPUSettings.h" #include "ML/3rdparty/GPUORTFloat16.h" #include "GPUReconstruction.h" +#include "GPUTPCGeometry.h" +#include "DataFormatsTPC/Constants.h" #ifdef GPUCA_HAS_ONNX #include @@ -87,8 +89,11 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow; clustererNN.mNnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad; clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime; + clustererNN.mNnClusterizerChargeArraySize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)); + clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0); + clustererNN.mBoundaryMapSize = (3*clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW)*(GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2*clustererNN.mNnClusterizerSizeInputPad); + clustererNN.mIndexLookupSize = 3*clustererNN.mNnClusterizerElementSize; // local row, pad, time coordinate from flat index clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData; - clustererNN.mNnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0); clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode; clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue; clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold; @@ -114,6 +119,41 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerModelReg2NumOutputNodes = mModelReg2.getNumOutputNodes()[0][1]; } } + createBoundary(clustererNN); + createIndexLookup(clustererNN); +} + +void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) { + // Call after init of the clustererNN elements + clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW; + clustererNN.mBoundaryMapSizePerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2 * clustererNN.mNnClusterizerSizeInputPad; + for(int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) { + for (int p = 0; p < clustererNN.mBoundaryMapSizePerRow; p++) { + int32_t i = r * clustererNN.mBoundaryMapSizePerRow + p; + clustererNN.mIsBoundary[i] = 1; + if (p >= clustererNN.mNnClusterizerSizeInputPad || r >= clustererNN.mNnClusterizerSizeInputRow) { + if ((r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) || + (r >= (GPUTPCGeometry::EndIROC() + 2*clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2*clustererNN.mNnClusterizerSizeInputRow))) { + clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mNnClusterizerSizeInputPad) >= static_cast(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow))); + } + if (clustererNN.mIsBoundary[i] == 1) { + break; // No need to check further pads in this row + } + } + } + } +} + +void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN) { + for(int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++){ + int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow; + int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1)); + int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad; + int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime; + clustererNN.mIndexLookup[3*i] = r; + clustererNN.mIndexLookup[3*i + 1] = p; + clustererNN.mIndexLookup[3*i + 2] = t; + } } // MockedOrtAllocator implementation to be able to use volatile assignment diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index a4449165261be..ed3c80320b632 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -49,6 +49,8 @@ class GPUTPCNNClusterizerHost void init(const GPUSettingsProcessingNNclusterizer&); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + void createBoundary(GPUTPCNNClusterizer&); + void createIndexLookup(GPUTPCNNClusterizer&); // ONNX void directOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 8cdc0684ad588..8a7453f7be3fc 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -155,37 +155,26 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.time()); - int32_t r = CAMath::Floor(transient_index / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow; - bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); - if (is_row_boundary) { + int32_t idxLookup = 3*transient_index; + int32_t r = clustererNN.mIndexLookup[idxLookup], p = clustererNN.mIndexLookup[idxLookup + 1], t = clustererNN.mIndexLookup[idxLookup + 2] + time; + int32_t current_row = row + r, current_pad = pad + p; + int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); + int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, current_row); + int32_t isBoundaryIndex = (current_row + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePerRow + current_pad + clustererNN.mNnClusterizerSizeInputPad; + + if (!clustererNN.mIsBoundary[isBoundaryIndex] && (t >= 0) && (t < TPC_MAX_FRAGMENT_LEN_GPU)) { + float central_charge = static_cast(chargeMap[peak].unpack()); + CfChargePos tmp_pos(current_row, current_pad + pad_offset, t); if (dtype == 0) { - clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); - } else { - clustererNN.mInputData_32[glo_idx] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); + clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); + } else if (dtype == 1) { + clustererNN.mInputData_32[glo_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; } } else { - int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r); - int32_t rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1)); - int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad + pad_offset; - int32_t time_pos = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime + time; - - bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (time_pos < 0 || time_pos >= TPC_MAX_FRAGMENT_LEN_GPU); - - if (!is_boundary) { - float central_charge = static_cast(chargeMap[peak].unpack()); - CfChargePos tmp_pos(row + r, pad + p, time_pos); - if (dtype == 0) { - clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); - } else if (dtype == 1) { - clustererNN.mInputData_32[glo_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; - } + if (dtype == 0) { + clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); } else { - if (dtype == 0) { - clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); - } else { - clustererNN.mInputData_32[glo_idx] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); - } + clustererNN.mInputData_32[glo_idx] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); } } } diff --git a/prodtests/full-system-test/dpl-workflow.sh b/prodtests/full-system-test/dpl-workflow.sh index 202352730ddc7..df87018b1013a 100755 --- a/prodtests/full-system-test/dpl-workflow.sh +++ b/prodtests/full-system-test/dpl-workflow.sh @@ -74,7 +74,7 @@ fi GPU_INPUT=zsraw GPU_OUTPUT=tracks,clusters GPU_CONFIG= -GPU_CONFIG_KEY= +#GPU_CONFIG_KEY= TOF_CONFIG= TOF_INPUT=raw TOF_OUTPUT=clusters From ed459ea8ea7d011059a2f84aff4dfaf44b5ce025 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 12 Jul 2025 03:48:49 +0200 Subject: [PATCH 02/14] Simplifying computations + bug-fixes --- .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 5 +++-- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 9 ++++----- prodtests/full-system-test/dpl-workflow.sh | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index c394d9b0766fc..50b3fb15f6294 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -132,9 +132,10 @@ void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) { int32_t i = r * clustererNN.mBoundaryMapSizePerRow + p; clustererNN.mIsBoundary[i] = 1; if (p >= clustererNN.mNnClusterizerSizeInputPad || r >= clustererNN.mNnClusterizerSizeInputRow) { - if ((r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) || - (r >= (GPUTPCGeometry::EndIROC() + 2*clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2*clustererNN.mNnClusterizerSizeInputRow))) { + if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) { clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mNnClusterizerSizeInputPad) >= static_cast(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow))); + } else if (r >= (GPUTPCGeometry::EndIROC() + 2*clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2*clustererNN.mNnClusterizerSizeInputRow)) { + clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mNnClusterizerSizeInputPad) >= static_cast(GPUTPCGeometry::NPads(r - 2*clustererNN.mNnClusterizerSizeInputRow))); } if (clustererNN.mIsBoundary[i] == 1) { break; // No need to check further pads in this row diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 8a7453f7be3fc..962d651bad97e 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -156,15 +156,14 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.time()); int32_t idxLookup = 3*transient_index; - int32_t r = clustererNN.mIndexLookup[idxLookup], p = clustererNN.mIndexLookup[idxLookup + 1], t = clustererNN.mIndexLookup[idxLookup + 2] + time; - int32_t current_row = row + r, current_pad = pad + p; + int32_t r = clustererNN.mIndexLookup[idxLookup] + row, p = clustererNN.mIndexLookup[idxLookup + 1] + pad, t = clustererNN.mIndexLookup[idxLookup + 2] + time; int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, current_row); - int32_t isBoundaryIndex = (current_row + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePerRow + current_pad + clustererNN.mNnClusterizerSizeInputPad; + int32_t isBoundaryIndex = (r + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePerRow + p + clustererNN.mNnClusterizerSizeInputPad; if (!clustererNN.mIsBoundary[isBoundaryIndex] && (t >= 0) && (t < TPC_MAX_FRAGMENT_LEN_GPU)) { + int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, r); float central_charge = static_cast(chargeMap[peak].unpack()); - CfChargePos tmp_pos(current_row, current_pad + pad_offset, t); + CfChargePos tmp_pos(r, p + pad_offset, t); if (dtype == 0) { clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); } else if (dtype == 1) { diff --git a/prodtests/full-system-test/dpl-workflow.sh b/prodtests/full-system-test/dpl-workflow.sh index df87018b1013a..202352730ddc7 100755 --- a/prodtests/full-system-test/dpl-workflow.sh +++ b/prodtests/full-system-test/dpl-workflow.sh @@ -74,7 +74,7 @@ fi GPU_INPUT=zsraw GPU_OUTPUT=tracks,clusters GPU_CONFIG= -#GPU_CONFIG_KEY= +GPU_CONFIG_KEY= TOF_CONFIG= TOF_INPUT=raw TOF_OUTPUT=clusters From 1ac8c2cd240a2063e3e729904b7297db09402b3f Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sun, 13 Jul 2025 10:54:18 +0200 Subject: [PATCH 03/14] Fixes for indexing and offsets --- .../Global/GPUChainTrackingClusterizer.cxx | 2 ++ .../TPCClusterFinder/GPUTPCNNClusterizer.h | 3 +- .../GPUTPCNNClusterizerHost.cxx | 26 +++++++------- .../GPUTPCNNClusterizerKernels.cxx | 34 ++++++++++++------- 4 files changed, 38 insertions(+), 27 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 25bfe37f0db30..beac893e0072c 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -706,6 +706,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow); } AllocateRegisteredMemory(clustererNN.mMemoryId); + nnApplications[lane].createBoundary(clustererNNShadow); + nnApplications[lane].createIndexLookup(clustererNNShadow); }); if (doGPU) { WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 2564515f58551..086b3c2211b5b 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -61,8 +61,9 @@ class GPUTPCNNClusterizer : public GPUProcessor // Boundary lookup table int32_t mBoundaryMapSizeRow = 0; - int32_t mBoundaryMapSizePerRow = 0; + int32_t mBoundaryMapSizePadsPerRow = 0; int32_t mBoundaryMapSize = 0; + int32_t mBoundaryPadding = 11; // Padding on each side of the boundary map to account for pad_offset int8_t* mIsBoundary = nullptr; // Index lookup table diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 50b3fb15f6294..d9bf511b2c724 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -91,8 +91,11 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime; clustererNN.mNnClusterizerChargeArraySize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)); clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0); - clustererNN.mBoundaryMapSize = (3*clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW)*(GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2*clustererNN.mNnClusterizerSizeInputPad); - clustererNN.mIndexLookupSize = 3*clustererNN.mNnClusterizerElementSize; // local row, pad, time coordinate from flat index + clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW; + clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work + clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2*clustererNN.mBoundaryPadding; + clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow*clustererNN.mBoundaryMapSizePadsPerRow; + clustererNN.mIndexLookupSize = 3*clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData; clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode; clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue; @@ -119,27 +122,22 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerModelReg2NumOutputNodes = mModelReg2.getNumOutputNodes()[0][1]; } } - createBoundary(clustererNN); - createIndexLookup(clustererNN); } void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) { // Call after init of the clustererNN elements - clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW; - clustererNN.mBoundaryMapSizePerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2 * clustererNN.mNnClusterizerSizeInputPad; for(int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) { - for (int p = 0; p < clustererNN.mBoundaryMapSizePerRow; p++) { - int32_t i = r * clustererNN.mBoundaryMapSizePerRow + p; + int8_t skipCheckInRow = 0; + for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) { + int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p; clustererNN.mIsBoundary[i] = 1; - if (p >= clustererNN.mNnClusterizerSizeInputPad || r >= clustererNN.mNnClusterizerSizeInputRow) { + if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) { if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) { - clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mNnClusterizerSizeInputPad) >= static_cast(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow))); + clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow))); } else if (r >= (GPUTPCGeometry::EndIROC() + 2*clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2*clustererNN.mNnClusterizerSizeInputRow)) { - clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mNnClusterizerSizeInputPad) >= static_cast(GPUTPCGeometry::NPads(r - 2*clustererNN.mNnClusterizerSizeInputRow))); - } - if (clustererNN.mIsBoundary[i] == 1) { - break; // No need to check further pads in this row + clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast(GPUTPCGeometry::NPads(r - 2*clustererNN.mNnClusterizerSizeInputRow))); } + skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row } } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 962d651bad97e..ef8c649fd977e 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -121,9 +121,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { - uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; + + uint32_t glo_idx = get_global_id(0); uint32_t base_idx = CAMath::Floor(glo_idx / clustererNN.mNnClusterizerElementSize); uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerElementSize); @@ -153,17 +154,22 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(pad) / GPUTPCGeometry::NPads(row); } - } else if ((int32_t)transient_index < (clustererNN.mNnClusterizerElementSize - 3)) { + } else if ((int32_t)transient_index < clustererNN.mNnClusterizerChargeArraySize) { int32_t time = static_cast(peak.time()); int32_t idxLookup = 3*transient_index; int32_t r = clustererNN.mIndexLookup[idxLookup] + row, p = clustererNN.mIndexLookup[idxLookup + 1] + pad, t = clustererNN.mIndexLookup[idxLookup + 2] + time; int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - int32_t isBoundaryIndex = (r + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePerRow + p + clustererNN.mNnClusterizerSizeInputPad; + int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, r); + p += pad_offset; + int32_t isBoundaryIndex = (r + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePadsPerRow + p + clustererNN.mBoundaryPadding; if (!clustererNN.mIsBoundary[isBoundaryIndex] && (t >= 0) && (t < TPC_MAX_FRAGMENT_LEN_GPU)) { - int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, r); float central_charge = static_cast(chargeMap[peak].unpack()); - CfChargePos tmp_pos(r, p + pad_offset, t); + CfChargePos tmp_pos(r, p, t); + // if ((glo_idx % (clustererNN.mNnClusterizerElementSize*1000)) == (int)((clustererNN.mNnClusterizerChargeArraySize-1)/2.f)){ + // printf("glo_idx: %d, r: %d, p: %d, t: %d, tmp_pos: (%d, %d, %d), charge: %f, central_charge: %f\n", + // glo_idx, clustererNN.mIndexLookup[idxLookup], clustererNN.mIndexLookup[idxLookup + 1], clustererNN.mIndexLookup[idxLookup + 2], tmp_pos.row(), tmp_pos.pad(), tmp_pos.time(), chargeMap[tmp_pos].unpack(), central_charge); + // } if (dtype == 0) { clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); } else if (dtype == 1) { @@ -489,24 +495,28 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread o2::tpc::constants::MAXGLOBALPADROW) { + return 0; // Short-circuit for negative rows + } else { + return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2); + } } -GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t global_shift) +GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset) { - return (row > 62 ? global_shift : 0); + return (row > 62 ? offset : 0); } -GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t global_shift) +GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset) { if (pad < 0 || row < 0) { // Faster short-circuit return true; } else if (row < 63) { return (pad >= static_cast(GPUTPCGeometry::NPads(row))); - } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network + } else if (row < (63 + offset)) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network return true; - } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) { - return (pad >= static_cast(GPUTPCGeometry::NPads(row - global_shift))); + } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + offset)) { + return (pad >= static_cast(GPUTPCGeometry::NPads(row - offset))); } else { return true; } From fdf6ef88ed36887d4dbf57cd030cd1930b895a7e Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 14 Jul 2025 01:16:31 +0200 Subject: [PATCH 04/14] Adjusting CPU kernel --- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 27 ---------------- .../GPUTPCNNClusterizerKernels.cxx | 31 ++++++++++++------- 2 files changed, 19 insertions(+), 39 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 282abe98a80bb..35cecb458c7a3 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -67,33 +67,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) return mem; } -// std::vector GPUTPCNNClusterizer::pointerSizes() { -// std::vector sizes(7, -1); -// if (mNnClusterizerBatchedMode > 0) { -// if (mNnInferenceInputDType == 0 && mNnClusterizerElementSize > 0) { -// sizes[0] = mNnClusterizerBatchedMode * mNnClusterizerElementSize; // inputData16 -// } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) { -// sizes[1] = mNnClusterizerBatchedMode * mNnClusterizerElementSize; // inputData32 -// } -// sizes[2] = 2 * mNnClusterizerBatchedMode; // mClusterFlags -// if (mNnClusterizerModelClassNumOutputNodes > 0) { -// sizes[3] = mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes; // modelProbabilities -// } -// if (!mNnClusterizerUseCfRegression) { -// if (mNnClusterizerModelReg1NumOutputNodes > 0) { -// sizes[4] = mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes; // outputDataReg1 -// } -// if (mNnClusterizerModelReg2NumOutputNodes > 0) { -// sizes[5] = mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes; // outputDataReg2 -// } -// } -// } -// if (mNnClusterizerTotalClusters > 0) { -// sizes[6] = mNnClusterizerTotalClusters; // mOutputDataClass -// } -// return sizes; -// } - void GPUTPCNNClusterizer::RegisterMemoryAllocation() { AllocateAndInitializeLate(); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index ef8c649fd977e..af194b9809e44 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -61,25 +61,31 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))]; - int32_t row = static_cast(peak.row()), pad = static_cast(peak.pad()), time = static_cast(peak.time()); // Explicit casting to avoid conversion errors + int32_t row = static_cast(peak.row()), pad = static_cast(peak.pad()), time = static_cast(peak.time()); // Explicit casting to avoid conversion errors float central_charge = static_cast(chargeMap[peak].unpack()); int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) { - bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); - int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r); - for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) { - bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow); + int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r); + int32_t row_pos = row + r; + for (int32_t p = (-clustererNN.mNnClusterizerSizeInputPad + pad_offset); p <= (clustererNN.mNnClusterizerSizeInputPad + pad_offset); p++) { + int32_t pad_pos = pad + p; for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) { int32_t time_pos = time + t; - if (!is_boundary && (time_pos >= 0) && (time_pos < TPC_MAX_FRAGMENT_LEN_GPU)) { - CfChargePos tmp_pos(row + r, pad + p, time + t); - if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization - clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); - clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx]; + int32_t isBoundaryIndex = (row_pos + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePadsPerRow + pad_pos + clustererNN.mBoundaryPadding; + if (!clustererNN.mIsBoundary[isBoundaryIndex] && (time_pos >= 0) && (time_pos < TPC_MAX_FRAGMENT_LEN_GPU)) { + CfChargePos tmp_pos(row_pos, pad_pos, time_pos); + if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { // Only if deconvolution flags are not set + if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization + clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); + clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx]; + } } if (dtype == 0) { clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); + // if(CAMath::Abs(static_cast(clustererNN.mInputData_16[write_idx]) - static_cast(clustererNN.mInputData_16[write_idx])) > 1e-6) { + // printf("Warning: (Charge) Charge difference at idx %d, batchStart %d, maxClusters %d, sector %d, row %d (%d), pad %d (%d), time %d (%d): %f / %f\n", glo_idx, batchStart, clusterer.mPmemory->counters.nClusters - 1, sector, row_pos, r, pad_pos, p, time_pos, t, static_cast(clustererNN.mInputData_16[write_idx]), static_cast(clustererNN.mInputData_16[write_idx])); + // } } else if (dtype == 1) { clustererNN.mInputData_32[write_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; } @@ -507,16 +513,17 @@ GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset return (row > 62 ? offset : 0); } +// Legacy. Deprecated. GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset) { if (pad < 0 || row < 0) { // Faster short-circuit return true; } else if (row < 63) { - return (pad >= static_cast(GPUTPCGeometry::NPads(row))); + return ((pad < 0) || (pad >= static_cast(GPUTPCGeometry::NPads(row)))); } else if (row < (63 + offset)) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network return true; } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + offset)) { - return (pad >= static_cast(GPUTPCGeometry::NPads(row - offset))); + return ((pad < 0) || (pad >= static_cast(GPUTPCGeometry::NPads(row - offset)))); } else { return true; } From 7604fab521a503ddf2fc337108a5d8c23f06800c Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Sun, 13 Jul 2025 23:18:51 +0000 Subject: [PATCH 05/14] Please consider the following formatting changes --- .../GPUTPCNNClusterizerHost.cxx | 26 ++++++++++--------- .../GPUTPCNNClusterizerKernels.cxx | 6 ++--- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index d9bf511b2c724..b8ddcb296a414 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -93,9 +93,9 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0); clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW; clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work - clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2*clustererNN.mBoundaryPadding; - clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow*clustererNN.mBoundaryMapSizePadsPerRow; - clustererNN.mIndexLookupSize = 3*clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index + clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2 * clustererNN.mBoundaryPadding; + clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow; + clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData; clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode; clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue; @@ -124,9 +124,10 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust } } -void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) { +void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) +{ // Call after init of the clustererNN elements - for(int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) { + for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) { int8_t skipCheckInRow = 0; for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) { int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p; @@ -134,8 +135,8 @@ void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) { if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) { if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) { clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow))); - } else if (r >= (GPUTPCGeometry::EndIROC() + 2*clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2*clustererNN.mNnClusterizerSizeInputRow)) { - clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast(GPUTPCGeometry::NPads(r - 2*clustererNN.mNnClusterizerSizeInputRow))); + } else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) { + clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow))); } skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row } @@ -143,15 +144,16 @@ void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) { } } -void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN) { - for(int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++){ +void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN) +{ + for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) { int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow; int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1)); int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad; int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime; - clustererNN.mIndexLookup[3*i] = r; - clustererNN.mIndexLookup[3*i + 1] = p; - clustererNN.mIndexLookup[3*i + 2] = t; + clustererNN.mIndexLookup[3 * i] = r; + clustererNN.mIndexLookup[3 * i + 1] = p; + clustererNN.mIndexLookup[3 * i + 2] = t; } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index af194b9809e44..ee4f10160e5ea 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -75,7 +75,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= 0) && (time_pos < TPC_MAX_FRAGMENT_LEN_GPU)) { CfChargePos tmp_pos(row_pos, pad_pos, time_pos); - if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { // Only if deconvolution flags are not set + if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { // Only if deconvolution flags are not set if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx]; @@ -162,7 +162,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.time()); - int32_t idxLookup = 3*transient_index; + int32_t idxLookup = 3 * transient_index; int32_t r = clustererNN.mIndexLookup[idxLookup] + row, p = clustererNN.mIndexLookup[idxLookup + 1] + pad, t = clustererNN.mIndexLookup[idxLookup + 2] + time; int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, r); @@ -501,7 +501,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread o2::tpc::constants::MAXGLOBALPADROW) { + if (row_current < 0 || row_current > o2::tpc::constants::MAXGLOBALPADROW) { return 0; // Short-circuit for negative rows } else { return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2); From 0c2ac6a5444e31e5735f86ca159f4400bf320b84 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 14 Jul 2025 09:16:54 +0200 Subject: [PATCH 06/14] Fix for row-number access --- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 2 +- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index b8ddcb296a414..b9b4619687ed4 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -93,7 +93,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0); clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW; clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work - clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW) + 2 * clustererNN.mBoundaryPadding; + clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW-1) + 2 * clustererNN.mBoundaryPadding; clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow; clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index ee4f10160e5ea..7f8e2528b6561 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -501,7 +501,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread o2::tpc::constants::MAXGLOBALPADROW) { + if (row_current < 0 || row_current >= o2::tpc::constants::MAXGLOBALPADROW) { return 0; // Short-circuit for negative rows } else { return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2); From 33c8278acf49d1bd462cfe7f2c43a72385866680 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Mon, 14 Jul 2025 07:17:37 +0000 Subject: [PATCH 07/14] Please consider the following formatting changes --- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index b9b4619687ed4..669e4214b2756 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -93,7 +93,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0); clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW; clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work - clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW-1) + 2 * clustererNN.mBoundaryPadding; + clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding; clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow; clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData; From eecb8a8e3f944b663b419c7a5dae39cd7c0486c3 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 16 Jul 2025 00:57:13 +0200 Subject: [PATCH 08/14] Improve kernel speed by ~15%. Next test: for-loop in pad direction for coallesced access --- .../Global/GPUChainTrackingClusterizer.cxx | 4 +- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 2 - .../TPCClusterFinder/GPUTPCNNClusterizer.h | 21 +- .../GPUTPCNNClusterizerHost.cxx | 80 ++++---- .../GPUTPCNNClusterizerKernels.cxx | 184 +++++++++++------- .../GPUTPCNNClusterizerKernels.h | 2 +- 6 files changed, 173 insertions(+), 120 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index beac893e0072c..13455efe6cb47 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -706,8 +706,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow); } AllocateRegisteredMemory(clustererNN.mMemoryId); - nnApplications[lane].createBoundary(clustererNNShadow); - nnApplications[lane].createIndexLookup(clustererNNShadow); + // nnApplications[lane].createBoundary(clustererNNShadow); + // nnApplications[lane].createIndexLookup(clustererNNShadow); }); if (doGPU) { WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 35cecb458c7a3..3dd8b0d621a56 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -61,8 +61,6 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) } if (mNnClusterizerTotalClusters > 0) { computePointerWithAlignment(mem, mOutputDataClass, mNnClusterizerTotalClusters); - computePointerWithAlignment(mem, mIsBoundary, mBoundaryMapSize); - computePointerWithAlignment(mem, mIndexLookup, mIndexLookupSize); } return mem; } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 086b3c2211b5b..0bffc525ac8cf 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -59,16 +59,22 @@ class GPUTPCNNClusterizer : public GPUProcessor int32_t mISector = -1; int32_t mDeviceId = -1; + // GPU optimizations + uint32_t mNnClusterizerFullRowSize = 0; + uint32_t mNnClusterizerFullPadSize = 0; + uint32_t mNnClusterizerFullTimeSize = 0; + uint32_t mNnClusterizerPadTimeSize = 0; + // Boundary lookup table - int32_t mBoundaryMapSizeRow = 0; - int32_t mBoundaryMapSizePadsPerRow = 0; - int32_t mBoundaryMapSize = 0; - int32_t mBoundaryPadding = 11; // Padding on each side of the boundary map to account for pad_offset - int8_t* mIsBoundary = nullptr; + // int32_t mBoundaryMapSizeRow = 0; + // int32_t mBoundaryMapSizePadsPerRow = 0; + // int32_t mBoundaryMapSize = 0; + // int32_t mBoundaryPadding = 11; // Padding on each side of the boundary map to account for pad_offset + // int8_t* mIsBoundary = nullptr; // Index lookup table - int32_t mIndexLookupSize = 0; - int32_t* mIndexLookup = nullptr; + // int32_t mIndexLookupSize = 0; + // int32_t* mIndexLookup = nullptr; // Memory allocation for neural network @@ -83,6 +89,7 @@ class GPUTPCNNClusterizer : public GPUProcessor // FP16 OrtDataType::Float16_t* mInputData_16 = nullptr; + OrtDataType::Float16_t* mInputData_16_Test = nullptr; OrtDataType::Float16_t* mModelProbabilities_16 = nullptr; OrtDataType::Float16_t* mOutputDataReg1_16 = nullptr; OrtDataType::Float16_t* mOutputDataReg2_16 = nullptr; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 669e4214b2756..17a55ed0df1e3 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -89,13 +89,17 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow; clustererNN.mNnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad; clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime; - clustererNN.mNnClusterizerChargeArraySize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)); + clustererNN.mNnClusterizerFullRowSize = 2 * settings.nnClusterizerSizeInputRow + 1; + clustererNN.mNnClusterizerFullPadSize = 2 * settings.nnClusterizerSizeInputPad + 1; + clustererNN.mNnClusterizerFullTimeSize = 2 * settings.nnClusterizerSizeInputTime + 1; + clustererNN.mNnClusterizerChargeArraySize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize; + clustererNN.mNnClusterizerPadTimeSize = clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize; clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0); - clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW; - clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work - clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding; - clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow; - clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index + // clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW; + // clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work + // clustererNN.mBoundaryMapSizePadsPerRow = GPUTPCGeometry::NPads(o2::tpc::constants::MAXGLOBALPADROW - 1) + 2 * clustererNN.mBoundaryPadding; + // clustererNN.mBoundaryMapSize = clustererNN.mBoundaryMapSizeRow * clustererNN.mBoundaryMapSizePadsPerRow; + // clustererNN.mIndexLookupSize = 3 * clustererNN.mNnClusterizerChargeArraySize; // local row, pad, time shift from flat index clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData; clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode; clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue; @@ -124,38 +128,38 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust } } -void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) -{ - // Call after init of the clustererNN elements - for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) { - int8_t skipCheckInRow = 0; - for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) { - int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p; - clustererNN.mIsBoundary[i] = 1; - if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) { - if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) { - clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow))); - } else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) { - clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow))); - } - skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row - } - } - } -} - -void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN) -{ - for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) { - int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow; - int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1)); - int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad; - int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime; - clustererNN.mIndexLookup[3 * i] = r; - clustererNN.mIndexLookup[3 * i + 1] = p; - clustererNN.mIndexLookup[3 * i + 2] = t; - } -} +// void GPUTPCNNClusterizerHost::createBoundary(GPUTPCNNClusterizer& clustererNN) +// { +// // Call after init of the clustererNN elements +// for (int r = 0; r < clustererNN.mBoundaryMapSizeRow; r++) { +// int8_t skipCheckInRow = 0; +// for (int p = 0; p < clustererNN.mBoundaryMapSizePadsPerRow; p++) { +// int32_t i = r * clustererNN.mBoundaryMapSizePadsPerRow + p; +// clustererNN.mIsBoundary[i] = 1; +// if (!skipCheckInRow && (p >= clustererNN.mBoundaryPadding || r >= clustererNN.mNnClusterizerSizeInputRow)) { +// if (r < (GPUTPCGeometry::EndIROC() + clustererNN.mNnClusterizerSizeInputRow)) { +// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast(GPUTPCGeometry::NPads(r - clustererNN.mNnClusterizerSizeInputRow))); +// } else if (r >= (GPUTPCGeometry::EndIROC() + 2 * clustererNN.mNnClusterizerSizeInputRow) && r < (o2::tpc::constants::MAXGLOBALPADROW + 2 * clustererNN.mNnClusterizerSizeInputRow)) { +// clustererNN.mIsBoundary[i] = (int32_t)((p - clustererNN.mBoundaryPadding) >= static_cast(GPUTPCGeometry::NPads(r - 2 * clustererNN.mNnClusterizerSizeInputRow))); +// } +// skipCheckInRow = (clustererNN.mIsBoundary[i] == 1); // No need to check further pads in this row +// } +// } +// } +// } + +// void GPUTPCNNClusterizerHost::createIndexLookup(GPUTPCNNClusterizer& clustererNN) +// { +// for (int32_t i = 0; i < clustererNN.mNnClusterizerChargeArraySize; i++) { +// int32_t r = CAMath::Floor(i / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow; +// int32_t rest_1 = i % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1)); +// int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad; +// int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime; +// clustererNN.mIndexLookup[3 * i] = r; +// clustererNN.mIndexLookup[3 * i + 1] = p; +// clustererNN.mIndexLookup[3 * i + 2] = t; +// } +// } // MockedOrtAllocator implementation to be able to use volatile assignment struct MockedOrtAllocator : OrtAllocator { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 7f8e2528b6561..8f1c8269339f6 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -50,53 +50,57 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadfragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } + template <> GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { uint32_t glo_idx = get_global_id(0); + auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; + + if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters) { + return; + } uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize; // Potential optimization: Either choose mNnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))]; - int32_t row = static_cast(peak.row()), pad = static_cast(peak.pad()), time = static_cast(peak.time()); // Explicit casting to avoid conversion errors + int32_t row = static_cast(peak.row()), pad = static_cast(peak.pad()), time = static_cast(peak.time()); // Explicit casting to avoid conversion errors float central_charge = static_cast(chargeMap[peak].unpack()); int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) { - int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r); - int32_t row_pos = row + r; - for (int32_t p = (-clustererNN.mNnClusterizerSizeInputPad + pad_offset); p <= (clustererNN.mNnClusterizerSizeInputPad + pad_offset); p++) { - int32_t pad_pos = pad + p; + bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); + int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r); + for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) { + bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow); for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) { int32_t time_pos = time + t; - int32_t isBoundaryIndex = (row_pos + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePadsPerRow + pad_pos + clustererNN.mBoundaryPadding; - if (!clustererNN.mIsBoundary[isBoundaryIndex] && (time_pos >= 0) && (time_pos < TPC_MAX_FRAGMENT_LEN_GPU)) { - CfChargePos tmp_pos(row_pos, pad_pos, time_pos); - if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { // Only if deconvolution flags are not set - if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization - clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); - clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx]; - } - } - if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); - // if(CAMath::Abs(static_cast(clustererNN.mInputData_16[write_idx]) - static_cast(clustererNN.mInputData_16[write_idx])) > 1e-6) { - // printf("Warning: (Charge) Charge difference at idx %d, batchStart %d, maxClusters %d, sector %d, row %d (%d), pad %d (%d), time %d (%d): %f / %f\n", glo_idx, batchStart, clusterer.mPmemory->counters.nClusters - 1, sector, row_pos, r, pad_pos, p, time_pos, t, static_cast(clustererNN.mInputData_16[write_idx]), static_cast(clustererNN.mInputData_16[write_idx])); - // } - } else if (dtype == 1) { - clustererNN.mInputData_32[write_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; - } - } else { + if (is_boundary || (time_pos < 0) || (time_pos >= TPC_MAX_FRAGMENT_LEN_GPU)) { // Filling boundary just to make sure that no values are left unintentionally if (dtype == 0) { clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); } else { clustererNN.mInputData_32[write_idx] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); } + } else { + CfChargePos tmp_pos(row + r, pad + p, time + t); + if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization + clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); + clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx]; + } + if (dtype == 0) { + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); + } else if (dtype == 1) { + clustererNN.mInputData_32[write_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + } } + // if((CAMath::Abs(static_cast(clustererNN.mInputData_16_Test[write_idx]) - static_cast(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) { + // printf("Warning: Input data mismatch at index %d, %d - row, pad, time: %d, %d, %d : %f -> %f\n", glo_idx, glo_idx + batchStart, r, p, t, + // static_cast(clustererNN.mInputData_16_Test[write_idx]), static_cast(clustererNN.mInputData_16[write_idx])); + // } write_idx++; } } @@ -127,66 +131,107 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { + uint32_t glo_idx = get_global_id(0); + auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; - uint32_t glo_idx = get_global_id(0); - uint32_t base_idx = CAMath::Floor(glo_idx / clustererNN.mNnClusterizerElementSize); + // Optimized division using bit operations + uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerElementSize; uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerElementSize); + // Early exit for out-of-bounds threads + if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) { + return; + } CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); + + // Use dedicated neural network shared memory arrays for warp-level caching + // First thread in each warp loads shared data CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))]; - int32_t row = static_cast(peak.row()), pad = static_cast(peak.pad()); - - if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.mNnClusterizerElementSize - 1)) { - uint32_t top_idx = (base_idx + 1) * clustererNN.mNnClusterizerElementSize; - if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { // Only if deconvolution flags are not set - clustererNN.mClusterFlags[2 * base_idx] = 0; - clustererNN.mClusterFlags[2 * base_idx + 1] = 0; - for (uint16_t i = 0; i < 8; i++) { // This solution needs testing. It is not the same as the deconvolution flags - Delta2 d = cfconsts::InnerNeighbors[i]; - CfChargePos tmp_pos = peak.delta(d); - clustererNN.mClusterFlags[2 * base_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); - } - clustererNN.mClusterFlags[2 * base_idx + 1] = clustererNN.mClusterFlags[2 * base_idx]; - } + float central_charge = static_cast(chargeMap[peak].unpack()); + int32_t row = static_cast(peak.row()); + int32_t pad = static_cast(peak.pad()); + int32_t time = static_cast(peak.time()); + + // Handle index data with fewer branches + if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index >= clustererNN.mNnClusterizerChargeArraySize) { + uint32_t output_idx = base_idx * clustererNN.mNnClusterizerElementSize + transient_index; + int32_t data_idx = transient_index - clustererNN.mNnClusterizerChargeArraySize; + + float index_values[3] = { + sector / 36.f, + row / 152.f, + static_cast(pad) / GPUTPCGeometry::NPads(row) + }; + if (dtype == 0) { - clustererNN.mInputData_16[top_idx - 3] = (OrtDataType::Float16_t)(sector / 36.f); - clustererNN.mInputData_16[top_idx - 2] = (OrtDataType::Float16_t)(row / 152.f); - clustererNN.mInputData_16[top_idx - 1] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); + clustererNN.mInputData_16[output_idx] = (OrtDataType::Float16_t)index_values[data_idx]; } else { - clustererNN.mInputData_32[top_idx - 3] = sector / 36.f; - clustererNN.mInputData_32[top_idx - 2] = row / 152.f; - clustererNN.mInputData_32[top_idx - 1] = static_cast(pad) / GPUTPCGeometry::NPads(row); + clustererNN.mInputData_32[output_idx] = index_values[data_idx]; } - } else if ((int32_t)transient_index < clustererNN.mNnClusterizerChargeArraySize) { - int32_t time = static_cast(peak.time()); - int32_t idxLookup = 3 * transient_index; - int32_t r = clustererNN.mIndexLookup[idxLookup] + row, p = clustererNN.mIndexLookup[idxLookup + 1] + pad, t = clustererNN.mIndexLookup[idxLookup + 2] + time; - int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, r); - p += pad_offset; - int32_t isBoundaryIndex = (r + row_offset + clustererNN.mNnClusterizerSizeInputRow) * clustererNN.mBoundaryMapSizePadsPerRow + p + clustererNN.mBoundaryPadding; - - if (!clustererNN.mIsBoundary[isBoundaryIndex] && (t >= 0) && (t < TPC_MAX_FRAGMENT_LEN_GPU)) { - float central_charge = static_cast(chargeMap[peak].unpack()); - CfChargePos tmp_pos(r, p, t); - // if ((glo_idx % (clustererNN.mNnClusterizerElementSize*1000)) == (int)((clustererNN.mNnClusterizerChargeArraySize-1)/2.f)){ - // printf("glo_idx: %d, r: %d, p: %d, t: %d, tmp_pos: (%d, %d, %d), charge: %f, central_charge: %f\n", - // glo_idx, clustererNN.mIndexLookup[idxLookup], clustererNN.mIndexLookup[idxLookup + 1], clustererNN.mIndexLookup[idxLookup + 2], tmp_pos.row(), tmp_pos.pad(), tmp_pos.time(), chargeMap[tmp_pos].unpack(), central_charge); - // } - if (dtype == 0) { - clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); - } else if (dtype == 1) { - clustererNN.mInputData_32[glo_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + + // Handle deconvolution flags only once per cluster (last thread in element) + if (data_idx == 2 && !clustererNN.mNnClusterizerSetDeconvolutionFlags) { + uint8_t cluster_flags = 0; + for (uint16_t i = 0; i < 8; i++) { + Delta2 d = cfconsts::InnerNeighbors[i]; + CfChargePos tmp_pos = peak.delta(d); + cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]); } - } else { + clustererNN.mClusterFlags[2 * base_idx] = cluster_flags; + clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags; + } + return; + } + + // Main data processing - optimize index calculations + if ((int32_t)transient_index < clustererNN.mNnClusterizerChargeArraySize) { + // Optimize 3D index calculation + int32_t r_local = (transient_index / clustererNN.mNnClusterizerPadTimeSize) - clustererNN.mNnClusterizerSizeInputRow; + int32_t pad_time_slice = (transient_index % clustererNN.mNnClusterizerPadTimeSize); + int32_t p_local = (pad_time_slice / clustererNN.mNnClusterizerFullPadSize) - clustererNN.mNnClusterizerSizeInputPad; + int32_t t_local = (pad_time_slice % clustererNN.mNnClusterizerFullPadSize) - clustererNN.mNnClusterizerSizeInputTime; + + // Early boundary check for row + int32_t target_row = row + r_local; + int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1)); + + if (is_row_boundary) { + // Use boundary fill value + float boundary_val = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); if (dtype == 0) { - clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); + clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)boundary_val; } else { - clustererNN.mInputData_32[glo_idx] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); + clustererNN.mInputData_32[glo_idx] = boundary_val; } + return; + } + + // Calculate offsets + int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); + int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row); + int32_t target_pad = pad + p_local + pad_offset; + int32_t target_time = time + t_local; + + // Optimized boundary check + int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= TPC_MAX_FRAGMENT_LEN_GPU); + + float output_value; + if (is_boundary) { + output_value = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); + } else { + // Coalesced memory access - create position and read charge + CfChargePos tmp_pos(target_row, target_pad, target_time); + output_value = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge + } + + // Write output with reduced branching + if (dtype == 0) { + clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)output_value; + } else { + clustererNN.mInputData_32[glo_idx] = output_value; } } } @@ -501,7 +546,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= o2::tpc::constants::MAXGLOBALPADROW) { + if(row_current < 0 || row_current >= o2::tpc::constants::MAXGLOBALPADROW) { return 0; // Short-circuit for negative rows } else { return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2); @@ -513,7 +558,6 @@ GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset return (row > 62 ? offset : 0); } -// Legacy. Deprecated. GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset) { if (pad < 0 || row < 0) { // Faster short-circuit diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index a3858d47eb99b..5659c61894c85 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -65,7 +65,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate determineClass2Labels = 4, publishClass1Regression = 5, publishClass2Regression = 6, - publishDeconvolutionFlags = 7 + publishDeconvolutionFlags = 7, }; template From 3447927ba0ff0aafb1b790cd08d85a005a0b64c2 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 16 Jul 2025 09:58:15 +0200 Subject: [PATCH 09/14] IMproving kernel speed by 30% compared to original version. Next try: for-loop over row dimension as access is somewhat coalsced too --- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 3 +- .../GPUTPCNNClusterizerHost.cxx | 2 + .../GPUTPCNNClusterizerKernels.cxx | 86 ++++++++++--------- 3 files changed, 50 insertions(+), 41 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 0bffc525ac8cf..7c22d8123fdec 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -64,6 +64,8 @@ class GPUTPCNNClusterizer : public GPUProcessor uint32_t mNnClusterizerFullPadSize = 0; uint32_t mNnClusterizerFullTimeSize = 0; uint32_t mNnClusterizerPadTimeSize = 0; + uint32_t mNnClusterizerRowTimeSize = 0; + uint32_t mNnClusterizerRowTimeSizeFull = 0; // Boundary lookup table // int32_t mBoundaryMapSizeRow = 0; @@ -89,7 +91,6 @@ class GPUTPCNNClusterizer : public GPUProcessor // FP16 OrtDataType::Float16_t* mInputData_16 = nullptr; - OrtDataType::Float16_t* mInputData_16_Test = nullptr; OrtDataType::Float16_t* mModelProbabilities_16 = nullptr; OrtDataType::Float16_t* mOutputDataReg1_16 = nullptr; OrtDataType::Float16_t* mOutputDataReg2_16 = nullptr; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 17a55ed0df1e3..fd56d49de7921 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -94,6 +94,8 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerFullTimeSize = 2 * settings.nnClusterizerSizeInputTime + 1; clustererNN.mNnClusterizerChargeArraySize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize; clustererNN.mNnClusterizerPadTimeSize = clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize; + clustererNN.mNnClusterizerRowTimeSize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullTimeSize; + clustererNN.mNnClusterizerRowTimeSizeFull = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 3 : 0); clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0); // clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW; // clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 8f1c8269339f6..7765315c8f692 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -137,8 +137,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clusterer.mPmemory->counters.nClusters) { @@ -156,9 +156,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.time()); // Handle index data with fewer branches - if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index >= clustererNN.mNnClusterizerChargeArraySize) { - uint32_t output_idx = base_idx * clustererNN.mNnClusterizerElementSize + transient_index; - int32_t data_idx = transient_index - clustererNN.mNnClusterizerChargeArraySize; + if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index >= clustererNN.mNnClusterizerRowTimeSize) { + int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize; + uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx; float index_values[3] = { sector / 36.f, @@ -167,9 +167,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread (o2::tpc::constants::MAXGLOBALPADROW - 1)); - if (is_row_boundary) { - // Use boundary fill value - float boundary_val = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); - if (dtype == 0) { - clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)boundary_val; - } else { - clustererNN.mInputData_32[glo_idx] = boundary_val; - } - return; - } - // Calculate offsets int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row); - int32_t target_pad = pad + p_local + pad_offset; - int32_t target_time = time + t_local; + for (int32_t p_local = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local++) { + if (is_row_boundary) { + // Use boundary fill value + float boundary_val = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); + if (dtype == 0) { + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_val; + } else { + clustererNN.mInputData_32[write_idx] = boundary_val; + } + write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position + continue; + } - // Optimized boundary check - int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= TPC_MAX_FRAGMENT_LEN_GPU); + // Calculate target pad and time + int32_t target_pad = pad + p_local; + int32_t target_time = time + t_local; - float output_value; - if (is_boundary) { - output_value = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); - } else { - // Coalesced memory access - create position and read charge - CfChargePos tmp_pos(target_row, target_pad, target_time); - output_value = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge - } + // Optimized boundary check + int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= TPC_MAX_FRAGMENT_LEN_GPU); - // Write output with reduced branching - if (dtype == 0) { - clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)output_value; - } else { - clustererNN.mInputData_32[glo_idx] = output_value; + float output_value; + if (is_boundary) { + output_value = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); + } else { + // Coalesced memory access - create position and read charge + CfChargePos tmp_pos(target_row, target_pad, target_time); + output_value = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge + } + + // Write output with reduced branching + if (dtype == 0) { + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; + } else { + clustererNN.mInputData_32[write_idx] = output_value; + } + write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position } } } From 6ef38a2fab6dc4bb9b2a9bd67b5596cd3992be15 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Wed, 16 Jul 2025 07:58:53 +0000 Subject: [PATCH 10/14] Please consider the following formatting changes --- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 7765315c8f692..1431367812e8c 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -50,7 +50,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadfragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } - template <> GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { @@ -163,8 +162,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(pad) / GPUTPCGeometry::NPads(row) - }; + static_cast(pad) / GPUTPCGeometry::NPads(row)}; if (dtype == 0) { clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx]; @@ -191,7 +189,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= o2::tpc::constants::MAXGLOBALPADROW) { + if (row_current < 0 || row_current >= o2::tpc::constants::MAXGLOBALPADROW) { return 0; // Short-circuit for negative rows } else { return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2); From 069a7e9c8cec0d7271c538c6e9eca49bb7a12451 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 16 Jul 2025 17:50:58 +0200 Subject: [PATCH 11/14] Minor improvements for MC handling --- .../GPUTPCNNClusterizerKernels.cxx | 40 +++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 1431367812e8c..f8cf0dc6a0127 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -291,9 +291,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadcounters.nClusters; uint32_t full_glo_idx = glo_idx + batchStart; - if (full_glo_idx >= maxClusterNum) { - return; - } int32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg1NumOutputNodes; CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); @@ -302,6 +299,24 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= maxClusterNum) { + if (withMC) { + ClusterAccumulator dummy_pc; + CPU_ONLY(labelAcc->collect(peak, central_charge)); + GPUTPCCFClusterizer::buildCluster( + clusterer.Param().rec, + chargeMap, + peak, + smem.posBcast, + smem.buf, + smem.innerAboveThreshold, + &dummy_pc, + labelAcc); + } + return; + } + tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.mNnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size(); @@ -389,6 +404,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadcounters.nClusters; CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))]; float central_charge = static_cast(chargeMap[peak].unpack()); @@ -397,6 +413,24 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= maxClusterNum) { + if (withMC) { + ClusterAccumulator dummy_pc; + CPU_ONLY(labelAcc->collect(peak, central_charge)); + GPUTPCCFClusterizer::buildCluster( + clusterer.Param().rec, + chargeMap, + peak, + smem.posBcast, + smem.buf, + smem.innerAboveThreshold, + &dummy_pc, + labelAcc); + } + return; + } + uint32_t model_output_index = glo_idx * clustererNN.mNnClusterizerModelReg2NumOutputNodes; if (clustererNN.mOutputDataClass[full_glo_idx] > 0) { From 4949b55f8030e9f12bcef34a88e99bf8de69ea63 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 17 Jul 2025 23:46:02 +0200 Subject: [PATCH 12/14] Beautifications to trigger the CI --- .../GPUTPCNNClusterizerKernels.cxx | 74 ++++++++++++------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index f8cf0dc6a0127..8940d40bbb574 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -53,47 +53,58 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { - uint32_t glo_idx = get_global_id(0); - auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; + uint32_t glo_idx = get_global_id(0); if (glo_idx + batchStart >= clusterer.mPmemory->counters.nClusters) { return; } - uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize; // Potential optimization: Either choose mNnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId + + uint32_t write_idx = glo_idx * clustererNN.mNnClusterizerElementSize; CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); - CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))]; - int32_t row = static_cast(peak.row()), pad = static_cast(peak.pad()), time = static_cast(peak.time()); // Explicit casting to avoid conversion errors + CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, clusterer.mPmemory->counters.nClusters - 1)]; + int32_t row = static_cast(peak.row()); + int32_t pad = static_cast(peak.pad()); + int32_t time = static_cast(peak.time()); float central_charge = static_cast(chargeMap[peak].unpack()); int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) { - bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); - int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r); - for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) { - bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow); - for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) { - int32_t time_pos = time + t; - if (is_boundary || (time_pos < 0) || (time_pos >= TPC_MAX_FRAGMENT_LEN_GPU)) { - // Filling boundary just to make sure that no values are left unintentionally + for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) { + int32_t target_row = row + r; + bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW); + int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row); + + for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) { + int32_t target_pad = pad + p; + bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow); + + for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) { + int32_t target_time = time + t; + + if (is_boundary || target_time < 0 || target_time >= TPC_MAX_FRAGMENT_LEN_GPU) { + // Fill boundary value + float boundary_value = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_value; } else { - clustererNN.mInputData_32[write_idx] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); + clustererNN.mInputData_32[write_idx] = boundary_value; } } else { - CfChargePos tmp_pos(row + r, pad + p, time + t); - if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization + CfChargePos tmp_pos(target_row, target_pad, target_time); + float normalized_charge = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + + if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && r == 0 && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx]; } + if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); - } else if (dtype == 1) { - clustererNN.mInputData_32[write_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)normalized_charge; + } else { + clustererNN.mInputData_32[write_idx] = normalized_charge; } } // if((CAMath::Abs(static_cast(clustererNN.mInputData_16_Test[write_idx]) - static_cast(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) { @@ -104,21 +115,28 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(pad) / GPUTPCGeometry::NPads(row); + if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(sector / 36.f); - clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f); - clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)sector_norm; + clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)row_norm; + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)pad_norm; } else { - clustererNN.mInputData_32[write_idx] = sector / 36.f; - clustererNN.mInputData_32[write_idx + 1] = row / 152.f; - clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / GPUTPCGeometry::NPads(row); + clustererNN.mInputData_32[write_idx] = sector_norm; + clustererNN.mInputData_32[write_idx + 1] = row_norm; + clustererNN.mInputData_32[write_idx + 2] = pad_norm; } } + if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { clustererNN.mClusterFlags[2 * glo_idx] = 0; clustererNN.mClusterFlags[2 * glo_idx + 1] = 0; - for (uint16_t i = 0; i < 8; i++) { + + for (uint16_t i = 0; i < 8; ++i) { Delta2 d = cfconsts::InnerNeighbors[i]; CfChargePos tmp_pos = peak.delta(d); clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); From f4729fd69b1d767964aa32c36e1a25fb83643753 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 18 Jul 2025 08:31:47 +0200 Subject: [PATCH 13/14] Compile-fix --- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 8940d40bbb574..395230dab6bf1 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -65,7 +65,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); - CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, clusterer.mPmemory->counters.nClusters - 1)]; + CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))]; int32_t row = static_cast(peak.row()); int32_t pad = static_cast(peak.pad()); int32_t time = static_cast(peak.time()); From 2af6060fd2b24a166b290b332b637a5fa5041139 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 18 Jul 2025 10:43:19 +0200 Subject: [PATCH 14/14] Fix int32_t error in fullCI build --- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 395230dab6bf1..4cd0c094398df 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -173,7 +173,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.time()); // Handle index data with fewer branches - if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index >= clustererNN.mNnClusterizerRowTimeSize) { + if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) { int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize; uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx; @@ -203,7 +203,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread