From 9c8984c53d01d851f4294d5a68d6a8bbefd39295 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 18 Oct 2025 00:55:54 +0200 Subject: [PATCH 1/6] Improve GPU filling kernel speed --- .../Global/GPUChainTrackingClusterizer.cxx | 7 +- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 1 + .../GPUTPCNNClusterizerHost.cxx | 1 + .../GPUTPCNNClusterizerKernels.cxx | 181 +++++++++--------- .../GPUTPCNNClusterizerKernels.h | 4 +- 5 files changed, 104 insertions(+), 90 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index b0d466f13e5ef..c4b7b2e47cbd9 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -642,6 +642,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Maximum of 4 lanes supported HighResTimer* nnTimers[12]; + int32_t countLoops = 0; if (GetProcessingSettings().nn.applyNNclusterizer) { int32_t deviceId = -1; @@ -1035,7 +1036,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Filling the data if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) { // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations - runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeFull, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); + for(int throughput_counter = 0; throughput_counter < 16; throughput_counter++) { // Loop to increase throughput on GPU, at least for large batch sizes + runKernel({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); + countLoops++; + } } else { // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); @@ -1138,6 +1142,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")"; } } + LOG(info) << "countLoops: " << countLoops; #else GPUFatal("Project not compiled with neural network clusterization. Aborting."); #endif diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 0b9553437765c..b7bc1575d349a 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -71,6 +71,7 @@ class GPUTPCNNClusterizer : public GPUProcessor uint32_t mNnClusterizerPadTimeSize = 0; uint32_t mNnClusterizerRowTimeSize = 0; uint32_t mNnClusterizerRowTimeSizeFull = 0; + uint32_t mNnClusterizerRowTimeSizeThreads = 0; // Boundary lookup table // int32_t mBoundaryMapSizeRow = 0; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index ae833ace2f648..582a0c6d7435a 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -98,6 +98,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerPadTimeSize = clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize; clustererNN.mNnClusterizerRowTimeSize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullTimeSize; clustererNN.mNnClusterizerRowTimeSizeFull = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 3 : 0); + clustererNN.mNnClusterizerRowTimeSizeThreads = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 1 : 0); clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0); // clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW; // clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 55fefa7dcf149..1c76fc3f1fa57 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -72,15 +72,19 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.time()); float central_charge = static_cast(chargeMap[peak].unpack()); int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); + const int32_t iroc_row = 63 + row_offset; + const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset; + const int32_t npads_row = GPUTPCGeometry::NPads(row); for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) { int32_t target_row = row + r; bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW); int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row); + int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(target_row + row_offset); for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) { int32_t target_pad = pad + p; - bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow); + bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference); for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) { int32_t target_time = time + t; @@ -143,125 +147,119 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeFull) { + if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeThreads) { return; } - uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeFull; - uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeFull); + uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads; + uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads); // Early exit for out-of-bounds threads - if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) { - return; - } + // if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) { + // return; + // } CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); // Use dedicated neural network shared memory arrays for warp-level caching // First thread in each warp loads shared data CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))]; - float central_charge = static_cast(chargeMap[peak].unpack()); + float central_charge = chargeMap[peak].unpack(); int32_t row = static_cast(peak.row()); int32_t pad = static_cast(peak.pad()); int32_t time = static_cast(peak.time()); + const int32_t npads_row = GPUTPCGeometry::NPads(row); + // Handle index data with fewer branches if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) { - int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize; - uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx; - - float index_values[3] = { - static_cast(sector) / o2::tpc::constants::MAXSECTOR, - static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW, - static_cast(pad) / GPUTPCGeometry::NPads(row)}; - + // int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize; + // uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx; +// + // float index_values[3] = { + // static_cast(sector) / o2::tpc::constants::MAXSECTOR, + // static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW, + // static_cast(pad) / GPUTPCGeometry::NPads(row)}; +// + // if (dtype == 0) { + // clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx]; + // } else { + // clustererNN.mInputData_32[write_idx] = index_values[data_idx]; + // } +// + // // Handle deconvolution flags only once per cluster (last thread in element) + // if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) { + // uint8_t cluster_flags = 0; + // for (uint16_t i = 0; i < 8; i++) { + // Delta2 d = cfconsts::InnerNeighbors[i]; + // CfChargePos tmp_pos = peak.delta(d); + // cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]); + // } + // clustererNN.mClusterFlags[2 * base_idx] = cluster_flags; + // clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags; + // } + // return; + uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize; if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx]; + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); + clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads_row); } else { - clustererNN.mInputData_32[write_idx] = index_values[data_idx]; - } - - // Handle deconvolution flags only once per cluster (last thread in element) - if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) { - uint8_t cluster_flags = 0; - for (uint16_t i = 0; i < 8; i++) { - Delta2 d = cfconsts::InnerNeighbors[i]; - CfChargePos tmp_pos = peak.delta(d); - cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]); - } - clustererNN.mClusterFlags[2 * base_idx] = cluster_flags; - clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags; + clustererNN.mInputData_32[write_idx] = static_cast(sector) / o2::tpc::constants::MAXSECTOR; + clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; + clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads_row; } - return; } // Main data processing - optimize index calculations if (transient_index < clustererNN.mNnClusterizerRowTimeSize) { // Optimize 3D index calculation - int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize; - int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow; - int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize; - int32_t t_local = time_idx - clustererNN.mNnClusterizerSizeInputTime; + const int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize; + const int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize; int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx; // Early boundary check for row - int32_t target_row = row + r_local; - int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1)); + const int32_t target_row = row + row_idx - clustererNN.mNnClusterizerSizeInputRow; + const int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1)); + const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime; + const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin); + const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide // Calculate offsets - int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row); - for (int32_t p_local = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local++) { + // int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); + // int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row); + const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); + const int32_t iroc_row = 63 + row_offset; + const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset; + const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row); + const int32_t boundary_row = target_row + row_offset; + const int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(boundary_row - clustererNN.mNnClusterizerSizeInputRow); + const float boundary_val = clustererNN.mNnClusterizerBoundaryFillValue; + + float output_value = boundary_val; + + const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; + const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local; + + for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) { if (is_row_boundary) { - // Use boundary fill value - float boundary_val = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); - if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_val; + output_value = boundary_val; + } else { + const uint8_t is_boundary = is_time_boundary || GPUTPCNNClusterizerKernels::isBoundary(boundary_row, target_pad, maxrow, iroc_row, npads_row, npads_reference); + if (!is_boundary) { + CfChargePos pos(target_row, target_pad, target_time); + // one load + one multiply + output_value = chargeMap[pos].unpack() * inverse_central_charge; } else { - clustererNN.mInputData_32[write_idx] = boundary_val; + output_value = boundary_val; } - write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position - continue; - } - - // Calculate target pad and time - int32_t target_pad = pad + p_local; - int32_t target_time = time + t_local; - - // Optimized boundary check - int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin); - - float output_value; - if (is_boundary) { - output_value = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); - } else { - // Coalesced memory access - create position and read charge - CfChargePos tmp_pos(target_row, target_pad, target_time); - output_value = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge } - - // Write output with reduced branching if (dtype == 0) { clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; } else { clustererNN.mInputData_32[write_idx] = output_value; } - // if (write_idx >= clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode) { - // printf("Error: Write index out of bounds (central array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n", - // write_idx, (int)(clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local); - // } - // if ((clusterer.mPmemory->counters.nClusters - batchStart) < clustererNN.mNnClusterizerBatchedMode) { - // if (write_idx >= ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize)) { - // printf("Error: Write index out of bounds (end of array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n", - // write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local); - // } - // if (write_idx > ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 5)) { - // printf("Sanity check (should appear only once) %d == %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n", - // write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 4), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local); - // } - // } - - write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position + write_idx += clustererNN.mNnClusterizerFullTimeSize; } } } @@ -275,6 +273,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) { return; } + if (glo_idx + batchStart >= clustererNN.mNnClusterizerTotalClusters) { + printf("Error: Class output index out of bounds! %d >= %d (glo_idx: %d, batchStart: %d, mNnClusterizerBatchedMode: %d, mNnClusterizerModelClassNumOutputNodes: %d, clusterer.mPmemory->counters.nClusters %d)\n", + glo_idx + batchStart, clustererNN.mNnClusterizerTotalClusters, glo_idx, batchStart, clustererNN.mNnClusterizerBatchedMode, clustererNN.mNnClusterizerModelClassNumOutputNodes, clusterer.mPmemory->counters.nClusters); + } if (clustererNN.mNnClusterizerUseClassification) { if (dtype == 0) { clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold); @@ -364,6 +366,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes) { + printf("Error: Global index out of bounds! %d >= %d (full_glo_idx: %d, maxClusterNum: %d, batchStart: %d)\n", + full_glo_idx, clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes, full_glo_idx, maxClusterNum, batchStart); + } + tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; ClusterAccumulator pc; @@ -737,16 +744,16 @@ GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset return (row > 62 ? offset : 0); } -GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset) +GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t maxrow, int32_t iroc_row, int32_t npads_row, int32_t npads_reference) { - if (pad < 0 || row < 0) { // Faster short-circuit + if (pad < 0) { // Faster short-circuit return true; } else if (row < 63) { - return (pad >= static_cast(GPUTPCGeometry::NPads(row))); - } else if (row < (63 + offset)) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network + return (pad >= npads_row); + } else if (row < iroc_row) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network return true; - } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + offset)) { - return (pad >= static_cast(GPUTPCGeometry::NPads(row - offset))); + } else if (row < maxrow) { + return (pad >= npads_reference); } else { return true; } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index cd3d7783771fe..9353722568b1f 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -57,7 +57,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate determineClass2Labels = 4, publishClass1Regression = 5, publishClass2Regression = 6, - publishDeconvolutionFlags = 7, + publishDeconvolutionFlags = 7 }; template @@ -66,7 +66,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate private: static GPUd() int32_t padOffset(int32_t, int32_t); static GPUd() int32_t rowOffset(int32_t, int32_t); - static GPUd() bool isBoundary(int32_t, int32_t, int32_t); + static GPUd() bool isBoundary(int32_t, int32_t, int32_t, int32_t, int32_t, int32_t); static GPUd() bool isBoundaryPublish(int32_t, int32_t, float&, float&); }; From a075c43cd3da71e56da1cd96769100c68c6c6c11 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 18 Oct 2025 18:33:48 +0200 Subject: [PATCH 2/6] Adjusting parameter bounds and additional GPU kernel optimizations --- .../Definitions/GPUDefParametersDefaults.h | 2 +- .../Global/GPUChainTrackingClusterizer.cxx | 7 +- .../GPUTPCNNClusterizerKernels.cxx | 104 +++++++----------- 3 files changed, 40 insertions(+), 73 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h index 29aa3808506dc..648482304aca2 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h @@ -482,7 +482,7 @@ #define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNCPU GPUCA_LB_GPUTPCNNClusterizerKernels - #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU GPUCA_LB_GPUTPCNNClusterizerKernels + #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU 1024 #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index c4b7b2e47cbd9..96ee665bda4eb 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -642,7 +642,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Maximum of 4 lanes supported HighResTimer* nnTimers[12]; - int32_t countLoops = 0; if (GetProcessingSettings().nn.applyNNclusterizer) { int32_t deviceId = -1; @@ -1036,10 +1035,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Filling the data if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) { // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations - for(int throughput_counter = 0; throughput_counter < 16; throughput_counter++) { // Loop to increase throughput on GPU, at least for large batch sizes - runKernel({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); - countLoops++; - } + runKernel({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); } else { // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); @@ -1142,7 +1138,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")"; } } - LOG(info) << "countLoops: " << countLoops; #else GPUFatal("Project not compiled with neural network clusterization. Aborting."); #endif diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 1c76fc3f1fa57..3a27b8740e46d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -72,8 +72,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.time()); float central_charge = static_cast(chargeMap[peak].unpack()); int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - const int32_t iroc_row = 63 + row_offset; - const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset; + const int32_t iroc_row = 63 + clustererNN.mNnClusterizerSizeInputRow; + const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + clustererNN.mNnClusterizerSizeInputRow; const int32_t npads_row = GPUTPCGeometry::NPads(row); for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) { @@ -169,45 +169,18 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()); int32_t time = static_cast(peak.time()); - const int32_t npads_row = GPUTPCGeometry::NPads(row); - // Handle index data with fewer branches if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) { - // int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize; - // uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx; -// - // float index_values[3] = { - // static_cast(sector) / o2::tpc::constants::MAXSECTOR, - // static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW, - // static_cast(pad) / GPUTPCGeometry::NPads(row)}; -// - // if (dtype == 0) { - // clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx]; - // } else { - // clustererNN.mInputData_32[write_idx] = index_values[data_idx]; - // } -// - // // Handle deconvolution flags only once per cluster (last thread in element) - // if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) { - // uint8_t cluster_flags = 0; - // for (uint16_t i = 0; i < 8; i++) { - // Delta2 d = cfconsts::InnerNeighbors[i]; - // CfChargePos tmp_pos = peak.delta(d); - // cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]); - // } - // clustererNN.mClusterFlags[2 * base_idx] = cluster_flags; - // clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags; - // } - // return; uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize; + const int32_t npads = GPUTPCGeometry::NPads(row); if (dtype == 0) { clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); - clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads_row); + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads); } else { clustererNN.mInputData_32[write_idx] = static_cast(sector) / o2::tpc::constants::MAXSECTOR; clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; - clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads_row; + clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads; } } @@ -220,46 +193,45 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread (o2::tpc::constants::MAXGLOBALPADROW - 1)); - const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime; - const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin); - const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide - - // Calculate offsets - // int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - // int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row); - const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - const int32_t iroc_row = 63 + row_offset; - const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset; - const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row); - const int32_t boundary_row = target_row + row_offset; - const int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(boundary_row - clustererNN.mNnClusterizerSizeInputRow); - const float boundary_val = clustererNN.mNnClusterizerBoundaryFillValue; - - float output_value = boundary_val; - - const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; - const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local; - - for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) { - if (is_row_boundary) { - output_value = boundary_val; - } else { - const uint8_t is_boundary = is_time_boundary || GPUTPCNNClusterizerKernels::isBoundary(boundary_row, target_pad, maxrow, iroc_row, npads_row, npads_reference); - if (!is_boundary) { + float output_value = clustererNN.mNnClusterizerBoundaryFillValue; + + if ((row < 63 && target_row > 62) || (target_row < 0) || (row > 62 && target_row < 63) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) { + for (int32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) { + if (dtype == 0) { + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; + } else { + clustererNN.mInputData_32[write_idx] = output_value; + } + write_idx += clustererNN.mNnClusterizerFullTimeSize; + } + return; + } else { + // Calculate offsets + const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime; + const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin); + const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide + const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row); + const int32_t npads = GPUTPCGeometry::NPads(target_row); + + const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; + const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local; + + for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) { + if (target_pad >= npads || target_pad < 0 || is_time_boundary) { + output_value = clustererNN.mNnClusterizerBoundaryFillValue; + } else { CfChargePos pos(target_row, target_pad, target_time); // one load + one multiply output_value = chargeMap[pos].unpack() * inverse_central_charge; + } + if (dtype == 0) { + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; } else { - output_value = boundary_val; + clustererNN.mInputData_32[write_idx] = output_value; } + write_idx += clustererNN.mNnClusterizerFullTimeSize; } - if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; - } else { - clustererNN.mInputData_32[write_idx] = output_value; - } - write_idx += clustererNN.mNnClusterizerFullTimeSize; + return; } } } From 587c3e6fadd812b0489891c28759dc3d6865e036 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 18 Oct 2025 21:12:47 +0200 Subject: [PATCH 3/6] Adding back if statement for early exit --- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 3a27b8740e46d..0f000451794ec 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -155,9 +155,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clusterer.mPmemory->counters.nClusters) { - // return; - // } + if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) { + return; + } CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); From 6e432576d34b871974fdb236bc9c6bc75619fcb8 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 18 Oct 2025 22:19:01 +0200 Subject: [PATCH 4/6] const'ing + fixing CPU kernel --- .../Global/GPUChainTrackingClusterizer.cxx | 2 +- .../GPUTPCNNClusterizerKernels.cxx | 63 +++++++++---------- 2 files changed, 32 insertions(+), 33 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 96ee665bda4eb..1740e525937f3 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -1035,7 +1035,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Filling the data if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) { // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations - runKernel({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); + runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); } else { // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 0f000451794ec..7927dcf572acc 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -57,7 +57,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) { return; } @@ -67,43 +67,42 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))]; - int32_t row = static_cast(peak.row()); - int32_t pad = static_cast(peak.pad()); - int32_t time = static_cast(peak.time()); - float central_charge = static_cast(chargeMap[peak].unpack()); - int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); + const int32_t row = static_cast(peak.row()); + const int32_t pad = static_cast(peak.pad()); + const int32_t time = static_cast(peak.time()); + const float central_charge = static_cast(chargeMap[peak].unpack()); + const float inverse_charge = 1.f / central_charge; + + const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); const int32_t iroc_row = 63 + clustererNN.mNnClusterizerSizeInputRow; const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + clustererNN.mNnClusterizerSizeInputRow; const int32_t npads_row = GPUTPCGeometry::NPads(row); + float output_value = clustererNN.mNnClusterizerBoundaryFillValue; - for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) { - int32_t target_row = row + r; - bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW); - int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row); - int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(target_row + row_offset); + for (int32_t target_row = -clustererNN.mNnClusterizerSizeInputRow + row; target_row <= clustererNN.mNnClusterizerSizeInputRow + row; ++target_row) { + uint8_t is_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW); + const int32_t p_local = pad + (is_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row)); + const int32_t npads_reference = is_boundary ? 0 : GPUTPCGeometry::NPads(target_row - row_offset); - for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) { - int32_t target_pad = pad + p; - bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference); - - for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) { - int32_t target_time = time + t; + for (int32_t target_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; target_pad <= clustererNN.mNnClusterizerSizeInputPad + p_local; ++target_pad) { + is_boundary = is_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference); + for (int32_t target_time = -clustererNN.mNnClusterizerSizeInputTime + time; target_time <= clustererNN.mNnClusterizerSizeInputTime + time; ++target_time) { if (is_boundary || target_time < 0 || target_time >= clustererNN.maxAllowedTimebin) { // Fill boundary value - float boundary_value = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); + output_value = clustererNN.mNnClusterizerBoundaryFillValue; if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_value; + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; } else { - clustererNN.mInputData_32[write_idx] = boundary_value; + clustererNN.mInputData_32[write_idx] = output_value; } } else { CfChargePos tmp_pos(target_row, target_pad, target_time); - float normalized_charge = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + output_value = chargeMap[tmp_pos].unpack() * inverse_charge; if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)normalized_charge; + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; } else { - clustererNN.mInputData_32[write_idx] = normalized_charge; + clustererNN.mInputData_32[write_idx] = output_value; } } // if((CAMath::Abs(static_cast(clustererNN.mInputData_16_Test[write_idx]) - static_cast(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) { @@ -119,11 +118,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(sector) / o2::tpc::constants::MAXSECTOR); clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); - clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads_row); } else { clustererNN.mInputData_32[write_idx] = static_cast(sector) / o2::tpc::constants::MAXSECTOR; clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; - clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / GPUTPCGeometry::NPads(row); + clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads_row; } } @@ -143,7 +142,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { - uint32_t glo_idx = get_global_id(0); + const uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; @@ -151,8 +150,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clusterer.mPmemory->counters.nClusters) { @@ -164,10 +163,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadcounters.nClusters - 1))]; - float central_charge = chargeMap[peak].unpack(); - int32_t row = static_cast(peak.row()); - int32_t pad = static_cast(peak.pad()); - int32_t time = static_cast(peak.time()); + const float central_charge = chargeMap[peak].unpack(); + const int32_t row = static_cast(peak.row()); + const int32_t pad = static_cast(peak.pad()); + const int32_t time = static_cast(peak.time()); // Handle index data with fewer branches if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) { From bb795c4170289bae93b19223264dc09aacaebea0 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sun, 19 Oct 2025 09:43:08 +0200 Subject: [PATCH 5/6] Remiving print statements --- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 9 --------- 1 file changed, 9 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 7927dcf572acc..b214a8f549a56 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -244,10 +244,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) { return; } - if (glo_idx + batchStart >= clustererNN.mNnClusterizerTotalClusters) { - printf("Error: Class output index out of bounds! %d >= %d (glo_idx: %d, batchStart: %d, mNnClusterizerBatchedMode: %d, mNnClusterizerModelClassNumOutputNodes: %d, clusterer.mPmemory->counters.nClusters %d)\n", - glo_idx + batchStart, clustererNN.mNnClusterizerTotalClusters, glo_idx, batchStart, clustererNN.mNnClusterizerBatchedMode, clustererNN.mNnClusterizerModelClassNumOutputNodes, clusterer.mPmemory->counters.nClusters); - } if (clustererNN.mNnClusterizerUseClassification) { if (dtype == 0) { clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold); @@ -337,11 +333,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes) { - printf("Error: Global index out of bounds! %d >= %d (full_glo_idx: %d, maxClusterNum: %d, batchStart: %d)\n", - full_glo_idx, clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes, full_glo_idx, maxClusterNum, batchStart); - } - tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; ClusterAccumulator pc; From f7cdc0be08798e9b10a490a8b08697d0826450bc Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 27 Oct 2025 13:08:55 +0100 Subject: [PATCH 6/6] Fixing CI build issue --- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index b214a8f549a56..ee0fa217b8095 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -195,7 +195,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread 62) || (target_row < 0) || (row > 62 && target_row < 63) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) { - for (int32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) { + for (uint32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) { if (dtype == 0) { clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; } else {