From 82f2153cd68dfd0ae394621cd1df9d0093ace680 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sun, 8 Jun 2025 14:22:21 +0200 Subject: [PATCH 1/8] First bug-fixes and optimizations for deconvolution flags --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 + .../Global/GPUChainTrackingClusterizer.cxx | 6 +++-- .../GPUTPCCFDeconvolution.cxx | 16 +++++++++---- .../TPCClusterFinder/GPUTPCCFDeconvolution.h | 2 +- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 1 + .../GPUTPCNNClusterizerHost.cxx | 1 + .../GPUTPCNNClusterizerKernels.cxx | 24 ++++++++++--------- 7 files changed, 32 insertions(+), 19 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index d8173a5b62a35..e75d5a5890e4b 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -269,6 +269,7 @@ AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed") AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN") AddOption(nnClusterizerApplyNoiseSuppression, int, 1, "", 0, "Applies the NoiseSuppression kernel before the digits to the network are filled") +AddOption(nnClusterizerSetDeconvolutionFlags, int, 1, "", 0, "Runs the deconvolution kernel without overwriting the charge in order to make cluster-to-track attachment identical to heuristic CF") AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path") AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.") AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path") diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 64e6f5a31aaa7..58b06c95ad26a 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -989,8 +989,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) int withMC = (doGPU && propagateMCLabels); if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { - runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true); DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); + } else if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) { + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, false); } // float time_clusterizer = 0, time_fill = 0, time_networks = 0; @@ -1102,7 +1104,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUFatal("Project not compiled with neural network clusterization. Aborting."); #endif } else { - runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true); DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0); } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx index 429d51685e504..406f3cb1d6d5a 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx @@ -22,18 +22,19 @@ using namespace o2::gpu; using namespace o2::gpu::tpccf; template <> -GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer) +GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, bool overwriteCharge) { CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); - GPUTPCCFDeconvolution::deconvolutionImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, isPeakMap, chargeMap, clusterer.mPpositions, clusterer.mPmemory->counters.nPositions); + GPUTPCCFDeconvolution::deconvolutionImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, isPeakMap, chargeMap, clusterer.mPpositions, clusterer.mPmemory->counters.nPositions, overwriteCharge); } GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, const CfArray2D& peakMap, CfArray2D& chargeMap, const CfChargePos* positions, - const uint32_t digitnum) + const uint32_t digitnum, + bool overwriteCharge) { SizeT idx = get_global_id(0); @@ -111,9 +112,14 @@ GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t peakCount = (peakCount == 0) ? 1 : peakCount; PackedCharge charge = chargeMap[pos]; - PackedCharge p(charge.unpack() / peakCount, has3x3, split); - chargeMap[pos] = p; + if(overwriteCharge) { + PackedCharge p(charge.unpack() / peakCount, has3x3, split); + chargeMap[pos] = p; + } else { + PackedCharge p(charge.unpack(), has3x3, split); + chargeMap[pos] = p; + } } GPUdi() uint8_t GPUTPCCFDeconvolution::countPeaksInner( diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h index e971a042e95a4..413d2f087f87a 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h @@ -51,7 +51,7 @@ class GPUTPCCFDeconvolution : public GPUKernelTemplate GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args); private: - static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D&, CfArray2D&, const CfChargePos*, const uint32_t); + static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D&, CfArray2D&, const CfChargePos*, const uint32_t, bool); static GPUdi() uint8_t countPeaksInner(uint16_t, const uint8_t*, uint8_t*); static GPUdi() uint8_t countPeaksOuter(uint16_t, uint8_t, const uint8_t*); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index f7c2d13407b0e..980c0977aca65 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -44,6 +44,7 @@ class GPUTPCNNClusterizer : public GPUProcessor bool mNnClusterizerAddIndexData = true; float mNnClassThreshold = 0.01; bool mNnSigmoidTrafoClassThreshold = 1; + bool mNnClusterizerSetDeconvolutionFlags = true; int mNnClusterizerUseCfRegression = 0; int mNnClusterizerBatchedMode = 1; int mNnClusterizerTotalClusters = 1; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 90f1d6e27246f..124320396d0d4 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -92,6 +92,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode; clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue; clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold; + clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags; if (clustererNN.mNnSigmoidTrafoClassThreshold) { clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold)); } else { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 8ca61602ab4e9..8575b1f47be16 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -126,11 +126,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); if (is_row_boundary) { if (dtype == 0) { - clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); + clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); } else { - clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); + clustererNN.mInputData_32[glo_idx] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); } } else { int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); @@ -164,15 +166,15 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(chargeMap[peak].unpack()); CfChargePos tmp_pos(row + r, pad + p, time + t); if (dtype == 0) { - clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); + clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); } else if (dtype == 1) { - clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + clustererNN.mInputData_32[glo_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; } } else { if (dtype == 0) { - clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); + clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); } else { - clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); + clustererNN.mInputData_32[glo_idx] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); } } } From 227d192e1c1f51cfe5ee3582c730508f11d70aa9 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sun, 8 Jun 2025 14:28:57 +0200 Subject: [PATCH 2/8] Adding publishing logic for deconvolution flags --- .../Global/GPUChainTrackingClusterizer.cxx | 4 +++ .../GPUTPCNNClusterizerKernels.cxx | 27 +++++++++++++++++++ .../GPUTPCNNClusterizerKernels.h | 1 + 3 files changed, 32 insertions(+) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 58b06c95ad26a..816ee43d50b15 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -1003,6 +1003,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // auto start0 = std::chrono::high_resolution_clock::now(); runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the data + if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) { + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the regression data + } + // auto stop0 = std::chrono::high_resolution_clock::now(); // auto start1 = std::chrono::high_resolution_clock::now(); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 8575b1f47be16..cb0f5e92468d9 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -459,6 +459,33 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + // Implements identical publishing logic as the heuristic clusterizer and deconvolution kernel + uint32_t idx = get_global_id(0); + auto& clusterer = processors.tpcClusterer[sector]; + auto& clustererNN = processors.tpcNNClusterer[sector]; + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfChargePos peak = clusterer.mPfilteredPeakPositions[idx + batchStart]; + + for(int i = 0; i < 8; i++) { + Delta2 d = cfconsts::InnerNeighbors[i]; + CfChargePos tmp_pos = peak.delta(d); + PackedCharge charge = chargeMap[tmp_pos]; + clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit()); + clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit()); + } + for(int i = 0; i < 16; i++) { + Delta2 d = cfconsts::OuterNeighbors[i]; + CfChargePos tmp_pos = peak.delta(d); + PackedCharge charge = chargeMap[tmp_pos]; + clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit() && !charge.has3x3Peak()); + clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit() && !charge.has3x3Peak()); + } +} + // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index dac2bf9554849..179eb483cdf6b 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -65,6 +65,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate determineClass2Labels = 4, publishClass1Regression = 5, publishClass2Regression = 6, + publishDeconvolutionFlags = 7 }; template From 57ff901bc19f57f3f32dc4f156f80a8aba13e974 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sun, 8 Jun 2025 15:18:58 +0200 Subject: [PATCH 3/8] Adjusting kernels.cmake --- GPU/GPUTracking/kernels.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index 08d879fbb8e9a..05efbf1d6acdb 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -111,7 +111,7 @@ o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, findFragmentStart" "= TPC o2_gpu_add_kernel("GPUTPCCFPeakFinder" "= TPCCLUSTERFINDER" LB) o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression" "= TPCCLUSTERFINDER" LB) o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks" "= TPCCLUSTERFINDER" LB) -o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB) +o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB, bool overwriteCharge) o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUSTERFINDER" LB int8_t onlyMC) o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER") o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten" "= TPCCLUSTERFINDER" NO GPUTPCLinearLabels* out) @@ -134,6 +134,7 @@ o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels" "= TPC o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishDeconvolutionFlags" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) endif() o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP From 05a4d3f867fc4427c6331fcb80d7476ddd06d0e7 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Sun, 8 Jun 2025 13:40:59 +0000 Subject: [PATCH 4/8] Please consider the following formatting changes --- .../TPCClusterFinder/GPUTPCCFDeconvolution.cxx | 2 +- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx index 406f3cb1d6d5a..a93f970de5729 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx @@ -113,7 +113,7 @@ GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t PackedCharge charge = chargeMap[pos]; - if(overwriteCharge) { + if (overwriteCharge) { PackedCharge p(charge.unpack() / peakCount, has3x3, split); chargeMap[pos] = p; } else { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index cb0f5e92468d9..bf30d07ad2e77 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -126,8 +126,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfChargePos peak = clusterer.mPfilteredPeakPositions[idx + batchStart]; - for(int i = 0; i < 8; i++) { + for (int i = 0; i < 8; i++) { Delta2 d = cfconsts::InnerNeighbors[i]; CfChargePos tmp_pos = peak.delta(d); PackedCharge charge = chargeMap[tmp_pos]; clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit()); clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit()); } - for(int i = 0; i < 16; i++) { + for (int i = 0; i < 16; i++) { Delta2 d = cfconsts::OuterNeighbors[i]; CfChargePos tmp_pos = peak.delta(d); PackedCharge charge = chargeMap[tmp_pos]; From 464402feacc581fe44f827d4ea33f9c50967af2d Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 9 Jun 2025 22:08:02 +0200 Subject: [PATCH 5/8] Bug-fix for time-position and boundary check in fillInputSingleElement --- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index bf30d07ad2e77..99fbc1c4bf504 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -158,13 +158,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= TPC_MAX_FRAGMENT_LEN_GPU); + bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (time_pos < 0 || time_pos >= TPC_MAX_FRAGMENT_LEN_GPU); if (!is_boundary) { float central_charge = static_cast(chargeMap[peak].unpack()); - CfChargePos tmp_pos(row + r, pad + p, time + t); + CfChargePos tmp_pos(row + r, pad + p, time_pos); if (dtype == 0) { clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); } else if (dtype == 1) { From e70e4d98bcf9322e1414b95da7ca44954bec4c45 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 10 Jun 2025 09:40:52 +0200 Subject: [PATCH 6/8] Fix for kernels.cmake and naming --- .../GPUTPCNNClusterizerKernels.cxx | 10 +++++----- GPU/GPUTracking/kernels.cmake | 18 +++++++++--------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 99fbc1c4bf504..58dd49630d8e6 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -51,7 +51,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; @@ -111,7 +111,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; @@ -182,7 +182,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { uint32_t glo_idx = get_global_id(0); if (dtype == 0) { @@ -193,7 +193,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { auto& clustererNN = processors.tpcNNClusterer[sector]; uint32_t glo_idx = get_global_id(0); @@ -461,7 +461,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { // Implements identical publishing logic as the heuristic clusterizer and deconvolution kernel uint32_t idx = get_global_id(0); diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index 05efbf1d6acdb..c7f56919d2ff8 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -111,7 +111,7 @@ o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, findFragmentStart" "= TPC o2_gpu_add_kernel("GPUTPCCFPeakFinder" "= TPCCLUSTERFINDER" LB) o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression" "= TPCCLUSTERFINDER" LB) o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks" "= TPCCLUSTERFINDER" LB) -o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB, bool overwriteCharge) +o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB bool overwriteCharge) o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUSTERFINDER" LB int8_t onlyMC) o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER") o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten" "= TPCCLUSTERFINDER" NO GPUTPCLinearLabels* out) @@ -127,14 +127,14 @@ o2_gpu_add_kernel("GPUTPCCFGather" "=" o2_gpu_add_kernel("GPUTrackingRefitKernel, mode0asGPU" "= GLOBALREFIT " LB) o2_gpu_add_kernel("GPUTrackingRefitKernel, mode1asTrackParCov" "= GLOBALREFIT " LB) if(onnxruntime_FOUND) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishDeconvolutionFlags" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishDeconvolutionFlags" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) endif() o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP From f6a3d5aae9665832fcca897ac377650406f5611e Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 12 Jun 2025 08:49:04 +0200 Subject: [PATCH 7/8] Changing to uint8_t --- GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx | 4 ++-- GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h | 2 +- GPU/GPUTracking/kernels.cmake | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx index a93f970de5729..d6b8703a9b35d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx @@ -22,7 +22,7 @@ using namespace o2::gpu; using namespace o2::gpu::tpccf; template <> -GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, bool overwriteCharge) +GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, uint8_t overwriteCharge) { CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); @@ -34,7 +34,7 @@ GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t CfArray2D& chargeMap, const CfChargePos* positions, const uint32_t digitnum, - bool overwriteCharge) + uint8_t overwriteCharge) { SizeT idx = get_global_id(0); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h index 413d2f087f87a..902e3a28fd21b 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h @@ -51,7 +51,7 @@ class GPUTPCCFDeconvolution : public GPUKernelTemplate GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args); private: - static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D&, CfArray2D&, const CfChargePos*, const uint32_t, bool); + static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D&, CfArray2D&, const CfChargePos*, const uint32_t, uint8_t); static GPUdi() uint8_t countPeaksInner(uint16_t, const uint8_t*, uint8_t*); static GPUdi() uint8_t countPeaksOuter(uint16_t, uint8_t, const uint8_t*); diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index c7f56919d2ff8..2a59f98a6d5b4 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -111,7 +111,7 @@ o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, findFragmentStart" "= TPC o2_gpu_add_kernel("GPUTPCCFPeakFinder" "= TPCCLUSTERFINDER" LB) o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression" "= TPCCLUSTERFINDER" LB) o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks" "= TPCCLUSTERFINDER" LB) -o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB bool overwriteCharge) +o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB uint8_t overwriteCharge) o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUSTERFINDER" LB int8_t onlyMC) o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER") o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten" "= TPCCLUSTERFINDER" NO GPUTPCLinearLabels* out) From ff620320d235483df7a90ee69cc2d5917331e3ed Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 12 Jun 2025 13:50:41 +0200 Subject: [PATCH 8/8] Adding kernel definition --- GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h | 1 + 1 file changed, 1 insertion(+) diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h index 7879789bf91c8..8fe8e8ca68e44 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h @@ -497,6 +497,7 @@ #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizerKernels + #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishDeconvolutionFlags GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE