From 9ab32454718d771993286e7fe252362521e697a9 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 12 Sep 2025 00:39:37 +0200 Subject: [PATCH 1/4] Bug-fix for MC labels --- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 15ee6b6119022..59a355780d9ee 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -46,7 +46,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY(MCLabelAccumulator labelAcc(clusterer)); - tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; int8_t isAccepted = (clustererNN.mNnClusterizerUseClassification ? (clustererNN.mOutputDataClass[CAMath::Min(glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] > 0) : 1); GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, reinterpret_cast(smem), chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow, isAccepted); } @@ -464,7 +464,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= maxClusterNum) { From b756dcc19b4e75bb7fccbfc042318e61370c79e9 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 12 Sep 2025 00:42:17 +0200 Subject: [PATCH 2/4] Switch on timers on CPU for the first four lanes --- .../Global/GPUChainTrackingClusterizer.cxx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index fd3699ae4d125..5c2a13df1a2b7 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -1052,7 +1052,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // NN evaluations if(clustererNNShadow.mNnClusterizerUseClassification) { - if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Start(); } + if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16); @@ -1066,13 +1066,13 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32); } } - if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); } + if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane]->Stop(); } // doGPU || lane<4 -> only for GPU or first 4 CPU lanes (to limit number of concurrent timers). At least gives some statistics for CPU time... if (nn_settings.nnClusterizerVerbosity > 3) { LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN classification inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")"; } } if (!clustererNNShadow.mNnClusterizerUseCfRegression) { - if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); } + if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16); @@ -1086,9 +1086,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32); } } - if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Stop(); } + if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 1]->Stop(); } if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) { - if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Start(); } + if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16); @@ -1102,7 +1102,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32); } } - if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Stop(); } + if(GetProcessingSettings().debugLevel >= 1 && (doGPU || lane < 4)) { nnTimers[3*lane + 2]->Stop(); } } if (nn_settings.nnClusterizerVerbosity > 3) { LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with NN regression inference. Loop=" << batch << ". (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")"; From b5ace3556bd784ef76c15a21fbd6c5003023d0f0 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 17 Sep 2025 23:41:37 +0200 Subject: [PATCH 3/4] Improved boundary checking for input filling and cluster publishing --- .../Global/GPUChainTrackingClusterizer.cxx | 6 +- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 4 + .../GPUTPCNNClusterizerHost.cxx | 5 +- .../GPUTPCNNClusterizerHost.h | 2 +- .../GPUTPCNNClusterizerKernels.cxx | 156 +++++++++++++++--- .../GPUTPCNNClusterizerKernels.h | 1 + 6 files changed, 147 insertions(+), 27 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 4a7552069c61e..619940ff6d3dd 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -709,6 +709,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId; } }); + const int16_t maxFragmentLen = GetProcessingSettings().overrideClusterizerFragmentLen; + const uint32_t maxAllowedTimebin = param().par.continuousTracking ? std::max(param().continuousMaxTimeBin, maxFragmentLen) : TPC_MAX_TIME_BIN_TRIGGERED; for (int32_t sector = 0; sector < NSECTORS; sector++) { GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector]; GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN; @@ -716,12 +718,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) clustererNN.mDeviceId = deviceId; clustererNN.mISector = sector; clustererNN.mNnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters; - nnApplications[lane].initClusterizer(nn_settings, clustererNN); + nnApplications[lane].initClusterizer(nn_settings, clustererNN, maxFragmentLen, maxAllowedTimebin); if (doGPU) { clustererNNShadow.mDeviceId = deviceId; clustererNNShadow.mISector = sector; clustererNNShadow.mNnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters; - nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow); + nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow, maxFragmentLen, maxAllowedTimebin); } if (nn_settings.nnClusterizerVerbosity > 2) { LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Processor initialized. Sector " << sector << ", lane " << lane << ", max clusters " << clustererNN.mNnClusterizerTotalClusters << " (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")"; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index a6b0b081fc3dd..0b9553437765c 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -60,6 +60,10 @@ class GPUTPCNNClusterizer : public GPUProcessor int32_t mISector = -1; int32_t mDeviceId = -1; + // charge array boundaries + int32_t maxFragmentLen = -1; + int32_t maxAllowedTimebin = -1; // == tpcMaxTimeBin + // GPU optimizations uint32_t mNnClusterizerFullRowSize = 0; uint32_t mNnClusterizerFullPadSize = 0; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index ad635c15b9256..ae833ace2f648 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -21,6 +21,7 @@ #include "GPUReconstruction.h" #include "GPUTPCGeometry.h" #include "DataFormatsTPC/Constants.h" +#include "clusterFinderDefs.h" #ifdef GPUCA_HAS_ONNX #include @@ -84,7 +85,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set } } -void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN) +void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN, int32_t maxFragmentLen, int32_t maxAllowedTimebin) { clustererNN.mNnClusterizerUseCfRegression = settings.nnClusterizerUseCfRegression; clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow; @@ -109,6 +110,8 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold; clustererNN.mNnClusterizerUseClassification = settings.nnClusterizerUseClassification; clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags; + clustererNN.maxFragmentLen = maxFragmentLen == -1 ? TPC_MAX_FRAGMENT_LEN_GPU : maxFragmentLen; + clustererNN.maxAllowedTimebin = maxAllowedTimebin == -1 ? TPC_MAX_FRAGMENT_LEN_GPU : maxAllowedTimebin; if (clustererNN.mNnSigmoidTrafoClassThreshold) { clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold)); } else { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 414c4539a33c1..8f8465d5dca34 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -48,7 +48,7 @@ class GPUTPCNNClusterizerHost GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode = false) { init(settings, useDeterministicMode); } void init(const GPUSettingsProcessingNNclusterizer&, bool = false); - void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&, int32_t = -1, int32_t = -1); void createBoundary(GPUTPCNNClusterizer&); void createIndexLookup(GPUTPCNNClusterizer&); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 59a355780d9ee..cd8a73542f0e1 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -85,7 +85,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= TPC_MAX_FRAGMENT_LEN_GPU) { + if (is_boundary || target_time < 0 || target_time >= clustererNN.maxAllowedTimebin) { // Fill boundary value float boundary_value = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); if (dtype == 0) { @@ -229,7 +229,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= TPC_MAX_FRAGMENT_LEN_GPU); + int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin); float output_value; if (is_boundary) { @@ -340,7 +340,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(full_glo_idx, maxClusterNum - 1)]; + uint32_t peakIndex = CAMath::Min(full_glo_idx, maxClusterNum - 1); + CfChargePos peak = clusterer.mPfilteredPeakPositions[peakIndex]; float central_charge = static_cast(chargeMap[peak].unpack()); CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); @@ -365,11 +366,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadcollect(peak, central_charge)); @@ -390,37 +388,113 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread UpdateClusterError2ByState bool notSinglePad = false, notSingleTime = false; for (uint16_t i = 0; i < 8; i++) { Delta2 d = cfconsts::InnerNeighbors[i]; CfChargePos tmp_pos = peak.delta(d); - notSinglePad |= (d.x != 0) && (static_cast(chargeMap[tmp_pos].unpack()) > 0); - notSingleTime |= (d.y != 0) && (static_cast(chargeMap[tmp_pos].unpack()) > 0); + float v = static_cast(chargeMap[tmp_pos].unpack()); + notSinglePad |= (d.x != 0) && (v > 0.f); + notSingleTime |= (d.y != 0) && (v > 0.f); } + float publishPadPosition = 0.f, publishTimePosition = 0.f; if (dtype == 0) { + publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(); + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(); + isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(), - static_cast(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(), + publishPadPosition, notSinglePad ? clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat() : 0.f, - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(), + (clusterer.mPmemory->fragment).start + publishTimePosition, notSingleTime ? clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat() : 0.f, clustererNN.mClusterFlags[2 * glo_idx], clustererNN.mClusterFlags[2 * glo_idx + 1]); - } else if (dtype == 1) { + } else { + publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index]; + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1]; + isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4], - static_cast(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index], + publishPadPosition, notSinglePad ? clustererNN.mOutputDataReg1_32[model_output_index + 2] : 0.f, - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1], + (clusterer.mPmemory->fragment).start + publishTimePosition, notSingleTime ? clustererNN.mOutputDataReg1_32[model_output_index + 3] : 0.f, clustererNN.mClusterFlags[2 * glo_idx], clustererNN.mClusterFlags[2 * glo_idx + 1]); } + // if (boundaryFlag != 0) { // Prints the entire NN input for the given index + // // Build a simple buffer manually (float with 3 decimals) + // const int MAX_CHARS = 4096; + // char buffer[MAX_CHARS]; + // int pos = 0; +// + // auto appendChar = [&](char c) { + // if (pos < MAX_CHARS - 1) buffer[pos++] = c; + // }; + // auto appendStr = [&](const char* s) { + // while (*s && pos < MAX_CHARS - 1) buffer[pos++] = *s++; + // }; + // auto appendUInt = [&](uint32_t v) { + // char tmp[16]; int tp = 0; + // if (v == 0) { appendChar('0'); return; } + // while (v && tp < 16) { tmp[tp++] = char('0' + (v % 10)); v /= 10; } + // while (tp--) appendChar(tmp[tp]); + // }; + // auto appendInt = [&](int v) { + // if (v < 0) { appendChar('-'); v = -v; } + // appendUInt((uint32_t)v); + // }; + // auto appendFloat = [&](float f) { + // if (f < 0) { appendChar('-'); f = -f; } + // int ip = (int)f; + // float frac = f - (float)ip; + // appendInt(ip); + // appendChar('.'); + // for (int i = 0; i < 3; i++) { + // frac *= 10.f; + // int d = (int)frac; + // appendChar((char)('0' + (d < 0 ? 0 : (d > 9 ? 9 : d)))); + // frac -= d; + // if (frac < 0) frac = 0; + // } + // }; +// + // appendStr("(NN CLUS) DEBUG: Boundary cluster detected (sector "); + // appendUInt(sector); + // appendStr(", row "); + // appendUInt(peak.row()); + // appendStr(", pad "); + // appendFloat(publishPadPosition); + // appendStr(", time "); + // appendFloat(publishTimePosition); + // appendStr(") [glo_idx="); + // appendUInt(glo_idx); + // appendStr(" elemSize="); + // appendInt(clustererNN.mNnClusterizerElementSize); + // appendStr(" dtype="); + // appendInt(dtype); + // appendStr("] INPUT:"); +// + // int elemSize = clustererNN.mNnClusterizerElementSize; + // int baseIdx = glo_idx * elemSize; +// + // int maxPrint = elemSize; + // for (int i = 0; i < maxPrint; ++i) { + // appendChar(' '); + // float v = (dtype == 0) ? clustererNN.mInputData_16[baseIdx + i].ToFloat() + // : clustererNN.mInputData_32[baseIdx + i]; + // appendFloat(v); + // if (pos > (MAX_CHARS - 32)) { appendStr(" ..."); break; } + // } +// + // buffer[pos] = 0; + // printf("%s\n", buffer); + // } + tpc::ClusterNative myCluster; bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap); if (clustererNN.mNnClusterizerUseClassification) { - rejectCluster |= (clustererNN.mOutputDataClass[CAMath::Min(full_glo_idx, (uint32_t)clusterer.mPmemory->counters.nClusters - 1)] <= 0); + rejectCluster |= (clustererNN.mOutputDataClass[peakIndex] <= 0); } if (rejectCluster) { if (clusterer.mPclusterPosInRow) { @@ -509,19 +583,26 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(); + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(); + isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(), - static_cast(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(), + publishPadPosition, clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(), - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 2].ToFloat(), + (clusterer.mPmemory->fragment).start + publishTimePosition, clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(), clustererNN.mClusterFlags[2 * glo_idx], clustererNN.mClusterFlags[2 * glo_idx + 1]); } else if (dtype == 1) { + publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index]; + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 1]; + isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8], - static_cast(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index], + publishPadPosition, clustererNN.mOutputDataReg2_32[model_output_index + 4], - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 2], + (clusterer.mPmemory->fragment).start + publishTimePosition, clustererNN.mOutputDataReg2_32[model_output_index + 6], clustererNN.mClusterFlags[2 * glo_idx], clustererNN.mClusterFlags[2 * glo_idx + 1]); @@ -558,18 +639,24 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(); + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(); + isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(), - static_cast(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(), + publishPadPosition, clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(), - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(), + (clusterer.mPmemory->fragment).start + publishTimePosition, clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(), clustererNN.mClusterFlags[2 * glo_idx], clustererNN.mClusterFlags[2 * glo_idx + 1]); } else if (dtype == 1) { + publishPadPosition = static_cast(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1]; + publishTimePosition = static_cast(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3]; + isBoundaryPublish(full_glo_idx, static_cast(peak.row()), publishPadPosition, publishTimePosition); pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9], - static_cast(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1], + publishPadPosition, clustererNN.mOutputDataReg2_32[model_output_index + 5], - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3], + (clusterer.mPmemory->fragment).start + publishTimePosition, clustererNN.mOutputDataReg2_32[model_output_index + 7], clustererNN.mClusterFlags[2 * glo_idx], clustererNN.mClusterFlags[2 * glo_idx + 1]); @@ -664,3 +751,26 @@ GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int return true; } } + +GPUd() bool GPUTPCNNClusterizerKernels::isBoundaryPublish(int32_t idx, int32_t row, float& pad, float& time) +{ + if(pad < 0) { + // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, pad < 0: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU); + pad = 0.f; + return true; + } else if (pad >= static_cast(GPUTPCGeometry::NPads(row))) { + // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, pad >= static_cast(GPUTPCGeometry::NPads(row): row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU); + pad = static_cast(GPUTPCGeometry::NPads(row) - 1); + return true; + } else if (time < 0) { + // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, time < 0: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU); + time = 0.f; + return true; + } else if (time >= TPC_MAX_FRAGMENT_LEN_GPU) { + // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, time >= TPC_MAX_FRAGMENT_LEN_GPU: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU); + time = static_cast(TPC_MAX_FRAGMENT_LEN_GPU - 1); + return true; + } else { + return false; + } +} diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index 9c93726a097b7..cd3d7783771fe 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -67,6 +67,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate static GPUd() int32_t padOffset(int32_t, int32_t); static GPUd() int32_t rowOffset(int32_t, int32_t); static GPUd() bool isBoundary(int32_t, int32_t, int32_t); + static GPUd() bool isBoundaryPublish(int32_t, int32_t, float&, float&); }; } // namespace o2::gpu From 82d76e57f871866476a630f36bcc4ace6e18260d Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Wed, 17 Sep 2025 21:42:23 +0000 Subject: [PATCH 4/4] Please consider the following formatting changes --- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index cd8a73542f0e1..55fefa7dcf149 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -427,7 +427,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread (MAX_CHARS - 32)) { appendStr(" ..."); break; } // } -// + // // buffer[pos] = 0; // printf("%s\n", buffer); // } @@ -754,7 +754,7 @@ GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int GPUd() bool GPUTPCNNClusterizerKernels::isBoundaryPublish(int32_t idx, int32_t row, float& pad, float& time) { - if(pad < 0) { + if (pad < 0) { // printf("(NN CLUS) WARNING: Boundary detected, idx = %d, pad < 0: row %d, pad %f (%d, %d), time %f (%d, %d)\n", idx, row, pad, 0, static_cast(GPUTPCGeometry::NPads(row)), time, 0, TPC_MAX_FRAGMENT_LEN_GPU); pad = 0.f; return true;