From 5ef448cc7716f8f4d0e176f86095ce9094bb0d27 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 28 Jun 2025 20:17:35 +0200 Subject: [PATCH 01/15] Adding first version of kernel timers --- .../Global/GPUChainTrackingClusterizer.cxx | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 816ee43d50b15..93ab7fff4a334 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -641,6 +641,34 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes]; + // Maximum of 4 lanes supported + HighResTimer* nnTimers[12] = { + &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 0), + &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 1), + &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 2), + &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 3), + &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 4), + &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 5), + &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 6), + &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 7), + &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 8), + &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 9), + &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 10), + &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 11) + }; + HighResTimer* nnFillInputTimers[4] { + &getTimer("GPUTPCNNClusterizer_fillInputNNSingleElement_0_", 0), + &getTimer("GPUTPCNNClusterizer_fillInputNNSingleElement_1_", 1), + &getTimer("GPUTPCNNClusterizer_fillInputNNSingleElement_2_", 2), + &getTimer("GPUTPCNNClusterizer_fillInputNNSingleElement_3_", 3) + }; + HighResTimer* nnPublishingTimers[4] { + &getTimer("GPUTPCNNClusterizer_publish_0_", 0), + &getTimer("GPUTPCNNClusterizer_publish_1_", 1), + &getTimer("GPUTPCNNClusterizer_publish_2_", 2), + &getTimer("GPUTPCNNClusterizer_publish_3_", 3) + }; + if (GetProcessingSettings().nn.applyNNclusterizer) { int32_t deviceId = -1; int32_t numLanes = GetProcessingSettings().nTPCClustererLanes; @@ -1001,7 +1029,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); // auto start0 = std::chrono::high_resolution_clock::now(); + if(GetProcessingSettings().debugLevel >= 1) { nnFillInputTimers[lane]->Start(); } runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the data + if(GetProcessingSettings().debugLevel >= 1) { nnFillInputTimers[lane]->Stop(); } if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) { runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the regression data @@ -1011,6 +1041,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // auto start1 = std::chrono::high_resolution_clock::now(); // NN evaluations + if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16); @@ -1024,7 +1055,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32); } } + if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane]->Stop(); } if (!clustererNNShadow.mNnClusterizerUseCfRegression) { + if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 1]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16); @@ -1038,7 +1071,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32); } } + if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 1]->Stop(); } if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) { + if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 2]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16); @@ -1052,12 +1087,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32); } } + if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 2]->Stop(); } } } // auto stopNNs = std::chrono::high_resolution_clock::now(); // Publishing kernels + if(GetProcessingSettings().debugLevel >= 1) { nnPublishingTimers[lane]->Start(); } if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) { runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Assigning class labels } else { @@ -1069,6 +1106,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results } } + if(GetProcessingSettings().debugLevel >= 1) { nnPublishingTimers[lane]->Stop(); } // for(int i = 0; i < iSize; ++i) { // if(clustererNNShadow.mOutputDataClass[i + batchStart] > 1) { @@ -1090,7 +1128,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } if (clustererNNShadow.mNnClusterizerUseCfRegression) { // auto start1 = std::chrono::high_resolution_clock::now(); + if(GetProcessingSettings().debugLevel >= 1) { nnPublishingTimers[lane]->Start(); } runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 + if(GetProcessingSettings().debugLevel >= 1) { nnPublishingTimers[lane]->Stop(); } // auto stop1 = std::chrono::high_resolution_clock::now(); // time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; } From fcf46d9aa604a1cc58d0672c4ae0745c22ffee48 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 2 Jul 2025 16:52:34 +0200 Subject: [PATCH 02/15] Removing GPU_CONFIG_KEY from dpl-workflow.sh to set my own values --- prodtests/full-system-test/dpl-workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prodtests/full-system-test/dpl-workflow.sh b/prodtests/full-system-test/dpl-workflow.sh index 202352730ddc7..8c7e9cfd63f40 100755 --- a/prodtests/full-system-test/dpl-workflow.sh +++ b/prodtests/full-system-test/dpl-workflow.sh @@ -74,7 +74,7 @@ fi GPU_INPUT=zsraw GPU_OUTPUT=tracks,clusters GPU_CONFIG= -GPU_CONFIG_KEY= +# GPU_CONFIG_KEY= TOF_CONFIG= TOF_INPUT=raw TOF_OUTPUT=clusters From 7f7000263bb15f79bafab89bdcfa8c20e6d11a94 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 19 Jul 2025 11:11:00 +0200 Subject: [PATCH 03/15] Bug fixes --- GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 2 +- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 91ca0fdd36072..28e17f44d774b 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -1023,7 +1023,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Filling the data if (mRec->IsGPU()) { // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations - runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); + runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeFull, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); } else { // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 4cd0c094398df..4ec0269da3bf9 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -335,7 +335,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread Date: Sat, 19 Jul 2025 11:17:50 +0200 Subject: [PATCH 04/15] undoing changes in dpl-workflow.sh --- prodtests/full-system-test/dpl-workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prodtests/full-system-test/dpl-workflow.sh b/prodtests/full-system-test/dpl-workflow.sh index fafa78c40a2e4..486be1a9160cd 100755 --- a/prodtests/full-system-test/dpl-workflow.sh +++ b/prodtests/full-system-test/dpl-workflow.sh @@ -76,7 +76,7 @@ fi GPU_INPUT=zsraw GPU_OUTPUT=tracks,clusters GPU_CONFIG= -# GPU_CONFIG_KEY= +GPU_CONFIG_KEY= TOF_CONFIG= TOF_INPUT=raw TOF_OUTPUT=clusters From 05a64bbf0a1c929e744d5cf10077f8e16a6cef5e Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 21 Jul 2025 11:34:17 +0200 Subject: [PATCH 05/15] Furhter fixes and beautifications --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 + .../Global/GPUChainTrackingClusterizer.cxx | 36 ++++++++++--------- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 1 + .../GPUTPCNNClusterizerHost.cxx | 1 + .../GPUTPCNNClusterizerKernels.cxx | 24 ++++++------- 5 files changed, 32 insertions(+), 31 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 51a0add7dbeea..0b080bf54ca5c 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -275,6 +275,7 @@ AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters wil AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path") AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") AddOption(nnEvalMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)") +AddOption(nnClusterizerUseClassification, int, 1, "", 0, "If 1, the classification output of the network is used to select clusters, else only the regression output is used and no clusters are rejected by classification") // CCDB AddOption(nnLoadFromCCDB, int, 0, "", 0, "If 1 networks are fetched from ccdb, else locally") AddOption(nnLocalFolder, std::string, ".", "", 0, "Local folder in which the networks will be fetched") diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 28e17f44d774b..c8d7b13839b8d 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -1034,23 +1034,25 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } // NN evaluations - if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane]->Start(); } - if (clustererNNShadow.mNnInferenceInputDType == 0) { - if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32); - } - } else if (clustererNNShadow.mNnInferenceInputDType == 1) { - if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32); + if(clustererNNShadow.mNnClusterizerUseClassification) { + if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Start(); } + if (clustererNNShadow.mNnInferenceInputDType == 0) { + if (clustererNNShadow.mNnInferenceOutputDType == 0) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32); + } + } else if (clustererNNShadow.mNnInferenceInputDType == 1) { + if (clustererNNShadow.mNnInferenceOutputDType == 0) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32); + } } } - if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane]->Stop(); } + if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); } if (!clustererNNShadow.mNnClusterizerUseCfRegression) { - if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 1]->Start(); } + if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16); @@ -1064,9 +1066,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32); } } - if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 1]->Stop(); } + if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Stop(); } if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) { - if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 2]->Start(); } + if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16); @@ -1080,7 +1082,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32); } } - if(GetProcessingSettings().debugLevel >= 1) { nnTimers[3*lane + 2]->Stop(); } + if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Stop(); } } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 7c22d8123fdec..69972c8a0651c 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -43,6 +43,7 @@ class GPUTPCNNClusterizer : public GPUProcessor int32_t mNnClusterizerChargeArraySize = -1; int32_t mNnClusterizerElementSize = -1; int8_t mNnClusterizerAddIndexData = 1; + int8_t mNnClusterizerUseClassification = 1; float mNnClassThreshold = 0.01; int8_t mNnSigmoidTrafoClassThreshold = 1; int8_t mNnClusterizerSetDeconvolutionFlags = 1; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index fd56d49de7921..59969348ac42f 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -106,6 +106,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode; clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue; clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold; + clustererNN.mNnClusterizerUseClassification = settings.nnClusterizerUseClassification; clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags; if (clustererNN.mNnSigmoidTrafoClassThreshold) { clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold)); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 4ec0269da3bf9..dd33a72f79389 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -117,18 +117,14 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(pad) / GPUTPCGeometry::NPads(row); - if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)sector_norm; - clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)row_norm; - clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)pad_norm; + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); + clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); } else { - clustererNN.mInputData_32[write_idx] = sector_norm; - clustererNN.mInputData_32[write_idx + 1] = row_norm; - clustererNN.mInputData_32[write_idx + 2] = pad_norm; + clustererNN.mInputData_32[write_idx] = static_cast(sector) / o2::tpc::constants::MAXSECTOR; + clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; + clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / GPUTPCGeometry::NPads(row); } } @@ -178,8 +174,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(sector) / o2::tpc::constants::MAXSECTOR, + static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW, static_cast(pad) / GPUTPCGeometry::NPads(row)}; if (dtype == 0) { @@ -339,7 +335,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= 1)) { + if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerUseClassification <= 0)) { ClusterAccumulator pc; @@ -451,7 +447,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread 0) { + if ((clustererNN.mOutputDataClass[full_glo_idx] > 0) || (clustererNN.mNnClusterizerUseClassification <= 0)) { ClusterAccumulator pc; From 8f06331b9fbfbc6087b28a10e43d0ca259273387 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Mon, 21 Jul 2025 09:36:14 +0000 Subject: [PATCH 06/15] Please consider the following formatting changes --- .../Global/GPUChainTrackingClusterizer.cxx | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index c8d7b13839b8d..b83670a0e7bbf 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -654,20 +654,17 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 8), &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 9), &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 10), - &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 11) - }; - HighResTimer* nnFillInputTimers[4] { + &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 11)}; + HighResTimer* nnFillInputTimers[4]{ &getTimer("GPUTPCNNClusterizer_fillInputNNSingleElement_0_", 0), &getTimer("GPUTPCNNClusterizer_fillInputNNSingleElement_1_", 1), &getTimer("GPUTPCNNClusterizer_fillInputNNSingleElement_2_", 2), - &getTimer("GPUTPCNNClusterizer_fillInputNNSingleElement_3_", 3) - }; - HighResTimer* nnPublishingTimers[4] { + &getTimer("GPUTPCNNClusterizer_fillInputNNSingleElement_3_", 3)}; + HighResTimer* nnPublishingTimers[4]{ &getTimer("GPUTPCNNClusterizer_publish_0_", 0), &getTimer("GPUTPCNNClusterizer_publish_1_", 1), &getTimer("GPUTPCNNClusterizer_publish_2_", 2), - &getTimer("GPUTPCNNClusterizer_publish_3_", 3) - }; + &getTimer("GPUTPCNNClusterizer_publish_3_", 3)}; if (GetProcessingSettings().nn.applyNNclusterizer) { int32_t deviceId = -1; From 152f459441b167294047ac7fc7b5f6577ed1cd06 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 21 Jul 2025 12:52:47 +0200 Subject: [PATCH 07/15] Removing unused timers --- GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index b83670a0e7bbf..1fc3fbd9c1162 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -655,16 +655,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 9), &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 10), &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 11)}; - HighResTimer* nnFillInputTimers[4]{ - &getTimer("GPUTPCNNClusterizer_fillInputNNSingleElement_0_", 0), - &getTimer("GPUTPCNNClusterizer_fillInputNNSingleElement_1_", 1), - &getTimer("GPUTPCNNClusterizer_fillInputNNSingleElement_2_", 2), - &getTimer("GPUTPCNNClusterizer_fillInputNNSingleElement_3_", 3)}; - HighResTimer* nnPublishingTimers[4]{ - &getTimer("GPUTPCNNClusterizer_publish_0_", 0), - &getTimer("GPUTPCNNClusterizer_publish_1_", 1), - &getTimer("GPUTPCNNClusterizer_publish_2_", 2), - &getTimer("GPUTPCNNClusterizer_publish_3_", 3)}; if (GetProcessingSettings().nn.applyNNclusterizer) { int32_t deviceId = -1; From 5801e3a1c2558b98e4a2f6cc4f608b57110478b9 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 21 Jul 2025 13:39:17 +0200 Subject: [PATCH 08/15] Moving Stop() of classification timer --- GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 1fc3fbd9c1162..28541b3ab6e92 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -1036,8 +1036,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32); } } + if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); } } - if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); } if (!clustererNNShadow.mNnClusterizerUseCfRegression) { if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { From f4dcbaada692245924c46051aa1b0154ffb1aee7 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 21 Jul 2025 13:42:30 +0200 Subject: [PATCH 09/15] Adding force method to fill input like it is done on GPU --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 + GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 0b080bf54ca5c..a1e259d269356 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -276,6 +276,7 @@ AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regress AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") AddOption(nnEvalMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)") AddOption(nnClusterizerUseClassification, int, 1, "", 0, "If 1, the classification output of the network is used to select clusters, else only the regression output is used and no clusters are rejected by classification") +AddOption(nnClusterizerForceGpuInputFill, int, 0, "", 0, "Forces to use the fillInputNNGPU function") // CCDB AddOption(nnLoadFromCCDB, int, 0, "", 0, "If 1 networks are fetched from ccdb, else locally") AddOption(nnLocalFolder, std::string, ".", "", 0, "Local folder in which the networks will be fetched") diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 28541b3ab6e92..ee5c6b48500f1 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -1008,7 +1008,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); // Filling the data - if (mRec->IsGPU()) { + if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) { // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeFull, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); } else { From e6482abc59f2efcc8c519cb7c322e48b543bcb7c Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 23 Jul 2025 09:17:42 +0200 Subject: [PATCH 10/15] Removing unnecessary static asserts --- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index 5659c61894c85..7469754594124 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -38,12 +38,6 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate { public: // Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNCPU) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer); struct GPUSharedMemory { // Regular cluster finder From f2d2b868b4064b7cb87aed1ac114eb74cf0885c2 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 25 Jul 2025 16:14:18 +0200 Subject: [PATCH 11/15] Adding deterministic mode (unfortunately that did not make it deterministic on GPU -> general problem with ONNX) --- Common/ML/include/ML/OrtInterface.h | 2 +- Common/ML/src/OrtInterface.cxx | 8 +++++++- GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 + .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 1 + 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 0c498e33d2e2c..04a5e0ba5c9fc 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -116,7 +116,7 @@ class OrtModel int32_t mInputsTotal = 0, mOutputsTotal = 0; // Total number of inputs and outputs // Environment settings - bool mInitialized = false; + bool mInitialized = false, mDeterministicMode = false; std::string mModelPath, mEnvName = "", mDeviceType = "CPU", mThreadAffinity = ""; // device options should be cpu, rocm, migraphx, cuda int32_t mIntraOpNumThreads = 1, mInterOpNumThreads = 1, mDeviceId = -1, mEnableProfiling = 0, mLoggingLevel = 0, mAllocateDeviceMemory = 0, mEnableOptimizations = 0; diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 1cd9913efb6aa..cd1cc2ebdc3d9 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -68,8 +68,10 @@ void OrtModel::initOptions(std::unordered_map optionsM mEnableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); mEnableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); mEnvName = (optionsMap.contains("onnx-environment-name") ? optionsMap["onnx-environment-name"] : "onnx_model_inference"); + mDeterministicMode = (optionsMap.contains("deterministic-compute") ? std::stoi(optionsMap["deterministic-compute"]) : 0); - if (mDeviceType == "CPU") { + if (mDeviceType == "CPU") + { (mPImplOrt->sessionOptions).SetIntraOpNumThreads(mIntraOpNumThreads); (mPImplOrt->sessionOptions).SetInterOpNumThreads(mInterOpNumThreads); if (mIntraOpNumThreads > 1 || mInterOpNumThreads > 1) { @@ -99,6 +101,10 @@ void OrtModel::initOptions(std::unordered_map optionsM (mPImplOrt->sessionOptions).DisableProfiling(); } + if (mDeterministicMode > 0) { + (mPImplOrt->sessionOptions).AddConfigEntry("session_options.use_deterministic_compute", "1"); + } + (mPImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(mEnableOptimizations)); (mPImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(mLoggingLevel)); diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index a1e259d269356..65697b7f7c08b 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -256,6 +256,7 @@ AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Spe AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347") +AddOption(nnInferenceUseDeterministicCompute, int, 0, "", 0, "Enables deterministic compute in ONNX Runtime were possible. Can be [0, 1] -> see https://github.com/microsoft/onnxruntime/blob/3b97d79b3c12dbf93aa0d563f345714596dc8ab6/onnxruntime/core/framework/session_options.h#L208") AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime") AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data") AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs") diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 59969348ac42f..15bfc38336215 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -54,6 +54,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)}, {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)}, {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)}, + {"deterministic-compute", std::to_string(settings.nnInferenceUseDeterministicCompute)}, // TODO: This unfortunately doesn't guarantee determinism (25.07.2025) {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)}, {"profiling-output-path", settings.nnInferenceOrtProfilingPath}, {"logging-level", std::to_string(settings.nnInferenceVerbosity)}, From f03fdc3ff2c9021c864768074d640083fae3f475 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Fri, 25 Jul 2025 14:15:34 +0000 Subject: [PATCH 12/15] Please consider the following formatting changes --- Common/ML/src/OrtInterface.cxx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index cd1cc2ebdc3d9..58d80eb9c0bf0 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -70,8 +70,7 @@ void OrtModel::initOptions(std::unordered_map optionsM mEnvName = (optionsMap.contains("onnx-environment-name") ? optionsMap["onnx-environment-name"] : "onnx_model_inference"); mDeterministicMode = (optionsMap.contains("deterministic-compute") ? std::stoi(optionsMap["deterministic-compute"]) : 0); - if (mDeviceType == "CPU") - { + if (mDeviceType == "CPU") { (mPImplOrt->sessionOptions).SetIntraOpNumThreads(mIntraOpNumThreads); (mPImplOrt->sessionOptions).SetInterOpNumThreads(mInterOpNumThreads); if (mIntraOpNumThreads > 1 || mInterOpNumThreads > 1) { From 52235a38ea0727fa762c075061042f6fc12ed009 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 28 Jul 2025 00:24:37 +0200 Subject: [PATCH 13/15] Adjusting for comment --- .../Global/GPUChainTrackingClusterizer.cxx | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index ee5c6b48500f1..d665cfe8168be 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -642,19 +642,21 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes]; // Maximum of 4 lanes supported - HighResTimer* nnTimers[12] = { - &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 0), - &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 1), - &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 2), - &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 3), - &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 4), - &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 5), - &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 6), - &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 7), - &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 8), - &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 9), - &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 10), - &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 11)}; + HighResTimer* nnTimers[12]; + if (GetProcessingSettings().nn.applyNNclusterizer && GetProcessingSettings().debugLevel >= 1) { + nnTimers[0] = &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 0); + nnTimers[1] = &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 1); + nnTimers[2] = &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 2); + nnTimers[3] = &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 3); + nnTimers[4] = &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 4); + nnTimers[5] = &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 5); + nnTimers[6] = &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 6); + nnTimers[7] = &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 7); + nnTimers[8] = &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 8); + nnTimers[9] = &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 9); + nnTimers[10] = &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 10); + nnTimers[11] = &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 11); + } if (GetProcessingSettings().nn.applyNNclusterizer) { int32_t deviceId = -1; From 8c87d3783b1a8fbbd1a898ab7ae00a69840cfdb1 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 28 Jul 2025 11:23:38 +0200 Subject: [PATCH 14/15] Adding deterministic mode --- GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 2 +- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 4 ++-- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index d665cfe8168be..07b332db1fc12 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -664,7 +664,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) int32_t maxThreads = mRec->getNKernelHostThreads(true); // bool recreateMemoryAllocator = false; mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) { - nnApplications[lane].init(nn_settings); + nnApplications[lane].init(nn_settings, GetProcessingSettings().deterministicGPUReconstruction); if (nnApplications[lane].mModelsUsed[0]) { SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId); (nnApplications[lane].mModelClass).setDeviceId(deviceId); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 15bfc38336215..7223e06fe2a55 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -28,7 +28,7 @@ using namespace o2::gpu; -void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) +void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode) { std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath; std::vector reg_model_paths_local; @@ -54,7 +54,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)}, {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)}, {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)}, - {"deterministic-compute", std::to_string(settings.nnInferenceUseDeterministicCompute)}, // TODO: This unfortunately doesn't guarantee determinism (25.07.2025) + {"deterministic-compute", std::to_string(useDeterministicMode? 1 : settings.nnInferenceUseDeterministicCompute)}, // TODO: This unfortunately doesn't guarantee determinism (25.07.2025) {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)}, {"profiling-output-path", settings.nnInferenceOrtProfilingPath}, {"logging-level", std::to_string(settings.nnInferenceVerbosity)}, diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index ed3c80320b632..414c4539a33c1 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -45,9 +45,9 @@ class GPUTPCNNClusterizerHost { public: GPUTPCNNClusterizerHost() = default; - GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) { init(settings); } + GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode = false) { init(settings, useDeterministicMode); } - void init(const GPUSettingsProcessingNNclusterizer&); + void init(const GPUSettingsProcessingNNclusterizer&, bool = false); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); void createBoundary(GPUTPCNNClusterizer&); void createIndexLookup(GPUTPCNNClusterizer&); From f8139a7b568440dca108ec0f1c9038199dc876e5 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Mon, 28 Jul 2025 09:24:15 +0000 Subject: [PATCH 15/15] Please consider the following formatting changes --- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 7223e06fe2a55..ad635c15b9256 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -54,7 +54,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)}, {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)}, {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)}, - {"deterministic-compute", std::to_string(useDeterministicMode? 1 : settings.nnInferenceUseDeterministicCompute)}, // TODO: This unfortunately doesn't guarantee determinism (25.07.2025) + {"deterministic-compute", std::to_string(useDeterministicMode ? 1 : settings.nnInferenceUseDeterministicCompute)}, // TODO: This unfortunately doesn't guarantee determinism (25.07.2025) {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)}, {"profiling-output-path", settings.nnInferenceOrtProfilingPath}, {"logging-level", std::to_string(settings.nnInferenceVerbosity)},