diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 0c498e33d2e2c..04a5e0ba5c9fc 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -116,7 +116,7 @@ class OrtModel int32_t mInputsTotal = 0, mOutputsTotal = 0; // Total number of inputs and outputs // Environment settings - bool mInitialized = false; + bool mInitialized = false, mDeterministicMode = false; std::string mModelPath, mEnvName = "", mDeviceType = "CPU", mThreadAffinity = ""; // device options should be cpu, rocm, migraphx, cuda int32_t mIntraOpNumThreads = 1, mInterOpNumThreads = 1, mDeviceId = -1, mEnableProfiling = 0, mLoggingLevel = 0, mAllocateDeviceMemory = 0, mEnableOptimizations = 0; diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 1cd9913efb6aa..58d80eb9c0bf0 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -68,6 +68,7 @@ void OrtModel::initOptions(std::unordered_map optionsM mEnableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); mEnableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); mEnvName = (optionsMap.contains("onnx-environment-name") ? optionsMap["onnx-environment-name"] : "onnx_model_inference"); + mDeterministicMode = (optionsMap.contains("deterministic-compute") ? std::stoi(optionsMap["deterministic-compute"]) : 0); if (mDeviceType == "CPU") { (mPImplOrt->sessionOptions).SetIntraOpNumThreads(mIntraOpNumThreads); @@ -99,6 +100,10 @@ void OrtModel::initOptions(std::unordered_map optionsM (mPImplOrt->sessionOptions).DisableProfiling(); } + if (mDeterministicMode > 0) { + (mPImplOrt->sessionOptions).AddConfigEntry("session_options.use_deterministic_compute", "1"); + } + (mPImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(mEnableOptimizations)); (mPImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(mLoggingLevel)); diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 51a0add7dbeea..65697b7f7c08b 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -256,6 +256,7 @@ AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Spe AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347") +AddOption(nnInferenceUseDeterministicCompute, int, 0, "", 0, "Enables deterministic compute in ONNX Runtime were possible. Can be [0, 1] -> see https://github.com/microsoft/onnxruntime/blob/3b97d79b3c12dbf93aa0d563f345714596dc8ab6/onnxruntime/core/framework/session_options.h#L208") AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime") AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data") AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs") @@ -275,6 +276,8 @@ AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters wil AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path") AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") AddOption(nnEvalMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)") +AddOption(nnClusterizerUseClassification, int, 1, "", 0, "If 1, the classification output of the network is used to select clusters, else only the regression output is used and no clusters are rejected by classification") +AddOption(nnClusterizerForceGpuInputFill, int, 0, "", 0, "Forces to use the fillInputNNGPU function") // CCDB AddOption(nnLoadFromCCDB, int, 0, "", 0, "If 1 networks are fetched from ccdb, else locally") AddOption(nnLocalFolder, std::string, ".", "", 0, "Local folder in which the networks will be fetched") diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 13455efe6cb47..07b332db1fc12 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -641,13 +641,30 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes]; + // Maximum of 4 lanes supported + HighResTimer* nnTimers[12]; + if (GetProcessingSettings().nn.applyNNclusterizer && GetProcessingSettings().debugLevel >= 1) { + nnTimers[0] = &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 0); + nnTimers[1] = &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 1); + nnTimers[2] = &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 2); + nnTimers[3] = &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 3); + nnTimers[4] = &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 4); + nnTimers[5] = &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 5); + nnTimers[6] = &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 6); + nnTimers[7] = &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 7); + nnTimers[8] = &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 8); + nnTimers[9] = &getTimer("GPUTPCNNClusterizer_ONNXClassification_0_", 9); + nnTimers[10] = &getTimer("GPUTPCNNClusterizer_ONNXRegression_1_", 10); + nnTimers[11] = &getTimer("GPUTPCNNClusterizer_ONNXRegression2_2_", 11); + } + if (GetProcessingSettings().nn.applyNNclusterizer) { int32_t deviceId = -1; int32_t numLanes = GetProcessingSettings().nTPCClustererLanes; int32_t maxThreads = mRec->getNKernelHostThreads(true); // bool recreateMemoryAllocator = false; mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) { - nnApplications[lane].init(nn_settings); + nnApplications[lane].init(nn_settings, GetProcessingSettings().deterministicGPUReconstruction); if (nnApplications[lane].mModelsUsed[0]) { SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId); (nnApplications[lane].mModelClass).setDeviceId(deviceId); @@ -993,9 +1010,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); // Filling the data - if (mRec->IsGPU()) { + if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) { // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations - runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); + runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeFull, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); } else { // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); @@ -1006,20 +1023,25 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } // NN evaluations - if (clustererNNShadow.mNnInferenceInputDType == 0) { - if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32); - } - } else if (clustererNNShadow.mNnInferenceInputDType == 1) { - if (clustererNNShadow.mNnInferenceOutputDType == 0) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16); - } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { - (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32); + if(clustererNNShadow.mNnClusterizerUseClassification) { + if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Start(); } + if (clustererNNShadow.mNnInferenceInputDType == 0) { + if (clustererNNShadow.mNnInferenceOutputDType == 0) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32); + } + } else if (clustererNNShadow.mNnInferenceInputDType == 1) { + if (clustererNNShadow.mNnInferenceOutputDType == 0) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32); + } } + if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane]->Stop(); } } if (!clustererNNShadow.mNnClusterizerUseCfRegression) { + if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16); @@ -1033,7 +1055,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32); } } + if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 1]->Stop(); } if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) { + if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Start(); } if (clustererNNShadow.mNnInferenceInputDType == 0) { if (clustererNNShadow.mNnInferenceOutputDType == 0) { (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16); @@ -1047,6 +1071,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32); } } + if(GetProcessingSettings().debugLevel >= 1 && doGPU) { nnTimers[3*lane + 2]->Stop(); } } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 7c22d8123fdec..69972c8a0651c 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -43,6 +43,7 @@ class GPUTPCNNClusterizer : public GPUProcessor int32_t mNnClusterizerChargeArraySize = -1; int32_t mNnClusterizerElementSize = -1; int8_t mNnClusterizerAddIndexData = 1; + int8_t mNnClusterizerUseClassification = 1; float mNnClassThreshold = 0.01; int8_t mNnSigmoidTrafoClassThreshold = 1; int8_t mNnClusterizerSetDeconvolutionFlags = 1; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index fd56d49de7921..ad635c15b9256 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -28,7 +28,7 @@ using namespace o2::gpu; -void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) +void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode) { std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath; std::vector reg_model_paths_local; @@ -54,6 +54,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)}, {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)}, {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)}, + {"deterministic-compute", std::to_string(useDeterministicMode ? 1 : settings.nnInferenceUseDeterministicCompute)}, // TODO: This unfortunately doesn't guarantee determinism (25.07.2025) {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)}, {"profiling-output-path", settings.nnInferenceOrtProfilingPath}, {"logging-level", std::to_string(settings.nnInferenceVerbosity)}, @@ -106,6 +107,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode; clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue; clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold; + clustererNN.mNnClusterizerUseClassification = settings.nnClusterizerUseClassification; clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags; if (clustererNN.mNnSigmoidTrafoClassThreshold) { clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold)); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index ed3c80320b632..414c4539a33c1 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -45,9 +45,9 @@ class GPUTPCNNClusterizerHost { public: GPUTPCNNClusterizerHost() = default; - GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) { init(settings); } + GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, bool useDeterministicMode = false) { init(settings, useDeterministicMode); } - void init(const GPUSettingsProcessingNNclusterizer&); + void init(const GPUSettingsProcessingNNclusterizer&, bool = false); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); void createBoundary(GPUTPCNNClusterizer&); void createIndexLookup(GPUTPCNNClusterizer&); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 4cd0c094398df..dd33a72f79389 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -117,18 +117,14 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(pad) / GPUTPCGeometry::NPads(row); - if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)sector_norm; - clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)row_norm; - clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)pad_norm; + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); + clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); } else { - clustererNN.mInputData_32[write_idx] = sector_norm; - clustererNN.mInputData_32[write_idx + 1] = row_norm; - clustererNN.mInputData_32[write_idx + 2] = pad_norm; + clustererNN.mInputData_32[write_idx] = static_cast(sector) / o2::tpc::constants::MAXSECTOR; + clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; + clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / GPUTPCGeometry::NPads(row); } } @@ -178,8 +174,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(sector) / o2::tpc::constants::MAXSECTOR, + static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW, static_cast(pad) / GPUTPCGeometry::NPads(row)}; if (dtype == 0) { @@ -335,11 +331,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= 1)) { + if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerUseClassification <= 0)) { ClusterAccumulator pc; @@ -451,7 +447,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread 0) { + if ((clustererNN.mOutputDataClass[full_glo_idx] > 0) || (clustererNN.mNnClusterizerUseClassification <= 0)) { ClusterAccumulator pc; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index 5659c61894c85..7469754594124 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -38,12 +38,6 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate { public: // Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNCPU) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer); struct GPUSharedMemory { // Regular cluster finder