diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 93549178848ca..cbd8501f9898f 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -84,7 +84,7 @@ class OrtModel // Environment settings bool mInitialized = false; - std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda + std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; std::string printShape(const std::vector&); diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index fc784dd14d2dc..5e9f1a8b0a5b6 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -48,7 +48,6 @@ void OrtModel::reset(std::unordered_map optionsMap) if (!optionsMap["model-path"].empty()) { modelPath = optionsMap["model-path"]; device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU"); - dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); @@ -226,19 +225,18 @@ template std::vector OrtModel::inference void OrtModel::inference(I* input, size_t input_size, O* output) { - std::vector inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputShape{input_size, (int64_t)mInputShapes[0][1]}; Ort::Value inputTensor = Ort::Value(nullptr); if constexpr (std::is_same_v) { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size, inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); } else { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); } - std::vector outputShape{inputShape[0], mOutputShapes[0][1]}; - size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]); - Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size()); + std::vector outputShape{input_size, mOutputShapes[0][1]}; + Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); - (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here + (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is always correct here } template void OrtModel::inference(OrtDataType::Float16_t*, size_t, float*); diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 40a7fc71cbb4d..83f6e320b8f5b 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -228,7 +228,8 @@ AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)") AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id") AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference") -AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 +AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16 +AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347") @@ -249,6 +250,16 @@ AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The c AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.") AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path") AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") +// CCDB +AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally") +AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched") +AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks") +AddOption(nnCCDBFetchMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)") +AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression") +AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") +AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") +AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp") +AddOption(nnCCDBInteractionRate, int, 50, "", 0, "Distinguishes between networks for different interaction rates [kHz].") AddHelp("help", 'h') EndConfig() diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index a48050a6cacbc..1638e134a4d6a 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -612,14 +612,16 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } #ifdef GPUCA_HAS_ONNX + const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; + GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only if (GetProcessingSettings().nn.applyNNclusterizer) { - uint32_t maxClusters = -1; + uint32_t maxClusters = 0; + nnApplication.init(nn_settings); for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters); } for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; - const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression; clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow; clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad; @@ -639,8 +641,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } else { clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; } - clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos; - GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); + clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos; + nnApplication.initClusterizer(nn_settings, clustererNN); AllocateRegisteredMemory(clustererNN.mMemoryId); } } @@ -916,7 +918,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) #ifdef GPUCA_HAS_ONNX GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; - GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); + int withMC = (doGPU && propagateMCLabels); if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); @@ -929,23 +931,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); auto start0 = std::chrono::high_resolution_clock::now(); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Filling the data auto stop0 = std::chrono::high_resolution_clock::now(); auto start1 = std::chrono::high_resolution_clock::now(); - nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype); + nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnInferenceInputDType); if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels } else { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels } if (!clustererNN.nnClusterizerUseCfRegression) { - nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1 - if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) { - nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2 + nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnInferenceInputDType); + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1 + if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { + nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnInferenceInputDType); + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2 } } auto stop1 = std::chrono::high_resolution_clock::now(); @@ -955,7 +957,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } auto start1 = std::chrono::high_resolution_clock::now(); if (clustererNN.nnClusterizerUseCfRegression) { - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 } auto stop1 = std::chrono::high_resolution_clock::now(); time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 6a9b6f546ae07..cc3f29434615f 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -24,25 +24,29 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {} void* GPUTPCNNClusterizer::setIOPointers(void* mem) { - if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize); - } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize); - } - computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode); - computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode); - computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode); - computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters); - if (nnClusterizerModelClassNumOutputNodes > 0) { - computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); - } - if (!nnClusterizerUseCfRegression) { - if (nnClusterizerModelReg1NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + if (nnClusterizerBatchedMode > 0) { + if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize); + } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize); } - if (nnClusterizerModelReg2NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode); + computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode); + computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode); + if (nnClusterizerModelClassNumOutputNodes > 0) { + computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); } + if (!nnClusterizerUseCfRegression) { + if (nnClusterizerModelReg1NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + } + if (nnClusterizerModelReg2NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + } + } + } + if (nnClusterizerTotalClusters > 0) { + computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters); } return mem; } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index ea6340dfd48bc..0b9e3a6572684 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -42,7 +42,7 @@ class GPUTPCNNClusterizer : public GPUProcessor int nnClusterizerSizeInputTime = 3; int nnClusterizerElementSize = -1; bool nnClusterizerAddIndexData = true; - float nnClassThreshold = 0.16; + float nnClassThreshold = 0.01; bool nnSigmoidTrafoClassThreshold = 1; int nnClusterizerUseCfRegression = 0; int nnClusterizerBatchedMode = 1; @@ -54,11 +54,10 @@ class GPUTPCNNClusterizer : public GPUProcessor int nnClusterizerModelClassNumOutputNodes = -1; int nnClusterizerModelReg1NumOutputNodes = -1; int nnClusterizerModelReg2NumOutputNodes = -1; - int nnClusterizerDtype = 0; // 0: float16, 1: float32 + int nnInferenceInputDType = 0; // 0: float16, 1: float32 int mISector = -1; // Memory allocation for neural network - uint class2_elements = 0; float* inputData32 = nullptr; OrtDataType::Float16_t* inputData16 = nullptr; float* outputDataClass = nullptr; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 5002c63524020..35db3f2107e7d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -12,21 +12,99 @@ /// \file GPUTPCNNClusterizerHost.cxx /// \author Christian Sonnabend +#include + #include "GPUTPCNNClusterizerHost.h" #include "GPUTPCNNClusterizer.h" +#include "CCDB/CcdbApi.h" #include "GPUSettings.h" #include "ML/3rdparty/GPUORTFloat16.h" using namespace o2::gpu; -GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer) +GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) +{ + init(settings); +} + +void GPUTPCNNClusterizerHost::loadFromCCDB(std::map settings) +{ + o2::ccdb::CcdbApi ccdbApi; + ccdbApi.init(settings["nnCCDBURL"]); + + metadata["inputDType"] = settings["inputDType"]; + metadata["outputDType"] = settings["outputDType"]; + metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C + metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model + metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN + if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) { + metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"]; + } + if (settings["nnCCDBBeamType"] != "") { + metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"]; + } + + bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], ".", metadata, 1, false, settings["outputFile"]); + // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables + + if (retrieveSuccess) { + LOG(info) << "Network " << settings["nnCCDBPath"] << " retrieved from CCDB, stored at " << settings["outputFile"]; + } else { + LOG(error) << "Failed to retrieve network from CCDB"; + } +} + +void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) { + std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath; + std::vector reg_model_paths; + + if (settings.nnLoadFromCCDB) { + std::map ccdbSettings = { + {"nnCCDBURL", settings.nnCCDBURL}, + {"nnCCDBPath", settings.nnCCDBPath}, + {"inputDType", settings.nnInferenceInputDType}, + {"outputDType", settings.nnInferenceOutputDType}, + {"nnCCDBWithMomentum", std::to_string(settings.nnCCDBWithMomentum)}, + {"nnCCDBBeamType", settings.nnCCDBBeamType}, + {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)}}; + + std::string nnFetchFolder = ""; + std::vector fetchMode = o2::utils::Str::tokenize(settings.nnCCDBFetchMode, ':'); + std::map networkRetrieval = ccdbSettings; + + if (fetchMode[0] == "c1") { + networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType; + networkRetrieval["nnCCDBEvalType"] = "classification_c1"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx"; + loadFromCCDB(networkRetrieval); + } else if (fetchMode[0] == "c2") { + networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType; + networkRetrieval["nnCCDBEvalType"] = "classification_c2"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx"; + loadFromCCDB(networkRetrieval); + } + class_model_path = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally + + networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType; + networkRetrieval["nnCCDBEvalType"] = "regression_c1"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx"; + loadFromCCDB(networkRetrieval); + reg_model_path = networkRetrieval["outputFile"]; + if (fetchMode[1] == "r2") { + networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType; + networkRetrieval["nnCCDBEvalType"] = "regression_c2"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx"; + loadFromCCDB(networkRetrieval); + reg_model_path += ":", networkRetrieval["outputFile"]; + } + } + OrtOptions = { - {"model-path", settings.nnClassificationPath}, + {"model-path", class_model_path}, {"device", settings.nnInferenceDevice}, {"device-id", std::to_string(settings.nnInferenceDeviceId)}, {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)}, - {"dtype", settings.nnInferenceDtype}, {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)}, {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)}, {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)}, @@ -35,31 +113,40 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl {"logging-level", std::to_string(settings.nnInferenceVerbosity)}}; model_class.init(OrtOptions); - clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; - reg_model_paths = splitString(settings.nnRegressionPath, ":"); + reg_model_paths = o2::utils::Str::tokenize(reg_model_path, ':'); if (!settings.nnClusterizerUseCfRegression) { if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { OrtOptions["model-path"] = reg_model_paths[0]; model_reg_1.init(OrtOptions); - clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; } else { OrtOptions["model-path"] = reg_model_paths[0]; model_reg_1.init(OrtOptions); - clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; OrtOptions["model-path"] = reg_model_paths[1]; model_reg_2.init(OrtOptions); + } + } +} + +void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer) +{ + clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; + if (!settings.nnClusterizerUseCfRegression) { + if (model_class.getNumOutputNodes()[0][1] == 1 || model_reg_2.isInitialized()) { + clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; + } else { + clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1]; } } } -void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype) +void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype) { if (dtype == 0) { - model.inference(clusterer.inputData16, size * clusterer.nnClusterizerElementSize, output); + model.inference(clustererNN.inputData16, size, output); } else { - model.inference(clusterer.inputData32, size * clusterer.nnClusterizerElementSize, output); + model.inference(clustererNN.inputData32, size, output); } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 7efa0edecb893..210d5f94dd503 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -37,7 +37,11 @@ class GPUTPCNNClusterizerHost { public: GPUTPCNNClusterizerHost() = default; - GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&); + + void init(const GPUSettingsProcessingNNclusterizer&); + void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + void loadFromCCDB(std::map); void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype); @@ -46,21 +50,8 @@ class GPUTPCNNClusterizerHost std::vector reg_model_paths; private: - // Avoid including CommonUtils/StringUtils.h - std::vector splitString(const std::string& input, const std::string& delimiter) - { - std::vector tokens; - std::size_t pos = 0; - std::size_t found; - - while ((found = input.find(delimiter, pos)) != std::string::npos) { - tokens.push_back(input.substr(pos, found - pos)); - pos = found + delimiter.length(); - } - tokens.push_back(input.substr(pos)); - - return tokens; - } + std::map metadata; + std::map headers; }; // class GPUTPCNNClusterizerHost } // namespace o2::gpu diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 379ea27443fea..73051bd8477fd 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -35,7 +35,7 @@ using namespace o2::gpu::tpccf; // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; @@ -45,91 +45,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY(MCLabelAccumulator labelAcc(clusterer)); - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new; GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } template <> GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart); -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - uint glo_idx = get_global_id(0); - processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold); -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - auto& clusterer = processors.tpcNNClusterer[sector]; - uint glo_idx = get_global_id(0); - uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes; - float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty] - uint class_label = 0; - for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) { - if (pIdx == elem_iterator) { - current_max_prob = clusterer.modelProbabilities[pIdx]; - } else { - class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label); - } - } - // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" - clusterer.outputDataClass[glo_idx + batchStart] = class_label; -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - uint glo_idx = get_global_id(0); - if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) { - return; - } - GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart); -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - uint glo_idx = get_global_id(0); - if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) { - return; - } - GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart); -} - -// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary -GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current) -{ - return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2); -} - -GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift) -{ - return (row > 62 ? global_shift : 0); -} - -GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift) -{ - if (pad < 0 || row < 0) { // Faster short-circuit - return true; - } else if (row < 63) { - return (pad >= static_cast(GPUTPCGeometry::NPads(row))); - } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network - return true; - } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) { - return (pad >= static_cast(GPUTPCGeometry::NPads(row - global_shift))); - } else { - return true; - } -} - -// Filling the input data for the neural network where there is no boundary -GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart) { uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; @@ -145,7 +67,7 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n clustererNN.peakPositions[glo_idx] = peak; clustererNN.centralCharges[glo_idx] = central_charge; - clustererNN.outputDataClass[glo_idx + batchStart] = -1; + clustererNN.outputDataClass[glo_idx + batchStart] = -1.f; int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow); #ifndef GPUCA_GPUCODE @@ -193,25 +115,58 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n } } -GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + uint glo_idx = get_global_id(0); + processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold); +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + auto& clustererNN = processors.tpcNNClusterer[sector]; + uint glo_idx = get_global_id(0); + uint elem_iterator = glo_idx * clustererNN.nnClusterizerModelClassNumOutputNodes; + float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty] + uint class_label = 0; + for (int pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes; pIdx++) { + if (pIdx == elem_iterator) { + current_max_prob = clustererNN.modelProbabilities[pIdx]; + } else { + class_label = (clustererNN.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label); + } + } + // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" + clustererNN.outputDataClass[glo_idx + batchStart] = class_label; + if (class_label > 1) { + clustererNN.clusterFlags[2 * glo_idx] = 1; + clustererNN.clusterFlags[2 * glo_idx + 1] = 1; + } +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { + uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; uint full_glo_idx = glo_idx + batchStart; int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg1NumOutputNodes; // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size(); - if (clustererNN.outputDataClass[full_glo_idx] == 1) { + if (clustererNN.outputDataClass[full_glo_idx] == 1 || (clustererNN.nnClusterizerModelReg2NumOutputNodes == -1 && clustererNN.outputDataClass[full_glo_idx] >= 1)) { ClusterAccumulator pc; // Publishing logic is taken from default clusterizer - if (onlyMC) { + if (withMC) { ClusterAccumulator dummy_pc; CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack())); GPUTPCCFClusterizer::buildCluster( @@ -224,7 +179,6 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha &dummy_pc, labelAcc); } - if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -273,24 +227,25 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha } } -GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { + uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; uint full_glo_idx = glo_idx + batchStart; int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg2NumOutputNodes; - // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg2NumOutputNodes << " -- " << clustererNN.peakPositions.size() << " -- " << clustererNN.centralCharges.size(); - if (clustererNN.outputDataClass[full_glo_idx] > 0) { ClusterAccumulator pc; - if (onlyMC) { + if (withMC) { ClusterAccumulator dummy_pc; CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack())); GPUTPCCFClusterizer::buildCluster( @@ -303,7 +258,6 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha &dummy_pc, labelAcc); } - if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -385,3 +339,29 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha return; } } + +// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary +GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current) +{ + return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2); +} + +GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift) +{ + return (row > 62 ? global_shift : 0); +} + +GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift) +{ + if (pad < 0 || row < 0) { // Faster short-circuit + return true; + } else if (row < 63) { + return (pad >= static_cast(GPUTPCGeometry::NPads(row))); + } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network + return true; + } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) { + return (pad >= static_cast(GPUTPCGeometry::NPads(row - global_shift))); + } else { + return true; + } +}