diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt index 540fe8ebf271c..2db91fc4f4320 100644 --- a/Common/ML/CMakeLists.txt +++ b/Common/ML/CMakeLists.txt @@ -9,21 +9,14 @@ # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. -# Pass ORT variables as a preprocessor definition -if(ORT_ROCM_BUILD) - add_compile_definitions(ORT_ROCM_BUILD=1) -endif() -if(ORT_CUDA_BUILD) - add_compile_definitions(ORT_CUDA_BUILD=1) -endif() -if(ORT_MIGRAPHX_BUILD) - add_compile_definitions(ORT_MIGRAPHX_BUILD=1) -endif() -if(ORT_TENSORRT_BUILD) - add_compile_definitions(ORT_TENSORRT_BUILD=1) -endif() - o2_add_library(ML SOURCES src/OrtInterface.cxx TARGETVARNAME targetName PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime) + +# Pass ORT variables as a preprocessor definition +target_compile_definitions(${targetName} PRIVATE + $<$:ORT_ROCM_BUILD> + $<$:ORT_CUDA_BUILD> + $<$:ORT_MIGRAPHX_BUILD> + $<$:ORT_TENSORRT_BUILD>) diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h index 76fd6734cf9db..9516ba5dad573 100644 --- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h +++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h @@ -882,4 +882,4 @@ static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match"); } // namespace OrtDataType } // namespace o2 -#endif \ No newline at end of file +#endif diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 93549178848ca..e37b6a69b6036 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -26,6 +26,13 @@ // O2 includes #include "Framework/Logger.h" +namespace Ort +{ +struct SessionOptions; +struct MemoryInfo; +struct Env; +} // namespace Ort + namespace o2 { @@ -36,14 +43,52 @@ class OrtModel { public: - // Constructor + // Constructors & destructors OrtModel() = default; - OrtModel(std::unordered_map optionsMap) { reset(optionsMap); } - void init(std::unordered_map optionsMap) { reset(optionsMap); } - void reset(std::unordered_map); + OrtModel(std::unordered_map optionsMap) { init(optionsMap); } + void init(std::unordered_map optionsMap) + { + initOptions(optionsMap); + initEnvironment(); + } + virtual ~OrtModel() = default; + + // General purpose + void initOptions(std::unordered_map optionsMap); + void initEnvironment(); + void initSession(); + void memoryOnDevice(int32_t = 0); bool isInitialized() { return mInitialized; } + void resetSession(); - virtual ~OrtModel() = default; + // Getters + std::vector> getNumInputNodes() const { return mInputShapes; } + std::vector> getNumOutputNodes() const { return mOutputShapes; } + std::vector getInputNames() const { return mInputNames; } + std::vector getOutputNames() const { return mOutputNames; } + Ort::SessionOptions* getSessionOptions(); + Ort::MemoryInfo* getMemoryInfo(); + Ort::Env* getEnv(); + int32_t getIntraOpNumThreads() const { return intraOpNumThreads; } + int32_t getInterOpNumThreads() const { return interOpNumThreads; } + + // Setters + void setDeviceId(int32_t id) { deviceId = id; } + void setIO(); + void setActiveThreads(int threads) { intraOpNumThreads = threads; } + void setIntraOpNumThreads(int threads) + { + if (deviceType == "CPU") { + intraOpNumThreads = threads; + } + } + void setInterOpNumThreads(int threads) + { + if (deviceType == "CPU") { + interOpNumThreads = threads; + } + } + void setEnv(Ort::Env*); // Conversion template @@ -53,41 +98,36 @@ class OrtModel template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h std::vector inference(std::vector&); - template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h + template std::vector inference(std::vector>&); - template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h - void inference(I*, size_t, O*); - - // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type - // std::vector inference(std::vector&); - - // Reset session - void resetSession(); + template + void inference(I*, int64_t, O*); - std::vector> getNumInputNodes() const { return mInputShapes; } - std::vector> getNumOutputNodes() const { return mOutputShapes; } - std::vector getInputNames() const { return mInputNames; } - std::vector getOutputNames() const { return mOutputNames; } + template + void inference(I**, int64_t, O*); - void setActiveThreads(int threads) { intraOpNumThreads = threads; } + void release(bool = false); private: - // ORT variables -> need to be hidden as Pimpl + // ORT variables -> need to be hidden as pImpl struct OrtVariables; OrtVariables* pImplOrt; // Input & Output specifications of the loaded network std::vector inputNamesChar, outputNamesChar; std::vector mInputNames, mOutputNames; - std::vector> mInputShapes, mOutputShapes; + std::vector> mInputShapes, mOutputShapes, inputShapesCopy, outputShapesCopy; // Input shapes + std::vector inputSizePerNode, outputSizePerNode; // Output shapes + int32_t mInputsTotal = 0, mOutputsTotal = 0; // Total number of inputs and outputs // Environment settings bool mInitialized = false; - std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda - int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; + std::string modelPath, envName = "", deviceType = "CPU", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda + int32_t intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = -1, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; std::string printShape(const std::vector&); + std::string printShape(const std::vector>&, std::vector&); }; } // namespace ml diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 88f548bd4fe7b..24a2fbffb252c 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -33,11 +33,12 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c Ort::SessionOptions sessionOptions; Ort::AllocatorWithDefaultOptions allocator; Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); + std::unique_ptr ioBinding = nullptr; }; -void OrtModel::reset(std::unordered_map optionsMap) +// General purpose +void OrtModel::initOptions(std::unordered_map optionsMap) { - pImplOrt = new OrtVariables(); // Load from options map @@ -47,77 +48,60 @@ void OrtModel::reset(std::unordered_map optionsMap) if (!optionsMap["model-path"].empty()) { modelPath = optionsMap["model-path"]; - device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU"); - dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); - deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); + deviceType = (optionsMap.contains("device-type") ? optionsMap["device-type"] : "CPU"); + deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : -1); allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0); loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); - - std::string dev_mem_str = "Hip"; -#if defined(ORT_ROCM_BUILD) - if (device == "ROCM") { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); - LOG(info) << "(ORT) ROCM execution provider set"; - } -#endif -#if defined(ORT_MIGRAPHX_BUILD) - if (device == "MIGRAPHX") { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); - LOG(info) << "(ORT) MIGraphX execution provider set"; - } -#endif -#if defined(ORT_CUDA_BUILD) - if (device == "CUDA") { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); - LOG(info) << "(ORT) CUDA execution provider set"; - dev_mem_str = "Cuda"; + envName = (optionsMap.contains("onnx-environment-name") ? optionsMap["onnx-environment-name"] : "onnx_model_inference"); + + if (deviceType == "CPU") { + (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); + (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads); + if (intraOpNumThreads > 1 || interOpNumThreads > 1) { + (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); + } else if (intraOpNumThreads == 1) { + (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + } + if (loggingLevel < 2) { + LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " (intraOpNumThreads) and " << interOpNumThreads << " (interOpNumThreads) threads"; + } } -#endif - if (allocateDeviceMemory) { - pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); - LOG(info) << "(ORT) Memory info set to on-device memory"; - } + // OrtROCMProviderOptions rocm_options{}; + // (pImplOrt->sessionOptions).AppendExecutionProvider_ROCM(rocm_options); - if (device == "CPU") { - (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); - (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads); - if (intraOpNumThreads > 1 || interOpNumThreads > 1) { - (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); - } else if (intraOpNumThreads == 1) { - (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); - } - if (loggingLevel < 2) { - LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " (intraOpNumThreads) and " << interOpNumThreads << " (interOpNumThreads) threads"; - } - } - - (pImplOrt->sessionOptions).DisableMemPattern(); - (pImplOrt->sessionOptions).DisableCpuMemArena(); + (pImplOrt->sessionOptions).DisableMemPattern(); + (pImplOrt->sessionOptions).DisableCpuMemArena(); - if (enableProfiling) { - if (optionsMap.contains("profiling-output-path")) { - (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); + if (enableProfiling) { + if (optionsMap.contains("profiling-output-path")) { + (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); + } else { + LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; + (pImplOrt->sessionOptions).DisableProfiling(); + } } else { - LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; (pImplOrt->sessionOptions).DisableProfiling(); } - } else { - (pImplOrt->sessionOptions).DisableProfiling(); - } - mInitialized = true; + (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); + (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); - (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); - (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); + mInitialized = true; + } else { + LOG(fatal) << "(ORT) Model path cannot be empty!"; + } +} +void OrtModel::initEnvironment() +{ pImplOrt->env = std::make_shared( OrtLoggingLevel(loggingLevel), - (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()), + (envName.empty() ? "ORT" : envName.c_str()), // Integrate ORT logging into Fairlogger [](void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, const char* message) { if (severity == ORT_LOGGING_LEVEL_VERBOSE) { @@ -136,31 +120,48 @@ void OrtModel::reset(std::unordered_map optionsMap) }, (void*)3); (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events - pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); +} - for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); - } - for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); - } - for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); - } - for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); +void OrtModel::initSession() +{ + if (allocateDeviceMemory) { + memoryOnDevice(deviceId); } + pImplOrt->session = std::make_shared(*pImplOrt->env, modelPath.c_str(), pImplOrt->sessionOptions); + pImplOrt->ioBinding = std::make_unique(*pImplOrt->session); + + setIO(); - inputNamesChar.resize(mInputNames.size(), nullptr); - std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), - [&](const std::string& str) { return str.c_str(); }); - outputNamesChar.resize(mOutputNames.size(), nullptr); - std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), - [&](const std::string& str) { return str.c_str(); }); - } if (loggingLevel < 2) { - LOG(info) << "(ORT) Model loaded successfully! (input: " << printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")"; + LOG(info) << "(ORT) Model loaded successfully! (inputs: " << printShape(mInputShapes, mInputNames) << ", outputs: " << printShape(mOutputShapes, mInputNames) << ")"; + } +} + +void OrtModel::memoryOnDevice(int32_t deviceIndex) +{ +#if (defined(ORT_ROCM_BUILD) || defined(ORT_MIGRAPHX_BUILD) || defined(ORT_CUDA_BUILD) || defined(ORT_TENSORRT_BUILD)) + if (deviceIndex >= 0) { + (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1"); + (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h + (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time + (pImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time + // Arena memory shrinkage comes at performance cost + /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0; + // (pImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27 + + std::string dev_mem_str = ""; + if (deviceType == "ROCM") { + dev_mem_str = "Hip"; + } + if (deviceType == "CUDA") { + dev_mem_str = "Cuda"; + } + pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault); + if (loggingLevel < 2) { + LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex << " and pImplOrt pointer " << pImplOrt; + } } +#endif } void OrtModel::resetSession() @@ -168,6 +169,22 @@ void OrtModel::resetSession() pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); } +// Getters +Ort::SessionOptions* OrtModel::getSessionOptions() +{ + return &pImplOrt->sessionOptions; +} + +Ort::MemoryInfo* OrtModel::getMemoryInfo() +{ + return &pImplOrt->memoryInfo; +} + +Ort::Env* OrtModel::getEnv() +{ + return (pImplOrt->env).get(); +} + template std::vector OrtModel::v2v(std::vector& input, bool clearInput) { @@ -183,20 +200,70 @@ std::vector OrtModel::v2v(std::vector& input, bool clearInput) } } -std::string OrtModel::printShape(const std::vector& v) +void OrtModel::setIO() { - std::stringstream ss(""); - for (size_t i = 0; i < v.size() - 1; i++) { - ss << v[i] << "x"; + for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { + mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); } - ss << v[v.size() - 1]; - return ss.str(); + for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { + mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { + mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); + } + for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { + mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + + inputNamesChar.resize(mInputNames.size(), nullptr); + std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), + [&](const std::string& str) { return str.c_str(); }); + outputNamesChar.resize(mOutputNames.size(), nullptr); + std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), + [&](const std::string& str) { return str.c_str(); }); + + inputShapesCopy = mInputShapes; + outputShapesCopy = mOutputShapes; + inputSizePerNode.resize(mInputShapes.size(), 1); + outputSizePerNode.resize(mOutputShapes.size(), 1); + mInputsTotal = 1; + for (size_t i = 0; i < mInputShapes.size(); ++i) { + if (mInputShapes[i].size() > 0) { + for (size_t j = 1; j < mInputShapes[i].size(); ++j) { + if (mInputShapes[i][j] > 0) { + mInputsTotal *= mInputShapes[i][j]; + inputSizePerNode[i] *= mInputShapes[i][j]; + } + } + } + } + mOutputsTotal = 1; + for (size_t i = 0; i < mOutputShapes.size(); ++i) { + if (mOutputShapes[i].size() > 0) { + for (size_t j = 1; j < mOutputShapes[i].size(); ++j) { + if (mOutputShapes[i][j] > 0) { + mOutputsTotal *= mOutputShapes[i][j]; + outputSizePerNode[i] *= mOutputShapes[i][j]; + } + } + } + } +} + +void OrtModel::setEnv(Ort::Env* env) +{ + pImplOrt->env = std::shared_ptr(env); } +// Inference template std::vector OrtModel::inference(std::vector& input) { - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputShape = mInputShapes[0]; + inputShape[0] = input.size(); + for (size_t i = 1; i < mInputShapes[0].size(); ++i) { + inputShape[0] /= mInputShapes[0][i]; + } std::vector inputTensor; if constexpr (std::is_same_v) { inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -212,51 +279,182 @@ std::vector OrtModel::inference(std::vector& input) } template std::vector OrtModel::inference(std::vector&); - template std::vector OrtModel::inference(std::vector&); - template std::vector OrtModel::inference(std::vector&); template -void OrtModel::inference(I* input, size_t input_size, O* output) +void OrtModel::inference(I* input, int64_t input_size, O* output) { - std::vector inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + // std::vector providers = Ort::GetAvailableProviders(); + // for (const auto& provider : providers) { + // LOG(info) << "Available Execution Provider: " << provider; + // } + std::vector inputShape{input_size, (int64_t)mInputShapes[0][1]}; Ort::Value inputTensor = Ort::Value(nullptr); if constexpr (std::is_same_v) { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size, inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); } else { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); } + (pImplOrt->ioBinding)->BindInput(mInputNames[0].c_str(), inputTensor); - std::vector outputShape{inputShape[0], mOutputShapes[0][1]}; - size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]); - Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size()); + std::vector outputShape{input_size, mOutputShapes[0][1]}; + Ort::Value outputTensor = Ort::Value(nullptr); + if constexpr (std::is_same_v) { + outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); + } else { + outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); + } + (pImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor); - (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here + (pImplOrt->session)->Run(pImplOrt->runOptions, *pImplOrt->ioBinding); } -template void OrtModel::inference(OrtDataType::Float16_t*, size_t, float*); +template void OrtModel::inference(OrtDataType::Float16_t*, int64_t, OrtDataType::Float16_t*); +template void OrtModel::inference(OrtDataType::Float16_t*, int64_t, float*); +template void OrtModel::inference(float*, int64_t, OrtDataType::Float16_t*); +template void OrtModel::inference(float*, int64_t, float*); + +template +void OrtModel::inference(I** input, int64_t input_size, O* output) +{ + std::vector inputTensors(inputShapesCopy.size()); + + for (size_t i = 0; i < inputShapesCopy.size(); ++i) { + + inputShapesCopy[i][0] = input_size; // batch-size + outputShapesCopy[i][0] = input_size; // batch-size + + if constexpr (std::is_same_v) { + inputTensors[i] = Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + reinterpret_cast(input[i]), + inputSizePerNode[i] * input_size, + inputShapesCopy[i].data(), + inputShapesCopy[i].size()); + } else { + inputTensors[i] = Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + input[i], + inputSizePerNode[i] * input_size, + inputShapesCopy[i].data(), + inputShapesCopy[i].size()); + } + } + + Ort::Value outputTensor = Ort::Value(nullptr); + if constexpr (std::is_same_v) { + outputTensor = Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + reinterpret_cast(output), + outputSizePerNode[0] * input_size, // assumes that there is only one output node + outputShapesCopy[0].data(), + outputShapesCopy[0].size()); + } else { + outputTensor = Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + output, + outputSizePerNode[0] * input_size, // assumes that there is only one output node + outputShapesCopy[0].data(), + outputShapesCopy[0].size()); + } + + // === Run inference === + pImplOrt->session->Run( + pImplOrt->runOptions, + inputNamesChar.data(), + inputTensors.data(), + inputNamesChar.size(), + outputNamesChar.data(), + &outputTensor, + outputNamesChar.size()); +} -template void OrtModel::inference(float*, size_t, float*); +template void OrtModel::inference(OrtDataType::Float16_t**, int64_t, OrtDataType::Float16_t*); +template void OrtModel::inference(OrtDataType::Float16_t**, int64_t, float*); +template void OrtModel::inference(float**, int64_t, OrtDataType::Float16_t*); +template void OrtModel::inference(float**, int64_t, float*); template -std::vector OrtModel::inference(std::vector>& input) +std::vector OrtModel::inference(std::vector>& inputs) { - std::vector inputTensor; - for (auto i : input) { - std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector input_tensors; + + for (size_t i = 0; i < inputs.size(); ++i) { + + inputShapesCopy[i][0] = inputs[i].size() / inputSizePerNode[i]; // batch-size + if constexpr (std::is_same_v) { - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); + input_tensors.emplace_back( + Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + reinterpret_cast(inputs[i].data()), + inputSizePerNode[i] * inputShapesCopy[i][0], + inputShapesCopy[i].data(), + inputShapesCopy[i].size())); } else { - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, i.data(), i.size(), inputShape.data(), inputShape.size())); + input_tensors.emplace_back( + Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + inputs[i].data(), + inputSizePerNode[i] * inputShapesCopy[i][0], + inputShapesCopy[i].data(), + inputShapesCopy[i].size())); } } - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); - std::vector outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; + + int32_t totalOutputSize = mOutputsTotal * inputShapesCopy[0][0]; + + // === Run inference === + auto output_tensors = pImplOrt->session->Run( + pImplOrt->runOptions, + inputNamesChar.data(), + input_tensors.data(), + input_tensors.size(), + outputNamesChar.data(), + outputNamesChar.size()); + + // === Extract output values === + O* output_data = output_tensors[0].template GetTensorMutableData(); + std::vector output_vec(output_data, output_data + totalOutputSize); + output_tensors.clear(); + return output_vec; +} + +template std::vector OrtModel::inference(std::vector>&); +template std::vector OrtModel::inference(std::vector>&); + +// Release session +void OrtModel::release(bool profilingEnabled) +{ + // if (profilingEnabled) { + // pImplOrt->session->EndProfiling(); + // } + LOG(info) << "(ORT) Size of pImplOrt: " << sizeof(*pImplOrt) << " bytes"; +} + +// private +std::string OrtModel::printShape(const std::vector& v) +{ + std::stringstream ss(""); + for (size_t i = 0; i < v.size() - 1; i++) { + ss << v[i] << "x"; + } + ss << v[v.size() - 1]; + return ss.str(); +} + +std::string OrtModel::printShape(const std::vector>& v, std::vector& n) +{ + std::stringstream ss(""); + for (size_t i = 0; i < v.size(); i++) { + ss << n[i] << " -> ("; + for (size_t j = 0; j < v[i].size() - 1; j++) { + ss << v[i][j] << "x"; + } + ss << v[i][v[i].size() - 1] << "); "; + } + return ss.str(); } } // namespace ml diff --git a/Detectors/TPC/calibration/CMakeLists.txt b/Detectors/TPC/calibration/CMakeLists.txt index 0ec62e5f323b3..7722fc4e2884f 100644 --- a/Detectors/TPC/calibration/CMakeLists.txt +++ b/Detectors/TPC/calibration/CMakeLists.txt @@ -25,6 +25,7 @@ o2_add_library(TPCCalibration src/CalibPadGainTracksBase.cxx src/CalibLaserTracks.cxx src/LaserTracksCalibrator.cxx + src/NeuralNetworkClusterizer.cxx src/SACDecoder.cxx src/IDCAverageGroup.cxx src/IDCAverageGroupBase.cxx @@ -82,6 +83,7 @@ o2_target_root_dictionary(TPCCalibration include/TPCCalibration/FastHisto.h include/TPCCalibration/CalibLaserTracks.h include/TPCCalibration/LaserTracksCalibrator.h + include/TPCCalibration/NeuralNetworkClusterizer.h include/TPCCalibration/SACDecoder.h include/TPCCalibration/IDCAverageGroup.h include/TPCCalibration/IDCAverageGroupBase.h diff --git a/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h new file mode 100644 index 0000000000000..196bba644714c --- /dev/null +++ b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h @@ -0,0 +1,38 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file NeuralNetworkClusterizer.h +/// \brief Fetching neural networks for clusterization from CCDB +/// \author Christian Sonnabend + +#ifndef AliceO2_TPC_NeuralNetworkClusterizer_h +#define AliceO2_TPC_NeuralNetworkClusterizer_h + +#include "CCDB/CcdbApi.h" + +namespace o2::tpc +{ + +class NeuralNetworkClusterizer +{ + public: + NeuralNetworkClusterizer() = default; + void initCcdbApi(std::string url); + void loadIndividualFromCCDB(std::map settings); + + private: + o2::ccdb::CcdbApi ccdbApi; + std::map metadata; + std::map headers; +}; + +} // namespace o2::tpc +#endif diff --git a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx new file mode 100644 index 0000000000000..bfbb7afc946f8 --- /dev/null +++ b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx @@ -0,0 +1,48 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file NeuralNetworkClusterizer.cxx +/// \brief Fetching neural networks for clusterization from CCDB +/// \author Christian Sonnabend + +#include +#include "TPCCalibration/NeuralNetworkClusterizer.h" + +using namespace o2::tpc; + +void NeuralNetworkClusterizer::initCcdbApi(std::string url) +{ + ccdbApi.init(url); +} + +void NeuralNetworkClusterizer::loadIndividualFromCCDB(std::map settings) +{ + metadata["inputDType"] = settings["inputDType"]; + metadata["outputDType"] = settings["outputDType"]; + metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C + metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model + metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN + if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) { + metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"]; + } + if (settings["nnCCDBBeamType"] != "") { + metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"]; + } + + bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], settings["outputFolder"], metadata, 1, false, settings["outputFile"]); + // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables + + if (retrieveSuccess) { + LOG(info) << "Network " << settings["nnCCDBPath"] << " retrieved from CCDB, stored at " << settings["outputFile"]; + } else { + LOG(error) << "Failed to retrieve network from CCDB"; + } +} diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index 163b00c804d7f..1174fcd8a38d7 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -24,6 +24,11 @@ #include "GPUReconstructionKernelIncludes.h" #include "GPUReconstructionKernels.h" +namespace Ort +{ +struct SessionOptions; +} + namespace o2::gpu { @@ -108,6 +113,9 @@ class GPUReconstructionCPU : public GPUReconstructionKernels #include +namespace Ort +{ +struct SessionOptions; +} + namespace o2::gpu { diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt index 3655eaf66055e..f595fb051db54 100644 --- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt +++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt @@ -114,6 +114,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${MODULE} SOURCES ${SRCS} PUBLIC_LINK_LIBRARIES O2::GPUTracking O2::ITStrackingCUDA + PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime PRIVATE_INCLUDE_DIRECTORIES ${CMAKE_SOURCE_DIR}/Detectors/Base/src ${CMAKE_SOURCE_DIR}/Detectors/TRD/base/src @@ -121,6 +122,11 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${CMAKE_CURRENT_SOURCE_DIR} TARGETVARNAME targetName) + target_compile_definitions(${targetName} PRIVATE + GPUCA_HAS_ONNX=1 + $<$:ORT_CUDA_BUILD> + $<$:ORT_TENSORRT_BUILD>) + install(FILES ${HDRS} DESTINATION include/GPU) endif() diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 3bea91994ba86..741f160158b43 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -37,6 +37,10 @@ #undef GPUCA_KRNL #endif +#ifdef GPUCA_HAS_ONNX +#include +#endif + static constexpr size_t REQUIRE_MIN_MEMORY = 1024L * 1024 * 1024; static constexpr size_t REQUIRE_MEMORY_RESERVED = 512L * 1024 * 1024; static constexpr size_t REQUIRE_FREE_MEMORY_RESERVED_PER_SM = 40L * 1024 * 1024; @@ -648,6 +652,28 @@ void GPUReconstructionCUDA::endGPUProfiling() { GPUChkErr(cudaProfilerStop()); } + +void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId) +{ +#ifdef ORT_CUDA_BUILD + cudaGetDevice(deviceId); + OrtCUDAProviderOptionsV2* cuda_options = nullptr; + CreateCUDAProviderOptions(&cuda_options); + + // std::vector keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"}; + // std::vector values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"}; + // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size()); + + // this implicitly sets "has_user_compute_stream" + cuda_options.has_user_compute_stream = 1; + UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream]); + session_options.AppendExecutionProvider_CUDA_V2(cuda_options); + + // Finally, don't forget to release the provider options + ReleaseCUDAProviderOptions(cuda_options); +#endif // ORT_CUDA_BUILD +} + #else // HIP void* GPUReconstructionHIP::getGPUPointer(void* ptr) { @@ -655,6 +681,22 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr) GPUChkErr(hipHostGetDevicePointer(&retVal, ptr, 0)); return retVal; } + +void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId) +{ +#ifdef ORT_ROCM_BUILD + // Create ROCm provider options + cudaGetDevice(deviceId); + // const auto& api = Ort::GetApi(); + // api.GetCurrentGpuDeviceId(deviceId); + OrtROCMProviderOptions rocm_options; + rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream + rocm_options.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code + // rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit + rocm_options.user_compute_stream = mInternals->Streams[stream]; + session_options.AppendExecutionProvider_ROCM(rocm_options); +#endif // ORT_ROCM_BUILD +} #endif // __HIPCC__ namespace o2::gpu diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index b1a3a53a6a62f..6bac1fefb2346 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -25,6 +25,11 @@ extern "C" __declspec(dllexport) o2::gpu::GPUReconstruction* GPUReconstruction_C extern "C" o2::gpu::GPUReconstruction* GPUReconstruction_Create_CUDA(const o2::gpu::GPUSettingsDeviceBackend& cfg); #endif +namespace Ort +{ +struct SessionOptions; +} + namespace o2::gpu { struct GPUReconstructionCUDAInternals; @@ -74,6 +79,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels* trackerTraits, std::unique_ptr* vertexerTraits, std::unique_ptr* timeFrame) override; diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index 3a03a054d4a7e..d7adb222d547b 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -162,6 +162,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${MODULE} SOURCES ${SRCS} PUBLIC_LINK_LIBRARIES O2::GPUTracking O2::ITStrackingHIP + PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime PRIVATE_INCLUDE_DIRECTORIES ${CMAKE_SOURCE_DIR}/Detectors/Base/src ${CMAKE_SOURCE_DIR}/Detectors/TRD/base/src @@ -169,6 +170,11 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${GPUCA_HIP_SOURCE_DIR} TARGETVARNAME targetName) + target_compile_definitions(${targetName} PRIVATE + GPUCA_HAS_ONNX=1 + $<$:ORT_ROCM_BUILD> + $<$:ORT_MIGRAPHX_BUILD>) + install(FILES ${HDRS} DESTINATION include/GPU) # o2_add_test(GPUsortHIP NAME test_GPUsortHIP diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index eaeec508ff27a..e82799b9e59c3 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -341,6 +341,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") O2::DetectorsRaw O2::Steer O2::ML + PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime PUBLIC_INCLUDE_DIRECTORIES ${INCDIRS} SOURCES ${SRCS} ${SRCS_NO_CINT} ${SRCS_NO_H}) diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h index b212abbcd2707..a84fce453a34a 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h @@ -493,6 +493,7 @@ #define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN GPUCA_LB_GPUTPCNNClusterizerKernels + #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNSingleElement GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 34fac6514851c..64e41e94b3335 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -247,7 +247,8 @@ AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)") AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id") AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference") -AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 +AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16 +AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347") @@ -268,6 +269,17 @@ AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The c AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.") AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path") AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") +AddOption(nnEvalMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)") +// CCDB +AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally") +AddOption(nnLocalFolder, std::string, ".", "", 0, "Local folder in which the networks will be fetched") +AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched") +AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks") +AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression") +AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") +AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") +AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp") +AddOption(nnCCDBInteractionRate, int, 50, "", 0, "Distinguishes between networks for different interaction rates [kHz].") AddHelp("help", 'h') EndConfig() diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h index 290ae32cafca8..59712c30a62dd 100644 --- a/GPU/GPUTracking/Global/GPUChain.h +++ b/GPU/GPUTracking/Global/GPUChain.h @@ -83,6 +83,7 @@ class GPUChain inline GPUParam& param() { return mRec->param(); } inline const GPUConstantMem* processors() const { return mRec->processors(); } inline void SynchronizeStream(int32_t stream) { mRec->SynchronizeStream(stream); } + inline void SetONNXGPUStream(Ort::SessionOptions& opt, int32_t stream, int32_t* deviceId) { mRec->SetONNXGPUStream(opt, stream, deviceId); } inline void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) { mRec->SynchronizeEvents(evList, nEvents); } inline void SynchronizeEventAndRelease(deviceEvent& ev, bool doGPU = true) { diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 4047dcae0a6b3..7db0ba66305e9 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -611,49 +611,89 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline) } + if (doGPU && mIOPtrs.tpcZS) { + processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta; + WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1); + } + if (doGPU) { + WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init); + } + #ifdef GPUCA_HAS_ONNX + const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; + GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes]; + if (GetProcessingSettings().nn.applyNNclusterizer) { - uint32_t maxClusters = -1; - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { - maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters); - } - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { - GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; - const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; - clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression; - clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow; - clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad; - clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime; - clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData; - clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0); - clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode; - clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue; - clustererNN.nnClusterizerTotalClusters = maxClusters; - clustererNN.nnClassThreshold = nn_settings.nnClassThreshold; - clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold; - if (clustererNN.nnSigmoidTrafoClassThreshold) { - clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold)); + int32_t deviceId = -1; + int32_t numLanes = GetProcessingSettings().nTPCClustererLanes; + int32_t maxThreads = mRec->getNKernelHostThreads(true); + // bool recreateMemoryAllocator = false; + mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) { + nnApplications[lane].init(nn_settings); + if (nnApplications[lane].modelsUsed[0]) { + SetONNXGPUStream(*(nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId); + (nnApplications[lane].model_class).setDeviceId(deviceId); + if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) { + nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads); + } + (nnApplications[lane].model_class).initEnvironment(); + // Registering this once seems to be enough, even with different environmnents / models. ONNX apparently uses this per device and stores the OrtAllocator internally. All models will then use the volatile allocation. + // But environment must be valid, so we init the model environment first and use it here afterwards. + // Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator. + // TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor + // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, recreateMemoryAllocator); + // recreateMemoryAllocator = true; + (nnApplications[lane].model_class).initSession(); } - if (nn_settings.nnClusterizerVerbosity < 0) { - clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity; - } else { - clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; + if (nnApplications[lane].modelsUsed[1]) { + SetONNXGPUStream(*(nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId); + (nnApplications[lane].model_reg_1).setDeviceId(deviceId); + if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) { + nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads); + } + // (nnApplications[lane].model_reg_1).setEnv((nnApplications[lane].model_class).getEnv()); + (nnApplications[lane].model_reg_1).initEnvironment(); + // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, recreateMemoryAllocator); + (nnApplications[lane].model_reg_1).initSession(); + } + if (nnApplications[lane].modelsUsed[2]) { + SetONNXGPUStream(*(nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId); + (nnApplications[lane].model_reg_2).setDeviceId(deviceId); + if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) { + nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads); + } + (nnApplications[lane].model_reg_2).initEnvironment(); + // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, recreateMemoryAllocator); + (nnApplications[lane].model_reg_2).initSession(); + } + if (nn_settings.nnClusterizerVerbosity < 3) { + LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId; + } + }); + mRec->runParallelOuterLoop(doGPU, NSECTORS, [&](uint32_t sector) { + GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector]; + GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN; + int32_t lane = sector % numLanes; + clustererNN.deviceId = deviceId; + clustererNN.mISector = sector; + clustererNN.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters; + nnApplications[lane].initClusterizer(nn_settings, clustererNN); + if (doGPU) { + clustererNNShadow.deviceId = deviceId; + clustererNNShadow.mISector = sector; + clustererNNShadow.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters; + nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow); } - clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos; - GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); AllocateRegisteredMemory(clustererNN.mMemoryId); + }); + if (doGPU) { + WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init); } + LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes"; + LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes"; } #endif - if (doGPU && mIOPtrs.tpcZS) { - processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta; - WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1); - } - if (doGPU) { - WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init); - } - size_t nClsTotal = 0; ClusterNativeAccess* tmpNativeAccess = mClusterNativeAccess.get(); ClusterNative* tmpNativeClusters = nullptr; @@ -914,58 +954,122 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (GetProcessingSettings().nn.applyNNclusterizer) { #ifdef GPUCA_HAS_ONNX - GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; - const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; - GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); + GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane]; + GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN; + GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane]; + + int withMC = (doGPU && propagateMCLabels); - if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { + if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); } - float time_clusterizer = 0, time_fill = 0; - for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) { - uint batchStart = batch * clustererNN.nnClusterizerBatchedMode; - size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); + float time_clusterizer = 0, time_fill = 0, time_networks = 0; + for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.nnClusterizerBatchedMode); batch++) { + uint batchStart = batch * clustererNNShadow.nnClusterizerBatchedMode; + size_t iSize = CAMath::Min((uint)clustererNNShadow.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); auto start0 = std::chrono::high_resolution_clock::now(); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data + runKernel({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data - auto stop0 = std::chrono::high_resolution_clock::now(); - auto start1 = std::chrono::high_resolution_clock::now(); - nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype); + // auto stop0 = std::chrono::high_resolution_clock::now(); + // auto start1 = std::chrono::high_resolution_clock::now(); + + // NN evaluations + if (clustererNNShadow.nnInferenceInputDType == 0) { + if (clustererNNShadow.nnInferenceOutputDType == 0) { + (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_16); + } else if (clustererNNShadow.nnInferenceOutputDType == 1) { + (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_32); + } + } else if (clustererNNShadow.nnInferenceInputDType == 1) { + if (clustererNNShadow.nnInferenceOutputDType == 0) { + (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_16); + } else if (clustererNNShadow.nnInferenceOutputDType == 1) { + (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_32); + } + } + if (!clustererNNShadow.nnClusterizerUseCfRegression) { + if (clustererNNShadow.nnInferenceInputDType == 0) { + if (clustererNNShadow.nnInferenceOutputDType == 0) { + (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_16); + } else if (clustererNNShadow.nnInferenceOutputDType == 1) { + (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_32); + } + } else if (clustererNNShadow.nnInferenceInputDType == 1) { + if (clustererNNShadow.nnInferenceOutputDType == 0) { + (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_16); + } else if (clustererNNShadow.nnInferenceOutputDType == 1) { + (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_32); + } + } + if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { + if (clustererNNShadow.nnInferenceInputDType == 0) { + if (clustererNNShadow.nnInferenceOutputDType == 0) { + (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_16); + } else if (clustererNNShadow.nnInferenceOutputDType == 1) { + (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_32); + } + } else if (clustererNNShadow.nnInferenceInputDType == 1) { + if (clustererNNShadow.nnInferenceOutputDType == 0) { + (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_16); + } else if (clustererNNShadow.nnInferenceOutputDType == 1) { + (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_32); + } + } + } + } + + // auto stopNNs = std::chrono::high_resolution_clock::now(); + + // Publishing kernels if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels } else { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels } - - if (!clustererNN.nnClusterizerUseCfRegression) { - nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1 - if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) { - nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2 + if (!clustererNNShadow.nnClusterizerUseCfRegression) { + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 1 regression results + if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results } } - auto stop1 = std::chrono::high_resolution_clock::now(); - time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; - time_fill += std::chrono::duration_cast(stop0 - start0).count() / 1e9; + // for(int i = 0; i < iSize; ++i) { + // if(clustererNNShadow.outputDataClass[i + batchStart] > 1) { + // LOG(info) << "WARNING ORT: Output of " << i + batchStart << " / " << clusterer.mPmemory->counters.nClusters << " is " << clustererNNShadow.modelProbabilities_16[i].ToFloat() << " and " << clustererNNShadow.outputDataClass[i + batchStart] << " thresh " << clustererNNShadow.nnClassThreshold << " instead of 0 or 1. Please check the model and the input data."; + // // std::string input = "["; + // // for(int j = 0; j < clustererNNShadow.nnClusterizerElementSize; j++){ + // // input += std::to_string(clustererNNShadow.inputData_16[i * clustererNNShadow.nnClusterizerElementSize + j].ToFloat()) + ", "; + // // } + // // input += "]"; + // // LOG(info) << "Input is: " << input; + // } + // } + + // auto stop1 = std::chrono::high_resolution_clock::now(); + + // time_networks += std::chrono::duration_cast(stopNNs - start1).count() / 1e9; + // time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; + // time_fill += std::chrono::duration_cast(stop0 - start0).count() / 1e9; } - auto start1 = std::chrono::high_resolution_clock::now(); - if (clustererNN.nnClusterizerUseCfRegression) { - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 - } - auto stop1 = std::chrono::high_resolution_clock::now(); - time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; - if (clustererNN.nnClusterizerVerbosity < 3) { - int acceptedClusters = 0; - for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) { - acceptedClusters += clustererNN.outputDataClass[i]; - } - LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s"; + if (clustererNNShadow.nnClusterizerUseCfRegression) { + // auto start1 = std::chrono::high_resolution_clock::now(); + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 + // auto stop1 = std::chrono::high_resolution_clock::now(); + // time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; } + // if (clustererNNShadow.nnClusterizerVerbosity < 3) { + // int acceptedClusters = 0; + // for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) { + // if(clustererNNShadow.outputDataClass[i] > 1 || clustererNNShadow.outputDataClass[i] < 0) { + // LOG(info) << "WARNING ORT 2: " << clustererNNShadow.outputDataClass[i] << " for index " << i << " / " << clusterer.mPmemory->counters.nClusters; + // } + // acceptedClusters += clustererNNShadow.outputDataClass[i]; + // } + // LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s"; + // } #else GPUFatal("Project not compiled with neural network clusterization. Aborting."); #endif @@ -1066,6 +1170,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } } for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) { + // if (GetProcessingSettings().nn.applyNNclusterizer) { + // GPUTPCNNClusterizerHost& nnApplication = nnApplications[i]; + // nnApplication.model_class.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); + // nnApplication.model_reg_1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); + // nnApplication.model_reg_2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); + // } if (transferRunning[i]) { ReleaseEvent(mEvents->stream[i], doGPU); } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 6a9b6f546ae07..fe3202fe7b439 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -24,29 +24,73 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {} void* GPUTPCNNClusterizer::setIOPointers(void* mem) { - if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize); - } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize); - } - computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode); - computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode); - computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode); - computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters); - if (nnClusterizerModelClassNumOutputNodes > 0) { - computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); - } - if (!nnClusterizerUseCfRegression) { - if (nnClusterizerModelReg1NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + if (nnClusterizerBatchedMode > 0) { + if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, inputData_16, nnClusterizerBatchedMode * nnClusterizerElementSize); + } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, inputData_32, nnClusterizerBatchedMode * nnClusterizerElementSize); } - if (nnClusterizerModelReg2NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode); + + if (nnInferenceOutputDType == 0 && nnClusterizerElementSize > 0) { + if (nnClusterizerModelClassNumOutputNodes > 0) { + computePointerWithAlignment(mem, modelProbabilities_16, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); + } + if (!nnClusterizerUseCfRegression) { + if (nnClusterizerModelReg1NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg1_16, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + } + if (nnClusterizerModelReg2NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg2_16, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + } + } + } else if (nnInferenceOutputDType == 1 && nnClusterizerElementSize > 0) { + if (nnClusterizerModelClassNumOutputNodes > 0) { + computePointerWithAlignment(mem, modelProbabilities_32, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); + } + if (!nnClusterizerUseCfRegression) { + if (nnClusterizerModelReg1NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg1_32, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + } + if (nnClusterizerModelReg2NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg2_32, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + } + } } } + if (nnClusterizerTotalClusters > 0) { + computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters); + } return mem; } +// std::vector GPUTPCNNClusterizer::pointerSizes() { +// std::vector sizes(7, -1); +// if (nnClusterizerBatchedMode > 0) { +// if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) { +// sizes[0] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData16 +// } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) { +// sizes[1] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData32 +// } +// sizes[2] = 2 * nnClusterizerBatchedMode; // clusterFlags +// if (nnClusterizerModelClassNumOutputNodes > 0) { +// sizes[3] = nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes; // modelProbabilities +// } +// if (!nnClusterizerUseCfRegression) { +// if (nnClusterizerModelReg1NumOutputNodes > 0) { +// sizes[4] = nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes; // outputDataReg1 +// } +// if (nnClusterizerModelReg2NumOutputNodes > 0) { +// sizes[5] = nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes; // outputDataReg2 +// } +// } +// } +// if (nnClusterizerTotalClusters > 0) { +// sizes[6] = nnClusterizerTotalClusters; // outputDataClass +// } +// return sizes; +// } + void GPUTPCNNClusterizer::RegisterMemoryAllocation() { AllocateAndInitializeLate(); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index ea6340dfd48bc..da490b0f94d58 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -42,33 +42,38 @@ class GPUTPCNNClusterizer : public GPUProcessor int nnClusterizerSizeInputTime = 3; int nnClusterizerElementSize = -1; bool nnClusterizerAddIndexData = true; - float nnClassThreshold = 0.16; + float nnClassThreshold = 0.01; bool nnSigmoidTrafoClassThreshold = 1; int nnClusterizerUseCfRegression = 0; int nnClusterizerBatchedMode = 1; int nnClusterizerTotalClusters = 1; int nnClusterizerVerbosity = 0; int nnClusterizerBoundaryFillValue = -1; - int nnClusterizerDumpDigits = 0; - int nnClusterizerApplyCfDeconvolution = 0; int nnClusterizerModelClassNumOutputNodes = -1; int nnClusterizerModelReg1NumOutputNodes = -1; int nnClusterizerModelReg2NumOutputNodes = -1; - int nnClusterizerDtype = 0; // 0: float16, 1: float32 + int nnInferenceInputDType = 0; // 0: float16, 1: float32 + int nnInferenceOutputDType = 0; // 0: float16, 1: float32 int mISector = -1; + int deviceId = -1; // Memory allocation for neural network - uint class2_elements = 0; - float* inputData32 = nullptr; - OrtDataType::Float16_t* inputData16 = nullptr; - float* outputDataClass = nullptr; - float* modelProbabilities = nullptr; - float* outputDataReg1 = nullptr; - float* outputDataReg2 = nullptr; - ChargePos* peakPositions = nullptr; - bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx - float* centralCharges = nullptr; + bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr + int* outputDataClass = nullptr; + + // FP32 + float* inputData_32 = nullptr; + float* modelProbabilities_32 = nullptr; + float* outputDataReg1_32 = nullptr; + float* outputDataReg2_32 = nullptr; + + // FP16 + OrtDataType::Float16_t* inputData_16 = nullptr; + OrtDataType::Float16_t* modelProbabilities_16 = nullptr; + OrtDataType::Float16_t* outputDataReg1_16 = nullptr; + OrtDataType::Float16_t* outputDataReg2_16 = nullptr; + int16_t mMemoryId = -1; }; // class GPUTPCNNClusterizer diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 5002c63524020..db2f05711f537 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -12,54 +12,205 @@ /// \file GPUTPCNNClusterizerHost.cxx /// \author Christian Sonnabend +#include + #include "GPUTPCNNClusterizerHost.h" #include "GPUTPCNNClusterizer.h" #include "GPUSettings.h" #include "ML/3rdparty/GPUORTFloat16.h" +#include "GPUReconstruction.h" + +#ifdef GPUCA_HAS_ONNX +#include +#endif using namespace o2::gpu; -GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer) +void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) { + std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath; + std::vector reg_model_paths; + std::vector evalMode = o2::utils::Str::tokenize(settings.nnEvalMode, ':'); + + if (settings.nnLoadFromCCDB) { + reg_model_path = settings.nnLocalFolder + "/net_regression_c1.onnx"; // Needs to be set identical to NeuralNetworkClusterizer.cxx, otherwise the networks might be loaded from the wrong place + if (evalMode[0] == "c1") { + class_model_path = settings.nnLocalFolder + "/net_classification_c1.onnx"; + } else if (evalMode[0] == "c2") { + class_model_path = settings.nnLocalFolder + "/net_classification_c2.onnx"; + } + + if (evalMode[1] == "r2") { + reg_model_path += ":" + settings.nnLocalFolder + "/net_regression_c2.onnx"; + } + } + OrtOptions = { - {"model-path", settings.nnClassificationPath}, - {"device", settings.nnInferenceDevice}, - {"device-id", std::to_string(settings.nnInferenceDeviceId)}, + {"model-path", class_model_path}, + {"device-type", settings.nnInferenceDevice}, {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)}, - {"dtype", settings.nnInferenceDtype}, {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)}, {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)}, {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)}, {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)}, {"profiling-output-path", settings.nnInferenceOrtProfilingPath}, - {"logging-level", std::to_string(settings.nnInferenceVerbosity)}}; + {"logging-level", std::to_string(settings.nnInferenceVerbosity)}, + {"onnx-environment-name", "c1"}}; - model_class.init(OrtOptions); - clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; + model_class.initOptions(OrtOptions); + modelsUsed[0] = true; - reg_model_paths = splitString(settings.nnRegressionPath, ":"); + reg_model_paths = o2::utils::Str::tokenize(reg_model_path, ':'); if (!settings.nnClusterizerUseCfRegression) { - if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { + if (reg_model_paths.size() == 1) { OrtOptions["model-path"] = reg_model_paths[0]; - model_reg_1.init(OrtOptions); - clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; + OrtOptions["onnx-environment-name"] = "r1"; + model_reg_1.initOptions(OrtOptions); + modelsUsed[1] = true; } else { OrtOptions["model-path"] = reg_model_paths[0]; - model_reg_1.init(OrtOptions); - clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; + OrtOptions["onnx-environment-name"] = "r1"; + model_reg_1.initOptions(OrtOptions); + modelsUsed[1] = true; OrtOptions["model-path"] = reg_model_paths[1]; - model_reg_2.init(OrtOptions); - clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1]; + OrtOptions["onnx-environment-name"] = "r2"; + model_reg_2.initOptions(OrtOptions); + modelsUsed[2] = true; } } } -void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype) +void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN) { - if (dtype == 0) { - model.inference(clusterer.inputData16, size * clusterer.nnClusterizerElementSize, output); + clustererNN.nnClusterizerUseCfRegression = settings.nnClusterizerUseCfRegression; + clustererNN.nnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow; + clustererNN.nnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad; + clustererNN.nnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime; + clustererNN.nnClusterizerAddIndexData = settings.nnClusterizerAddIndexData; + clustererNN.nnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0); + clustererNN.nnClusterizerBatchedMode = settings.nnClusterizerBatchedMode; + clustererNN.nnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue; + clustererNN.nnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold; + if (clustererNN.nnSigmoidTrafoClassThreshold) { + clustererNN.nnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold)); + } else { + clustererNN.nnClassThreshold = settings.nnClassThreshold; + } + if (settings.nnClusterizerVerbosity < 0) { + clustererNN.nnClusterizerVerbosity = settings.nnInferenceVerbosity; } else { - model.inference(clusterer.inputData32, size * clusterer.nnClusterizerElementSize, output); + clustererNN.nnClusterizerVerbosity = settings.nnClusterizerVerbosity; + } + clustererNN.nnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos; + clustererNN.nnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos; + clustererNN.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; + if (!settings.nnClusterizerUseCfRegression) { + if (model_class.getNumOutputNodes()[0][1] == 1 || !model_reg_2.isInitialized()) { + clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; + } else { + clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; + clustererNN.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1]; + } + } +} + +// MockedOrtAllocator implementation to be able to use volatile assignment +struct MockedOrtAllocator : OrtAllocator { + MockedOrtAllocator(GPUReconstruction* = nullptr, OrtMemoryInfo* = nullptr); + ~MockedOrtAllocator(); + + void* Alloc(size_t size); + void Free(void* p); + const OrtMemoryInfo* Info() const; + void* Reserve(size_t size); + size_t NumAllocations() const; + size_t NumReserveAllocations() const; + + void LeakCheck(); + + private: + MockedOrtAllocator(const MockedOrtAllocator&) = delete; + MockedOrtAllocator& operator=(const MockedOrtAllocator&) = delete; + + std::atomic memory_inuse{0}; + std::atomic num_allocations{0}; + std::atomic num_reserve_allocations{0}; + OrtMemoryInfo* memory_info; + GPUReconstruction* rec; +}; + +MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info) +{ + OrtAllocator::version = ORT_API_VERSION; + OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast(this_)->Alloc(size); }; + OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast(this_)->Free(p); }; + OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast(this_)->Info(); }; + OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast(this_)->Reserve(size); }; + rec = r; + memory_info = info; +} + +MockedOrtAllocator::~MockedOrtAllocator() +{ + // Ort::GetApi().ReleaseMemoryInfo(memory_info); +} + +void* MockedOrtAllocator::Alloc(size_t size) +{ + // LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes"; + return rec->AllocateVolatileDeviceMemory(size); +} + +void* MockedOrtAllocator::Reserve(size_t size) +{ + // LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes"; + return rec->AllocateVolatileDeviceMemory(size); +} + +void MockedOrtAllocator::Free(void* p) +{ + // LOG(info) << "(ORT) Freeing volatile memory " << p; + rec->ReturnVolatileDeviceMemory(); +} + +const OrtMemoryInfo* MockedOrtAllocator::Info() const +{ + return memory_info; +} + +size_t MockedOrtAllocator::NumAllocations() const +{ + return num_allocations.load(); +} + +size_t MockedOrtAllocator::NumReserveAllocations() const +{ + return num_reserve_allocations.load(); +} + +void MockedOrtAllocator::LeakCheck() +{ + if (memory_inuse.load()) + LOG(warning) << "memory leak!!!"; +} + +void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate) +{ + mockedAlloc = std::make_shared(rec, (OrtMemoryInfo*)(*memInfo)); + if (recreate) { + Ort::ThrowOnError(Ort::GetApi().UnregisterAllocator((OrtEnv*)(*env), (OrtMemoryInfo*)(*memInfo))); } + Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc.get())); + memInfo = (Ort::MemoryInfo*)mockedAlloc->Info(); +} + +const OrtMemoryInfo* GPUTPCNNClusterizerHost::getMockedMemoryInfo() +{ + return mockedAlloc->Info(); +} + +MockedOrtAllocator* GPUTPCNNClusterizerHost::getMockedAllocator() +{ + return mockedAlloc.get(); } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 7efa0edecb893..0379b83d0ae02 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -22,6 +22,15 @@ using namespace o2::ml; +class OrtMemoryInfo; +class OrtAllocator; +struct MockedOrtAllocator; +namespace Ort +{ +struct Env; +struct MemoryInfo; +} // namespace Ort + namespace o2::OrtDataType { struct Float16_t; @@ -30,6 +39,7 @@ struct Float16_t; namespace o2::gpu { +class GPUReconstruction; class GPUTPCNNClusterizer; struct GPUSettingsProcessingNNclusterizer; @@ -37,30 +47,23 @@ class GPUTPCNNClusterizerHost { public: GPUTPCNNClusterizerHost() = default; - GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) { init(settings); } + + void init(const GPUSettingsProcessingNNclusterizer&); + void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); - void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype); + // ONNX + void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false); + MockedOrtAllocator* getMockedAllocator(); + const OrtMemoryInfo* getMockedMemoryInfo(); std::unordered_map OrtOptions; o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters + std::vector modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2 + int32_t deviceId = -1; std::vector reg_model_paths; - private: - // Avoid including CommonUtils/StringUtils.h - std::vector splitString(const std::string& input, const std::string& delimiter) - { - std::vector tokens; - std::size_t pos = 0; - std::size_t found; - - while ((found = input.find(delimiter, pos)) != std::string::npos) { - tokens.push_back(input.substr(pos, found - pos)); - pos = found + delimiter.length(); - } - tokens.push_back(input.substr(pos)); - - return tokens; - } + std::shared_ptr mockedAlloc = nullptr; }; // class GPUTPCNNClusterizerHost } // namespace o2::gpu diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 379ea27443fea..2cf9ab2037007 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -35,7 +35,7 @@ using namespace o2::gpu::tpccf; // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; @@ -45,109 +45,26 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY(MCLabelAccumulator labelAcc(clusterer)); - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new; GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } template <> GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart); -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - uint glo_idx = get_global_id(0); - processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold); -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - auto& clusterer = processors.tpcNNClusterer[sector]; - uint glo_idx = get_global_id(0); - uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes; - float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty] - uint class_label = 0; - for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) { - if (pIdx == elem_iterator) { - current_max_prob = clusterer.modelProbabilities[pIdx]; - } else { - class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label); - } - } - // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" - clusterer.outputDataClass[glo_idx + batchStart] = class_label; -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - uint glo_idx = get_global_id(0); - if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) { - return; - } - GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart); -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - uint glo_idx = get_global_id(0); - if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) { - return; - } - GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart); -} - -// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary -GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current) -{ - return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2); -} - -GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift) -{ - return (row > 62 ? global_shift : 0); -} - -GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift) -{ - if (pad < 0 || row < 0) { // Faster short-circuit - return true; - } else if (row < 63) { - return (pad >= static_cast(GPUTPCGeometry::NPads(row))); - } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network - return true; - } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) { - return (pad >= static_cast(GPUTPCGeometry::NPads(row - global_shift))); - } else { - return true; - } -} - -// Filling the input data for the neural network where there is no boundary -GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart) { uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; - Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - Array2D isPeakMap(clusterer.mPpeakMap); - uint write_idx = glo_idx * clustererNN.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + Array2D isPeakMap(clusterer.mPpeakMap); ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; int row = static_cast(peak.row()), pad = static_cast(peak.pad()), time = static_cast(peak.time()); // Explicit casting to avoid conversion errors float central_charge = static_cast(chargeMap[peak].unpack()); - - clustererNN.peakPositions[glo_idx] = peak; - clustererNN.centralCharges[glo_idx] = central_charge; - clustererNN.outputDataClass[glo_idx + batchStart] = -1; - int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow); + #ifndef GPUCA_GPUCODE GPUCA_UNROLL(U(), U()); #endif @@ -160,20 +77,20 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n if (!is_boundary) { ChargePos tmp_pos(row + r, pad + p, time + t); if (r == 0 && !clustererNN.clusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization - clustererNN.clusterFlags[2 * glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]); + clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx]; } if (dtype == 0) { - clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); - } else { - clustererNN.inputData32[write_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); + } else if (dtype == 1) { + clustererNN.inputData_32[write_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; } } else { // Filling boundary just to make sure that no values are left unintentionally if (dtype == 0) { - clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); + clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); } else { - clustererNN.inputData32[write_idx] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); + clustererNN.inputData_32[write_idx] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); } } write_idx++; @@ -182,66 +99,191 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n } if (clustererNN.nnClusterizerAddIndexData) { if (dtype == 0) { - clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f); - clustererNN.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f); - clustererNN.inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); + clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(sector / 36.f); + clustererNN.inputData_16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f); + clustererNN.inputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); + } else { + clustererNN.inputData_32[write_idx] = sector / 36.f; + clustererNN.inputData_32[write_idx + 1] = row / 152.f; + clustererNN.inputData_32[write_idx + 2] = static_cast(pad) / GPUTPCGeometry::NPads(row); + } + } +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + uint glo_idx = get_global_id(0); + auto& clusterer = processors.tpcClusterer[sector]; + auto& clustererNN = processors.tpcNNClusterer[sector]; + uint base_idx = CAMath::Floor(glo_idx / clustererNN.nnClusterizerElementSize); + uint transient_index = glo_idx % clustererNN.nnClusterizerElementSize; + + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + Array2D isPeakMap(clusterer.mPpeakMap); + ChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart]; + int row = static_cast(peak.row()), pad = static_cast(peak.pad()); + + if (clustererNN.nnClusterizerAddIndexData && transient_index == (clustererNN.nnClusterizerElementSize - 1)) { + uint top_idx = (base_idx + 1) * clustererNN.nnClusterizerElementSize; + for (uint16_t i = 0; i < 8; i++) { + Delta2 d = cfconsts::InnerNeighbors[i]; + ChargePos tmp_pos = peak.delta(d); + clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); + clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx]; + } + if (dtype == 0) { + clustererNN.inputData_16[top_idx - 3] = (OrtDataType::Float16_t)(sector / 36.f); + clustererNN.inputData_16[top_idx - 2] = (OrtDataType::Float16_t)(row / 152.f); + clustererNN.inputData_16[top_idx - 1] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); } else { - clustererNN.inputData32[write_idx] = clusterer.mISector / 36.f; - clustererNN.inputData32[write_idx + 1] = row / 152.f; - clustererNN.inputData32[write_idx + 2] = static_cast(pad) / GPUTPCGeometry::NPads(row); + clustererNN.inputData_32[top_idx - 3] = sector / 36.f; + clustererNN.inputData_32[top_idx - 2] = row / 152.f; + clustererNN.inputData_32[top_idx - 1] = static_cast(pad) / GPUTPCGeometry::NPads(row); + } + } else if (transient_index < (clustererNN.nnClusterizerElementSize - 3)) { + int time = static_cast(peak.time()); + int r = CAMath::Floor(transient_index / ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1))) - clustererNN.nnClusterizerSizeInputRow; + bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); + if (is_row_boundary) { + if (dtype == 0) { + clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); + } else { + clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); + } + } else { + int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow); + int pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r); + int rest_1 = transient_index % ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1)); + int p = CAMath::Floor(rest_1 / (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputPad + pad_offset; + bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow); + + if (!is_boundary) { + float central_charge = static_cast(chargeMap[peak].unpack()); + int t = (rest_1 % (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputTime; + ChargePos tmp_pos(row + r, pad + p, time + t); + if (dtype == 0) { + clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); + } else if (dtype == 1) { + clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + } + } else { + if (dtype == 0) { + clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); + } else { + clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); + } + } } } } -GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + uint glo_idx = get_global_id(0); + if (dtype == 0) { + processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)((processors.tpcNNClusterer[sector].modelProbabilities_16[glo_idx]).ToFloat() > processors.tpcNNClusterer[sector].nnClassThreshold); + } else if (dtype == 1) { + processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities_32[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold); + } +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) { + auto& clustererNN = processors.tpcNNClusterer[sector]; + uint glo_idx = get_global_id(0); + uint elem_iterator = glo_idx * clustererNN.nnClusterizerModelClassNumOutputNodes; + float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty] + uint class_label = 0; + for (int pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes; pIdx++) { + if (pIdx == elem_iterator) { + if (dtype == 0) { + current_max_prob = static_cast(clustererNN.modelProbabilities_16[pIdx]); + } else if (dtype == 1) { + current_max_prob = clustererNN.modelProbabilities_32[pIdx]; + } + } else { + if (dtype == 0) { + current_max_prob = CAMath::Max(current_max_prob, clustererNN.modelProbabilities_16[pIdx].ToFloat()); + } else if (dtype == 1) { + current_max_prob = CAMath::Max(current_max_prob, clustererNN.modelProbabilities_32[pIdx]); + } + } + } + // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" + clustererNN.outputDataClass[glo_idx + batchStart] = class_label; + if (class_label > 1) { + clustererNN.clusterFlags[2 * glo_idx] = 1; + clustererNN.clusterFlags[2 * glo_idx + 1] = 1; + } +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) +{ + uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; + float central_charge = static_cast(chargeMap[peak].unpack()); + CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; uint full_glo_idx = glo_idx + batchStart; int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg1NumOutputNodes; // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size(); - if (clustererNN.outputDataClass[full_glo_idx] == 1) { + if (clustererNN.outputDataClass[full_glo_idx] == 1 || (clustererNN.nnClusterizerModelReg2NumOutputNodes == -1 && clustererNN.outputDataClass[full_glo_idx] >= 1)) { ClusterAccumulator pc; // Publishing logic is taken from default clusterizer - if (onlyMC) { + if (withMC) { ClusterAccumulator dummy_pc; - CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack())); + CPU_ONLY(labelAcc->collect(peak, central_charge)); GPUTPCCFClusterizer::buildCluster( clusterer.Param().rec, chargeMap, - clustererNN.peakPositions[glo_idx], + peak, smem.posBcast, smem.buf, smem.innerAboveThreshold, &dummy_pc, labelAcc); } - - if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) { + if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; } return; } - pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg1[model_output_index + 4], - static_cast(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg1[model_output_index], - clustererNN.outputDataReg1[model_output_index + 2], - (clusterer.mPmemory->fragment).start + static_cast(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1], - clustererNN.outputDataReg1[model_output_index + 3], - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + if (dtype == 0) { + pc.setFull(central_charge * clustererNN.outputDataReg1_16[model_output_index + 4].ToFloat(), + static_cast(peak.pad()) + clustererNN.outputDataReg1_16[model_output_index].ToFloat(), + clustererNN.outputDataReg1_16[model_output_index + 2].ToFloat(), + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg1_16[model_output_index + 1].ToFloat(), + clustererNN.outputDataReg1_16[model_output_index + 3].ToFloat(), + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + } else if (dtype == 1) { + pc.setFull(central_charge * clustererNN.outputDataReg1_32[model_output_index + 4], + static_cast(peak.pad()) + clustererNN.outputDataReg1_32[model_output_index], + clustererNN.outputDataReg1_32[model_output_index + 2], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg1_32[model_output_index + 1], + clustererNN.outputDataReg1_32[model_output_index + 3], + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + } tpc::ClusterNative myCluster; - bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap); + bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap); if (rejectCluster) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -250,11 +292,11 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha } uint rowIndex = 0; - if (clusterer.mPclusterByRow != nullptr) { + if (clusterOut != nullptr) { rowIndex = GPUTPCCFClusterizer::sortIntoBuckets( clusterer, myCluster, - clustererNN.peakPositions[glo_idx].row(), + peak.row(), clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut); @@ -264,7 +306,7 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha } else if (clusterer.mPclusterPosInRow) { rowIndex = clusterer.mPclusterPosInRow[full_glo_idx]; } - CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); + CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); } else { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -273,38 +315,41 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha } } -GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { + uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; + float central_charge = static_cast(chargeMap[peak].unpack()); + CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; uint full_glo_idx = glo_idx + batchStart; int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg2NumOutputNodes; - // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg2NumOutputNodes << " -- " << clustererNN.peakPositions.size() << " -- " << clustererNN.centralCharges.size(); - if (clustererNN.outputDataClass[full_glo_idx] > 0) { ClusterAccumulator pc; - if (onlyMC) { + if (withMC) { ClusterAccumulator dummy_pc; - CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack())); + CPU_ONLY(labelAcc->collect(peak, central_charge)); GPUTPCCFClusterizer::buildCluster( clusterer.Param().rec, chargeMap, - clustererNN.peakPositions[glo_idx], + peak, smem.posBcast, smem.buf, smem.innerAboveThreshold, &dummy_pc, labelAcc); } - - if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) { + if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; } @@ -312,16 +357,26 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha } // Cluster 1 - pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 8], - static_cast(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index], - clustererNN.outputDataReg2[model_output_index + 4], - (clusterer.mPmemory->fragment).start + static_cast(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2], - clustererNN.outputDataReg2[model_output_index + 6], - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + if (dtype == 0) { + pc.setFull(central_charge * clustererNN.outputDataReg2_16[model_output_index + 8].ToFloat(), + static_cast(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index].ToFloat(), + clustererNN.outputDataReg2_16[model_output_index + 4].ToFloat(), + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 2].ToFloat(), + clustererNN.outputDataReg2_16[model_output_index + 6].ToFloat(), + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + } else if (dtype == 1) { + pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 8], + static_cast(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index], + clustererNN.outputDataReg2_32[model_output_index + 4], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 2], + clustererNN.outputDataReg2_32[model_output_index + 6], + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + } tpc::ClusterNative myCluster; - bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap); + bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap); if (rejectCluster) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -330,11 +385,11 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha } uint rowIndex = 0; - if (clusterer.mPclusterByRow != nullptr) { + if (clusterOut != nullptr) { rowIndex = GPUTPCCFClusterizer::sortIntoBuckets( clusterer, myCluster, - clustererNN.peakPositions[glo_idx].row(), + peak.row(), clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut); @@ -344,18 +399,28 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha } else if (clusterer.mPclusterPosInRow) { rowIndex = clusterer.mPclusterPosInRow[full_glo_idx]; } - CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); + CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); // Cluster 2 - pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 9], - static_cast(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index + 1], - clustererNN.outputDataReg2[model_output_index + 5], - (clusterer.mPmemory->fragment).start + static_cast(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3], - clustererNN.outputDataReg2[model_output_index + 7], - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); - - rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap); + if (dtype == 0) { + pc.setFull(central_charge * clustererNN.outputDataReg2_16[model_output_index + 9].ToFloat(), + static_cast(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index + 1].ToFloat(), + clustererNN.outputDataReg2_16[model_output_index + 5].ToFloat(), + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 3].ToFloat(), + clustererNN.outputDataReg2_16[model_output_index + 7].ToFloat(), + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + } else if (dtype == 1) { + pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 9], + static_cast(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index + 1], + clustererNN.outputDataReg2_32[model_output_index + 5], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 3], + clustererNN.outputDataReg2_32[model_output_index + 7], + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + } + + rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap); if (rejectCluster) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -363,11 +428,11 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha return; } - if (clusterer.mPclusterByRow != nullptr) { + if (clusterOut != nullptr) { rowIndex = GPUTPCCFClusterizer::sortIntoBuckets( clusterer, myCluster, - clustererNN.peakPositions[glo_idx].row(), + peak.row(), clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut); @@ -377,7 +442,7 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha } else if (clusterer.mPclusterPosInRow) { rowIndex = clusterer.mPclusterPosInRow[full_glo_idx]; } - // CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters? + // CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters? } else { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -385,3 +450,29 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha return; } } + +// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary +GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current) +{ + return (int)((GPUTPCGeometry::NPads(row_current) - GPUTPCGeometry::NPads(row_ref)) / 2); +} + +GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift) +{ + return (row > 62 ? global_shift : 0); +} + +GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift) +{ + if (pad < 0 || row < 0) { // Faster short-circuit + return true; + } else if (row < 63) { + return (pad >= static_cast(GPUTPCGeometry::NPads(row))); + } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network + return true; + } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) { + return (pad >= static_cast(GPUTPCGeometry::NPads(row - global_shift))); + } else { + return true; + } +} diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index a1d641fdb0b93..27cfba2487aed 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -39,6 +39,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate public: // Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); + static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNSingleElement) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); @@ -59,10 +60,11 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate enum K : int32_t { runCfClusterizer = 0, fillInputNN = 1, - determineClass1Labels = 2, - determineClass2Labels = 3, - publishClass1Regression = 4, - publishClass2Regression = 5, + fillInputNNSingleElement = 2, + determineClass1Labels = 3, + determineClass2Labels = 4, + publishClass1Regression = 5, + publishClass2Regression = 6, }; template diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index ee3af2b87d925..23ab0f0da5bac 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -116,6 +116,7 @@ o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUS if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone") o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) diff --git a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h index 0038233f1c376..73f1f208e8889 100644 --- a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h +++ b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h @@ -83,6 +83,7 @@ class GPUO2Interface; struct TPCPadGainCalib; struct TPCZSLinkMapping; struct GPUSettingsO2; +struct GPUSettingsProcessingNNclusterizer; class GPUO2InterfaceQA; struct GPUTrackingInOutPointers; struct GPUTrackingInOutZS; @@ -225,6 +226,8 @@ class GPURecoWorkflowSpec : public o2::framework::Task uint32_t mNextThreadIndex = 0; bool mUpdateGainMapCCDB = true; std::unique_ptr mTFSettings; + std::unique_ptr mNNClusterizerSettings; + Config mSpecConfig; std::shared_ptr mGGR; bool mGRPGeomUpdated = false; diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index 7ad03ec58ae80..8a755a703705f 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -78,6 +78,7 @@ #include "DetectorsRaw/RDHUtils.h" #include "ITStracking/TrackingInterface.h" #include "GPUWorkflowInternal.h" +#include "TPCCalibration/NeuralNetworkClusterizer.h" // #include "Framework/ThreadPool.h" #include @@ -132,6 +133,50 @@ void GPURecoWorkflowSpec::init(InitContext& ic) { GRPGeomHelper::instance().setRequest(mGGR); GPUO2InterfaceConfiguration& config = *mConfig.get(); + GPUSettingsProcessingNNclusterizer& mNNClusterizerSettings = mConfig->configProcessing.nn; + + if (mNNClusterizerSettings.nnLoadFromCCDB) { + LOG(info) << "Loading neural networks from CCDB"; + o2::tpc::NeuralNetworkClusterizer nnClusterizerFetcher; + nnClusterizerFetcher.initCcdbApi(mNNClusterizerSettings.nnCCDBURL); + std::map ccdbSettings = { + {"nnCCDBURL", mNNClusterizerSettings.nnCCDBURL}, + {"nnCCDBPath", mNNClusterizerSettings.nnCCDBPath}, + {"inputDType", mNNClusterizerSettings.nnInferenceInputDType}, + {"outputDType", mNNClusterizerSettings.nnInferenceOutputDType}, + {"outputFolder", mNNClusterizerSettings.nnLocalFolder}, + {"nnCCDBPath", mNNClusterizerSettings.nnCCDBPath}, + {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings.nnCCDBWithMomentum)}, + {"nnCCDBBeamType", mNNClusterizerSettings.nnCCDBBeamType}, + {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings.nnCCDBInteractionRate)}}; + + std::string nnFetchFolder = mNNClusterizerSettings.nnLocalFolder; + std::vector evalMode = o2::utils::Str::tokenize(mNNClusterizerSettings.nnEvalMode, ':'); + + if (evalMode[0] == "c1") { + ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBClassificationLayerType; + ccdbSettings["nnCCDBEvalType"] = "classification_c1"; + ccdbSettings["outputFile"] = "net_classification_c1.onnx"; + nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); + } else if (evalMode[0] == "c2") { + ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBClassificationLayerType; + ccdbSettings["nnCCDBEvalType"] = "classification_c2"; + ccdbSettings["outputFile"] = "net_classification_c2.onnx"; + nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); + } + + ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBRegressionLayerType; + ccdbSettings["nnCCDBEvalType"] = "regression_c1"; + ccdbSettings["outputFile"] = "net_regression_c1.onnx"; + nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); + if (evalMode[1] == "r2") { + ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBRegressionLayerType; + ccdbSettings["nnCCDBEvalType"] = "regression_c2"; + ccdbSettings["outputFile"] = "net_regression_c2.onnx"; + nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); + } + LOG(info) << "Neural network loading done!"; + } // Create configuration object and fill settings mConfig->configGRP.solenoidBzNominalGPU = 0;