diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index ea70e28c0421c..b4f40f3f5c694 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -70,23 +70,23 @@ class OrtModel Ort::SessionOptions* getSessionOptions(); Ort::MemoryInfo* getMemoryInfo(); Ort::Env* getEnv(); - int32_t getIntraOpNumThreads() const { return intraOpNumThreads; } - int32_t getInterOpNumThreads() const { return interOpNumThreads; } + int32_t getIntraOpNumThreads() const { return mIntraOpNumThreads; } + int32_t getInterOpNumThreads() const { return mInterOpNumThreads; } // Setters - void setDeviceId(int32_t id) { deviceId = id; } + void setDeviceId(int32_t id) { mDeviceId = id; } void setIO(); - void setActiveThreads(int threads) { intraOpNumThreads = threads; } + void setActiveThreads(int threads) { mIntraOpNumThreads = threads; } void setIntraOpNumThreads(int threads) { - if (deviceType == "CPU") { - intraOpNumThreads = threads; + if (mDeviceType == "CPU") { + mIntraOpNumThreads = threads; } } void setInterOpNumThreads(int threads) { - if (deviceType == "CPU") { - interOpNumThreads = threads; + if (mDeviceType == "CPU") { + mInterOpNumThreads = threads; } } void setEnv(Ort::Env*); @@ -113,19 +113,19 @@ class OrtModel private: // ORT variables -> need to be hidden as pImpl struct OrtVariables; - OrtVariables* pImplOrt; + OrtVariables* mPImplOrt; // Input & Output specifications of the loaded network - std::vector inputNamesChar, outputNamesChar; + std::vector mInputNamesChar, mOutputNamesChar; std::vector mInputNames, mOutputNames; - std::vector> mInputShapes, mOutputShapes, inputShapesCopy, outputShapesCopy; // Input shapes - std::vector inputSizePerNode, outputSizePerNode; // Output shapes - int32_t mInputsTotal = 0, mOutputsTotal = 0; // Total number of inputs and outputs + std::vector> mInputShapes, mOutputShapes, mInputShapesCopy, mOutputShapesCopy; // Input shapes + std::vector mInputSizePerNode, mOutputSizePerNode; // Output shapes + int32_t mInputsTotal = 0, mOutputsTotal = 0; // Total number of inputs and outputs // Environment settings bool mInitialized = false; - std::string modelPath, envName = "", deviceType = "CPU", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda - int32_t intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = -1, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; + std::string mModelPath, mEnvName = "", mDeviceType = "CPU", mThreadAffinity = ""; // device options should be cpu, rocm, migraphx, cuda + int32_t mIntraOpNumThreads = 1, mInterOpNumThreads = 1, mDeviceId = -1, mEnableProfiling = 0, mLoggingLevel = 0, mAllocateDeviceMemory = 0, mEnableOptimizations = 0; std::string printShape(const std::vector&); std::string printShape(const std::vector>&, std::vector&); diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index a8a20b11f9e64..df7f0a2deba82 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -41,7 +41,7 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c // General purpose void OrtModel::initOptions(std::unordered_map optionsMap) { - pImplOrt = new OrtVariables(); + mPImplOrt = new OrtVariables(); // Load from options map if (!optionsMap.contains("model-path")) { @@ -49,49 +49,49 @@ void OrtModel::initOptions(std::unordered_map optionsM } if (!optionsMap["model-path"].empty()) { - modelPath = optionsMap["model-path"]; - deviceType = (optionsMap.contains("device-type") ? optionsMap["device-type"] : "CPU"); - deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : -1); - allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); - intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); - interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0); - loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); - enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); - enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); - envName = (optionsMap.contains("onnx-environment-name") ? optionsMap["onnx-environment-name"] : "onnx_model_inference"); - - if (deviceType == "CPU") { - (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); - (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads); - if (intraOpNumThreads > 1 || interOpNumThreads > 1) { - (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); - } else if (intraOpNumThreads == 1) { - (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + mModelPath = optionsMap["model-path"]; + mDeviceType = (optionsMap.contains("device-type") ? optionsMap["device-type"] : "CPU"); + mDeviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : -1); + mAllocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); + mIntraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); + mInterOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0); + mLoggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); + mEnableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); + mEnableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); + mEnvName = (optionsMap.contains("onnx-environment-name") ? optionsMap["onnx-environment-name"] : "onnx_model_inference"); + + if (mDeviceType == "CPU") { + (mPImplOrt->sessionOptions).SetIntraOpNumThreads(mIntraOpNumThreads); + (mPImplOrt->sessionOptions).SetInterOpNumThreads(mInterOpNumThreads); + if (mIntraOpNumThreads > 1 || mInterOpNumThreads > 1) { + (mPImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); + } else if (mIntraOpNumThreads == 1) { + (mPImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); } - if (loggingLevel < 2) { - LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " (intraOpNumThreads) and " << interOpNumThreads << " (interOpNumThreads) threads"; + if (mLoggingLevel < 2) { + LOG(info) << "(ORT) CPU execution provider set with " << mIntraOpNumThreads << " (mIntraOpNumThreads) and " << mInterOpNumThreads << " (mInterOpNumThreads) threads"; } } // OrtROCMProviderOptions rocm_options{}; - // (pImplOrt->sessionOptions).AppendExecutionProvider_ROCM(rocm_options); + // (mPImplOrt->sessionOptions).AppendExecutionProvider_ROCM(rocm_options); - (pImplOrt->sessionOptions).DisableMemPattern(); - (pImplOrt->sessionOptions).DisableCpuMemArena(); + (mPImplOrt->sessionOptions).DisableMemPattern(); + (mPImplOrt->sessionOptions).DisableCpuMemArena(); - if (enableProfiling) { + if (mEnableProfiling) { if (optionsMap.contains("profiling-output-path")) { - (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); + (mPImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); } else { LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; - (pImplOrt->sessionOptions).DisableProfiling(); + (mPImplOrt->sessionOptions).DisableProfiling(); } } else { - (pImplOrt->sessionOptions).DisableProfiling(); + (mPImplOrt->sessionOptions).DisableProfiling(); } - (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); - (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); + (mPImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(mEnableOptimizations)); + (mPImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(mLoggingLevel)); mInitialized = true; } else { @@ -101,9 +101,9 @@ void OrtModel::initOptions(std::unordered_map optionsM void OrtModel::initEnvironment() { - pImplOrt->env = std::make_shared( - OrtLoggingLevel(loggingLevel), - (envName.empty() ? "ORT" : envName.c_str()), + mPImplOrt->env = std::make_shared( + OrtLoggingLevel(mLoggingLevel), + (mEnvName.empty() ? "ORT" : mEnvName.c_str()), // Integrate ORT logging into Fairlogger [](void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, const char* message) { if (severity == ORT_LOGGING_LEVEL_VERBOSE) { @@ -121,20 +121,20 @@ void OrtModel::initEnvironment() } }, (void*)3); - (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events + (mPImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events } void OrtModel::initSession() { - if (allocateDeviceMemory) { - memoryOnDevice(deviceId); + if (mAllocateDeviceMemory) { + memoryOnDevice(mDeviceId); } - pImplOrt->session = std::make_shared(*pImplOrt->env, modelPath.c_str(), pImplOrt->sessionOptions); - pImplOrt->ioBinding = std::make_unique(*pImplOrt->session); + mPImplOrt->session = std::make_shared(*mPImplOrt->env, mModelPath.c_str(), mPImplOrt->sessionOptions); + mPImplOrt->ioBinding = std::make_unique(*mPImplOrt->session); setIO(); - if (loggingLevel < 2) { + if (mLoggingLevel < 2) { LOG(info) << "(ORT) Model loaded successfully! (inputs: " << printShape(mInputShapes, mInputNames) << ", outputs: " << printShape(mOutputShapes, mInputNames) << ")"; } } @@ -142,47 +142,47 @@ void OrtModel::initSession() void OrtModel::memoryOnDevice(int32_t deviceIndex) { if (deviceIndex >= 0) { - (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1"); - (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h - (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time - (pImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time + (mPImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1"); + (mPImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h + (mPImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time + (mPImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time // Arena memory shrinkage comes at performance cost /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0; - // (pImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27 + // (mPImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27 std::string dev_mem_str = ""; - if (deviceType == "ROCM") { + if (mDeviceType == "ROCM") { dev_mem_str = "Hip"; } - if (deviceType == "CUDA") { + if (mDeviceType == "CUDA") { dev_mem_str = "Cuda"; } - pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault); - if (loggingLevel < 2) { - LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex << " and pImplOrt pointer " << pImplOrt; + mPImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault); + if (mLoggingLevel < 2) { + LOG(info) << "(ORT) Memory info set to on-device memory for device type " << mDeviceType << " with ID " << deviceIndex << " and mPImplOrt pointer " << mPImplOrt; } } } void OrtModel::resetSession() { - pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); + mPImplOrt->session = std::make_shared(*(mPImplOrt->env), mModelPath.c_str(), mPImplOrt->sessionOptions); } // Getters Ort::SessionOptions* OrtModel::getSessionOptions() { - return &pImplOrt->sessionOptions; + return &mPImplOrt->sessionOptions; } Ort::MemoryInfo* OrtModel::getMemoryInfo() { - return &pImplOrt->memoryInfo; + return &mPImplOrt->memoryInfo; } Ort::Env* OrtModel::getEnv() { - return (pImplOrt->env).get(); + return (mPImplOrt->env).get(); } template @@ -202,37 +202,37 @@ std::vector OrtModel::v2v(std::vector& input, bool clearInput) void OrtModel::setIO() { - for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); + for (size_t i = 0; i < (mPImplOrt->session)->GetInputCount(); ++i) { + mInputNames.push_back((mPImplOrt->session)->GetInputNameAllocated(i, mPImplOrt->allocator).get()); } - for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + for (size_t i = 0; i < (mPImplOrt->session)->GetInputCount(); ++i) { + mInputShapes.emplace_back((mPImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); } - for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); + for (size_t i = 0; i < (mPImplOrt->session)->GetOutputCount(); ++i) { + mOutputNames.push_back((mPImplOrt->session)->GetOutputNameAllocated(i, mPImplOrt->allocator).get()); } - for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + for (size_t i = 0; i < (mPImplOrt->session)->GetOutputCount(); ++i) { + mOutputShapes.emplace_back((mPImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); } - inputNamesChar.resize(mInputNames.size(), nullptr); - std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), + mInputNamesChar.resize(mInputNames.size(), nullptr); + std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(mInputNamesChar), [&](const std::string& str) { return str.c_str(); }); - outputNamesChar.resize(mOutputNames.size(), nullptr); - std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), + mOutputNamesChar.resize(mOutputNames.size(), nullptr); + std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(mOutputNamesChar), [&](const std::string& str) { return str.c_str(); }); - inputShapesCopy = mInputShapes; - outputShapesCopy = mOutputShapes; - inputSizePerNode.resize(mInputShapes.size(), 1); - outputSizePerNode.resize(mOutputShapes.size(), 1); + mInputShapesCopy = mInputShapes; + mOutputShapesCopy = mOutputShapes; + mInputSizePerNode.resize(mInputShapes.size(), 1); + mOutputSizePerNode.resize(mOutputShapes.size(), 1); mInputsTotal = 1; for (size_t i = 0; i < mInputShapes.size(); ++i) { if (mInputShapes[i].size() > 0) { for (size_t j = 1; j < mInputShapes[i].size(); ++j) { if (mInputShapes[i][j] > 0) { mInputsTotal *= mInputShapes[i][j]; - inputSizePerNode[i] *= mInputShapes[i][j]; + mInputSizePerNode[i] *= mInputShapes[i][j]; } } } @@ -243,7 +243,7 @@ void OrtModel::setIO() for (size_t j = 1; j < mOutputShapes[i].size(); ++j) { if (mOutputShapes[i][j] > 0) { mOutputsTotal *= mOutputShapes[i][j]; - outputSizePerNode[i] *= mOutputShapes[i][j]; + mOutputSizePerNode[i] *= mOutputShapes[i][j]; } } } @@ -252,7 +252,7 @@ void OrtModel::setIO() void OrtModel::setEnv(Ort::Env* env) { - pImplOrt->env = std::shared_ptr(env); + mPImplOrt->env = std::shared_ptr(env); } // Inference @@ -266,12 +266,12 @@ std::vector OrtModel::inference(std::vector& input) } std::vector inputTensor; if constexpr (std::is_same_v) { - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); + inputTensor.emplace_back(Ort::Value::CreateTensor(mPImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); } else { - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); + inputTensor.emplace_back(Ort::Value::CreateTensor(mPImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); } // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + auto outputTensors = (mPImplOrt->session)->Run(mPImplOrt->runOptions, mInputNamesChar.data(), inputTensor.data(), inputTensor.size(), mOutputNamesChar.data(), mOutputNamesChar.size()); O* outputValues = outputTensors[0].template GetTensorMutableData(); std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; outputTensors.clear(); @@ -292,22 +292,22 @@ void OrtModel::inference(I* input, int64_t input_size, O* output) std::vector inputShape{input_size, (int64_t)mInputShapes[0][1]}; Ort::Value inputTensor = Ort::Value(nullptr); if constexpr (std::is_same_v) { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(mPImplOrt->memoryInfo, reinterpret_cast(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); } else { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(mPImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); } - (pImplOrt->ioBinding)->BindInput(mInputNames[0].c_str(), inputTensor); + (mPImplOrt->ioBinding)->BindInput(mInputNames[0].c_str(), inputTensor); std::vector outputShape{input_size, mOutputShapes[0][1]}; Ort::Value outputTensor = Ort::Value(nullptr); if constexpr (std::is_same_v) { - outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); + outputTensor = Ort::Value::CreateTensor(mPImplOrt->memoryInfo, reinterpret_cast(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); } else { - outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); + outputTensor = Ort::Value::CreateTensor(mPImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); } - (pImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor); + (mPImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor); - (pImplOrt->session)->Run(pImplOrt->runOptions, *pImplOrt->ioBinding); + (mPImplOrt->session)->Run(mPImplOrt->runOptions, *mPImplOrt->ioBinding); } template void OrtModel::inference(OrtDataType::Float16_t*, int64_t, OrtDataType::Float16_t*); @@ -318,56 +318,56 @@ template void OrtModel::inference(float*, int64_t, float*); template void OrtModel::inference(I** input, int64_t input_size, O* output) { - std::vector inputTensors(inputShapesCopy.size()); + std::vector inputTensors(mInputShapesCopy.size()); - for (size_t i = 0; i < inputShapesCopy.size(); ++i) { + for (size_t i = 0; i < mInputShapesCopy.size(); ++i) { - inputShapesCopy[i][0] = input_size; // batch-size - outputShapesCopy[i][0] = input_size; // batch-size + mInputShapesCopy[i][0] = input_size; // batch-size + mOutputShapesCopy[i][0] = input_size; // batch-size if constexpr (std::is_same_v) { inputTensors[i] = Ort::Value::CreateTensor( - pImplOrt->memoryInfo, + mPImplOrt->memoryInfo, reinterpret_cast(input[i]), - inputSizePerNode[i] * input_size, - inputShapesCopy[i].data(), - inputShapesCopy[i].size()); + mInputSizePerNode[i] * input_size, + mInputShapesCopy[i].data(), + mInputShapesCopy[i].size()); } else { inputTensors[i] = Ort::Value::CreateTensor( - pImplOrt->memoryInfo, + mPImplOrt->memoryInfo, input[i], - inputSizePerNode[i] * input_size, - inputShapesCopy[i].data(), - inputShapesCopy[i].size()); + mInputSizePerNode[i] * input_size, + mInputShapesCopy[i].data(), + mInputShapesCopy[i].size()); } } Ort::Value outputTensor = Ort::Value(nullptr); if constexpr (std::is_same_v) { outputTensor = Ort::Value::CreateTensor( - pImplOrt->memoryInfo, + mPImplOrt->memoryInfo, reinterpret_cast(output), - outputSizePerNode[0] * input_size, // assumes that there is only one output node - outputShapesCopy[0].data(), - outputShapesCopy[0].size()); + mOutputSizePerNode[0] * input_size, // assumes that there is only one output node + mOutputShapesCopy[0].data(), + mOutputShapesCopy[0].size()); } else { outputTensor = Ort::Value::CreateTensor( - pImplOrt->memoryInfo, + mPImplOrt->memoryInfo, output, - outputSizePerNode[0] * input_size, // assumes that there is only one output node - outputShapesCopy[0].data(), - outputShapesCopy[0].size()); + mOutputSizePerNode[0] * input_size, // assumes that there is only one output node + mOutputShapesCopy[0].data(), + mOutputShapesCopy[0].size()); } // === Run inference === - pImplOrt->session->Run( - pImplOrt->runOptions, - inputNamesChar.data(), + mPImplOrt->session->Run( + mPImplOrt->runOptions, + mInputNamesChar.data(), inputTensors.data(), - inputNamesChar.size(), - outputNamesChar.data(), + mInputNamesChar.size(), + mOutputNamesChar.data(), &outputTensor, - outputNamesChar.size()); + mOutputNamesChar.size()); } template void OrtModel::inference(OrtDataType::Float16_t**, int64_t, OrtDataType::Float16_t*); @@ -382,37 +382,37 @@ std::vector OrtModel::inference(std::vector>& inputs) for (size_t i = 0; i < inputs.size(); ++i) { - inputShapesCopy[i][0] = inputs[i].size() / inputSizePerNode[i]; // batch-size + mInputShapesCopy[i][0] = inputs[i].size() / mInputSizePerNode[i]; // batch-size if constexpr (std::is_same_v) { input_tensors.emplace_back( Ort::Value::CreateTensor( - pImplOrt->memoryInfo, + mPImplOrt->memoryInfo, reinterpret_cast(inputs[i].data()), - inputSizePerNode[i] * inputShapesCopy[i][0], - inputShapesCopy[i].data(), - inputShapesCopy[i].size())); + mInputSizePerNode[i] * mInputShapesCopy[i][0], + mInputShapesCopy[i].data(), + mInputShapesCopy[i].size())); } else { input_tensors.emplace_back( Ort::Value::CreateTensor( - pImplOrt->memoryInfo, + mPImplOrt->memoryInfo, inputs[i].data(), - inputSizePerNode[i] * inputShapesCopy[i][0], - inputShapesCopy[i].data(), - inputShapesCopy[i].size())); + mInputSizePerNode[i] * mInputShapesCopy[i][0], + mInputShapesCopy[i].data(), + mInputShapesCopy[i].size())); } } - int32_t totalOutputSize = mOutputsTotal * inputShapesCopy[0][0]; + int32_t totalOutputSize = mOutputsTotal * mInputShapesCopy[0][0]; // === Run inference === - auto output_tensors = pImplOrt->session->Run( - pImplOrt->runOptions, - inputNamesChar.data(), + auto output_tensors = mPImplOrt->session->Run( + mPImplOrt->runOptions, + mInputNamesChar.data(), input_tensors.data(), input_tensors.size(), - outputNamesChar.data(), - outputNamesChar.size()); + mOutputNamesChar.data(), + mOutputNamesChar.size()); // === Extract output values === O* output_data = output_tensors[0].template GetTensorMutableData(); @@ -428,9 +428,9 @@ template std::vector OrtModel::inferencesession->EndProfiling(); + // mPImplOrt->session->EndProfiling(); // } - LOG(info) << "(ORT) Size of pImplOrt: " << sizeof(*pImplOrt) << " bytes"; + LOG(info) << "(ORT) Size of mPImplOrt: " << sizeof(*mPImplOrt) << " bytes"; } // private diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 37c12b2a3b3f4..630c2200e5900 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -645,41 +645,41 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // bool recreateMemoryAllocator = false; mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) { nnApplications[lane].init(nn_settings); - if (nnApplications[lane].modelsUsed[0]) { - SetONNXGPUStream(*(nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId); - (nnApplications[lane].model_class).setDeviceId(deviceId); - if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) { - nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads); + if (nnApplications[lane].mModelsUsed[0]) { + SetONNXGPUStream(*(nnApplications[lane].mModelClass).getSessionOptions(), lane, &deviceId); + (nnApplications[lane].mModelClass).setDeviceId(deviceId); + if (nnApplications[lane].mModelClass.getIntraOpNumThreads() > maxThreads) { + nnApplications[lane].mModelClass.setIntraOpNumThreads(maxThreads); } - (nnApplications[lane].model_class).initEnvironment(); + (nnApplications[lane].mModelClass).initEnvironment(); // Registering this once seems to be enough, even with different environmnents / models. ONNX apparently uses this per device and stores the OrtAllocator internally. All models will then use the volatile allocation. // But environment must be valid, so we init the model environment first and use it here afterwards. // Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator. // TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor - // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, recreateMemoryAllocator); + // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator); // recreateMemoryAllocator = true; - (nnApplications[lane].model_class).initSession(); + (nnApplications[lane].mModelClass).initSession(); } - if (nnApplications[lane].modelsUsed[1]) { - SetONNXGPUStream(*(nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId); - (nnApplications[lane].model_reg_1).setDeviceId(deviceId); - if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) { - nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads); + if (nnApplications[lane].mModelsUsed[1]) { + SetONNXGPUStream(*(nnApplications[lane].mModelReg1).getSessionOptions(), lane, &deviceId); + (nnApplications[lane].mModelReg1).setDeviceId(deviceId); + if (nnApplications[lane].mModelReg1.getIntraOpNumThreads() > maxThreads) { + nnApplications[lane].mModelReg1.setIntraOpNumThreads(maxThreads); } - // (nnApplications[lane].model_reg_1).setEnv((nnApplications[lane].model_class).getEnv()); - (nnApplications[lane].model_reg_1).initEnvironment(); - // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, recreateMemoryAllocator); - (nnApplications[lane].model_reg_1).initSession(); + // (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv()); + (nnApplications[lane].mModelReg1).initEnvironment(); + // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator); + (nnApplications[lane].mModelReg1).initSession(); } - if (nnApplications[lane].modelsUsed[2]) { - SetONNXGPUStream(*(nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId); - (nnApplications[lane].model_reg_2).setDeviceId(deviceId); - if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) { - nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads); + if (nnApplications[lane].mModelsUsed[2]) { + SetONNXGPUStream(*(nnApplications[lane].mModelReg2).getSessionOptions(), lane, &deviceId); + (nnApplications[lane].mModelReg2).setDeviceId(deviceId); + if (nnApplications[lane].mModelReg2.getIntraOpNumThreads() > maxThreads) { + nnApplications[lane].mModelReg2.setIntraOpNumThreads(maxThreads); } - (nnApplications[lane].model_reg_2).initEnvironment(); - // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, recreateMemoryAllocator); - (nnApplications[lane].model_reg_2).initSession(); + (nnApplications[lane].mModelReg2).initEnvironment(); + // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator); + (nnApplications[lane].mModelReg2).initSession(); } if (nn_settings.nnClusterizerVerbosity < 3) { LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId; @@ -689,14 +689,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector]; GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN; int32_t lane = sector % numLanes; - clustererNN.deviceId = deviceId; + clustererNN.mDeviceId = deviceId; clustererNN.mISector = sector; - clustererNN.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters; + clustererNN.mNnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters; nnApplications[lane].initClusterizer(nn_settings, clustererNN); if (doGPU) { - clustererNNShadow.deviceId = deviceId; + clustererNNShadow.mDeviceId = deviceId; clustererNNShadow.mISector = sector; - clustererNNShadow.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters; + clustererNNShadow.mNnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters; nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow); } AllocateRegisteredMemory(clustererNN.mMemoryId); @@ -975,62 +975,62 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) int withMC = (doGPU && propagateMCLabels); - if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { + if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); } // float time_clusterizer = 0, time_fill = 0, time_networks = 0; - for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.nnClusterizerBatchedMode); batch++) { - uint batchStart = batch * clustererNNShadow.nnClusterizerBatchedMode; - size_t iSize = CAMath::Min((uint)clustererNNShadow.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); + for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.mNnClusterizerBatchedMode); batch++) { + uint batchStart = batch * clustererNNShadow.mNnClusterizerBatchedMode; + size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); // auto start0 = std::chrono::high_resolution_clock::now(); - runKernel({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data + runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the data // auto stop0 = std::chrono::high_resolution_clock::now(); // auto start1 = std::chrono::high_resolution_clock::now(); // NN evaluations - if (clustererNNShadow.nnInferenceInputDType == 0) { - if (clustererNNShadow.nnInferenceOutputDType == 0) { - (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_16); - } else if (clustererNNShadow.nnInferenceOutputDType == 1) { - (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_32); + if (clustererNNShadow.mNnInferenceInputDType == 0) { + if (clustererNNShadow.mNnInferenceOutputDType == 0) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mModelProbabilities_32); } - } else if (clustererNNShadow.nnInferenceInputDType == 1) { - if (clustererNNShadow.nnInferenceOutputDType == 0) { - (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_16); - } else if (clustererNNShadow.nnInferenceOutputDType == 1) { - (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_32); + } else if (clustererNNShadow.mNnInferenceInputDType == 1) { + if (clustererNNShadow.mNnInferenceOutputDType == 0) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + (nnApplication.mModelClass).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mModelProbabilities_32); } } - if (!clustererNNShadow.nnClusterizerUseCfRegression) { - if (clustererNNShadow.nnInferenceInputDType == 0) { - if (clustererNNShadow.nnInferenceOutputDType == 0) { - (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_16); - } else if (clustererNNShadow.nnInferenceOutputDType == 1) { - (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_32); + if (!clustererNNShadow.mNnClusterizerUseCfRegression) { + if (clustererNNShadow.mNnInferenceInputDType == 0) { + if (clustererNNShadow.mNnInferenceOutputDType == 0) { + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg1_32); } - } else if (clustererNNShadow.nnInferenceInputDType == 1) { - if (clustererNNShadow.nnInferenceOutputDType == 0) { - (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_16); - } else if (clustererNNShadow.nnInferenceOutputDType == 1) { - (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_32); + } else if (clustererNNShadow.mNnInferenceInputDType == 1) { + if (clustererNNShadow.mNnInferenceOutputDType == 0) { + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + (nnApplication.mModelReg1).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg1_32); } } - if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { - if (clustererNNShadow.nnInferenceInputDType == 0) { - if (clustererNNShadow.nnInferenceOutputDType == 0) { - (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_16); - } else if (clustererNNShadow.nnInferenceOutputDType == 1) { - (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_32); + if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) { + if (clustererNNShadow.mNnInferenceInputDType == 0) { + if (clustererNNShadow.mNnInferenceOutputDType == 0) { + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_16, iSize, clustererNNShadow.mOutputDataReg2_32); } - } else if (clustererNNShadow.nnInferenceInputDType == 1) { - if (clustererNNShadow.nnInferenceOutputDType == 0) { - (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_16); - } else if (clustererNNShadow.nnInferenceOutputDType == 1) { - (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_32); + } else if (clustererNNShadow.mNnInferenceInputDType == 1) { + if (clustererNNShadow.mNnInferenceOutputDType == 0) { + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_16); + } else if (clustererNNShadow.mNnInferenceOutputDType == 1) { + (nnApplication.mModelReg2).inference(clustererNNShadow.mInputData_32, iSize, clustererNNShadow.mOutputDataReg2_32); } } } @@ -1039,24 +1039,24 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // auto stopNNs = std::chrono::high_resolution_clock::now(); // Publishing kernels - if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels + if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) { + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Assigning class labels } else { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Assigning class labels } - if (!clustererNNShadow.nnClusterizerUseCfRegression) { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 1 regression results - if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results + if (!clustererNNShadow.mNnClusterizerUseCfRegression) { + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Publishing class 1 regression results + if (nnApplication.mModelClass.getNumOutputNodes()[0][1] > 1 && nnApplication.mModelReg2.isInitialized()) { + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results } } // for(int i = 0; i < iSize; ++i) { - // if(clustererNNShadow.outputDataClass[i + batchStart] > 1) { - // LOG(info) << "WARNING ORT: Output of " << i + batchStart << " / " << clusterer.mPmemory->counters.nClusters << " is " << clustererNNShadow.modelProbabilities_16[i].ToFloat() << " and " << clustererNNShadow.outputDataClass[i + batchStart] << " thresh " << clustererNNShadow.nnClassThreshold << " instead of 0 or 1. Please check the model and the input data."; + // if(clustererNNShadow.mOutputDataClass[i + batchStart] > 1) { + // LOG(info) << "WARNING ORT: Output of " << i + batchStart << " / " << clusterer.mPmemory->counters.nClusters << " is " << clustererNNShadow.mModelProbabilities_16[i].ToFloat() << " and " << clustererNNShadow.mOutputDataClass[i + batchStart] << " thresh " << clustererNNShadow.mNnClassThreshold << " instead of 0 or 1. Please check the model and the input data."; // // std::string input = "["; - // // for(int j = 0; j < clustererNNShadow.nnClusterizerElementSize; j++){ - // // input += std::to_string(clustererNNShadow.inputData_16[i * clustererNNShadow.nnClusterizerElementSize + j].ToFloat()) + ", "; + // // for(int j = 0; j < clustererNNShadow.mNnClusterizerElementSize; j++){ + // // input += std::to_string(clustererNNShadow.mInputData_16[i * clustererNNShadow.mNnClusterizerElementSize + j].ToFloat()) + ", "; // // } // // input += "]"; // // LOG(info) << "Input is: " << input; @@ -1069,19 +1069,19 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; // time_fill += std::chrono::duration_cast(stop0 - start0).count() / 1e9; } - if (clustererNNShadow.nnClusterizerUseCfRegression) { + if (clustererNNShadow.mNnClusterizerUseCfRegression) { // auto start1 = std::chrono::high_resolution_clock::now(); - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 // auto stop1 = std::chrono::high_resolution_clock::now(); // time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; } - // if (clustererNNShadow.nnClusterizerVerbosity < 3) { + // if (clustererNNShadow.mNnClusterizerVerbosity < 3) { // int acceptedClusters = 0; // for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) { - // if(clustererNNShadow.outputDataClass[i] > 1 || clustererNNShadow.outputDataClass[i] < 0) { - // LOG(info) << "WARNING ORT 2: " << clustererNNShadow.outputDataClass[i] << " for index " << i << " / " << clusterer.mPmemory->counters.nClusters; + // if(clustererNNShadow.mOutputDataClass[i] > 1 || clustererNNShadow.mOutputDataClass[i] < 0) { + // LOG(info) << "WARNING ORT 2: " << clustererNNShadow.mOutputDataClass[i] << " for index " << i << " / " << clusterer.mPmemory->counters.nClusters; // } - // acceptedClusters += clustererNNShadow.outputDataClass[i]; + // acceptedClusters += clustererNNShadow.mOutputDataClass[i]; // } // LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s"; // } @@ -1187,9 +1187,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) { // if (GetProcessingSettings().nn.applyNNclusterizer) { // GPUTPCNNClusterizerHost& nnApplication = nnApplications[i]; - // nnApplication.model_class.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); - // nnApplication.model_reg_1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); - // nnApplication.model_reg_2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); + // nnApplication.mModelClass.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); + // nnApplication.mModelReg1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); + // nnApplication.mModelReg2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); // } if (transferRunning[i]) { ReleaseEvent(mEvents->stream[i], doGPU); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 092af2ea393c5..da37c0771fe84 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -25,69 +25,69 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {} void* GPUTPCNNClusterizer::setIOPointers(void* mem) { - if (nnClusterizerBatchedMode > 0) { - if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, inputData_16, nnClusterizerBatchedMode * nnClusterizerElementSize); - } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, inputData_32, nnClusterizerBatchedMode * nnClusterizerElementSize); + if (mNnClusterizerBatchedMode > 0) { + if (mNnInferenceInputDType == 0 && mNnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, mInputData_16, mNnClusterizerBatchedMode * mNnClusterizerElementSize); + } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, mInputData_32, mNnClusterizerBatchedMode * mNnClusterizerElementSize); } - computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode); + computePointerWithAlignment(mem, mClusterFlags, 2 * mNnClusterizerBatchedMode); - if (nnInferenceOutputDType == 0 && nnClusterizerElementSize > 0) { - if (nnClusterizerModelClassNumOutputNodes > 0) { - computePointerWithAlignment(mem, modelProbabilities_16, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); + if (mNnInferenceOutputDType == 0 && mNnClusterizerElementSize > 0) { + if (mNnClusterizerModelClassNumOutputNodes > 0) { + computePointerWithAlignment(mem, mModelProbabilities_16, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes); } - if (!nnClusterizerUseCfRegression) { - if (nnClusterizerModelReg1NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg1_16, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + if (!mNnClusterizerUseCfRegression) { + if (mNnClusterizerModelReg1NumOutputNodes > 0) { + computePointerWithAlignment(mem, mOutputDataReg1_16, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes); } - if (nnClusterizerModelReg2NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg2_16, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + if (mNnClusterizerModelReg2NumOutputNodes > 0) { + computePointerWithAlignment(mem, mOutputDataReg2_16, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes); } } - } else if (nnInferenceOutputDType == 1 && nnClusterizerElementSize > 0) { - if (nnClusterizerModelClassNumOutputNodes > 0) { - computePointerWithAlignment(mem, modelProbabilities_32, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); + } else if (mNnInferenceOutputDType == 1 && mNnClusterizerElementSize > 0) { + if (mNnClusterizerModelClassNumOutputNodes > 0) { + computePointerWithAlignment(mem, mModelProbabilities_32, mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes); } - if (!nnClusterizerUseCfRegression) { - if (nnClusterizerModelReg1NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg1_32, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + if (!mNnClusterizerUseCfRegression) { + if (mNnClusterizerModelReg1NumOutputNodes > 0) { + computePointerWithAlignment(mem, mOutputDataReg1_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes); } - if (nnClusterizerModelReg2NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg2_32, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + if (mNnClusterizerModelReg2NumOutputNodes > 0) { + computePointerWithAlignment(mem, mOutputDataReg2_32, mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes); } } } } - if (nnClusterizerTotalClusters > 0) { - computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters); + if (mNnClusterizerTotalClusters > 0) { + computePointerWithAlignment(mem, mOutputDataClass, mNnClusterizerTotalClusters); } return mem; } // std::vector GPUTPCNNClusterizer::pointerSizes() { // std::vector sizes(7, -1); -// if (nnClusterizerBatchedMode > 0) { -// if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) { -// sizes[0] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData16 -// } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) { -// sizes[1] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData32 +// if (mNnClusterizerBatchedMode > 0) { +// if (mNnInferenceInputDType == 0 && mNnClusterizerElementSize > 0) { +// sizes[0] = mNnClusterizerBatchedMode * mNnClusterizerElementSize; // inputData16 +// } else if (mNnInferenceInputDType == 1 && mNnClusterizerElementSize > 0) { +// sizes[1] = mNnClusterizerBatchedMode * mNnClusterizerElementSize; // inputData32 // } -// sizes[2] = 2 * nnClusterizerBatchedMode; // clusterFlags -// if (nnClusterizerModelClassNumOutputNodes > 0) { -// sizes[3] = nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes; // modelProbabilities +// sizes[2] = 2 * mNnClusterizerBatchedMode; // mClusterFlags +// if (mNnClusterizerModelClassNumOutputNodes > 0) { +// sizes[3] = mNnClusterizerBatchedMode * mNnClusterizerModelClassNumOutputNodes; // modelProbabilities // } -// if (!nnClusterizerUseCfRegression) { -// if (nnClusterizerModelReg1NumOutputNodes > 0) { -// sizes[4] = nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes; // outputDataReg1 +// if (!mNnClusterizerUseCfRegression) { +// if (mNnClusterizerModelReg1NumOutputNodes > 0) { +// sizes[4] = mNnClusterizerBatchedMode * mNnClusterizerModelReg1NumOutputNodes; // outputDataReg1 // } -// if (nnClusterizerModelReg2NumOutputNodes > 0) { -// sizes[5] = nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes; // outputDataReg2 +// if (mNnClusterizerModelReg2NumOutputNodes > 0) { +// sizes[5] = mNnClusterizerBatchedMode * mNnClusterizerModelReg2NumOutputNodes; // outputDataReg2 // } // } // } -// if (nnClusterizerTotalClusters > 0) { -// sizes[6] = nnClusterizerTotalClusters; // outputDataClass +// if (mNnClusterizerTotalClusters > 0) { +// sizes[6] = mNnClusterizerTotalClusters; // mOutputDataClass // } // return sizes; // } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 022642f9f142e..f7c2d13407b0e 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -37,42 +37,42 @@ class GPUTPCNNClusterizer : public GPUProcessor // Neural network clusterization - int nnClusterizerSizeInputRow = 3; - int nnClusterizerSizeInputPad = 3; - int nnClusterizerSizeInputTime = 3; - int nnClusterizerElementSize = -1; - bool nnClusterizerAddIndexData = true; - float nnClassThreshold = 0.01; - bool nnSigmoidTrafoClassThreshold = 1; - int nnClusterizerUseCfRegression = 0; - int nnClusterizerBatchedMode = 1; - int nnClusterizerTotalClusters = 1; - int nnClusterizerVerbosity = 0; - int nnClusterizerBoundaryFillValue = -1; - int nnClusterizerModelClassNumOutputNodes = -1; - int nnClusterizerModelReg1NumOutputNodes = -1; - int nnClusterizerModelReg2NumOutputNodes = -1; - int nnInferenceInputDType = 0; // 0: float16, 1: float32 - int nnInferenceOutputDType = 0; // 0: float16, 1: float32 + int mNnClusterizerSizeInputRow = 3; + int mNnClusterizerSizeInputPad = 3; + int mNnClusterizerSizeInputTime = 3; + int mNnClusterizerElementSize = -1; + bool mNnClusterizerAddIndexData = true; + float mNnClassThreshold = 0.01; + bool mNnSigmoidTrafoClassThreshold = 1; + int mNnClusterizerUseCfRegression = 0; + int mNnClusterizerBatchedMode = 1; + int mNnClusterizerTotalClusters = 1; + int mNnClusterizerVerbosity = 0; + int mNnClusterizerBoundaryFillValue = -1; + int mNnClusterizerModelClassNumOutputNodes = -1; + int mNnClusterizerModelReg1NumOutputNodes = -1; + int mNnClusterizerModelReg2NumOutputNodes = -1; + int mNnInferenceInputDType = 0; // 0: float16, 1: float32 + int mNnInferenceOutputDType = 0; // 0: float16, 1: float32 int mISector = -1; - int deviceId = -1; + int mDeviceId = -1; // Memory allocation for neural network - bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr - int* outputDataClass = nullptr; + bool* mClusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr + int* mOutputDataClass = nullptr; // FP32 - float* inputData_32 = nullptr; - float* modelProbabilities_32 = nullptr; - float* outputDataReg1_32 = nullptr; - float* outputDataReg2_32 = nullptr; + float* mInputData_32 = nullptr; + float* mModelProbabilities_32 = nullptr; + float* mOutputDataReg1_32 = nullptr; + float* mOutputDataReg2_32 = nullptr; // FP16 - OrtDataType::Float16_t* inputData_16 = nullptr; - OrtDataType::Float16_t* modelProbabilities_16 = nullptr; - OrtDataType::Float16_t* outputDataReg1_16 = nullptr; - OrtDataType::Float16_t* outputDataReg2_16 = nullptr; + OrtDataType::Float16_t* mInputData_16 = nullptr; + OrtDataType::Float16_t* mModelProbabilities_16 = nullptr; + OrtDataType::Float16_t* mOutputDataReg1_16 = nullptr; + OrtDataType::Float16_t* mOutputDataReg2_16 = nullptr; int16_t mMemoryId = -1; }; // class GPUTPCNNClusterizer diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 31b71fd8f1ebe..ca2deec60601c 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -45,7 +45,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set } } - OrtOptions = { + mOrtOptions = { {"model-path", class_model_path}, {"device-type", settings.nnInferenceDevice}, {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)}, @@ -57,60 +57,60 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set {"logging-level", std::to_string(settings.nnInferenceVerbosity)}, {"onnx-environment-name", "c1"}}; - model_class.initOptions(OrtOptions); - modelsUsed[0] = true; + mModelClass.initOptions(mOrtOptions); + mModelsUsed[0] = true; reg_model_paths_local = o2::utils::Str::tokenize(reg_model_path, ':'); if (!settings.nnClusterizerUseCfRegression) { if (reg_model_paths_local.size() == 1) { - OrtOptions["model-path"] = reg_model_paths_local[0]; - OrtOptions["onnx-environment-name"] = "r1"; - model_reg_1.initOptions(OrtOptions); - modelsUsed[1] = true; + mOrtOptions["model-path"] = reg_model_paths_local[0]; + mOrtOptions["onnx-environment-name"] = "r1"; + mModelReg1.initOptions(mOrtOptions); + mModelsUsed[1] = true; } else { - OrtOptions["model-path"] = reg_model_paths_local[0]; - OrtOptions["onnx-environment-name"] = "r1"; - model_reg_1.initOptions(OrtOptions); - modelsUsed[1] = true; - OrtOptions["model-path"] = reg_model_paths_local[1]; - OrtOptions["onnx-environment-name"] = "r2"; - model_reg_2.initOptions(OrtOptions); - modelsUsed[2] = true; + mOrtOptions["model-path"] = reg_model_paths_local[0]; + mOrtOptions["onnx-environment-name"] = "r1"; + mModelReg1.initOptions(mOrtOptions); + mModelsUsed[1] = true; + mOrtOptions["model-path"] = reg_model_paths_local[1]; + mOrtOptions["onnx-environment-name"] = "r2"; + mModelReg2.initOptions(mOrtOptions); + mModelsUsed[2] = true; } } } void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN) { - clustererNN.nnClusterizerUseCfRegression = settings.nnClusterizerUseCfRegression; - clustererNN.nnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow; - clustererNN.nnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad; - clustererNN.nnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime; - clustererNN.nnClusterizerAddIndexData = settings.nnClusterizerAddIndexData; - clustererNN.nnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0); - clustererNN.nnClusterizerBatchedMode = settings.nnClusterizerBatchedMode; - clustererNN.nnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue; - clustererNN.nnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold; - if (clustererNN.nnSigmoidTrafoClassThreshold) { - clustererNN.nnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold)); + clustererNN.mNnClusterizerUseCfRegression = settings.nnClusterizerUseCfRegression; + clustererNN.mNnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow; + clustererNN.mNnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad; + clustererNN.mNnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime; + clustererNN.mNnClusterizerAddIndexData = settings.nnClusterizerAddIndexData; + clustererNN.mNnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0); + clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode; + clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue; + clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold; + if (clustererNN.mNnSigmoidTrafoClassThreshold) { + clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold)); } else { - clustererNN.nnClassThreshold = settings.nnClassThreshold; + clustererNN.mNnClassThreshold = settings.nnClassThreshold; } if (settings.nnClusterizerVerbosity < 0) { - clustererNN.nnClusterizerVerbosity = settings.nnInferenceVerbosity; + clustererNN.mNnClusterizerVerbosity = settings.nnInferenceVerbosity; } else { - clustererNN.nnClusterizerVerbosity = settings.nnClusterizerVerbosity; + clustererNN.mNnClusterizerVerbosity = settings.nnClusterizerVerbosity; } - clustererNN.nnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos; - clustererNN.nnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos; - clustererNN.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; + clustererNN.mNnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos; + clustererNN.mNnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos; + clustererNN.mNnClusterizerModelClassNumOutputNodes = mModelClass.getNumOutputNodes()[0][1]; if (!settings.nnClusterizerUseCfRegression) { - if (model_class.getNumOutputNodes()[0][1] == 1 || !model_reg_2.isInitialized()) { - clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; + if (mModelClass.getNumOutputNodes()[0][1] == 1 || !mModelReg2.isInitialized()) { + clustererNN.mNnClusterizerModelReg1NumOutputNodes = mModelReg1.getNumOutputNodes()[0][1]; } else { - clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; - clustererNN.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1]; + clustererNN.mNnClusterizerModelReg1NumOutputNodes = mModelReg1.getNumOutputNodes()[0][1]; + clustererNN.mNnClusterizerModelReg2NumOutputNodes = mModelReg2.getNumOutputNodes()[0][1]; } } } @@ -199,20 +199,20 @@ void MockedOrtAllocator::LeakCheck() void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate) { - mockedAlloc = std::make_shared(rec, (OrtMemoryInfo*)(*memInfo)); + mMockedAlloc = std::make_shared(rec, (OrtMemoryInfo*)(*memInfo)); if (recreate) { Ort::ThrowOnError(Ort::GetApi().UnregisterAllocator((OrtEnv*)(*env), (OrtMemoryInfo*)(*memInfo))); } - Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc.get())); - memInfo = (Ort::MemoryInfo*)mockedAlloc->Info(); + Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mMockedAlloc.get())); + memInfo = (Ort::MemoryInfo*)mMockedAlloc->Info(); } const OrtMemoryInfo* GPUTPCNNClusterizerHost::getMockedMemoryInfo() { - return mockedAlloc->Info(); + return mMockedAlloc->Info(); } MockedOrtAllocator* GPUTPCNNClusterizerHost::getMockedAllocator() { - return mockedAlloc.get(); + return mMockedAlloc.get(); } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 0379b83d0ae02..e659753f21d7d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -57,13 +57,11 @@ class GPUTPCNNClusterizerHost MockedOrtAllocator* getMockedAllocator(); const OrtMemoryInfo* getMockedMemoryInfo(); - std::unordered_map OrtOptions; - o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters - std::vector modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2 - int32_t deviceId = -1; - std::vector reg_model_paths; - - std::shared_ptr mockedAlloc = nullptr; + std::unordered_map mOrtOptions; + o2::ml::OrtModel mModelClass, mModelReg1, mModelReg2; // For splitting clusters + std::vector mModelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2 + int32_t mDeviceId = -1; + std::shared_ptr mMockedAlloc = nullptr; }; // class GPUTPCNNClusterizerHost } // namespace o2::gpu diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 413293502d3c6..47bc5e8da80ca 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -40,7 +40,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); @@ -56,56 +56,56 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); CfChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; int row = static_cast(peak.row()), pad = static_cast(peak.pad()), time = static_cast(peak.time()); // Explicit casting to avoid conversion errors float central_charge = static_cast(chargeMap[peak].unpack()); - int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow); + int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); #ifndef GPUCA_GPUCODE GPUCA_UNROLL(U(), U()); #endif - for (int r = -clustererNN.nnClusterizerSizeInputRow; r <= clustererNN.nnClusterizerSizeInputRow; r++) { + for (int r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) { bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r); - for (int p = -clustererNN.nnClusterizerSizeInputPad + pad_offset; p <= clustererNN.nnClusterizerSizeInputPad + pad_offset; p++) { - bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow); - for (int t = -clustererNN.nnClusterizerSizeInputTime; t <= clustererNN.nnClusterizerSizeInputTime; t++) { + for (int p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) { + bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow); + for (int t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) { if (!is_boundary) { CfChargePos tmp_pos(row + r, pad + p, time + t); - if (r == 0 && !clustererNN.clusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization - clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); - clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx]; + if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization + clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); + clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx]; } if (dtype == 0) { - clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); } else if (dtype == 1) { - clustererNN.inputData_32[write_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + clustererNN.mInputData_32[write_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; } } else { // Filling boundary just to make sure that no values are left unintentionally if (dtype == 0) { - clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); } else { - clustererNN.inputData_32[write_idx] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); + clustererNN.mInputData_32[write_idx] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); } } write_idx++; } } } - if (clustererNN.nnClusterizerAddIndexData) { + if (clustererNN.mNnClusterizerAddIndexData) { if (dtype == 0) { - clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(sector / 36.f); - clustererNN.inputData_16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f); - clustererNN.inputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(sector / 36.f); + clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f); + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); } else { - clustererNN.inputData_32[write_idx] = sector / 36.f; - clustererNN.inputData_32[write_idx + 1] = row / 152.f; - clustererNN.inputData_32[write_idx + 2] = static_cast(pad) / GPUTPCGeometry::NPads(row); + clustererNN.mInputData_32[write_idx] = sector / 36.f; + clustererNN.mInputData_32[write_idx + 1] = row / 152.f; + clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / GPUTPCGeometry::NPads(row); } } } @@ -116,62 +116,62 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); CfChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart]; int row = static_cast(peak.row()), pad = static_cast(peak.pad()); - if (clustererNN.nnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.nnClusterizerElementSize - 1)) { - uint top_idx = (base_idx + 1) * clustererNN.nnClusterizerElementSize; + if (clustererNN.mNnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.mNnClusterizerElementSize - 1)) { + uint top_idx = (base_idx + 1) * clustererNN.mNnClusterizerElementSize; for (uint16_t i = 0; i < 8; i++) { Delta2 d = cfconsts::InnerNeighbors[i]; CfChargePos tmp_pos = peak.delta(d); - clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); - clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx]; + clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); + clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx]; } if (dtype == 0) { - clustererNN.inputData_16[top_idx - 3] = (OrtDataType::Float16_t)(sector / 36.f); - clustererNN.inputData_16[top_idx - 2] = (OrtDataType::Float16_t)(row / 152.f); - clustererNN.inputData_16[top_idx - 1] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); + clustererNN.mInputData_16[top_idx - 3] = (OrtDataType::Float16_t)(sector / 36.f); + clustererNN.mInputData_16[top_idx - 2] = (OrtDataType::Float16_t)(row / 152.f); + clustererNN.mInputData_16[top_idx - 1] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); } else { - clustererNN.inputData_32[top_idx - 3] = sector / 36.f; - clustererNN.inputData_32[top_idx - 2] = row / 152.f; - clustererNN.inputData_32[top_idx - 1] = static_cast(pad) / GPUTPCGeometry::NPads(row); + clustererNN.mInputData_32[top_idx - 3] = sector / 36.f; + clustererNN.mInputData_32[top_idx - 2] = row / 152.f; + clustererNN.mInputData_32[top_idx - 1] = static_cast(pad) / GPUTPCGeometry::NPads(row); } - } else if ((int32_t)transient_index < (clustererNN.nnClusterizerElementSize - 3)) { + } else if ((int32_t)transient_index < (clustererNN.mNnClusterizerElementSize - 3)) { int time = static_cast(peak.time()); - int r = CAMath::Floor(transient_index / ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1))) - clustererNN.nnClusterizerSizeInputRow; + int r = CAMath::Floor(transient_index / ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1))) - clustererNN.mNnClusterizerSizeInputRow; bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); if (is_row_boundary) { if (dtype == 0) { - clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); + clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); } else { - clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); + clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); } } else { - int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow); + int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); int pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r); - int rest_1 = transient_index % ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1)); - int p = CAMath::Floor(rest_1 / (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputPad + pad_offset; - bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow); + int rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1)); + int p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad + pad_offset; + bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow); if (!is_boundary) { float central_charge = static_cast(chargeMap[peak].unpack()); - int t = (rest_1 % (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputTime; + int t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime; CfChargePos tmp_pos(row + r, pad + p, time + t); if (dtype == 0) { - clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); + clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); } else if (dtype == 1) { - clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; } } else { if (dtype == 0) { - clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); + clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); } else { - clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); + clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); } } } @@ -183,9 +183,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread processors.tpcNNClusterer[sector].nnClassThreshold); + processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int)((processors.tpcNNClusterer[sector].mModelProbabilities_16[glo_idx]).ToFloat() > processors.tpcNNClusterer[sector].mNnClassThreshold); } else if (dtype == 1) { - processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities_32[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold); + processors.tpcNNClusterer[sector].mOutputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].mModelProbabilities_32[glo_idx] > processors.tpcNNClusterer[sector].mNnClassThreshold); } } @@ -194,29 +194,29 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(clustererNN.modelProbabilities_16[pIdx]); + current_max_prob = static_cast(clustererNN.mModelProbabilities_16[pIdx]); } else if (dtype == 1) { - current_max_prob = clustererNN.modelProbabilities_32[pIdx]; + current_max_prob = clustererNN.mModelProbabilities_32[pIdx]; } } else { if (dtype == 0) { - current_max_prob = CAMath::Max(current_max_prob, clustererNN.modelProbabilities_16[pIdx].ToFloat()); + current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_16[pIdx].ToFloat()); } else if (dtype == 1) { - current_max_prob = CAMath::Max(current_max_prob, clustererNN.modelProbabilities_32[pIdx]); + current_max_prob = CAMath::Max(current_max_prob, clustererNN.mModelProbabilities_32[pIdx]); } } } - // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" - clustererNN.outputDataClass[glo_idx + batchStart] = class_label; + // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.mNnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" + clustererNN.mOutputDataClass[glo_idx + batchStart] = class_label; if (class_label > 1) { - clustererNN.clusterFlags[2 * glo_idx] = 1; - clustererNN.clusterFlags[2 * glo_idx + 1] = 1; + clustererNN.mClusterFlags[2 * glo_idx] = 1; + clustererNN.mClusterFlags[2 * glo_idx + 1] = 1; } } @@ -235,11 +235,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= 1)) { + if (clustererNN.mOutputDataClass[full_glo_idx] == 1 || (clustererNN.mNnClusterizerModelReg2NumOutputNodes == -1 && clustererNN.mOutputDataClass[full_glo_idx] >= 1)) { ClusterAccumulator pc; @@ -265,21 +265,21 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.outputDataReg1_16[model_output_index].ToFloat(), - clustererNN.outputDataReg1_16[model_output_index + 2].ToFloat(), - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg1_16[model_output_index + 1].ToFloat(), - clustererNN.outputDataReg1_16[model_output_index + 3].ToFloat(), - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + pc.setFull(central_charge * clustererNN.mOutputDataReg1_16[model_output_index + 4].ToFloat(), + static_cast(peak.pad()) + clustererNN.mOutputDataReg1_16[model_output_index].ToFloat(), + clustererNN.mOutputDataReg1_16[model_output_index + 2].ToFloat(), + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.mOutputDataReg1_16[model_output_index + 1].ToFloat(), + clustererNN.mOutputDataReg1_16[model_output_index + 3].ToFloat(), + clustererNN.mClusterFlags[2 * glo_idx], + clustererNN.mClusterFlags[2 * glo_idx + 1]); } else if (dtype == 1) { - pc.setFull(central_charge * clustererNN.outputDataReg1_32[model_output_index + 4], - static_cast(peak.pad()) + clustererNN.outputDataReg1_32[model_output_index], - clustererNN.outputDataReg1_32[model_output_index + 2], - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg1_32[model_output_index + 1], - clustererNN.outputDataReg1_32[model_output_index + 3], - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + pc.setFull(central_charge * clustererNN.mOutputDataReg1_32[model_output_index + 4], + static_cast(peak.pad()) + clustererNN.mOutputDataReg1_32[model_output_index], + clustererNN.mOutputDataReg1_32[model_output_index + 2], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.mOutputDataReg1_32[model_output_index + 1], + clustererNN.mOutputDataReg1_32[model_output_index + 3], + clustererNN.mClusterFlags[2 * glo_idx], + clustererNN.mClusterFlags[2 * glo_idx + 1]); } tpc::ClusterNative myCluster; @@ -330,9 +330,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread 0) { + if (clustererNN.mOutputDataClass[full_glo_idx] > 0) { ClusterAccumulator pc; @@ -358,21 +358,21 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index].ToFloat(), - clustererNN.outputDataReg2_16[model_output_index + 4].ToFloat(), - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 2].ToFloat(), - clustererNN.outputDataReg2_16[model_output_index + 6].ToFloat(), - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 8].ToFloat(), + static_cast(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index].ToFloat(), + clustererNN.mOutputDataReg2_16[model_output_index + 4].ToFloat(), + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 2].ToFloat(), + clustererNN.mOutputDataReg2_16[model_output_index + 6].ToFloat(), + clustererNN.mClusterFlags[2 * glo_idx], + clustererNN.mClusterFlags[2 * glo_idx + 1]); } else if (dtype == 1) { - pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 8], - static_cast(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index], - clustererNN.outputDataReg2_32[model_output_index + 4], - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 2], - clustererNN.outputDataReg2_32[model_output_index + 6], - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 8], + static_cast(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index], + clustererNN.mOutputDataReg2_32[model_output_index + 4], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 2], + clustererNN.mOutputDataReg2_32[model_output_index + 6], + clustererNN.mClusterFlags[2 * glo_idx], + clustererNN.mClusterFlags[2 * glo_idx + 1]); } tpc::ClusterNative myCluster; @@ -403,21 +403,21 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index + 1].ToFloat(), - clustererNN.outputDataReg2_16[model_output_index + 5].ToFloat(), - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 3].ToFloat(), - clustererNN.outputDataReg2_16[model_output_index + 7].ToFloat(), - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + pc.setFull(central_charge * clustererNN.mOutputDataReg2_16[model_output_index + 9].ToFloat(), + static_cast(peak.pad()) + clustererNN.mOutputDataReg2_16[model_output_index + 1].ToFloat(), + clustererNN.mOutputDataReg2_16[model_output_index + 5].ToFloat(), + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.mOutputDataReg2_16[model_output_index + 3].ToFloat(), + clustererNN.mOutputDataReg2_16[model_output_index + 7].ToFloat(), + clustererNN.mClusterFlags[2 * glo_idx], + clustererNN.mClusterFlags[2 * glo_idx + 1]); } else if (dtype == 1) { - pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 9], - static_cast(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index + 1], - clustererNN.outputDataReg2_32[model_output_index + 5], - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 3], - clustererNN.outputDataReg2_32[model_output_index + 7], - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + pc.setFull(central_charge * clustererNN.mOutputDataReg2_32[model_output_index + 9], + static_cast(peak.pad()) + clustererNN.mOutputDataReg2_32[model_output_index + 1], + clustererNN.mOutputDataReg2_32[model_output_index + 5], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.mOutputDataReg2_32[model_output_index + 3], + clustererNN.mOutputDataReg2_32[model_output_index + 7], + clustererNN.mClusterFlags[2 * glo_idx], + clustererNN.mClusterFlags[2 * glo_idx + 1]); } rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap);