diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 89631d59a3846..93549178848ca 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -41,6 +41,7 @@ class OrtModel OrtModel(std::unordered_map optionsMap) { reset(optionsMap); } void init(std::unordered_map optionsMap) { reset(optionsMap); } void reset(std::unordered_map); + bool isInitialized() { return mInitialized; } virtual ~OrtModel() = default; @@ -55,6 +56,9 @@ class OrtModel template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h std::vector inference(std::vector>&); + template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h + void inference(I*, size_t, O*); + // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type // std::vector inference(std::vector&); @@ -79,8 +83,9 @@ class OrtModel std::vector> mInputShapes, mOutputShapes; // Environment settings - std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda - int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; + bool mInitialized = false; + std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda + int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; std::string printShape(const std::vector&); }; diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index eb124ff6f12c9..fc784dd14d2dc 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -44,17 +44,20 @@ void OrtModel::reset(std::unordered_map optionsMap) if (!optionsMap.contains("model-path")) { LOG(fatal) << "(ORT) Model path cannot be empty!"; } - modelPath = optionsMap["model-path"]; - device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU"); - dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); - deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); - allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); - intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); - loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 2); - enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); - enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); - - std::string dev_mem_str = "Hip"; + + if (!optionsMap["model-path"].empty()) { + modelPath = optionsMap["model-path"]; + device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU"); + dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); + deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); + allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); + intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); + interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0); + loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); + enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); + enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); + + std::string dev_mem_str = "Hip"; #if defined(ORT_ROCM_BUILD) #if ORT_ROCM_BUILD == 1 if (device == "ROCM") { @@ -88,12 +91,15 @@ void OrtModel::reset(std::unordered_map optionsMap) if (device == "CPU") { (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); - if (intraOpNumThreads > 1) { + (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads); + if (intraOpNumThreads > 1 || interOpNumThreads > 1) { (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); } else if (intraOpNumThreads == 1) { (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); } - LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads"; + if (loggingLevel < 2) { + LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " (intraOpNumThreads) and " << interOpNumThreads << " (interOpNumThreads) threads"; + } } (pImplOrt->sessionOptions).DisableMemPattern(); @@ -109,6 +115,9 @@ void OrtModel::reset(std::unordered_map optionsMap) } else { (pImplOrt->sessionOptions).DisableProfiling(); } + + mInitialized = true; + (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); @@ -154,16 +163,9 @@ void OrtModel::reset(std::unordered_map optionsMap) outputNamesChar.resize(mOutputNames.size(), nullptr); std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), [&](const std::string& str) { return str.c_str(); }); - - // Print names - LOG(info) << "\tInput Nodes:"; - for (size_t i = 0; i < mInputNames.size(); i++) { - LOG(info) << "\t\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); } - - LOG(info) << "\tOutput Nodes:"; - for (size_t i = 0; i < mOutputNames.size(); i++) { - LOG(info) << "\t\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); + if (loggingLevel < 2) { + LOG(info) << "(ORT) Model loaded successfully! (input: " << printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")"; } } @@ -187,36 +189,6 @@ std::vector OrtModel::v2v(std::vector& input, bool clearInput) } } -template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h -std::vector OrtModel::inference(std::vector& input) -{ - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); - std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; -} - -template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h -std::vector OrtModel::inference(std::vector>& input) -{ - std::vector inputTensor; - for (auto i : input) { - std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); - } - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); - std::vector outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; -} - std::string OrtModel::printShape(const std::vector& v) { std::stringstream ss(""); @@ -227,74 +199,68 @@ std::string OrtModel::printShape(const std::vector& v) return ss.str(); } -template <> -std::vector OrtModel::inference(std::vector& input) +template +std::vector OrtModel::inference(std::vector& input) { std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); + if constexpr (std::is_same_v) { + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); + } else { + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); + } // input.clear(); auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - float* outputValues = outputTensors[0].template GetTensorMutableData(); - std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + O* outputValues = outputTensors[0].template GetTensorMutableData(); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; outputTensors.clear(); return outputValuesVec; } -template <> -std::vector OrtModel::inference(std::vector& input) -{ - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - float* outputValues = outputTensors[0].template GetTensorMutableData(); - std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; -} +template std::vector OrtModel::inference(std::vector&); -template <> -std::vector OrtModel::inference(std::vector& input) -{ - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); - std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; -} +template std::vector OrtModel::inference(std::vector&); -template <> -std::vector OrtModel::inference(std::vector& input) +template std::vector OrtModel::inference(std::vector&); + +template +void OrtModel::inference(I* input, size_t input_size, O* output) { - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); - std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; + std::vector inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + Ort::Value inputTensor = Ort::Value(nullptr); + if constexpr (std::is_same_v) { + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size, inputShape.data(), inputShape.size()); + } else { + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size()); + } + + std::vector outputShape{inputShape[0], mOutputShapes[0][1]}; + size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]); + Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size()); + + (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here } -template <> -std::vector OrtModel::inference(std::vector>& input) +template void OrtModel::inference(OrtDataType::Float16_t*, size_t, float*); + +template void OrtModel::inference(float*, size_t, float*); + +template +std::vector OrtModel::inference(std::vector>& input) { std::vector inputTensor; for (auto i : input) { std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); + if constexpr (std::is_same_v) { + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); + } else { + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, i.data(), i.size(), inputShape.data(), inputShape.size())); + } } // input.clear(); auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); - std::vector outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; + O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; outputTensors.clear(); return outputValuesVec; } diff --git a/GPU/GPUTracking/Base/GPUConstantMem.h b/GPU/GPUTracking/Base/GPUConstantMem.h index 4f83fa48a64e0..8f1cc90f5ae93 100644 --- a/GPU/GPUTracking/Base/GPUConstantMem.h +++ b/GPU/GPUTracking/Base/GPUConstantMem.h @@ -34,12 +34,15 @@ #include "GPUKernelDebugOutput.h" #endif +#ifdef GPUCA_HAS_ONNX +#include "GPUTPCNNClusterizer.h" +#endif + namespace o2::gpu { struct GPUConstantMem { GPUParam param; - GPUTPCTracker - tpcTrackers[GPUCA_NSECTORS]; + GPUTPCTracker tpcTrackers[GPUCA_NSECTORS]; GPUTPCConvert tpcConverter; GPUTPCCompression tpcCompressor; GPUTPCDecompression tpcDecompressor; @@ -55,6 +58,9 @@ struct GPUConstantMem { #ifdef GPUCA_KERNEL_DEBUGGER_OUTPUT GPUKernelDebugOutput debugOutput; #endif +#ifdef GPUCA_HAS_ONNX + GPUTPCNNClusterizer tpcNNClusterer[GPUCA_NSECTORS]; +#endif template GPUd() auto& getTRDTracker(); diff --git a/GPU/GPUTracking/Base/GPUMemoryResource.h b/GPU/GPUTracking/Base/GPUMemoryResource.h index 3bb2c363db2a9..06e350db0bfc7 100644 --- a/GPU/GPUTracking/Base/GPUMemoryResource.h +++ b/GPU/GPUTracking/Base/GPUMemoryResource.h @@ -28,6 +28,7 @@ struct GPUMemoryReuse { }; enum Group : uint16_t { ClustererScratch, + NNClusterer, ClustererZS, TrackerScratch, TrackerDataLinks, diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index 656fa37fb6a4c..df9a7380834ce 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -93,6 +93,9 @@ GPUReconstruction::GPUReconstruction(const GPUSettingsDeviceBackend& cfg) : mHos for (uint32_t i = 0; i < NSECTORS; i++) { processors()->tpcTrackers[i].SetSector(i); // TODO: Move to a better place processors()->tpcClusterer[i].mISector = i; +#ifdef GPUCA_HAS_ONNX + processors()->tpcNNClusterer[i].mISector = i; +#endif } #ifndef GPUCA_NO_ROOT mROOTDump = GPUROOTDumpCore::getAndCreate(); diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index c97742ac1d47f..d5a90dbd65ea3 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -159,8 +159,8 @@ set(HDRS_INSTALL ) set(SRCS_NO_CINT ${SRCS_NO_CINT} display/GPUDisplayInterface.cxx) -set(SRCS_NO_CINT - ${SRCS_NO_CINT} + +set(SRCS_NO_CINT ${SRCS_NO_CINT} Global/GPUChainITS.cxx ITS/GPUITSFitter.cxx ITS/GPUITSFitterKernels.cxx @@ -191,6 +191,10 @@ set(SRCS_NO_CINT Refit/GPUTrackingRefitKernel.cxx Merger/GPUTPCGMO2Output.cxx) +if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone") + list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx TPCClusterFinder/GPUTPCNNClusterizer.cxx TPCClusterFinder/GPUTPCNNClusterizerHost.cxx) +endif() + set(SRCS_DATATYPES ${SRCS_DATATYPES} DataTypes/TPCPadGainCalib.cxx @@ -273,6 +277,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") PRIVATE_LINK_LIBRARIES O2::DataFormatsTPC SOURCES ${SRCS_DATATYPES}) target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2) + o2_target_root_dictionary(GPUDataTypes HEADERS ${HDRS_CINT_DATATYPES} ${HDRS_CINT_O2_ADDITIONAL} LINKDEF GPUTrackingLinkDef_O2_DataTypes.h) @@ -292,6 +297,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") O2::TPCFastTransformation O2::DetectorsRaw O2::Steer + O2::ML PUBLIC_INCLUDE_DIRECTORIES . Definitions DataTypes @@ -317,7 +323,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${targetName} PRIVATE $) - target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2) + target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1) o2_target_root_dictionary(${MODULE} HEADERS ${HDRS_CINT_O2} ${HDRS_CINT_O2_ADDITIONAL} diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h index 3ed6c25762405..55f2e76344bd5 100644 --- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h +++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h @@ -477,6 +477,9 @@ #ifndef GPUCA_LB_GPUTPCCFClusterizer #define GPUCA_LB_GPUTPCCFClusterizer 512 #endif + #ifndef GPUCA_LB_GPUTPCNNClusterizerKernels + #define GPUCA_LB_GPUTPCNNClusterizerKernels 512 + #endif #ifndef GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU #define GPUCA_LB_GPUTrackingRefitKernel_mode0asGPU 256 #endif @@ -495,6 +498,16 @@ #define GPUCA_LB_GPUTPCCFNoiseSuppression_noiseSuppression GPUCA_LB_GPUTPCCFNoiseSuppression #define GPUCA_LB_GPUTPCCFNoiseSuppression_updatePeaks GPUCA_LB_GPUTPCCFNoiseSuppression + +#ifdef GPUCA_HAS_ONNX +#define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels +#define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN GPUCA_LB_GPUTPCNNClusterizerKernels +#define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels +#define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels +#define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels +#define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizerKernels +#endif + #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_THREAD_COUNT_SCAN #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_THREAD_COUNT_SCAN #define GPUCA_LB_GPUTPCCFStreamCompaction_scanTop GPUCA_THREAD_COUNT_SCAN diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 9b6be7743e485..63fcf51004eae 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -222,6 +222,35 @@ AddOption(tpcTriggerHandling, bool, true, "", 0, "Enable TPC trigger handling") AddHelp("help", 'h') EndConfig() +BeginSubConfig(GPUSettingsProcessingNNclusterizer, nn, configStandalone.proc, "NN", 0, "Processing settings for neural network clusterizer", proc_nn) +AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural network clusterizer should be used.") +AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)") +AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id") +AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference") +AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 +AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") +AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") +AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347") +AddOption(nnInferenceOrtProfiling, int, 0, "", 0, "Enables profiling of model execution in ONNX Runtime") +AddOption(nnInferenceOrtProfilingPath, std::string, ".", "", 0, "If nnInferenceOrtProfiling is set, the path to store the profiling data") +AddOption(nnInferenceVerbosity, int, 1, "", 0, "0: No messages; 1: Warnings; 2: Warnings + major debugs; >3: All debugs") +AddOption(nnClusterizerAddIndexData, int, 1, "", 0, "If normalized index data (sector, row, pad), should be appended to the input") +AddOption(nnClusterizerSizeInputRow, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") +AddOption(nnClusterizerSizeInputPad, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") +AddOption(nnClusterizerSizeInputTime, int, 3, "", 0, "Size of the input to the NN (currently calcualted as (length-1)/2") +AddOption(nnClusterizerUseCfRegression, int, 0, "", 0, "(bool, default = false) If true, use the regression from the native clusterizer and not the NN") +AddOption(nnClusterizerApplyCfDeconvolution, int, 0, "", 0, "Applies the CFDeconvolution kernel before the digits to the network are filled") +AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) If >1, the NN is evaluated on batched input of size specified in this variable") +AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed") +AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN") +AddOption(nnClusterizerApplyNoiseSuppression, int, 1, "", 0, "Applies the NoiseSuppression kernel before the digits to the network are filled") +AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path") +AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.") +AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path") +AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") +AddHelp("help", 'h') +EndConfig() + BeginSubConfig(GPUSettingsProcessing, proc, configStandalone, "PROC", 0, "Processing settings", proc) AddOption(deviceNum, int32_t, -1, "gpuDevice", 0, "Set GPU device to use (-1: automatic, -2: for round-robin usage in timeslice-pipeline)") AddOption(gpuDeviceOnly, bool, false, "", 0, "Use only GPU as device (i.e. no CPU for OpenCL)") @@ -299,6 +328,7 @@ AddOption(printSettings, bool, false, "", 0, "Print all settings when initializi AddVariable(eventDisplay, o2::gpu::GPUDisplayFrontendInterface*, nullptr) AddSubConfig(GPUSettingsProcessingRTC, rtc) AddSubConfig(GPUSettingsProcessingParam, param) +AddSubConfig(GPUSettingsProcessingNNclusterizer, nn) AddHelp("help", 'h') EndConfig() #endif // __OPENCL__ diff --git a/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h b/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h index ab60827655a43..35ebbabe41672 100644 --- a/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h +++ b/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h @@ -30,6 +30,7 @@ #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessing + ; #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingParam + ; #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingRTC + ; +#pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsProcessingNNclusterizer + ; #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsDisplay + ; #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsDisplayLight + ; #pragma link C++ class o2::gpu::GPUConfigurableParamGPUSettingsDisplayHeavy + ; diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index 6dcb6f1d7e514..37ad164d20a60 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -103,6 +103,9 @@ void GPUChainTracking::RegisterPermanentMemoryAndProcessors() if (GetRecoSteps() & RecoStep::TPCClusterFinding) { for (uint32_t i = 0; i < NSECTORS; i++) { mRec->RegisterGPUProcessor(&processors()->tpcClusterer[i], GetRecoStepsGPU() & RecoStep::TPCClusterFinding); +#ifdef GPUCA_HAS_ONNX + mRec->RegisterGPUProcessor(&processors()->tpcNNClusterer[i], GetRecoStepsGPU() & RecoStep::TPCClusterFinding); +#endif } } if (GetRecoSteps() & RecoStep::Refit) { @@ -148,6 +151,9 @@ void GPUChainTracking::RegisterGPUProcessors() if (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) { for (uint32_t i = 0; i < NSECTORS; i++) { mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcClusterer[i], &processors()->tpcClusterer[i]); +#ifdef GPUCA_HAS_ONNX + mRec->RegisterGPUDeviceProcessor(&processorsShadow()->tpcNNClusterer[i], &processors()->tpcNNClusterer[i]); +#endif } } if (GetRecoStepsGPU() & RecoStep::Refit) { diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 62a4a524d67df..63d56da37595b 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -39,6 +39,11 @@ #include #endif +#ifdef GPUCA_HAS_ONNX +#include "GPUTPCNNClusterizerKernels.h" +#include "GPUTPCNNClusterizerHost.h" +#endif + using namespace o2::gpu; using namespace o2::tpc; using namespace o2::tpc::constants; @@ -149,7 +154,8 @@ std::pair GPUChainTracking::TPCClusterizerDecodeZSCount(uint uint32_t endpointAdcSamples[GPUTrackingInOutZS::NENDPOINTS]; memset(endpointAdcSamples, 0, sizeof(endpointAdcSamples)); bool doGPU = mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding; - int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : (mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) : 0; + int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : (mIOPtrs.tpcZS->sector[iSector].count[0] && mIOPtrs.tpcZS->sector[iSector].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->sector[iSector].zsPtr[0][0]) + : 0; for (uint16_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) { #ifndef GPUCA_NO_VC @@ -606,6 +612,41 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline) } +#ifdef GPUCA_HAS_ONNX + if (GetProcessingSettings().nn.applyNNclusterizer) { + uint32_t maxClusters = -1; + for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { + maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters); + } + for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { + GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; + const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; + clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression; + clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow; + clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad; + clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime; + clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData; + clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0); + clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode; + clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue; + clustererNN.nnClusterizerTotalClusters = maxClusters; + clustererNN.nnClassThreshold = nn_settings.nnClassThreshold; + clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold; + if (clustererNN.nnSigmoidTrafoClassThreshold) { + clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold)); + } + if (nn_settings.nnClusterizerVerbosity < 0) { + clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity; + } else { + clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; + } + clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos; + GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); + AllocateRegisteredMemory(clustererNN.mMemoryId); + } + } +#endif + if (doGPU && mIOPtrs.tpcZS) { processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta; WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1); @@ -854,6 +895,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) uint32_t iSector = iSectorBase + lane; GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSector]; GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSector] : clusterer; + if (doGPU) { SynchronizeStream(lane); } @@ -871,17 +913,77 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) return; } - runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); - DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); + if (GetProcessingSettings().nn.applyNNclusterizer) { +#ifdef GPUCA_HAS_ONNX + GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; + const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; + GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); + + if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); + DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); + } + + float time_clusterizer = 0, time_fill = 0; + for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) { + uint batchStart = batch * clustererNN.nnClusterizerBatchedMode; + size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); + + auto start0 = std::chrono::high_resolution_clock::now(); + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data + + auto stop0 = std::chrono::high_resolution_clock::now(); + auto start1 = std::chrono::high_resolution_clock::now(); + nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype); + if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels + } else { + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels + } + + if (!clustererNN.nnClusterizerUseCfRegression) { + nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype); + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1 + if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) { + nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype); + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2 + } + } + auto stop1 = std::chrono::high_resolution_clock::now(); + + time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; + time_fill += std::chrono::duration_cast(stop0 - start0).count() / 1e9; + } + auto start1 = std::chrono::high_resolution_clock::now(); + if (clustererNN.nnClusterizerUseCfRegression) { + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 + } + auto stop1 = std::chrono::high_resolution_clock::now(); + time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; + if (clustererNN.nnClusterizerVerbosity < 3) { + int acceptedClusters = 0; + for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) { + acceptedClusters += clustererNN.outputDataClass[i]; + } + LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s"; + } +#else + GPUFatal("Project not compiled with neural network clusterization. Aborting."); +#endif + } else { + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); + DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0); + } - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0); if (doGPU && propagateMCLabels) { TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane); if (doGPU) { SynchronizeStream(lane); } - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1); + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSector}}, 1); // Computes MC labels } + if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSector, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters); } diff --git a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h index b4a4752b0f932..cdd489e0ef938 100644 --- a/GPU/GPUTracking/TPCClusterFinder/ChargePos.h +++ b/GPU/GPUTracking/TPCClusterFinder/ChargePos.h @@ -45,6 +45,7 @@ struct ChargePos { GPUdi() tpccf::Row row() const { return gpad / TPC_PADS_PER_ROW_PADDED; } GPUdi() tpccf::Pad pad() const { return gpad % TPC_PADS_PER_ROW_PADDED - GPUCF_PADDING_PAD; } GPUdi() tpccf::TPCFragmentTime time() const { return timePadded - GPUCF_PADDING_TIME; } + GPUdi() tpccf::TPCFragmentTime globalTime() const { return timePadded; } private: // Maps the position of a pad given as row and index in that row to a unique diff --git a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h index f0c6ac47f3c8a..90d977372b201 100644 --- a/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h +++ b/GPU/GPUTracking/TPCClusterFinder/ClusterAccumulator.h @@ -41,6 +41,17 @@ class ClusterAccumulator GPUd() tpccf::Charge updateInner(PackedCharge, tpccf::Delta2); GPUd() tpccf::Charge updateOuter(PackedCharge, tpccf::Delta2); + GPUd() void setFull(float qtot, float padMean, float padSigma, float timeMean, float timeSigma, uint8_t splitInPad, uint8_t splitInTime) + { + mQtot = qtot; + mPadMean = padMean; + mPadSigma = padSigma; + mTimeMean = timeMean; + mTimeSigma = timeSigma; + mSplitInPad = splitInPad; + mSplitInTime = splitInTime; + } + GPUd() void finalize(const ChargePos&, const tpccf::Charge, tpccf::TPCTime); GPUd() bool toNative(const ChargePos&, const tpccf::Charge, tpc::ClusterNative&, const GPUParam&, const Array2D&); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx new file mode 100644 index 0000000000000..6a9b6f546ae07 --- /dev/null +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -0,0 +1,55 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCNNClusterizer.cxx +/// \author Christian Sonnabend + +#include "GPUReconstruction.h" +#include "ML/3rdparty/GPUORTFloat16.h" +#include "GPUTPCNNClusterizer.h" + +using namespace o2::gpu; + +void GPUTPCNNClusterizer::InitializeProcessor() {} + +void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {} + +void* GPUTPCNNClusterizer::setIOPointers(void* mem) +{ + if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize); + } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize); + } + computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode); + computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode); + computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode); + computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters); + if (nnClusterizerModelClassNumOutputNodes > 0) { + computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); + } + if (!nnClusterizerUseCfRegression) { + if (nnClusterizerModelReg1NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + } + if (nnClusterizerModelReg2NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + } + } + return mem; +} + +void GPUTPCNNClusterizer::RegisterMemoryAllocation() +{ + AllocateAndInitializeLate(); + int32_t memType = GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK; + mMemoryId = mRec->RegisterMemoryAllocation(this, &GPUTPCNNClusterizer::setIOPointers, memType, "TPCNNClusterer", GPUMemoryReuse{GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::NNClusterer, (uint16_t)(mISector % mRec->GetProcessingSettings().nTPCClustererLanes)}); +} diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h new file mode 100644 index 0000000000000..ea6340dfd48bc --- /dev/null +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -0,0 +1,77 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCNNClusterizer.h +/// \author Christian Sonnabend + +#ifndef O2_GPUTPCNNCLUSTERIZER_H +#define O2_GPUTPCNNCLUSTERIZER_H + +#include "ChargePos.h" +#include "GPUProcessor.h" + +namespace o2::OrtDataType +{ +struct Float16_t; +} + +namespace o2::gpu +{ + +class GPUTPCNNClusterizer : public GPUProcessor +{ + public: + GPUTPCNNClusterizer() = default; + void* setIOPointers(void*); + void RegisterMemoryAllocation(); + void InitializeProcessor(); + void SetMaxData(const GPUTrackingInOutPointers&); + + // Neural network clusterization + + int nnClusterizerSizeInputRow = 3; + int nnClusterizerSizeInputPad = 3; + int nnClusterizerSizeInputTime = 3; + int nnClusterizerElementSize = -1; + bool nnClusterizerAddIndexData = true; + float nnClassThreshold = 0.16; + bool nnSigmoidTrafoClassThreshold = 1; + int nnClusterizerUseCfRegression = 0; + int nnClusterizerBatchedMode = 1; + int nnClusterizerTotalClusters = 1; + int nnClusterizerVerbosity = 0; + int nnClusterizerBoundaryFillValue = -1; + int nnClusterizerDumpDigits = 0; + int nnClusterizerApplyCfDeconvolution = 0; + int nnClusterizerModelClassNumOutputNodes = -1; + int nnClusterizerModelReg1NumOutputNodes = -1; + int nnClusterizerModelReg2NumOutputNodes = -1; + int nnClusterizerDtype = 0; // 0: float16, 1: float32 + int mISector = -1; + + // Memory allocation for neural network + uint class2_elements = 0; + float* inputData32 = nullptr; + OrtDataType::Float16_t* inputData16 = nullptr; + float* outputDataClass = nullptr; + float* modelProbabilities = nullptr; + float* outputDataReg1 = nullptr; + float* outputDataReg2 = nullptr; + + ChargePos* peakPositions = nullptr; + bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx + float* centralCharges = nullptr; + int16_t mMemoryId = -1; +}; // class GPUTPCNNClusterizer + +} // namespace o2::gpu + +#endif diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx new file mode 100644 index 0000000000000..5002c63524020 --- /dev/null +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -0,0 +1,65 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCNNClusterizerHost.cxx +/// \author Christian Sonnabend + +#include "GPUTPCNNClusterizerHost.h" +#include "GPUTPCNNClusterizer.h" +#include "GPUSettings.h" +#include "ML/3rdparty/GPUORTFloat16.h" + +using namespace o2::gpu; + +GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer) +{ + OrtOptions = { + {"model-path", settings.nnClassificationPath}, + {"device", settings.nnInferenceDevice}, + {"device-id", std::to_string(settings.nnInferenceDeviceId)}, + {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)}, + {"dtype", settings.nnInferenceDtype}, + {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)}, + {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)}, + {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)}, + {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)}, + {"profiling-output-path", settings.nnInferenceOrtProfilingPath}, + {"logging-level", std::to_string(settings.nnInferenceVerbosity)}}; + + model_class.init(OrtOptions); + clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; + + reg_model_paths = splitString(settings.nnRegressionPath, ":"); + + if (!settings.nnClusterizerUseCfRegression) { + if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { + OrtOptions["model-path"] = reg_model_paths[0]; + model_reg_1.init(OrtOptions); + clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; + } else { + OrtOptions["model-path"] = reg_model_paths[0]; + model_reg_1.init(OrtOptions); + clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; + OrtOptions["model-path"] = reg_model_paths[1]; + model_reg_2.init(OrtOptions); + clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1]; + } + } +} + +void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype) +{ + if (dtype == 0) { + model.inference(clusterer.inputData16, size * clusterer.nnClusterizerElementSize, output); + } else { + model.inference(clusterer.inputData32, size * clusterer.nnClusterizerElementSize, output); + } +} diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h new file mode 100644 index 0000000000000..7efa0edecb893 --- /dev/null +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -0,0 +1,68 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCNNClusterizerHost.h +/// \author Christian Sonnabend + +#ifndef O2_GPUTPCNNCLUSTERIZERHOST_H +#define O2_GPUTPCNNCLUSTERIZERHOST_H + +#include +#include +#include +#include "ML/OrtInterface.h" + +using namespace o2::ml; + +namespace o2::OrtDataType +{ +struct Float16_t; +} + +namespace o2::gpu +{ + +class GPUTPCNNClusterizer; +struct GPUSettingsProcessingNNclusterizer; + +class GPUTPCNNClusterizerHost +{ + public: + GPUTPCNNClusterizerHost() = default; + GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + + void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype); + + std::unordered_map OrtOptions; + o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters + std::vector reg_model_paths; + + private: + // Avoid including CommonUtils/StringUtils.h + std::vector splitString(const std::string& input, const std::string& delimiter) + { + std::vector tokens; + std::size_t pos = 0; + std::size_t found; + + while ((found = input.find(delimiter, pos)) != std::string::npos) { + tokens.push_back(input.substr(pos, found - pos)); + pos = found + delimiter.length(); + } + tokens.push_back(input.substr(pos)); + + return tokens; + } +}; // class GPUTPCNNClusterizerHost + +} // namespace o2::gpu + +#endif diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx new file mode 100644 index 0000000000000..25cd2497fbf62 --- /dev/null +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -0,0 +1,386 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCNNClusterizerKernels.cxx +/// \author Christian Sonnabend + +#include "GPUTPCNNClusterizerKernels.h" +#include "GPUTPCCFClusterizer.h" + +using namespace o2::gpu; +using namespace o2::gpu::tpccf; + +#include "CfConsts.h" +#include "CfUtils.h" +#include "ClusterAccumulator.h" +#include "ML/3rdparty/GPUORTFloat16.h" + +#if !defined(GPUCA_GPUCODE) +#include "GPUHostDataTypes.h" +#include "MCLabelAccumulator.h" +#endif + +#ifdef GPUCA_GPUCODE +#include "GPUTPCCFClusterizer.inc" +#endif + +// Defining individual thread functions for data filling, determining the class label and running the CF clusterizer +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + uint glo_idx = get_global_id(0); + auto& clusterer = processors.tpcClusterer[sector]; + auto& clustererNN = processors.tpcNNClusterer[sector]; + if (clustererNN.outputDataClass[glo_idx] == 0) { // default clusterizer should not be called in batched mode due to mess-up with thread indices + return; + } + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CPU_ONLY(MCLabelAccumulator labelAcc(clusterer)); + tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new; + GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart); +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + uint glo_idx = get_global_id(0); + processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold); +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + auto& clusterer = processors.tpcNNClusterer[sector]; + uint glo_idx = get_global_id(0); + uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes; + float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty] + uint class_label = 0; + for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) { + if (pIdx == elem_iterator) { + current_max_prob = clusterer.modelProbabilities[pIdx]; + } else { + class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label); + } + } + // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" + clusterer.outputDataClass[glo_idx + batchStart] = class_label; +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + uint glo_idx = get_global_id(0); + if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) { + return; + } + GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart); +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + uint glo_idx = get_global_id(0); + if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) { + return; + } + GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart); +} + +// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary +GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo) +{ + return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2); +} + +GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift) +{ + return (row > 62 ? global_shift : 0); +} + +GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo) +{ + if (pad < 0 || row < 0) { // Faster short-circuit + return true; + } else if (row < 63) { + return (pad >= static_cast(geo.NPads(row))); + } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network + return true; + } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) { + return (pad >= static_cast(geo.NPads(row - global_shift))); + } else { + return true; + } +} + +// Filling the input data for the neural network where there is no boundary +GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart) +{ + uint glo_idx = get_global_id(0); + auto& clusterer = processors.tpcClusterer[sector]; + auto& clustererNN = processors.tpcNNClusterer[sector]; + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + Array2D isPeakMap(clusterer.mPpeakMap); + + uint write_idx = glo_idx * clustererNN.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId + + ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; + int row = static_cast(peak.row()), pad = static_cast(peak.pad()), time = static_cast(peak.time()); // Explicit casting to avoid conversion errors + float central_charge = static_cast(chargeMap[peak].unpack()); + + clustererNN.peakPositions[glo_idx] = peak; + clustererNN.centralCharges[glo_idx] = central_charge; + clustererNN.outputDataClass[glo_idx + batchStart] = -1; + + int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow); +#ifndef GPUCA_GPUCODE + GPUCA_UNROLL(U(), U()); +#endif + for (int r = -clustererNN.nnClusterizerSizeInputRow; r <= clustererNN.nnClusterizerSizeInputRow; r++) { + bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); + int pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r, clusterer.Param().tpcGeometry); + for (int p = -clustererNN.nnClusterizerSizeInputPad + pad_offset; p <= clustererNN.nnClusterizerSizeInputPad + pad_offset; p++) { + bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow, clusterer.Param().tpcGeometry); + for (int t = -clustererNN.nnClusterizerSizeInputTime; t <= clustererNN.nnClusterizerSizeInputTime; t++) { + if (!is_boundary) { + ChargePos tmp_pos(row + r, pad + p, time + t); + if (r == 0 && !clustererNN.clusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization + clustererNN.clusterFlags[2 * glo_idx] = CfUtils::isPeak(isPeakMap[tmp_pos]); + clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx]; + } + if (dtype == 0) { + clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); + } else { + clustererNN.inputData32[write_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + } + } else { + // Filling boundary just to make sure that no values are left unintentionally + if (dtype == 0) { + clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); + } else { + clustererNN.inputData32[write_idx] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); + } + } + write_idx++; + } + } + } + if (clustererNN.nnClusterizerAddIndexData) { + if (dtype == 0) { + clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f); + clustererNN.inputData16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f); + clustererNN.inputData16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / clusterer.Param().tpcGeometry.NPads(row)); + } else { + clustererNN.inputData32[write_idx] = clusterer.mISector / 36.f; + clustererNN.inputData32[write_idx + 1] = row / 152.f; + clustererNN.inputData32[write_idx + 2] = static_cast(pad) / clusterer.Param().tpcGeometry.NPads(row); + } + } +} + +GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + auto& clusterer = processors.tpcClusterer[sector]; + auto& clustererNN = processors.tpcNNClusterer[sector]; + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); + MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); + tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + uint full_glo_idx = glo_idx + batchStart; + int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg1NumOutputNodes; + + // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg1NumOutputNodes << " -- " << clusterer.peakPositions.size() << " -- " << clusterer.centralCharges.size(); + + if (clustererNN.outputDataClass[full_glo_idx] == 1) { + + ClusterAccumulator pc; + + // Publishing logic is taken from default clusterizer + if (onlyMC) { + ClusterAccumulator dummy_pc; + CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack())); + GPUTPCCFClusterizer::buildCluster( + clusterer.Param().rec, + chargeMap, + clustererNN.peakPositions[glo_idx], + smem.posBcast, + smem.buf, + smem.innerAboveThreshold, + &dummy_pc, + labelAcc); + } + + if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) { + if (clusterer.mPclusterPosInRow) { + clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; + } + return; + } + + pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg1[model_output_index + 4], + static_cast(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg1[model_output_index], + clustererNN.outputDataReg1[model_output_index + 2], + (clusterer.mPmemory->fragment).start + static_cast(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1], + clustererNN.outputDataReg1[model_output_index + 3], + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap); + if (rejectCluster) { + if (clusterer.mPclusterPosInRow) { + clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; + } + return; + } + + uint rowIndex = 0; + if (clusterer.mPclusterByRow != nullptr) { + rowIndex = GPUTPCCFClusterizer::sortIntoBuckets( + clusterer, + myCluster, + clustererNN.peakPositions[glo_idx].row(), + clusterer.mNMaxClusterPerRow, + clusterer.mPclusterInRow, + clusterOut); + if (clusterer.mPclusterPosInRow != nullptr) { + clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex; + } + } else if (clusterer.mPclusterPosInRow) { + rowIndex = clusterer.mPclusterPosInRow[full_glo_idx]; + } + CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); + } else { + if (clusterer.mPclusterPosInRow) { + clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; + } + return; + } +} + +GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + auto& clusterer = processors.tpcClusterer[sector]; + auto& clustererNN = processors.tpcNNClusterer[sector]; + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); + MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); + tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + uint full_glo_idx = glo_idx + batchStart; + int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg2NumOutputNodes; + + // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg2NumOutputNodes << " -- " << clustererNN.peakPositions.size() << " -- " << clustererNN.centralCharges.size(); + + if (clustererNN.outputDataClass[full_glo_idx] > 0) { + + ClusterAccumulator pc; + + if (onlyMC) { + ClusterAccumulator dummy_pc; + CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack())); + GPUTPCCFClusterizer::buildCluster( + clusterer.Param().rec, + chargeMap, + clustererNN.peakPositions[glo_idx], + smem.posBcast, + smem.buf, + smem.innerAboveThreshold, + &dummy_pc, + labelAcc); + } + + if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) { + if (clusterer.mPclusterPosInRow) { + clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; + } + return; + } + + // Cluster 1 + pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 8], + static_cast(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index], + clustererNN.outputDataReg2[model_output_index + 4], + (clusterer.mPmemory->fragment).start + static_cast(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2], + clustererNN.outputDataReg2[model_output_index + 6], + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + + tpc::ClusterNative myCluster; + bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap); + if (rejectCluster) { + if (clusterer.mPclusterPosInRow) { + clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; + } + return; + } + + uint rowIndex = 0; + if (clusterer.mPclusterByRow != nullptr) { + rowIndex = GPUTPCCFClusterizer::sortIntoBuckets( + clusterer, + myCluster, + clustererNN.peakPositions[glo_idx].row(), + clusterer.mNMaxClusterPerRow, + clusterer.mPclusterInRow, + clusterOut); + if (clusterer.mPclusterPosInRow != nullptr) { + clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex; + } + } else if (clusterer.mPclusterPosInRow) { + rowIndex = clusterer.mPclusterPosInRow[full_glo_idx]; + } + CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); + + // Cluster 2 + pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 9], + static_cast(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index + 1], + clustererNN.outputDataReg2[model_output_index + 5], + (clusterer.mPmemory->fragment).start + static_cast(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3], + clustererNN.outputDataReg2[model_output_index + 7], + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + + rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap); + if (rejectCluster) { + if (clusterer.mPclusterPosInRow) { + clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; + } + return; + } + + if (clusterer.mPclusterByRow != nullptr) { + rowIndex = GPUTPCCFClusterizer::sortIntoBuckets( + clusterer, + myCluster, + clustererNN.peakPositions[glo_idx].row(), + clusterer.mNMaxClusterPerRow, + clusterer.mPclusterInRow, + clusterOut); + if (clusterer.mPclusterPosInRow != nullptr) { + clusterer.mPclusterPosInRow[full_glo_idx] = rowIndex; + } + } else if (clusterer.mPclusterPosInRow) { + rowIndex = clusterer.mPclusterPosInRow[full_glo_idx]; + } + // CPU_ONLY(labelAcc->commit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters? + } else { + if (clusterer.mPclusterPosInRow) { + clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; + } + return; + } +} diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h new file mode 100644 index 0000000000000..c7bd18115d61f --- /dev/null +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -0,0 +1,77 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUTPCNNClusterizerKernels.h +/// \author Christian Sonnabend + +#ifndef O2_GPU_NN_CLUSTERIZER_H +#define O2_GPU_NN_CLUSTERIZER_H + +#include "clusterFinderDefs.h" +#include "GPUGeneralKernels.h" +#include "GPUConstantMem.h" +#include "GPUTPCClusterFinder.h" +#include "Array2D.h" +#include "PackedCharge.h" +#include "GPUTPCNNClusterizer.h" + +namespace o2::tpc +{ +struct ClusterNative; +} // namespace o2::tpc + +namespace o2::gpu +{ + +class ClusterAccumulator; +class MCLabelAccumulator; + +class GPUTPCNNClusterizerKernels : public GPUKernelTemplate +{ + public: + static constexpr size_t SCRATCH_PAD_WORK_GROUP_SIZE = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels); + struct GPUSharedMemory { + // Regular cluster finder + ChargePos posBcast[SCRATCH_PAD_WORK_GROUP_SIZE]; + PackedCharge buf[SCRATCH_PAD_WORK_GROUP_SIZE * SCRATCH_PAD_BUILD_N]; + uint8_t innerAboveThreshold[SCRATCH_PAD_WORK_GROUP_SIZE]; + }; + + GPUhdi() constexpr static GPUDataTypes::RecoStep GetRecoStep() + { + return GPUDataTypes::RecoStep::TPCClusterFinding; + } + + enum K : int32_t { + runCfClusterizer = 0, + fillInputNN = 1, + determineClass1Labels = 2, + determineClass2Labels = 3, + publishClass1Regression = 4, + publishClass2Regression = 5, + }; + + template + GPUd() static void Thread(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, processorType&, uint8_t = 0, int8_t = 0, int8_t = 0, uint = 0, Args...); + + private: + static GPUd() void fillInputData(int32_t, int32_t, int32_t, int32_t, processorType&, uint8_t, int8_t, uint); + static GPUd() void publishClustersReg1(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint); + static GPUd() void publishClustersReg2(uint, GPUSharedMemory&, processorType&, uint8_t, int8_t, int8_t, uint); + + static GPUd() int padOffset(int, int, const GPUTPCGeometry&); + static GPUd() int rowOffset(int, int); + static GPUd() bool isBoundary(int, int, int, const GPUTPCGeometry&); +}; + +} // namespace o2::gpu + +#endif diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index 4b7aab75519fa..ad348a84264f0 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -24,6 +24,9 @@ o2_gpu_kernel_file_list(O2PROPAGATOR TrackParametrization.cxx TrackParametrizati o2_gpu_kernel_file_list(TPCCOMPRESSION GPUTPCCompressionTrackModel.cxx) o2_gpu_kernel_file_list(TPCDECOMPRESSION GPUTPCCompressionTrackModel.cxx ERRORS) o2_gpu_kernel_file_list(TPCCLUSTERFINDER ERRORS ClusterAccumulator.cxx) +if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone") +o2_gpu_kernel_file_list(TPCNNCLUSTERFINDER ERRORS ClusterAccumulator.cxx GPUTPCNNClusterizerKernels.cxx) +endif() o2_gpu_kernel_file_list(TRDTRACKER GPUTRDTrack.cxx GPUTRDTracker.cxx GPUTRDTrackletWord.cxx GeometryBase.cxx) o2_gpu_kernel_file_list(GLOBALREFIT TPCMERGER O2PROPAGATOR MATLUT GPUTrackingRefit.cxx) @@ -111,7 +114,15 @@ o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression" "= TPCCLUS o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks" "= TPCCLUSTERFINDER" LB) o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB) o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUSTERFINDER" LB int8_t onlyMC) -o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER") +if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone") +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +endif() +o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER") o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten" "= TPCCLUSTERFINDER" NO GPUTPCLinearLabels* out) o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanStart" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t stage) o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanUp" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t nElems)