From 4fdfcfd156a2f70ce1a8aaaf882f45747b841d78 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Tue, 22 Apr 2025 11:06:27 +0200 Subject: [PATCH 1/3] ML: Fix compiler warnings --- Common/ML/include/ML/3rdparty/GPUORTFloat16.h | 10 +++++----- Common/ML/include/ML/OrtInterface.h | 1 + Common/ML/src/OrtInterface.cxx | 4 ++-- .../Global/GPUChainTrackingClusterizer.cxx | 4 ++-- .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 16 +++++++++------- .../GPUTPCNNClusterizerKernels.cxx | 6 +++--- 6 files changed, 22 insertions(+), 19 deletions(-) diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h index 9516ba5dad573..3bf2f465b2a35 100644 --- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h +++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h @@ -535,9 +535,9 @@ GPUdi() uint16_t BFloat16Impl::ToUint16Impl(float v) noexcept result = kPositiveQNaNBits; } else { auto get_msb_half = [](float fl) { - uint16_t result; + uint16_t res; #ifdef GPUCA_GPUCODE - o2::gpu::CAMath::memcpy(&result, reinterpret_cast(&fl) + sizeof(uint16_t), sizeof(uint16_t)); + o2::gpu::CAMath::memcpy(&res, reinterpret_cast(&fl) + sizeof(uint16_t), sizeof(uint16_t)); #else #ifdef __cpp_if_constexpr if constexpr (detail::endian::native == detail::endian::little) @@ -545,12 +545,12 @@ GPUdi() uint16_t BFloat16Impl::ToUint16Impl(float v) noexcept if (detail::endian::native == detail::endian::little) #endif { - std::memcpy(&result, reinterpret_cast(&fl) + sizeof(uint16_t), sizeof(uint16_t)); + std::memcpy(&res, reinterpret_cast(&fl) + sizeof(uint16_t), sizeof(uint16_t)); } else { - std::memcpy(&result, &fl, sizeof(uint16_t)); + std::memcpy(&res, &fl, sizeof(uint16_t)); } #endif - return result; + return res; }; uint16_t upper_bits = get_msb_half(v); diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index e37b6a69b6036..791f6813c2d24 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -22,6 +22,7 @@ #include #include #include +#include // O2 includes #include "Framework/Logger.h" diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 24a2fbffb252c..a8a20b11f9e64 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -19,6 +19,8 @@ // ONNX includes #include +#include + namespace o2 { @@ -139,7 +141,6 @@ void OrtModel::initSession() void OrtModel::memoryOnDevice(int32_t deviceIndex) { -#if (defined(ORT_ROCM_BUILD) || defined(ORT_MIGRAPHX_BUILD) || defined(ORT_CUDA_BUILD) || defined(ORT_TENSORRT_BUILD)) if (deviceIndex >= 0) { (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1"); (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -161,7 +162,6 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex) LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex << " and pImplOrt pointer " << pImplOrt; } } -#endif } void OrtModel::resetSession() diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 981d565852d28..37c12b2a3b3f4 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -980,12 +980,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); } - float time_clusterizer = 0, time_fill = 0, time_networks = 0; + // float time_clusterizer = 0, time_fill = 0, time_networks = 0; for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.nnClusterizerBatchedMode); batch++) { uint batchStart = batch * clustererNNShadow.nnClusterizerBatchedMode; size_t iSize = CAMath::Min((uint)clustererNNShadow.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); - auto start0 = std::chrono::high_resolution_clock::now(); + // auto start0 = std::chrono::high_resolution_clock::now(); runKernel({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data // auto stop0 = std::chrono::high_resolution_clock::now(); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index db2f05711f537..31b71fd8f1ebe 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -29,7 +29,7 @@ using namespace o2::gpu; void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) { std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath; - std::vector reg_model_paths; + std::vector reg_model_paths_local; std::vector evalMode = o2::utils::Str::tokenize(settings.nnEvalMode, ':'); if (settings.nnLoadFromCCDB) { @@ -60,20 +60,20 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set model_class.initOptions(OrtOptions); modelsUsed[0] = true; - reg_model_paths = o2::utils::Str::tokenize(reg_model_path, ':'); + reg_model_paths_local = o2::utils::Str::tokenize(reg_model_path, ':'); if (!settings.nnClusterizerUseCfRegression) { - if (reg_model_paths.size() == 1) { - OrtOptions["model-path"] = reg_model_paths[0]; + if (reg_model_paths_local.size() == 1) { + OrtOptions["model-path"] = reg_model_paths_local[0]; OrtOptions["onnx-environment-name"] = "r1"; model_reg_1.initOptions(OrtOptions); modelsUsed[1] = true; } else { - OrtOptions["model-path"] = reg_model_paths[0]; + OrtOptions["model-path"] = reg_model_paths_local[0]; OrtOptions["onnx-environment-name"] = "r1"; model_reg_1.initOptions(OrtOptions); modelsUsed[1] = true; - OrtOptions["model-path"] = reg_model_paths[1]; + OrtOptions["model-path"] = reg_model_paths_local[1]; OrtOptions["onnx-environment-name"] = "r2"; model_reg_2.initOptions(OrtOptions); modelsUsed[2] = true; @@ -154,6 +154,7 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info MockedOrtAllocator::~MockedOrtAllocator() { // Ort::GetApi().ReleaseMemoryInfo(memory_info); + (void)0; // Suppress warning for empty destructor } void* MockedOrtAllocator::Alloc(size_t size) @@ -191,8 +192,9 @@ size_t MockedOrtAllocator::NumReserveAllocations() const void MockedOrtAllocator::LeakCheck() { - if (memory_inuse.load()) + if (memory_inuse.load()) { LOG(warning) << "memory leak!!!"; + } } void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 512bc1d3bb09b..413293502d3c6 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -124,7 +124,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.row()), pad = static_cast(peak.pad()); - if (clustererNN.nnClusterizerAddIndexData && transient_index == (clustererNN.nnClusterizerElementSize - 1)) { + if (clustererNN.nnClusterizerAddIndexData && (int32_t)transient_index == (clustererNN.nnClusterizerElementSize - 1)) { uint top_idx = (base_idx + 1) * clustererNN.nnClusterizerElementSize; for (uint16_t i = 0; i < 8; i++) { Delta2 d = cfconsts::InnerNeighbors[i]; @@ -141,7 +141,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(pad) / GPUTPCGeometry::NPads(row); } - } else if (transient_index < (clustererNN.nnClusterizerElementSize - 3)) { + } else if ((int32_t)transient_index < (clustererNN.nnClusterizerElementSize - 3)) { int time = static_cast(peak.time()); int r = CAMath::Floor(transient_index / ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1))) - clustererNN.nnClusterizerSizeInputRow; bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); @@ -197,7 +197,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(clustererNN.modelProbabilities_16[pIdx]); From b4b7d60a4cafe5c8cb7075ab2012a48f00c5fb4b Mon Sep 17 00:00:00 2001 From: David Rohr Date: Tue, 22 Apr 2025 11:06:40 +0200 Subject: [PATCH 2/3] GPU Standalone: support build with ONNXRuntime --- Common/ML/CMakeLists.txt | 2 +- Common/ML/include/ML/OrtInterface.h | 2 +- GPU/GPUTracking/Base/cuda/CMakeLists.txt | 12 +- GPU/GPUTracking/Base/hip/CMakeLists.txt | 11 +- GPU/GPUTracking/CMakeLists.txt | 10 +- GPU/GPUTracking/Standalone/CMakeLists.txt | 23 +- GPU/GPUTracking/Standalone/cmake/config.cmake | 1 + GPU/GPUTracking/Standalone/cmake/prepare.sh | 2 +- GPU/GPUTracking/cmake/kernel_helpers.cmake | 2 - GPU/GPUTracking/kernels.cmake | 214 +++++++++--------- 10 files changed, 151 insertions(+), 128 deletions(-) diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt index a5b336bf7e820..0ed52e1a23e20 100644 --- a/Common/ML/CMakeLists.txt +++ b/Common/ML/CMakeLists.txt @@ -12,7 +12,7 @@ o2_add_library(ML SOURCES src/OrtInterface.cxx TARGETVARNAME targetName - PRIVATE_LINK_LIBRARIES O2::Framework onnxruntime::onnxruntime) + PRIVATE_LINK_LIBRARIES O2::GPUCommon onnxruntime::onnxruntime) # Pass ORT variables as a preprocessor definition target_compile_definitions(${targetName} PRIVATE diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 791f6813c2d24..ea70e28c0421c 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -25,7 +25,7 @@ #include // O2 includes -#include "Framework/Logger.h" +#include "GPUCommonLogger.h" namespace Ort { diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt index 8dd430d00a5c0..36162bcaa2f13 100644 --- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt +++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt @@ -121,12 +121,6 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${CMAKE_SOURCE_DIR}/DataFormats/Reconstruction/src ${CMAKE_CURRENT_SOURCE_DIR} TARGETVARNAME targetName) - - target_compile_definitions(${targetName} PRIVATE - GPUCA_HAS_ONNX=1 - $<$:ORT_CUDA_BUILD> - $<$:ORT_TENSORRT_BUILD>) - install(FILES ${HDRS} DESTINATION include/GPU) endif() @@ -141,6 +135,12 @@ endif() target_compile_definitions(${targetName} PRIVATE $) +if (onnxruntime_FOUND) + target_compile_definitions(${targetName} PRIVATE + $<$:ORT_CUDA_BUILD> + $<$:ORT_TENSORRT_BUILD>) +endif() + # Setting target architecture and adding GPU libraries target_link_libraries(${targetName} PRIVATE cuda cudart nvrtc) set_target_cuda_arch(${targetName}) diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index 315a6c2fa3080..9398ffdd5b9f1 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -170,11 +170,6 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${GPUCA_HIP_SOURCE_DIR} TARGETVARNAME targetName) - target_compile_definitions(${targetName} PRIVATE - GPUCA_HAS_ONNX=1 - $<$:ORT_ROCM_BUILD> - $<$:ORT_MIGRAPHX_BUILD>) - install(FILES ${HDRS} DESTINATION include/GPU) # o2_add_test(GPUsortHIP NAME test_GPUsortHIP @@ -195,6 +190,12 @@ endif() target_compile_definitions(${targetName} PRIVATE $) +if (onnxruntime_FOUND) + target_compile_definitions(${targetName} PRIVATE + $<$:ORT_ROCM_BUILD> + $<$:ORT_MIGRAPHX_BUILD>) +endif() + add_library(${MODULE}_CXX OBJECT ${SRCS_CXX}) # Adding a C++ library for the .cxx code of the HIP library, such that it does not link to HIP libraries, and CMake HIP Language doesn't add HIP compile flags. target_compile_definitions(${MODULE}_CXX PRIVATE $) target_include_directories(${MODULE}_CXX PRIVATE $) diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index c0648b3274108..4c1de17025627 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -200,7 +200,7 @@ set(SRCS_NO_CINT ${SRCS_NO_CINT} Refit/GPUTrackingRefitKernel.cxx Merger/GPUTPCGMO2Output.cxx) -if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone") +if(onnxruntime_FOUND) list(APPEND SRCS_NO_CINT TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx TPCClusterFinder/GPUTPCNNClusterizer.cxx TPCClusterFinder/GPUTPCNNClusterizerHost.cxx) endif() @@ -343,7 +343,6 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") O2::DetectorsRaw O2::Steer O2::ML - PRIVATE_LINK_LIBRARIES onnxruntime::onnxruntime PUBLIC_INCLUDE_DIRECTORIES ${INCDIRS} SOURCES ${SRCS} ${SRCS_NO_CINT} ${SRCS_NO_H}) @@ -351,7 +350,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${targetName} PRIVATE $) - target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1) + target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2) o2_target_root_dictionary(${MODULE} HEADERS ${HDRS_CINT_O2} ${HDRS_CINT_O2_ADDITIONAL} @@ -421,6 +420,11 @@ target_link_libraries(${targetName} PRIVATE TBB::tbb) target_compile_options(${targetName} PRIVATE -Wno-instantiation-after-specialization) +if (onnxruntime_FOUND) + target_compile_definitions(${targetName} PRIVATE GPUCA_HAS_ONNX=1) + target_link_libraries(${targetName} PRIVATE onnxruntime::onnxruntime) +endif() + # Add CMake recipes for GPU Tracking librararies if(CUDA_ENABLED OR OPENCL_ENABLED OR HIP_ENABLED) if(CMAKE_SYSTEM_NAME MATCHES Darwin) diff --git a/GPU/GPUTracking/Standalone/CMakeLists.txt b/GPU/GPUTracking/Standalone/CMakeLists.txt index fbc256d5d7f91..a17c58ad1ba03 100644 --- a/GPU/GPUTracking/Standalone/CMakeLists.txt +++ b/GPU/GPUTracking/Standalone/CMakeLists.txt @@ -121,11 +121,25 @@ else() endif() # Detect GPU Backends -find_package(O2GPU) +find_package(O2GPU REQUIRED) + +if(GPUCA_CONFIG_ONNX) + find_package(onnxruntime REQUIRED) + if(CUDA_ENABLED AND NOT DEFINED ORT_CUDA_BUILD) + set(ORT_CUDA_BUILD ON) + elseif(HIP_ENABLED AND NOT DEFINED ORT_ROCM_BUILD) + set(ORT_ROCM_BUILD ON) + endif() +else() + set(onnxruntime_FOUND OFF) +endif() # Create main targets add_subdirectory(../../ GPU) -add_library(standalone_support SHARED ${O2_DIR}/Common/Field/src/MagFieldFast.cxx +add_library(standalone_support SHARED + ${O2_DIR}/Common/Field/src/MagFieldFast.cxx + ${O2_DIR}/Common/ML/src/OrtInterface.cxx + ${O2_DIR}/Common/Utils/src/StringUtils.cxx ${O2_DIR}/DataFormats/Detectors/TPC/src/CompressedClusters.cxx ${O2_DIR}/DataFormats/Reconstruction/src/TrackParametrization.cxx ${O2_DIR}/DataFormats/Reconstruction/src/TrackParametrizationWithError.cxx @@ -150,6 +164,7 @@ target_include_directories(standalone_support PUBLIC ${O2_DIR}/Common/Constants/include ${O2_DIR}/Common/MathUtils/include ${O2_DIR}/Common/Utils/include + ${O2_DIR}/Common/ML/include ${O2_DIR}/DataFormats/common/include ${O2_DIR}/DataFormats/Detectors/Common/include ${O2_DIR}/DataFormats/Detectors/ITSMFT/common/include @@ -210,6 +225,10 @@ if(GPUCA_CONFIG_ROOT) ROOT::Tree) endif() +if(GPUCA_CONFIG_ONNX) + target_link_libraries(standalone_support PRIVATE onnxruntime::onnxruntime) +endif() + if (GPUCA_BUILD_DEBUG_SANITIZE AND CMAKE_CXX_COMPILER MATCHES "clang\\+\\+") execute_process(COMMAND ${CMAKE_CXX_COMPILER} -print-file-name=libclang_rt.asan-x86_64.so OUTPUT_VARIABLE CLANG_ASAN_SO_PATH OUTPUT_STRIP_TRAILING_WHITESPACE) get_filename_component(CLANG_ASAN_SO_PATH "${CLANG_ASAN_SO_PATH}" DIRECTORY) diff --git a/GPU/GPUTracking/Standalone/cmake/config.cmake b/GPU/GPUTracking/Standalone/cmake/config.cmake index af7c96bb96fbb..1de0cfa27d7ee 100644 --- a/GPU/GPUTracking/Standalone/cmake/config.cmake +++ b/GPU/GPUTracking/Standalone/cmake/config.cmake @@ -18,6 +18,7 @@ set(ENABLE_OPENCL AUTO) set(GPUCA_CONFIG_VC 1) set(GPUCA_CONFIG_FMT 1) set(GPUCA_CONFIG_ROOT 1) +set(GPUCA_CONFIG_ONNX 0) set(GPUCA_BUILD_EVENT_DISPLAY 1) set(GPUCA_BUILD_EVENT_DISPLAY_FREETYPE 1) set(GPUCA_BUILD_EVENT_DISPLAY_VULKAN 1) diff --git a/GPU/GPUTracking/Standalone/cmake/prepare.sh b/GPU/GPUTracking/Standalone/cmake/prepare.sh index 17474b5fc6956..121245e23dc65 100755 --- a/GPU/GPUTracking/Standalone/cmake/prepare.sh +++ b/GPU/GPUTracking/Standalone/cmake/prepare.sh @@ -11,6 +11,6 @@ else fi eval "`alienv shell-helper`" # alienv load O2/latest -for i in Vc boost fmt CMake ms_gsl Clang ninja TBB ROOT; do +for i in Vc boost fmt CMake ms_gsl Clang ninja TBB ROOT ONNXRuntime; do source sw/$ALIARCH/$i/latest/etc/profile.d/init.sh done diff --git a/GPU/GPUTracking/cmake/kernel_helpers.cmake b/GPU/GPUTracking/cmake/kernel_helpers.cmake index 35f2915d9486a..e36cb4e2f3149 100644 --- a/GPU/GPUTracking/cmake/kernel_helpers.cmake +++ b/GPU/GPUTracking/cmake/kernel_helpers.cmake @@ -174,7 +174,6 @@ function(o2_gpu_kernel_add_parameter) list(LENGTH ARGV n) math(EXPR n "${n} - 1") foreach(i RANGE 0 ${n}) - message(STATUS "Adding ${ARGV${i}}") set_property(TARGET O2_GPU_KERNELS APPEND PROPERTY O2_GPU_KERNEL_PARAMS "${ARGV${i}}") endforeach() endfunction() @@ -182,7 +181,6 @@ function(o2_gpu_kernel_add_string_parameter) list(LENGTH ARGV n) math(EXPR n "${n} - 1") foreach(i RANGE 0 ${n}) - message(STATUS "Adding ${ARGV${i}}") set_property(TARGET O2_GPU_KERNELS APPEND PROPERTY O2_GPU_KERNEL_STRING_PARAMS "${ARGV${i}}") endforeach() endfunction() diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index 202ea47d1f3bf..937a92fef33df 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -24,117 +24,117 @@ o2_gpu_kernel_file_list(O2PROPAGATOR TrackParametrization.cxx TrackParametrizati o2_gpu_kernel_file_list(TPCCOMPRESSION GPUTPCCompressionTrackModel.cxx) o2_gpu_kernel_file_list(TPCDECOMPRESSION GPUTPCCompressionTrackModel.cxx ERRORS) o2_gpu_kernel_file_list(TPCCLUSTERFINDER ERRORS ClusterAccumulator.cxx) -if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone") -o2_gpu_kernel_file_list(TPCNNCLUSTERFINDER ERRORS ClusterAccumulator.cxx GPUTPCNNClusterizerKernels.cxx) -endif() o2_gpu_kernel_file_list(TRDTRACKER GPUTRDTrack.cxx GPUTRDTracker.cxx GPUTRDTrackletWord.cxx GeometryBase.cxx) o2_gpu_kernel_file_list(GLOBALREFIT TPCMERGER O2PROPAGATOR MATLUT GPUTrackingRefit.cxx) +if(onnxruntime_FOUND) +o2_gpu_kernel_file_list(TPCNNCLUSTERFINDER ERRORS ClusterAccumulator.cxx GPUTPCNNClusterizerKernels.cxx) +endif() -o2_gpu_add_kernel("GPUTPCNeighboursFinder" "= TPCTRACKER" LB) -o2_gpu_add_kernel("GPUTPCNeighboursCleaner" "= TPCTRACKER" LB) -o2_gpu_add_kernel("GPUTPCStartHitsFinder" "= TPCTRACKER" LB) -o2_gpu_add_kernel("GPUTPCStartHitsSorter" "= TPCTRACKER" LB) -o2_gpu_add_kernel("GPUTPCTrackletConstructor" "= TPCTRACKER" LB) -o2_gpu_add_kernel("GPUTPCTrackletSelector" "= TPCTRACKER" LB) -o2_gpu_add_kernel("GPUMemClean16" "GPUGeneralKernels" NO void* ptr "uint64_t" size) -o2_gpu_add_kernel("GPUitoa" "GPUGeneralKernels" NO int32_t* ptr "uint64_t" size) -o2_gpu_add_kernel("GPUTPCExtrapolationTrackingCopyNumbers" "GPUTPCExtrapolationTracking TPCTRACKER" NO int32_t n) -o2_gpu_add_kernel("GPUTPCExtrapolationTracking" "= TPCTRACKER TPCTRACKLETCONS" LB) -o2_gpu_add_kernel("GPUTPCCreateTrackingData" "= TPCTRACKER TPCSECTORDATA" LB) -o2_gpu_add_kernel("GPUTPCSectorDebugSortKernels, hitData" "= TPCTRACKER") -o2_gpu_add_kernel("GPUTPCSectorDebugSortKernels, startHits" "= TPCTRACKER") -o2_gpu_add_kernel("GPUTPCSectorDebugSortKernels, sectorTracks" "= TPCTRACKER") -o2_gpu_add_kernel("GPUTPCGlobalDebugSortKernels, clearIds" "= TPCMERGER" NO int8_t parameter) -o2_gpu_add_kernel("GPUTPCGlobalDebugSortKernels, sectorTracks" "= TPCMERGER" NO int8_t parameter) -o2_gpu_add_kernel("GPUTPCGlobalDebugSortKernels, extrapolatedTracks1" "= TPCMERGER" NO int8_t parameter) -o2_gpu_add_kernel("GPUTPCGlobalDebugSortKernels, extrapolatedTracks2" "= TPCMERGER" NO int8_t parameter) -o2_gpu_add_kernel("GPUTPCGlobalDebugSortKernels, borderTracks" "= TPCMERGER" NO int8_t parameter) -o2_gpu_add_kernel("GPUTPCCreateOccupancyMap, fill" "= TPCOCCUPANCY" LB GPUTPCClusterOccupancyMapBin* map) -o2_gpu_add_kernel("GPUTPCCreateOccupancyMap, fold" "= TPCOCCUPANCY" LB GPUTPCClusterOccupancyMapBin* map "uint32_t*" output) -o2_gpu_add_kernel("GPUTPCGMMergerTrackFit" "GPUTPCGMMergerGPU TPCMERGER TPCTRACKER MATLUT TPCDEDX" LB int32_t mode) -o2_gpu_add_kernel("GPUTPCGMMergerFollowLoopers" "GPUTPCGMMergerGPU TPCMERGER TPCTRACKER MATLUT" LB) -o2_gpu_add_kernel("GPUTPCGMMergerUnpackResetIds" "GPUTPCGMMergerGPU TPCMERGER" LB int32_t iSector) -o2_gpu_add_kernel("GPUTPCGMMergerSectorRefit" "GPUTPCGMMergerGPU TPCMERGER MATLUT" LB int32_t iSector) -o2_gpu_add_kernel("GPUTPCGMMergerUnpackGlobal" "GPUTPCGMMergerGPU TPCMERGER" LB int32_t iSector) -o2_gpu_add_kernel("GPUTPCGMMergerUnpackSaveNumber" "GPUTPCGMMergerGPU TPCMERGER" NO int32_t id) -o2_gpu_add_kernel("GPUTPCGMMergerResolve, step0" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerResolve, step1" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerResolve, step2" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerResolve, step3" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerResolve, step4" "GPUTPCGMMergerGPU TPCMERGER" LB int8_t useOrigTrackParam int8_t mergeAll) -o2_gpu_add_kernel("GPUTPCGMMergerClearLinks" "GPUTPCGMMergerGPU TPCMERGER" LB int8_t output) -o2_gpu_add_kernel("GPUTPCGMMergerMergeWithinPrepare" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerMergeSectorsPrepare" "GPUTPCGMMergerGPU TPCMERGER" LB int32_t border0 int32_t border1 int8_t useOrigTrackParam) -o2_gpu_add_kernel("GPUTPCGMMergerMergeBorders, step0" "GPUTPCGMMergerGPU TPCMERGER" LB int32_t iSector int8_t withinSector int8_t mergeMode) -o2_gpu_add_kernel("GPUTPCGMMergerMergeBorders, step1" "GPUTPCGMMergerGPU TPCMERGER" NO int32_t iSector int8_t withinSector int8_t mergeMode) -o2_gpu_add_kernel("GPUTPCGMMergerMergeBorders, step2" "GPUTPCGMMergerGPU TPCMERGER" LB int32_t iSector int8_t withinSector int8_t mergeMode) -o2_gpu_add_kernel("GPUTPCGMMergerMergeBorders, variant" "GPUTPCGMMergerGPU TPCMERGER" NO gputpcgmmergertypes::GPUTPCGMBorderRange* range int32_t N int32_t cmpMax) -o2_gpu_add_kernel("GPUTPCGMMergerMergeCE" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerLinkExtrapolatedTracks" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerCollect" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerSortTracks" "GPUTPCGMMergerGPU TPCMERGER") -o2_gpu_add_kernel("GPUTPCGMMergerSortTracksQPt" "GPUTPCGMMergerGPU TPCMERGER") -o2_gpu_add_kernel("GPUTPCGMMergerSortTracksPrepare" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerPrepareClusters, step0" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerPrepareClusters, step1" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerPrepareClusters, step2" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerFinalize, step0" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerFinalize, step1" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerFinalize, step2" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerMergeLoopers, step0" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerMergeLoopers, step1" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMMergerMergeLoopers, step2" "GPUTPCGMMergerGPU TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMO2Output, prepare" "= TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMO2Output, sort" "= TPCMERGER") -o2_gpu_add_kernel("GPUTPCGMO2Output, output" "= TPCMERGER" LB) -o2_gpu_add_kernel("GPUTPCGMO2Output, mc" "= TPCMERGER") -o2_gpu_add_kernel("GPUTRDTrackerKernels, gpuVersion" "= TRDTRACKER MATLUT TPCMERGER" LB GPUTRDTrackerGPU* externalInstance) -o2_gpu_add_kernel("GPUTRDTrackerKernels, o2Version" "= TRDTRACKER MATLUT O2PROPAGATOR" LB GPUTRDTracker* externalInstance) -o2_gpu_add_kernel("GPUITSFitterKernels" "= TPCMERGER MATLUT" LB) -o2_gpu_add_kernel("GPUTPCConvertKernel" "=" LB) -o2_gpu_add_kernel("GPUTPCCompressionKernels, step0attached" "= TPCCOMPRESSION" LB) -o2_gpu_add_kernel("GPUTPCCompressionKernels, step1unattached" "= ERRORS" LB) -o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, unbuffered" "GPUTPCCompressionKernels" LB) -o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered32" "GPUTPCCompressionKernels" LB) -o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered64" "GPUTPCCompressionKernels" LB) -o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered128" "GPUTPCCompressionKernels" LB) -o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, multiBlock" "GPUTPCCompressionKernels" LB) -o2_gpu_add_kernel("GPUTPCDecompressionKernels, step0attached" "= TPCDECOMPRESSION" LB int32_t trackStart int32_t trackEnd) -o2_gpu_add_kernel("GPUTPCDecompressionKernels, step1unattached" "= TPCDECOMPRESSION" LB int32_t sectorStart int32_t nSectors) -o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, sortPerSectorRow" "GPUTPCDecompressionKernels" LB) -o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, countFilteredClusters" "GPUTPCDecompressionKernels" LB) -o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, storeFilteredClusters" "GPUTPCDecompressionKernels" LB) -o2_gpu_add_kernel("GPUTPCCFCheckPadBaseline" "= TPCCLUSTERFINDER" LB) -o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillIndexMap" "= TPCCLUSTERFINDER" LB) -o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillFromDigits" "= TPCCLUSTERFINDER" LB) -o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, findFragmentStart" "= TPCCLUSTERFINDER" LB int8_t setPositions) -o2_gpu_add_kernel("GPUTPCCFPeakFinder" "= TPCCLUSTERFINDER" LB) -o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression" "= TPCCLUSTERFINDER" LB) -o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks" "= TPCCLUSTERFINDER" LB) -o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB) -o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUSTERFINDER" LB int8_t onlyMC) -if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone") -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNeighboursFinder" "= TPCTRACKER" LB) +o2_gpu_add_kernel("GPUTPCNeighboursCleaner" "= TPCTRACKER" LB) +o2_gpu_add_kernel("GPUTPCStartHitsFinder" "= TPCTRACKER" LB) +o2_gpu_add_kernel("GPUTPCStartHitsSorter" "= TPCTRACKER" LB) +o2_gpu_add_kernel("GPUTPCTrackletConstructor" "= TPCTRACKER" LB) +o2_gpu_add_kernel("GPUTPCTrackletSelector" "= TPCTRACKER" LB) +o2_gpu_add_kernel("GPUMemClean16" "GPUGeneralKernels" NO void* ptr uint64_t size) +o2_gpu_add_kernel("GPUitoa" "GPUGeneralKernels" NO int32_t* ptr uint64_t size) +o2_gpu_add_kernel("GPUTPCExtrapolationTrackingCopyNumbers" "GPUTPCExtrapolationTracking TPCTRACKER" NO int32_t n) +o2_gpu_add_kernel("GPUTPCExtrapolationTracking" "= TPCTRACKER TPCTRACKLETCONS" LB) +o2_gpu_add_kernel("GPUTPCCreateTrackingData" "= TPCTRACKER TPCSECTORDATA" LB) +o2_gpu_add_kernel("GPUTPCSectorDebugSortKernels, hitData" "= TPCTRACKER") +o2_gpu_add_kernel("GPUTPCSectorDebugSortKernels, startHits" "= TPCTRACKER") +o2_gpu_add_kernel("GPUTPCSectorDebugSortKernels, sectorTracks" "= TPCTRACKER") +o2_gpu_add_kernel("GPUTPCGlobalDebugSortKernels, clearIds" "= TPCMERGER" NO int8_t parameter) +o2_gpu_add_kernel("GPUTPCGlobalDebugSortKernels, sectorTracks" "= TPCMERGER" NO int8_t parameter) +o2_gpu_add_kernel("GPUTPCGlobalDebugSortKernels, extrapolatedTracks1" "= TPCMERGER" NO int8_t parameter) +o2_gpu_add_kernel("GPUTPCGlobalDebugSortKernels, extrapolatedTracks2" "= TPCMERGER" NO int8_t parameter) +o2_gpu_add_kernel("GPUTPCGlobalDebugSortKernels, borderTracks" "= TPCMERGER" NO int8_t parameter) +o2_gpu_add_kernel("GPUTPCCreateOccupancyMap, fill" "= TPCOCCUPANCY" LB GPUTPCClusterOccupancyMapBin* map) +o2_gpu_add_kernel("GPUTPCCreateOccupancyMap, fold" "= TPCOCCUPANCY" LB GPUTPCClusterOccupancyMapBin* map uint32_t* output) +o2_gpu_add_kernel("GPUTPCGMMergerTrackFit" "GPUTPCGMMergerGPU TPCMERGER TPCTRACKER MATLUT TPCDEDX" LB int32_t mode) +o2_gpu_add_kernel("GPUTPCGMMergerFollowLoopers" "GPUTPCGMMergerGPU TPCMERGER TPCTRACKER MATLUT" LB) +o2_gpu_add_kernel("GPUTPCGMMergerUnpackResetIds" "GPUTPCGMMergerGPU TPCMERGER" LB int32_t iSector) +o2_gpu_add_kernel("GPUTPCGMMergerSectorRefit" "GPUTPCGMMergerGPU TPCMERGER MATLUT" LB int32_t iSector) +o2_gpu_add_kernel("GPUTPCGMMergerUnpackGlobal" "GPUTPCGMMergerGPU TPCMERGER" LB int32_t iSector) +o2_gpu_add_kernel("GPUTPCGMMergerUnpackSaveNumber" "GPUTPCGMMergerGPU TPCMERGER" NO int32_t id) +o2_gpu_add_kernel("GPUTPCGMMergerResolve, step0" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerResolve, step1" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerResolve, step2" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerResolve, step3" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerResolve, step4" "GPUTPCGMMergerGPU TPCMERGER" LB int8_t useOrigTrackParam int8_t mergeAll) +o2_gpu_add_kernel("GPUTPCGMMergerClearLinks" "GPUTPCGMMergerGPU TPCMERGER" LB int8_t output) +o2_gpu_add_kernel("GPUTPCGMMergerMergeWithinPrepare" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerMergeSectorsPrepare" "GPUTPCGMMergerGPU TPCMERGER" LB int32_t border0 int32_t border1 int8_t useOrigTrackParam) +o2_gpu_add_kernel("GPUTPCGMMergerMergeBorders, step0" "GPUTPCGMMergerGPU TPCMERGER" LB int32_t iSector int8_t withinSector int8_t mergeMode) +o2_gpu_add_kernel("GPUTPCGMMergerMergeBorders, step1" "GPUTPCGMMergerGPU TPCMERGER" NO int32_t iSector int8_t withinSector int8_t mergeMode) +o2_gpu_add_kernel("GPUTPCGMMergerMergeBorders, step2" "GPUTPCGMMergerGPU TPCMERGER" LB int32_t iSector int8_t withinSector int8_t mergeMode) +o2_gpu_add_kernel("GPUTPCGMMergerMergeBorders, variant" "GPUTPCGMMergerGPU TPCMERGER" NO gputpcgmmergertypes::GPUTPCGMBorderRange* range int32_t N int32_t cmpMax) +o2_gpu_add_kernel("GPUTPCGMMergerMergeCE" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerLinkExtrapolatedTracks" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerCollect" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerSortTracks" "GPUTPCGMMergerGPU TPCMERGER") +o2_gpu_add_kernel("GPUTPCGMMergerSortTracksQPt" "GPUTPCGMMergerGPU TPCMERGER") +o2_gpu_add_kernel("GPUTPCGMMergerSortTracksPrepare" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerPrepareClusters, step0" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerPrepareClusters, step1" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerPrepareClusters, step2" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerFinalize, step0" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerFinalize, step1" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerFinalize, step2" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerMergeLoopers, step0" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerMergeLoopers, step1" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMMergerMergeLoopers, step2" "GPUTPCGMMergerGPU TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMO2Output, prepare" "= TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMO2Output, sort" "= TPCMERGER") +o2_gpu_add_kernel("GPUTPCGMO2Output, output" "= TPCMERGER" LB) +o2_gpu_add_kernel("GPUTPCGMO2Output, mc" "= TPCMERGER") +o2_gpu_add_kernel("GPUTRDTrackerKernels, gpuVersion" "= TRDTRACKER MATLUT TPCMERGER" LB GPUTRDTrackerGPU* externalInstance) +o2_gpu_add_kernel("GPUTRDTrackerKernels, o2Version" "= TRDTRACKER MATLUT O2PROPAGATOR" LB GPUTRDTracker* externalInstance) +o2_gpu_add_kernel("GPUITSFitterKernels" "= TPCMERGER MATLUT" LB) +o2_gpu_add_kernel("GPUTPCConvertKernel" "=" LB) +o2_gpu_add_kernel("GPUTPCCompressionKernels, step0attached" "= TPCCOMPRESSION" LB) +o2_gpu_add_kernel("GPUTPCCompressionKernels, step1unattached" "= ERRORS" LB) +o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, unbuffered" "GPUTPCCompressionKernels" LB) +o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered32" "GPUTPCCompressionKernels" LB) +o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered64" "GPUTPCCompressionKernels" LB) +o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, buffered128" "GPUTPCCompressionKernels" LB) +o2_gpu_add_kernel("GPUTPCCompressionGatherKernels, multiBlock" "GPUTPCCompressionKernels" LB) +o2_gpu_add_kernel("GPUTPCDecompressionKernels, step0attached" "= TPCDECOMPRESSION" LB int32_t trackStart int32_t trackEnd) +o2_gpu_add_kernel("GPUTPCDecompressionKernels, step1unattached" "= TPCDECOMPRESSION" LB int32_t sectorStart int32_t nSectors) +o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, sortPerSectorRow" "GPUTPCDecompressionKernels" LB) +o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, countFilteredClusters" "GPUTPCDecompressionKernels" LB) +o2_gpu_add_kernel("GPUTPCDecompressionUtilKernels, storeFilteredClusters" "GPUTPCDecompressionKernels" LB) +o2_gpu_add_kernel("GPUTPCCFCheckPadBaseline" "= TPCCLUSTERFINDER" LB) +o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillIndexMap" "= TPCCLUSTERFINDER" LB) +o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, fillFromDigits" "= TPCCLUSTERFINDER" LB) +o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, findFragmentStart" "= TPCCLUSTERFINDER" LB int8_t setPositions) +o2_gpu_add_kernel("GPUTPCCFPeakFinder" "= TPCCLUSTERFINDER" LB) +o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression" "= TPCCLUSTERFINDER" LB) +o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks" "= TPCCLUSTERFINDER" LB) +o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB) +o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUSTERFINDER" LB int8_t onlyMC) +o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER") +o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten" "= TPCCLUSTERFINDER" NO GPUTPCLinearLabels* out) +o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanStart" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t stage) +o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanUp" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t nElems) +o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanTop" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t nElems) +o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanDown" "= TPCCLUSTERFINDER" LB int32_t iBuf uint32_t offset int32_t nElems) +o2_gpu_add_kernel("GPUTPCCFStreamCompaction, compactDigits" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t stage CfChargePos* in CfChargePos* out) +o2_gpu_add_kernel("GPUTPCCFDecodeZS" "= TPCCLUSTERFINDER" LB int32_t firstHBF) +o2_gpu_add_kernel("GPUTPCCFDecodeZSLink" "GPUTPCCFDecodeZS" LB int32_t firstHBF) +o2_gpu_add_kernel("GPUTPCCFDecodeZSDenseLink" "GPUTPCCFDecodeZS" LB int32_t firstHBF) +o2_gpu_add_kernel("GPUTPCCFGather" "=" LB o2::tpc::ClusterNative* dest) +o2_gpu_add_kernel("GPUTrackingRefitKernel, mode0asGPU" "= GLOBALREFIT " LB) +o2_gpu_add_kernel("GPUTrackingRefitKernel, mode1asTrackParCov" "= GLOBALREFIT " LB) +if(onnxruntime_FOUND) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) endif() -o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER") -o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten" "= TPCCLUSTERFINDER" NO GPUTPCLinearLabels* out) -o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanStart" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t stage) -o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanUp" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t nElems) -o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanTop" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t nElems) -o2_gpu_add_kernel("GPUTPCCFStreamCompaction, scanDown" "= TPCCLUSTERFINDER" LB int32_t iBuf "uint32_t" offset int32_t nElems) -o2_gpu_add_kernel("GPUTPCCFStreamCompaction, compactDigits" "= TPCCLUSTERFINDER" LB int32_t iBuf int32_t stage CfChargePos* in CfChargePos* out) -o2_gpu_add_kernel("GPUTPCCFDecodeZS" "= TPCCLUSTERFINDER" LB int32_t firstHBF) -o2_gpu_add_kernel("GPUTPCCFDecodeZSLink" "GPUTPCCFDecodeZS" LB int32_t firstHBF) -o2_gpu_add_kernel("GPUTPCCFDecodeZSDenseLink" "GPUTPCCFDecodeZS" LB int32_t firstHBF) -o2_gpu_add_kernel("GPUTPCCFGather" "=" LB o2::tpc::ClusterNative* dest) -o2_gpu_add_kernel("GPUTrackingRefitKernel, mode0asGPU" "= GLOBALREFIT " LB) -o2_gpu_add_kernel("GPUTrackingRefitKernel, mode1asTrackParCov" "= GLOBALREFIT " LB) o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP NEIGHBOURS_FINDER_UNROLL_GLOBAL From e25f1513ece0d38319abf89cf49770e3fc808394 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Tue, 22 Apr 2025 18:34:36 +0200 Subject: [PATCH 3/3] CUDA ORT: Must use api struct to call functions --- .../Base/cuda/GPUReconstructionCUDA.cu | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index d5b01bfa34833..d4f9faaf203c9 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -621,24 +621,34 @@ void GPUReconstructionCUDA::loadKernelModules(bool perKernel) } } +#define ORTCHK(command) \ + { \ + OrtStatus* status = command; \ + if (status != nullptr) { \ + const char* msg = api->GetErrorMessage(status); \ + GPUFatal("ONNXRuntime Error: %s", msg); \ + } \ + } + void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId) { GPUChkErr(cudaGetDevice(deviceId)); #if !defined(__HIPCC__) && defined(ORT_CUDA_BUILD) + const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION); OrtCUDAProviderOptionsV2* cuda_options = nullptr; - CreateCUDAProviderOptions(&cuda_options); + ORTCHK(api->CreateCUDAProviderOptions(&cuda_options)); // std::vector keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"}; // std::vector values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"}; // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size()); // this implicitly sets "has_user_compute_stream" - cuda_options.has_user_compute_stream = 1; - UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream]); + cuda_options->has_user_compute_stream = 1; + ORTCHK(api->UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream])); session_options.AppendExecutionProvider_CUDA_V2(cuda_options); // Finally, don't forget to release the provider options - ReleaseCUDAProviderOptions(cuda_options); + api->ReleaseCUDAProviderOptions(cuda_options); #elif defined(ORT_ROCM_BUILD) // const auto& api = Ort::GetApi(); // api.GetCurrentGpuDeviceId(deviceId);