From 84eac06b0bb1562d826b4ace3a8435c3385b91a0 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 15 Mar 2025 21:36:05 +0100 Subject: [PATCH 01/40] Initial set of bug.fixes and cosmetic changes --- .../Global/GPUChainTrackingClusterizer.cxx | 15 +- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 38 +++-- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 3 +- .../GPUTPCNNClusterizerHost.cxx | 4 +- .../GPUTPCNNClusterizerHost.h | 17 -- .../GPUTPCNNClusterizerKernels.cxx | 160 ++++++++---------- 6 files changed, 101 insertions(+), 136 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 63d56da37595b..546f62b6c35d6 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -614,7 +614,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) #ifdef GPUCA_HAS_ONNX if (GetProcessingSettings().nn.applyNNclusterizer) { - uint32_t maxClusters = -1; + uint32_t maxClusters = 0; for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters); } @@ -918,6 +918,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); + int withMC = (doGPU && propagateMCLabels); if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); @@ -930,23 +931,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); auto start0 = std::chrono::high_resolution_clock::now(); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Filling the data auto stop0 = std::chrono::high_resolution_clock::now(); auto start1 = std::chrono::high_resolution_clock::now(); nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype); if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels } else { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels } if (!clustererNN.nnClusterizerUseCfRegression) { nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1 + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 1 if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) { nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2 + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 2 } } auto stop1 = std::chrono::high_resolution_clock::now(); @@ -956,7 +957,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } auto start1 = std::chrono::high_resolution_clock::now(); if (clustererNN.nnClusterizerUseCfRegression) { - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 } auto stop1 = std::chrono::high_resolution_clock::now(); time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 6a9b6f546ae07..df0f895cd5976 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -24,25 +24,29 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {} void* GPUTPCNNClusterizer::setIOPointers(void* mem) { - if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize); - } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize); - } - computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode); - computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode); - computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode); - computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters); - if (nnClusterizerModelClassNumOutputNodes > 0) { - computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); - } - if (!nnClusterizerUseCfRegression) { - if (nnClusterizerModelReg1NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + if (nnClusterizerBatchedMode > 0){ + if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize); + } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize); } - if (nnClusterizerModelReg2NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode); + computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode); + computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode); + if (nnClusterizerModelClassNumOutputNodes > 0) { + computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); } + if (!nnClusterizerUseCfRegression) { + if (nnClusterizerModelReg1NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + } + if (nnClusterizerModelReg2NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + } + } + } + if (nnClusterizerTotalClusters > 0) { + computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters); } return mem; } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index ea6340dfd48bc..01d1873f3b351 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -42,7 +42,7 @@ class GPUTPCNNClusterizer : public GPUProcessor int nnClusterizerSizeInputTime = 3; int nnClusterizerElementSize = -1; bool nnClusterizerAddIndexData = true; - float nnClassThreshold = 0.16; + float nnClassThreshold = 0.01; bool nnSigmoidTrafoClassThreshold = 1; int nnClusterizerUseCfRegression = 0; int nnClusterizerBatchedMode = 1; @@ -58,7 +58,6 @@ class GPUTPCNNClusterizer : public GPUProcessor int mISector = -1; // Memory allocation for neural network - uint class2_elements = 0; float* inputData32 = nullptr; OrtDataType::Float16_t* inputData16 = nullptr; float* outputDataClass = nullptr; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 5002c63524020..321fad3d039db 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -12,6 +12,8 @@ /// \file GPUTPCNNClusterizerHost.cxx /// \author Christian Sonnabend +#include + #include "GPUTPCNNClusterizerHost.h" #include "GPUTPCNNClusterizer.h" #include "GPUSettings.h" @@ -37,7 +39,7 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl model_class.init(OrtOptions); clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; - reg_model_paths = splitString(settings.nnRegressionPath, ":"); + reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':'); if (!settings.nnClusterizerUseCfRegression) { if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 7efa0edecb893..430d78d0bb2fb 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -44,23 +44,6 @@ class GPUTPCNNClusterizerHost std::unordered_map OrtOptions; o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters std::vector reg_model_paths; - - private: - // Avoid including CommonUtils/StringUtils.h - std::vector splitString(const std::string& input, const std::string& delimiter) - { - std::vector tokens; - std::size_t pos = 0; - std::size_t found; - - while ((found = input.find(delimiter, pos)) != std::string::npos) { - tokens.push_back(input.substr(pos, found - pos)); - pos = found + delimiter.length(); - } - tokens.push_back(input.substr(pos)); - - return tokens; - } }; // class GPUTPCNNClusterizerHost } // namespace o2::gpu diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 25cd2497fbf62..c536303147ae6 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -34,7 +34,7 @@ using namespace o2::gpu::tpccf; // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; @@ -44,91 +44,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY(MCLabelAccumulator labelAcc(clusterer)); - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new; GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart); -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - uint glo_idx = get_global_id(0); - processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold); -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - auto& clusterer = processors.tpcNNClusterer[sector]; - uint glo_idx = get_global_id(0); - uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes; - float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty] - uint class_label = 0; - for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) { - if (pIdx == elem_iterator) { - current_max_prob = clusterer.modelProbabilities[pIdx]; - } else { - class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label); - } - } - // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" - clusterer.outputDataClass[glo_idx + batchStart] = class_label; -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - uint glo_idx = get_global_id(0); - if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) { - return; - } - GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart); -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - uint glo_idx = get_global_id(0); - if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) { - return; - } - GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart); -} - -// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary -GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo) -{ - return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2); -} - -GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift) -{ - return (row > 62 ? global_shift : 0); -} - -GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo) -{ - if (pad < 0 || row < 0) { // Faster short-circuit - return true; - } else if (row < 63) { - return (pad >= static_cast(geo.NPads(row))); - } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network - return true; - } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) { - return (pad >= static_cast(geo.NPads(row - global_shift))); - } else { - return true; - } -} - -// Filling the input data for the neural network where there is no boundary -GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; @@ -144,7 +66,7 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n clustererNN.peakPositions[glo_idx] = peak; clustererNN.centralCharges[glo_idx] = central_charge; - clustererNN.outputDataClass[glo_idx + batchStart] = -1; + clustererNN.outputDataClass[glo_idx + batchStart] = -1.f; int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow); #ifndef GPUCA_GPUCODE @@ -192,14 +114,43 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n } } -GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) +{ + uint glo_idx = get_global_id(0); + processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold); +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) +{ + auto& clusterer = processors.tpcNNClusterer[sector]; + uint glo_idx = get_global_id(0); + uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes; + float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty] + uint class_label = 0; + for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) { + if (pIdx == elem_iterator) { + current_max_prob = clusterer.modelProbabilities[pIdx]; + } else { + class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label); + } + } + // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" + clusterer.outputDataClass[glo_idx + batchStart] = class_label; +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { + uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; uint full_glo_idx = glo_idx + batchStart; int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg1NumOutputNodes; @@ -210,7 +161,7 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha ClusterAccumulator pc; // Publishing logic is taken from default clusterizer - if (onlyMC) { + if (withMC) { ClusterAccumulator dummy_pc; CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack())); GPUTPCCFClusterizer::buildCluster( @@ -223,7 +174,6 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha &dummy_pc, labelAcc); } - if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -272,24 +222,25 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha } } -GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { + uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; uint full_glo_idx = glo_idx + batchStart; int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg2NumOutputNodes; - // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg2NumOutputNodes << " -- " << clustererNN.peakPositions.size() << " -- " << clustererNN.centralCharges.size(); - if (clustererNN.outputDataClass[full_glo_idx] > 0) { ClusterAccumulator pc; - if (onlyMC) { + if (withMC) { ClusterAccumulator dummy_pc; CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack())); GPUTPCCFClusterizer::buildCluster( @@ -302,7 +253,6 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha &dummy_pc, labelAcc); } - if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -384,3 +334,29 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha return; } } + +// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary +GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo) +{ + return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2); +} + +GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift) +{ + return (row > 62 ? global_shift : 0); +} + +GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo) +{ + if (pad < 0 || row < 0) { // Faster short-circuit + return true; + } else if (row < 63) { + return (pad >= static_cast(geo.NPads(row))); + } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network + return true; + } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) { + return (pad >= static_cast(geo.NPads(row - global_shift))); + } else { + return true; + } +} From 219164923257101b6084bd97700314ea4f109d30 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Sat, 15 Mar 2025 20:37:59 +0000 Subject: [PATCH 02/40] Please consider the following formatting changes --- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index df0f895cd5976..655e2bf5a933c 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -24,7 +24,7 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {} void* GPUTPCNNClusterizer::setIOPointers(void* mem) { - if (nnClusterizerBatchedMode > 0){ + if (nnClusterizerBatchedMode > 0) { if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) { computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize); } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) { From b742c50537a7aa73812e62079306830884fce271 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 15 Mar 2025 21:42:35 +0100 Subject: [PATCH 03/40] Adjusting eval sizes. Makes code neater and avoids some calculations --- Common/ML/src/OrtInterface.cxx | 13 ++++++------- .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 6 +++--- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index fc784dd14d2dc..ae809a2ba5c1a 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -226,19 +226,18 @@ template std::vector OrtModel::inference void OrtModel::inference(I* input, size_t input_size, O* output) { - std::vector inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputShape{input_size, (int64_t)mInputShapes[0][1]}; Ort::Value inputTensor = Ort::Value(nullptr); if constexpr (std::is_same_v) { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size, inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); } else { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); } - std::vector outputShape{inputShape[0], mOutputShapes[0][1]}; - size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]); - Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size()); + std::vector outputShape{input_size, mOutputShapes[0][1]}; + Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); - (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here + (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is always correct here } template void OrtModel::inference(OrtDataType::Float16_t*, size_t, float*); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 321fad3d039db..b32d042ebd1fa 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -57,11 +57,11 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl } } -void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype) +void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype) { if (dtype == 0) { - model.inference(clusterer.inputData16, size * clusterer.nnClusterizerElementSize, output); + model.inference(clustererNN.inputData16, size, output); } else { - model.inference(clusterer.inputData32, size * clusterer.nnClusterizerElementSize, output); + model.inference(clustererNN.inputData32, size, output); } } From 0c1cfb742e987ab50d87c0f5023a63841531335d Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 20 Mar 2025 13:13:25 +0100 Subject: [PATCH 04/40] Adding separate functions. Now the host process only needs one instance and one initialization --- .../Global/GPUChainTrackingClusterizer.cxx | 7 +++--- .../GPUTPCNNClusterizerHost.cxx | 24 +++++++++++++++---- .../GPUTPCNNClusterizerHost.h | 3 +++ 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index b928ed7c177eb..916f2634fb2f6 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -612,14 +612,16 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } #ifdef GPUCA_HAS_ONNX + const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; + GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only if (GetProcessingSettings().nn.applyNNclusterizer) { uint32_t maxClusters = 0; + nnApplication.init(nn_settings); for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters); } for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; - const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression; clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow; clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad; @@ -640,7 +642,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; } clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos; - GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); + nnApplication.initClusterizer(nn_settings, clustererNN); AllocateRegisteredMemory(clustererNN.mMemoryId); } } @@ -916,7 +918,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) #ifdef GPUCA_HAS_ONNX GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; - GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); int withMC = (doGPU && propagateMCLabels); if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index b32d042ebd1fa..a1f78ca787282 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -21,7 +21,12 @@ using namespace o2::gpu; -GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer) +GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) +{ + init(settings); +} + +void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) { OrtOptions = { {"model-path", settings.nnClassificationPath}, @@ -37,21 +42,30 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl {"logging-level", std::to_string(settings.nnInferenceVerbosity)}}; model_class.init(OrtOptions); - clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; - reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':'); + reg_model_paths = splitString(settings.nnRegressionPath, ":"); if (!settings.nnClusterizerUseCfRegression) { if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { OrtOptions["model-path"] = reg_model_paths[0]; model_reg_1.init(OrtOptions); - clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; } else { OrtOptions["model-path"] = reg_model_paths[0]; model_reg_1.init(OrtOptions); - clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; OrtOptions["model-path"] = reg_model_paths[1]; model_reg_2.init(OrtOptions); + } + } +} + +void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer) +{ + clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; + if (!settings.nnClusterizerUseCfRegression) { + if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { + clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; + } else { + clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1]; } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 430d78d0bb2fb..1f31567dc42f1 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -39,6 +39,9 @@ class GPUTPCNNClusterizerHost GPUTPCNNClusterizerHost() = default; GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + void init(const GPUSettingsProcessingNNclusterizer&); + void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype); std::unordered_map OrtOptions; From 83c004fa0f84ab8fec7a7458e210ad63ab7a489f Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 22 Mar 2025 16:55:50 +0100 Subject: [PATCH 05/40] First version of CCDB implementation --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 8 ++++++ .../Global/GPUChainTrackingClusterizer.cxx | 21 +++++++++++++++ .../GPUTPCNNClusterizerHost.cxx | 26 +++++++++++++++++++ .../GPUTPCNNClusterizerHost.h | 1 + 4 files changed, 56 insertions(+) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 40a7fc71cbb4d..7611e810768fe 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -249,6 +249,14 @@ AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The c AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.") AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path") AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") +// CCDB +AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally") +AddOption(nnCCDBURL, std::string, "http://alice-ccdb.cern.ch", "", 0, "The CCDB URL from where the network files are fetched") +AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks") +AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression") +AddOption(nnCCDBLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") +AddOption(nnCCDBBeamType, std::string, "", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp") +AddOption(nnCCDBInteractionRate, int, -1, "", 0, "Distinguishes between networks for different interaction rates [kHz].") AddHelp("help", 'h') EndConfig() diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 916f2634fb2f6..c7816bb9ec17c 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -615,6 +615,27 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only if (GetProcessingSettings().nn.applyNNclusterizer) { + if(nn_settings.nnLoadFromCCDB) { + std::unordered_map ccdbSettings = { + {"nnCCDBPath", nn_settings.nnCCDBPath}, + {"inputDType", nn_settings.inputDType}, + {"outputDType", nn_settings.outputDType}, + {"nnCCDBWithMomentum", std::to_string(nn_settings.nnCCDBWithMomentum)}, + {"nnCCDBLayerType", nn_settings.nnCCDBLayerType}, + {"nnCCDBBeamType", nn_settings.nnCCDBBeamType}, + {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)} + }; + + std::unordered_map networkRetrieval = ccdbSettings; + + networkRetrieval["nnCCDBEvalType"] = "classification_c1"; + networkRetrieval["outputFile"] = "net_classification_c1.onnx"; + nnApplication.loadFromCCDB(networkRetrieval); + + networkRetrieval["nnCCDBEvalType"] = "regression_c1"; + networkRetrieval["outputFile"] = "net_regression_c1.onnx"; + nnApplication.loadFromCCDB(networkRetrieval); + } uint32_t maxClusters = 0; nnApplication.init(nn_settings); for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index a1f78ca787282..20190994b97ba 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -26,6 +26,32 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl init(settings); } +void GPUTPCNNClusterizerHost::loadFromCCDB(std::unordered_map settings) { + o2::ccdb::CcdbApi ccdbApi; + ccdbApi.init(settings["nnCCDBURL"]); + + metadata[settings["inputDType"]] = settings["inputDType"]; + metadata[settings["outputDType"]] = settings["outputDType"]; + metadata[settings["nnCCDBEvalType"]] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C + metadata[settings["nnCCDBWithMomentum"]] = std::stoi(settings["nnCCDBWithMomentum"]); // 0, 1 -> Only for regression model + metadata[settings["nnCCDBLayerType"]] = settings["nnCCDBLayerType"]; // FC, CNN + if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) { + metadata[settings["nnCCDBInteractionRate"]] = settings["nnCCDBInteractionRate"]; + } + if (settings["nnCCDBBeamType"] != "") { + metadata[settings["nnCCDBBeamType"]] = settings["nnCCDBBeamType"]; + } + + bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnPathCCDB"], ".", metadata, 1, false, settings["outputFile"]); + // headers = ccdbApi.retrieveHeaders(nnPathCCDB, metadata, ccdbTimestamp); // potentially needed to init some local variables + + if (retrieveSuccess) { + LOG(info) << "Network " << settings["nnPathCCDB"] << " retrieved from CCDB, stored at " << settings["networkPathLocal"]; + } else { + LOG(error) << "Failed to retrieve network from CCDB"; + } +} + void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) { OrtOptions = { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 1f31567dc42f1..a3f3ecd72ffca 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -41,6 +41,7 @@ class GPUTPCNNClusterizerHost void init(const GPUSettingsProcessingNNclusterizer&); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + void loadFromCCDB(std::unordered_map); void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype); From d767ed1b636f97c0fe8447e8e3ebfc854a4aa214 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 24 Mar 2025 00:16:18 +0100 Subject: [PATCH 06/40] Working CCDB API calls (tested with test-ccdb) --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 11 +++++--- .../Global/GPUChainTrackingClusterizer.cxx | 13 ++++++---- .../GPUTPCNNClusterizerHost.cxx | 25 ++++++++++--------- .../GPUTPCNNClusterizerHost.h | 8 ++++-- 4 files changed, 34 insertions(+), 23 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 7611e810768fe..5b4d08f5ffe67 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -229,6 +229,8 @@ AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify i AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id") AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference") AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 +AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16 +AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347") @@ -251,12 +253,13 @@ AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regress AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") // CCDB AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally") -AddOption(nnCCDBURL, std::string, "http://alice-ccdb.cern.ch", "", 0, "The CCDB URL from where the network files are fetched") +AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched") AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks") AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression") -AddOption(nnCCDBLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") -AddOption(nnCCDBBeamType, std::string, "", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp") -AddOption(nnCCDBInteractionRate, int, -1, "", 0, "Distinguishes between networks for different interaction rates [kHz].") +AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") +AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") +AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp") +AddOption(nnCCDBInteractionRate, int, 50, "", 0, "Distinguishes between networks for different interaction rates [kHz].") AddHelp("help", 'h') EndConfig() diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index c7816bb9ec17c..fb6bffe51a160 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -616,26 +616,29 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only if (GetProcessingSettings().nn.applyNNclusterizer) { if(nn_settings.nnLoadFromCCDB) { - std::unordered_map ccdbSettings = { + std::map ccdbSettings = { + {"nnCCDBURL", nn_settings.nnCCDBURL}, {"nnCCDBPath", nn_settings.nnCCDBPath}, - {"inputDType", nn_settings.inputDType}, - {"outputDType", nn_settings.outputDType}, + {"inputDType", nn_settings.nnInferenceInputDType}, + {"outputDType", nn_settings.nnInferenceOutputDType}, {"nnCCDBWithMomentum", std::to_string(nn_settings.nnCCDBWithMomentum)}, - {"nnCCDBLayerType", nn_settings.nnCCDBLayerType}, {"nnCCDBBeamType", nn_settings.nnCCDBBeamType}, {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)} }; - std::unordered_map networkRetrieval = ccdbSettings; + std::map networkRetrieval = ccdbSettings; + networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType; networkRetrieval["nnCCDBEvalType"] = "classification_c1"; networkRetrieval["outputFile"] = "net_classification_c1.onnx"; nnApplication.loadFromCCDB(networkRetrieval); + networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType; networkRetrieval["nnCCDBEvalType"] = "regression_c1"; networkRetrieval["outputFile"] = "net_regression_c1.onnx"; nnApplication.loadFromCCDB(networkRetrieval); } + uint32_t maxClusters = 0; nnApplication.init(nn_settings); for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 20190994b97ba..b4ee558b1e201 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -16,6 +16,7 @@ #include "GPUTPCNNClusterizerHost.h" #include "GPUTPCNNClusterizer.h" +#include "CCDB/CcdbApi.h" #include "GPUSettings.h" #include "ML/3rdparty/GPUORTFloat16.h" @@ -26,27 +27,27 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl init(settings); } -void GPUTPCNNClusterizerHost::loadFromCCDB(std::unordered_map settings) { +void GPUTPCNNClusterizerHost::loadFromCCDB(std::map settings) { o2::ccdb::CcdbApi ccdbApi; ccdbApi.init(settings["nnCCDBURL"]); - metadata[settings["inputDType"]] = settings["inputDType"]; - metadata[settings["outputDType"]] = settings["outputDType"]; - metadata[settings["nnCCDBEvalType"]] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C - metadata[settings["nnCCDBWithMomentum"]] = std::stoi(settings["nnCCDBWithMomentum"]); // 0, 1 -> Only for regression model - metadata[settings["nnCCDBLayerType"]] = settings["nnCCDBLayerType"]; // FC, CNN + metadata["inputDType"] = settings["inputDType"]; + metadata["outputDType"] = settings["outputDType"]; + metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C + metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model + metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) { - metadata[settings["nnCCDBInteractionRate"]] = settings["nnCCDBInteractionRate"]; + metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"]; } if (settings["nnCCDBBeamType"] != "") { - metadata[settings["nnCCDBBeamType"]] = settings["nnCCDBBeamType"]; + metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"]; } - bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnPathCCDB"], ".", metadata, 1, false, settings["outputFile"]); - // headers = ccdbApi.retrieveHeaders(nnPathCCDB, metadata, ccdbTimestamp); // potentially needed to init some local variables + bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], ".", metadata, 1, false, settings["outputFile"]); + // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables if (retrieveSuccess) { - LOG(info) << "Network " << settings["nnPathCCDB"] << " retrieved from CCDB, stored at " << settings["networkPathLocal"]; + LOG(info) << "Network " << settings["nnCCDBPath"] << " retrieved from CCDB, stored at " << settings["outputFile"]; } else { LOG(error) << "Failed to retrieve network from CCDB"; } @@ -69,7 +70,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set model_class.init(OrtOptions); - reg_model_paths = splitString(settings.nnRegressionPath, ":"); + reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':'); if (!settings.nnClusterizerUseCfRegression) { if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index a3f3ecd72ffca..798d4af2826b3 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -37,17 +37,21 @@ class GPUTPCNNClusterizerHost { public: GPUTPCNNClusterizerHost() = default; - GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&); void init(const GPUSettingsProcessingNNclusterizer&); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); - void loadFromCCDB(std::unordered_map); + void loadFromCCDB(std::map); void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype); std::unordered_map OrtOptions; o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters std::vector reg_model_paths; + + private: + std::map metadata; + std::map headers; }; // class GPUTPCNNClusterizerHost } // namespace o2::gpu From ad4b22be3c457fbcb6e0cca852bf8800d7b9929e Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 24 Mar 2025 10:38:51 +0100 Subject: [PATCH 07/40] Improve fetching, but have to pass settings by value, not const ref --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 + .../Global/GPUChainTrackingClusterizer.cxx | 31 +++++++++++++++---- .../GPUTPCNNClusterizerHost.cxx | 4 +-- .../GPUTPCNNClusterizerHost.h | 4 +-- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 5b4d08f5ffe67..a8a4ae566f485 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -255,6 +255,7 @@ AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally") AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched") AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks") +AddOption(nnCCDBFetchMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)") AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression") AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index fb6bffe51a160..dcd5cc2197e3c 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -40,6 +40,7 @@ #endif #ifdef GPUCA_HAS_ONNX +#include #include "GPUTPCNNClusterizerKernels.h" #include "GPUTPCNNClusterizerHost.h" #endif @@ -612,7 +613,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } #ifdef GPUCA_HAS_ONNX - const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; + GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn; GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only if (GetProcessingSettings().nn.applyNNclusterizer) { if(nn_settings.nnLoadFromCCDB) { @@ -626,17 +627,35 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)} }; + std::string nnFetchFolder = ""; + std::vector fetchMode = o2::utils::Str::tokenize(nn_settings.nnCCDBFetchMode, ':'); std::map networkRetrieval = ccdbSettings; - networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType; - networkRetrieval["nnCCDBEvalType"] = "classification_c1"; - networkRetrieval["outputFile"] = "net_classification_c1.onnx"; - nnApplication.loadFromCCDB(networkRetrieval); + if (fetchMode[0] == "c1") { + networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType; + networkRetrieval["nnCCDBEvalType"] = "classification_c1"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx"; + nnApplication.loadFromCCDB(networkRetrieval); + } else if (fetchMode[0] == "c2") { + networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType; + networkRetrieval["nnCCDBEvalType"] = "classification_c2"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx"; + nnApplication.loadFromCCDB(networkRetrieval); + } + nn_settings.nnClassificationPath = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType; networkRetrieval["nnCCDBEvalType"] = "regression_c1"; - networkRetrieval["outputFile"] = "net_regression_c1.onnx"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx"; nnApplication.loadFromCCDB(networkRetrieval); + nn_settings.nnRegressionPath = networkRetrieval["outputFile"]; + if (fetchMode[1] == "r2") { + networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType; + networkRetrieval["nnCCDBEvalType"] = "regression_c2"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx"; + nnApplication.loadFromCCDB(networkRetrieval); + nn_settings.nnRegressionPath += ":", networkRetrieval["outputFile"]; + } } uint32_t maxClusters = 0; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index b4ee558b1e201..da32d4938ebed 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -22,7 +22,7 @@ using namespace o2::gpu; -GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) +GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer settings) { init(settings); } @@ -53,7 +53,7 @@ void GPUTPCNNClusterizerHost::loadFromCCDB(std::map se } } -void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) +void GPUTPCNNClusterizerHost::init(GPUSettingsProcessingNNclusterizer settings) { OrtOptions = { {"model-path", settings.nnClassificationPath}, diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 798d4af2826b3..b6d5e48304e0d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -37,9 +37,9 @@ class GPUTPCNNClusterizerHost { public: GPUTPCNNClusterizerHost() = default; - GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&); + GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer); - void init(const GPUSettingsProcessingNNclusterizer&); + void init(GPUSettingsProcessingNNclusterizer); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); void loadFromCCDB(std::map); From 81c646be0d2e27af8707a0b67a651cccc9de5b64 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 24 Mar 2025 11:04:57 +0100 Subject: [PATCH 08/40] Using const ref and moving CCDB calls to host initialization --- .../Global/GPUChainTrackingClusterizer.cxx | 47 +--------------- .../GPUTPCNNClusterizerHost.cxx | 53 +++++++++++++++++-- .../GPUTPCNNClusterizerHost.h | 4 +- 3 files changed, 53 insertions(+), 51 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index dcd5cc2197e3c..98a0ec16495c5 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -40,7 +40,6 @@ #endif #ifdef GPUCA_HAS_ONNX -#include #include "GPUTPCNNClusterizerKernels.h" #include "GPUTPCNNClusterizerHost.h" #endif @@ -613,51 +612,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } #ifdef GPUCA_HAS_ONNX - GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn; + const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only if (GetProcessingSettings().nn.applyNNclusterizer) { - if(nn_settings.nnLoadFromCCDB) { - std::map ccdbSettings = { - {"nnCCDBURL", nn_settings.nnCCDBURL}, - {"nnCCDBPath", nn_settings.nnCCDBPath}, - {"inputDType", nn_settings.nnInferenceInputDType}, - {"outputDType", nn_settings.nnInferenceOutputDType}, - {"nnCCDBWithMomentum", std::to_string(nn_settings.nnCCDBWithMomentum)}, - {"nnCCDBBeamType", nn_settings.nnCCDBBeamType}, - {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)} - }; - - std::string nnFetchFolder = ""; - std::vector fetchMode = o2::utils::Str::tokenize(nn_settings.nnCCDBFetchMode, ':'); - std::map networkRetrieval = ccdbSettings; - - if (fetchMode[0] == "c1") { - networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType; - networkRetrieval["nnCCDBEvalType"] = "classification_c1"; - networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx"; - nnApplication.loadFromCCDB(networkRetrieval); - } else if (fetchMode[0] == "c2") { - networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType; - networkRetrieval["nnCCDBEvalType"] = "classification_c2"; - networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx"; - nnApplication.loadFromCCDB(networkRetrieval); - } - nn_settings.nnClassificationPath = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally - - networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType; - networkRetrieval["nnCCDBEvalType"] = "regression_c1"; - networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx"; - nnApplication.loadFromCCDB(networkRetrieval); - nn_settings.nnRegressionPath = networkRetrieval["outputFile"]; - if (fetchMode[1] == "r2") { - networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType; - networkRetrieval["nnCCDBEvalType"] = "regression_c2"; - networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx"; - nnApplication.loadFromCCDB(networkRetrieval); - nn_settings.nnRegressionPath += ":", networkRetrieval["outputFile"]; - } - } - uint32_t maxClusters = 0; nnApplication.init(nn_settings); for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { @@ -988,7 +945,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (!clustererNN.nnClusterizerUseCfRegression) { nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype); runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 1 - if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) { + if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype); runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 2 } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index da32d4938ebed..533ac0c7481ff 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -22,7 +22,7 @@ using namespace o2::gpu; -GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer settings) +GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) { init(settings); } @@ -53,10 +53,55 @@ void GPUTPCNNClusterizerHost::loadFromCCDB(std::map se } } -void GPUTPCNNClusterizerHost::init(GPUSettingsProcessingNNclusterizer settings) +void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) { + std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath; + std::vector reg_model_paths; + + if(settings.nnLoadFromCCDB) { + std::map ccdbSettings = { + {"nnCCDBURL", settings.nnCCDBURL}, + {"nnCCDBPath", settings.nnCCDBPath}, + {"inputDType", settings.nnInferenceInputDType}, + {"outputDType", settings.nnInferenceOutputDType}, + {"nnCCDBWithMomentum", std::to_string(settings.nnCCDBWithMomentum)}, + {"nnCCDBBeamType", settings.nnCCDBBeamType}, + {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)} + }; + + std::string nnFetchFolder = ""; + std::vector fetchMode = o2::utils::Str::tokenize(settings.nnCCDBFetchMode, ':'); + std::map networkRetrieval = ccdbSettings; + + if (fetchMode[0] == "c1") { + networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType; + networkRetrieval["nnCCDBEvalType"] = "classification_c1"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx"; + loadFromCCDB(networkRetrieval); + } else if (fetchMode[0] == "c2") { + networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType; + networkRetrieval["nnCCDBEvalType"] = "classification_c2"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx"; + loadFromCCDB(networkRetrieval); + } + class_model_path = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally + + networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType; + networkRetrieval["nnCCDBEvalType"] = "regression_c1"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx"; + loadFromCCDB(networkRetrieval); + reg_model_path = networkRetrieval["outputFile"]; + if (fetchMode[1] == "r2") { + networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType; + networkRetrieval["nnCCDBEvalType"] = "regression_c2"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx"; + loadFromCCDB(networkRetrieval); + reg_model_path += ":", networkRetrieval["outputFile"]; + } + } + OrtOptions = { - {"model-path", settings.nnClassificationPath}, + {"model-path", class_model_path}, {"device", settings.nnInferenceDevice}, {"device-id", std::to_string(settings.nnInferenceDeviceId)}, {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)}, @@ -70,7 +115,7 @@ void GPUTPCNNClusterizerHost::init(GPUSettingsProcessingNNclusterizer settings) model_class.init(OrtOptions); - reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':'); + reg_model_paths = o2::utils::Str::tokenize(reg_model_path, ':'); if (!settings.nnClusterizerUseCfRegression) { if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index b6d5e48304e0d..798d4af2826b3 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -37,9 +37,9 @@ class GPUTPCNNClusterizerHost { public: GPUTPCNNClusterizerHost() = default; - GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer); + GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&); - void init(GPUSettingsProcessingNNclusterizer); + void init(const GPUSettingsProcessingNNclusterizer&); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); void loadFromCCDB(std::map); From 566ddb7b0b6133cde807ef5526a2efa66be1a785 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 25 Mar 2025 09:51:18 +0100 Subject: [PATCH 09/40] Simplifications and renaming --- Common/ML/include/ML/OrtInterface.h | 2 +- Common/ML/src/OrtInterface.cxx | 1 - GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 - .../Global/GPUChainTrackingClusterizer.cxx | 20 +++++++++---------- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 4 ++-- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 2 +- .../GPUTPCNNClusterizerHost.cxx | 3 +-- .../GPUTPCNNClusterizerKernels.cxx | 20 +++++++++++-------- 8 files changed, 27 insertions(+), 26 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 93549178848ca..cbd8501f9898f 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -84,7 +84,7 @@ class OrtModel // Environment settings bool mInitialized = false; - std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda + std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; std::string printShape(const std::vector&); diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index ae809a2ba5c1a..5e9f1a8b0a5b6 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -48,7 +48,6 @@ void OrtModel::reset(std::unordered_map optionsMap) if (!optionsMap["model-path"].empty()) { modelPath = optionsMap["model-path"]; device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU"); - dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index a8a4ae566f485..83f6e320b8f5b 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -228,7 +228,6 @@ AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)") AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id") AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference") -AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16 AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 98a0ec16495c5..1638e134a4d6a 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -641,7 +641,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } else { clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; } - clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos; + clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos; nnApplication.initClusterizer(nn_settings, clustererNN); AllocateRegisteredMemory(clustererNN.mMemoryId); } @@ -931,23 +931,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); auto start0 = std::chrono::high_resolution_clock::now(); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Filling the data + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Filling the data auto stop0 = std::chrono::high_resolution_clock::now(); auto start1 = std::chrono::high_resolution_clock::now(); - nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype); + nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnInferenceInputDType); if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels } else { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels } if (!clustererNN.nnClusterizerUseCfRegression) { - nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 1 + nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnInferenceInputDType); + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1 if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { - nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 2 + nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnInferenceInputDType); + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2 } } auto stop1 = std::chrono::high_resolution_clock::now(); @@ -957,7 +957,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } auto start1 = std::chrono::high_resolution_clock::now(); if (clustererNN.nnClusterizerUseCfRegression) { - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 } auto stop1 = std::chrono::high_resolution_clock::now(); time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 655e2bf5a933c..cc3f29434615f 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -25,9 +25,9 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {} void* GPUTPCNNClusterizer::setIOPointers(void* mem) { if (nnClusterizerBatchedMode > 0) { - if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) { + if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) { computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize); - } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) { + } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) { computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize); } computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 01d1873f3b351..0b9e3a6572684 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -54,7 +54,7 @@ class GPUTPCNNClusterizer : public GPUProcessor int nnClusterizerModelClassNumOutputNodes = -1; int nnClusterizerModelReg1NumOutputNodes = -1; int nnClusterizerModelReg2NumOutputNodes = -1; - int nnClusterizerDtype = 0; // 0: float16, 1: float32 + int nnInferenceInputDType = 0; // 0: float16, 1: float32 int mISector = -1; // Memory allocation for neural network diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 533ac0c7481ff..3463740cf7918 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -105,7 +105,6 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set {"device", settings.nnInferenceDevice}, {"device-id", std::to_string(settings.nnInferenceDeviceId)}, {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)}, - {"dtype", settings.nnInferenceDtype}, {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)}, {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)}, {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)}, @@ -134,7 +133,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust { clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; if (!settings.nnClusterizerUseCfRegression) { - if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { + if (model_class.getNumOutputNodes()[0][1] == 1 || model_reg_2.isInitialized()) { clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; } else { clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 763119444bf7c..73051bd8477fd 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -125,20 +125,24 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) { - auto& clusterer = processors.tpcNNClusterer[sector]; + auto& clustererNN = processors.tpcNNClusterer[sector]; uint glo_idx = get_global_id(0); - uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes; + uint elem_iterator = glo_idx * clustererNN.nnClusterizerModelClassNumOutputNodes; float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty] uint class_label = 0; - for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) { + for (int pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes; pIdx++) { if (pIdx == elem_iterator) { - current_max_prob = clusterer.modelProbabilities[pIdx]; + current_max_prob = clustererNN.modelProbabilities[pIdx]; } else { - class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label); + class_label = (clustererNN.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label); } } - // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" - clusterer.outputDataClass[glo_idx + batchStart] = class_label; + // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" + clustererNN.outputDataClass[glo_idx + batchStart] = class_label; + if (class_label > 1) { + clustererNN.clusterFlags[2 * glo_idx] = 1; + clustererNN.clusterFlags[2 * glo_idx + 1] = 1; + } } template <> @@ -157,7 +161,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= 1)) { ClusterAccumulator pc; From a9c33b5b7775123283b0de118b99ae2945b0c669 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Tue, 25 Mar 2025 08:52:20 +0000 Subject: [PATCH 10/40] Please consider the following formatting changes --- .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 12 ++++++------ .../TPCClusterFinder/GPUTPCNNClusterizerHost.h | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 3463740cf7918..35db3f2107e7d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -27,15 +27,16 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl init(settings); } -void GPUTPCNNClusterizerHost::loadFromCCDB(std::map settings) { +void GPUTPCNNClusterizerHost::loadFromCCDB(std::map settings) +{ o2::ccdb::CcdbApi ccdbApi; ccdbApi.init(settings["nnCCDBURL"]); metadata["inputDType"] = settings["inputDType"]; metadata["outputDType"] = settings["outputDType"]; - metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C + metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model - metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN + metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) { metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"]; } @@ -58,7 +59,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath; std::vector reg_model_paths; - if(settings.nnLoadFromCCDB) { + if (settings.nnLoadFromCCDB) { std::map ccdbSettings = { {"nnCCDBURL", settings.nnCCDBURL}, {"nnCCDBPath", settings.nnCCDBPath}, @@ -66,8 +67,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set {"outputDType", settings.nnInferenceOutputDType}, {"nnCCDBWithMomentum", std::to_string(settings.nnCCDBWithMomentum)}, {"nnCCDBBeamType", settings.nnCCDBBeamType}, - {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)} - }; + {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)}}; std::string nnFetchFolder = ""; std::vector fetchMode = o2::utils::Str::tokenize(settings.nnCCDBFetchMode, ':'); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 798d4af2826b3..210d5f94dd503 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -49,9 +49,9 @@ class GPUTPCNNClusterizerHost o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters std::vector reg_model_paths; - private: - std::map metadata; - std::map headers; + private: + std::map metadata; + std::map headers; }; // class GPUTPCNNClusterizerHost } // namespace o2::gpu From 9037ea6d7b46a44e73ba5da3f741852a8189b797 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 27 Mar 2025 15:16:21 +0100 Subject: [PATCH 11/40] First version of GPU stream implementation. Still needs testing. --- Common/ML/include/ML/OrtInterface.h | 4 +- Common/ML/src/OrtInterface.cxx | 13 +++--- .../Base/GPUReconstructionProcessing.h | 3 ++ GPU/GPUTracking/Base/cuda/CMakeLists.txt | 1 + .../Base/cuda/GPUReconstructionCUDA.cu | 42 +++++++++++++++++++ .../Base/cuda/GPUReconstructionCUDA.h | 1 + GPU/GPUTracking/Base/hip/CMakeLists.txt | 1 + 7 files changed, 57 insertions(+), 8 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 93549178848ca..3d2de192a1fd6 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -84,8 +84,8 @@ class OrtModel // Environment settings bool mInitialized = false; - std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda - int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; + std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda + int intraOpNumThreads = 1, interOpNumThreads = 1, streamId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; std::string printShape(const std::vector&); }; diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index fc784dd14d2dc..7f550e8e9b32c 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -48,8 +48,7 @@ void OrtModel::reset(std::unordered_map optionsMap) if (!optionsMap["model-path"].empty()) { modelPath = optionsMap["model-path"]; device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU"); - dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); - deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); + streamId = (optionsMap.contains("stream-id") ? std::stoi(optionsMap["stream-id"]) : 0); allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0); @@ -61,7 +60,8 @@ void OrtModel::reset(std::unordered_map optionsMap) #if defined(ORT_ROCM_BUILD) #if ORT_ROCM_BUILD == 1 if (device == "ROCM") { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); + // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, streamId)); + o2::gpu::SetONNXGPUStream(pImplOrt->sessionOptions, streamId); LOG(info) << "(ORT) ROCM execution provider set"; } #endif @@ -69,7 +69,7 @@ void OrtModel::reset(std::unordered_map optionsMap) #if defined(ORT_MIGRAPHX_BUILD) #if ORT_MIGRAPHX_BUILD == 1 if (device == "MIGRAPHX") { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, streamId)); LOG(info) << "(ORT) MIGraphX execution provider set"; } #endif @@ -77,7 +77,8 @@ void OrtModel::reset(std::unordered_map optionsMap) #if defined(ORT_CUDA_BUILD) #if ORT_CUDA_BUILD == 1 if (device == "CUDA") { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); + // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, streamId)); + o2::gpu::SetONNXGPUStream(pImplOrt->sessionOptions, streamId); LOG(info) << "(ORT) CUDA execution provider set"; dev_mem_str = "Cuda"; } @@ -85,7 +86,7 @@ void OrtModel::reset(std::unordered_map optionsMap) #endif if (allocateDeviceMemory) { - pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); + pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, streamId, OrtMemType::OrtMemTypeDefault); LOG(info) << "(ORT) Memory info set to on-device memory"; } diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h index b0466efceac24..662258ba13d97 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h +++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h @@ -22,6 +22,8 @@ #include #include +struct OrtSessionOptions; + namespace o2::gpu { @@ -88,6 +90,7 @@ class GPUReconstructionProcessing : public GPUReconstruction void AddGPUEvents(T*& events); virtual std::unique_ptr GetThreadContext() override; + virtual int32_t SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) { return 0; } struct RecoStepTimerMeta { HighResTimer timerToGPU; diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt index de54f09fdc2e1..613a73bc49b27 100644 --- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt +++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt @@ -115,6 +115,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${MODULE} SOURCES ${SRCS} PUBLIC_LINK_LIBRARIES ${TMP_BASELIB} O2::ITStrackingCUDA + PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime PRIVATE_INCLUDE_DIRECTORIES ${CMAKE_SOURCE_DIR}/Detectors/Base/src ${CMAKE_SOURCE_DIR}/Detectors/TRD/base/src diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index f87d5c8189cdc..f1f3c2ecba12f 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -16,6 +16,7 @@ #include "GPUReconstructionCUDAIncludesHost.h" #include +#include "ML/OrtInterface.h" #include "GPUReconstructionCUDA.h" #include "GPUReconstructionCUDAInternals.h" @@ -35,6 +36,10 @@ #undef GPUCA_KRNL #endif +#ifdef GPUCA_HAS_ONNX +#include +#endif + static constexpr size_t REQUIRE_MIN_MEMORY = 1024L * 1024 * 1024; static constexpr size_t REQUIRE_MEMORY_RESERVED = 512L * 1024 * 1024; static constexpr size_t REQUIRE_FREE_MEMORY_RESERVED_PER_SM = 40L * 1024 * 1024; @@ -656,6 +661,28 @@ void GPUReconstructionCUDA::endGPUProfiling() { GPUChkErr(cudaProfilerStop()); } + +#ifdef GPUCA_HAS_ONNX +int32_t GPUReconstructionCUDA::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) +{ + OrtCUDAProviderOptionsV2* cuda_options = nullptr; + CreateCUDAProviderOptions(&cuda_options); + + // std::vector keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"}; + // std::vector values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"}; + // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size()); + + // this implicitly sets "has_user_compute_stream" + UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", &mInternals->Streams[stream]); + Ort::ThrowOnError(SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options)); + + // Finally, don't forget to release the provider options + ReleaseCUDAProviderOptions(cuda_options); + + return 0; +} +#endif // GPUCA_HAS_ONNX + #else // HIP void* GPUReconstructionHIP::getGPUPointer(void* ptr) { @@ -663,6 +690,21 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr) GPUChkErr(hipHostGetDevicePointer(&retVal, ptr, 0)); return retVal; } + +#ifdef GPUCA_HAS_ONNX +int32_t GPUReconstructionCUDA::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) +{ + // Create ROCm provider options + const auto& api = Ort::GetApi(); + OrtROCMProviderOptions rocm_options{}; + rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream + rocm_options.user_compute_stream = &mInternals->Streams[stream]; + + // Append the ROCm execution provider with the custom HIP stream + Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_ROCM(session_options, &rocm_options)); + return 0; +} +#endif // GPUCA_HAS_ONNX #endif // __HIPCC__ namespace o2::gpu diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index a98b14a873ca0..34674c549a9c7 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -79,6 +79,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels* trackerTraits, std::unique_ptr* vertexerTraits, std::unique_ptr* timeFrame) override; diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index 43259decef956..d4ebd29306ccc 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -153,6 +153,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${MODULE} SOURCES ${SRCS} PUBLIC_LINK_LIBRARIES ${TMP_BASELIB} O2::ITStrackingHIP + PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime PRIVATE_INCLUDE_DIRECTORIES ${CMAKE_SOURCE_DIR}/Detectors/Base/src ${CMAKE_SOURCE_DIR}/Detectors/TRD/base/src From 64c19d5a5700b726639236d40b619a49d21fd0c4 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 27 Mar 2025 15:41:11 +0100 Subject: [PATCH 12/40] Fixes --- GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 3 +-- GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 2 +- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 4 ++-- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index f1f3c2ecba12f..d8c4dc7914718 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -16,7 +16,6 @@ #include "GPUReconstructionCUDAIncludesHost.h" #include -#include "ML/OrtInterface.h" #include "GPUReconstructionCUDA.h" #include "GPUReconstructionCUDAInternals.h" @@ -692,7 +691,7 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr) } #ifdef GPUCA_HAS_ONNX -int32_t GPUReconstructionCUDA::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) +int32_t GPUReconstructionHIP::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) { // Create ROCm provider options const auto& api = Ort::GetApi(); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 4047dcae0a6b3..f6a3d64c3e120 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -640,7 +640,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; } clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos; - GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); + GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN, iSector); AllocateRegisteredMemory(clustererNN.mMemoryId); } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 5002c63524020..0f53e12d2e063 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -19,12 +19,12 @@ using namespace o2::gpu; -GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer) +GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer, int32_t streamId) { OrtOptions = { {"model-path", settings.nnClassificationPath}, {"device", settings.nnInferenceDevice}, - {"device-id", std::to_string(settings.nnInferenceDeviceId)}, + {"stream-id", std::to_string(streamId)}, {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)}, {"dtype", settings.nnInferenceDtype}, {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)}, diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 7efa0edecb893..51f1f76679c7b 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -37,7 +37,7 @@ class GPUTPCNNClusterizerHost { public: GPUTPCNNClusterizerHost() = default; - GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&, int32_t = 0); void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype); From 8a5bb69c12ea5629d930e5c953345b7372d024d3 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Thu, 27 Mar 2025 14:44:20 +0000 Subject: [PATCH 13/40] Please consider the following formatting changes --- GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index d8c4dc7914718..915f3bb4707de 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -696,7 +696,7 @@ int32_t GPUReconstructionHIP::SetONNXGPUStream(OrtSessionOptions* session_option // Create ROCm provider options const auto& api = Ort::GetApi(); OrtROCMProviderOptions rocm_options{}; - rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream + rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream rocm_options.user_compute_stream = &mInternals->Streams[stream]; // Append the ROCm execution provider with the custom HIP stream From 46fb1e126da5c6bb13b4f725114c2b2c0e048649 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 27 Mar 2025 21:09:27 +0100 Subject: [PATCH 14/40] Adding the lane variable. This PR will in any case conflict with #14069 --- GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index f6a3d64c3e120..bf83f97b28775 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -916,7 +916,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) #ifdef GPUCA_HAS_ONNX GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; - GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); + GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN, lane); if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); From 70320c3afce42dca26ccd10f165b246e82b6341f Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 29 Mar 2025 13:39:27 +0100 Subject: [PATCH 15/40] Compiles on EPNs. Need to add shadow processors next. But for this, I will merge https://github.com/AliceO2Group/AliceO2/pull/14069 to have the changes in GPUChainTrackingClusterizer. --- Common/ML/include/ML/OrtInterface.h | 7 ++ Common/ML/src/OrtInterface.cxx | 70 +++++++++++-------- GPU/GPUTracking/Base/GPUReconstructionCPU.h | 7 ++ .../Base/GPUReconstructionProcessing.h | 2 +- .../Base/cuda/GPUReconstructionCUDA.cu | 20 +++--- .../Base/cuda/GPUReconstructionCUDA.h | 6 +- GPU/GPUTracking/CMakeLists.txt | 14 ++++ GPU/GPUTracking/Global/GPUChain.h | 1 + .../Global/GPUChainTrackingClusterizer.cxx | 3 + 9 files changed, 90 insertions(+), 40 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 3d2de192a1fd6..33e6821108112 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -26,6 +26,11 @@ // O2 includes #include "Framework/Logger.h" +namespace Ort { + struct SessionOptions; + struct MemoryInfo; +} + namespace o2 { @@ -42,6 +47,8 @@ class OrtModel void init(std::unordered_map optionsMap) { reset(optionsMap); } void reset(std::unordered_map); bool isInitialized() { return mInitialized; } + Ort::SessionOptions* updateSessionOptions(); + Ort::MemoryInfo* updateMemoryInfo(); virtual ~OrtModel() = default; diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 7f550e8e9b32c..1f750abf8226e 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -35,6 +35,16 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); }; +Ort::SessionOptions* OrtModel::updateSessionOptions() +{ + return &(pImplOrt->sessionOptions); +} + +Ort::MemoryInfo* OrtModel::updateMemoryInfo() +{ + return &(pImplOrt->memoryInfo); +} + void OrtModel::reset(std::unordered_map optionsMap) { @@ -56,39 +66,41 @@ void OrtModel::reset(std::unordered_map optionsMap) enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); - std::string dev_mem_str = "Hip"; -#if defined(ORT_ROCM_BUILD) -#if ORT_ROCM_BUILD == 1 - if (device == "ROCM") { - // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, streamId)); - o2::gpu::SetONNXGPUStream(pImplOrt->sessionOptions, streamId); - LOG(info) << "(ORT) ROCM execution provider set"; - } -#endif -#endif -#if defined(ORT_MIGRAPHX_BUILD) -#if ORT_MIGRAPHX_BUILD == 1 - if (device == "MIGRAPHX") { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, streamId)); - LOG(info) << "(ORT) MIGraphX execution provider set"; - } -#endif -#endif -#if defined(ORT_CUDA_BUILD) -#if ORT_CUDA_BUILD == 1 - if (device == "CUDA") { - // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, streamId)); - o2::gpu::SetONNXGPUStream(pImplOrt->sessionOptions, streamId); - LOG(info) << "(ORT) CUDA execution provider set"; - dev_mem_str = "Cuda"; - } -#endif -#endif - +// #if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1 +// if (device == "ROCM") { +// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, streamId)); +// SetONNXGPUStream(pImplOrt->sessionOptions, streamId); +// LOG(info) << "(ORT) ROCM execution provider set"; +// } +// #endif +// #if defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1 +// if (device == "MIGRAPHX") { +// Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, streamId)); +// LOG(info) << "(ORT) MIGraphX execution provider set"; +// } +// #endif +// #if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1 +// if (device == "CUDA") { +// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, streamId)); +// SetONNXGPUStream(pImplOrt->sessionOptions, streamId); +// LOG(info) << "(ORT) CUDA execution provider set"; +// dev_mem_str = "Cuda"; +// } +// #endif + +#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1) if (allocateDeviceMemory) { + std::string dev_mem_str = ""; + if (device == "ROCM") { + dev_mem_str = "Hip"; + } + if (device == "CUDA") { + dev_mem_str = "Cuda"; + } pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, streamId, OrtMemType::OrtMemTypeDefault); LOG(info) << "(ORT) Memory info set to on-device memory"; } +#endif if (device == "CPU") { (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index fd999ec2304e1..3bb6fff25be17 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -24,6 +24,10 @@ #include "GPUReconstructionKernelIncludes.h" #include "GPUReconstructionKernels.h" +namespace Ort { + struct SessionOptions; +} + namespace o2::gpu { @@ -111,6 +115,9 @@ class GPUReconstructionCPU : public GPUReconstructionKernels GetThreadContext() override; - virtual int32_t SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) { return 0; } + virtual void SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) {} struct RecoStepTimerMeta { HighResTimer timerToGPU; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 915f3bb4707de..3e0f739418125 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -661,11 +661,12 @@ void GPUReconstructionCUDA::endGPUProfiling() GPUChkErr(cudaProfilerStop()); } -#ifdef GPUCA_HAS_ONNX -int32_t GPUReconstructionCUDA::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) +#if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1 +void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream) { OrtCUDAProviderOptionsV2* cuda_options = nullptr; CreateCUDAProviderOptions(&cuda_options); + OrtSessionOptions* raw_options = session_options->operator OrtSessionOptions*(); // std::vector keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"}; // std::vector values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"}; @@ -673,12 +674,10 @@ int32_t GPUReconstructionCUDA::SetONNXGPUStream(OrtSessionOptions* session_optio // this implicitly sets "has_user_compute_stream" UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", &mInternals->Streams[stream]); - Ort::ThrowOnError(SessionOptionsAppendExecutionProvider_CUDA_V2(session_options, cuda_options)); + Ort::ThrowOnError(SessionOptionsAppendExecutionProvider_CUDA_V2(raw_options, cuda_options)); // Finally, don't forget to release the provider options ReleaseCUDAProviderOptions(cuda_options); - - return 0; } #endif // GPUCA_HAS_ONNX @@ -690,8 +689,8 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr) return retVal; } -#ifdef GPUCA_HAS_ONNX -int32_t GPUReconstructionHIP::SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) +#if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1 +void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream) { // Create ROCm provider options const auto& api = Ort::GetApi(); @@ -699,10 +698,13 @@ int32_t GPUReconstructionHIP::SetONNXGPUStream(OrtSessionOptions* session_option rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream rocm_options.user_compute_stream = &mInternals->Streams[stream]; + // Get the raw OrtSessionOptions pointer from the Ort::SessionOptions wrapper + OrtSessionOptions* raw_options = session_options->operator OrtSessionOptions*(); + // Append the ROCm execution provider with the custom HIP stream - Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_ROCM(session_options, &rocm_options)); - return 0; + Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_ROCM(raw_options, &rocm_options)); } + #endif // GPUCA_HAS_ONNX #endif // __HIPCC__ diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index 34674c549a9c7..b72b8264c4098 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -25,6 +25,10 @@ extern "C" __declspec(dllexport) o2::gpu::GPUReconstruction* GPUReconstruction_C extern "C" o2::gpu::GPUReconstruction* GPUReconstruction_Create_CUDA(const o2::gpu::GPUSettingsDeviceBackend& cfg); #endif +namespace Ort { + struct SessionOptions; +} + namespace o2::gpu { struct GPUReconstructionCUDAInternals; @@ -79,7 +83,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels* trackerTraits, std::unique_ptr* vertexerTraits, std::unique_ptr* timeFrame) override; diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index ad7dd9c210cd1..0859502e59ef2 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -14,6 +14,20 @@ set(MODULE GPUTracking) # set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O0") # to uncomment if needed, tired of typing this... # set(GPUCA_BUILD_DEBUG 1) +# Pass ORT variables as a preprocessor definition +if(DEFINED ENV{ORT_ROCM_BUILD}) + add_compile_definitions(ORT_ROCM_BUILD=$ENV{ORT_ROCM_BUILD}) +endif() +if(DEFINED ENV{ORT_CUDA_BUILD}) + add_compile_definitions(ORT_CUDA_BUILD=$ENV{ORT_CUDA_BUILD}) +endif() +if(DEFINED ENV{ORT_MIGRAPHX_BUILD}) + add_compile_definitions(ORT_MIGRAPHX_BUILD=$ENV{ORT_MIGRAPHX_BUILD}) +endif() +if(DEFINED ENV{ORT_TENSORRT_BUILD}) + add_compile_definitions(ORT_TENSORRT_BUILD=$ENV{ORT_TENSORRT_BUILD}) +endif() + if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH}) set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_OPTO2}) diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h index 290ae32cafca8..66b50b781172a 100644 --- a/GPU/GPUTracking/Global/GPUChain.h +++ b/GPU/GPUTracking/Global/GPUChain.h @@ -83,6 +83,7 @@ class GPUChain inline GPUParam& param() { return mRec->param(); } inline const GPUConstantMem* processors() const { return mRec->processors(); } inline void SynchronizeStream(int32_t stream) { mRec->SynchronizeStream(stream); } + inline void SetONNXGPUStream(Ort::SessionOptions* opt, int32_t stream) { mRec->SetONNXGPUStream(opt, stream); } inline void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) { mRec->SynchronizeEvents(evList, nEvents); } inline void SynchronizeEventAndRelease(deviceEvent& ev, bool doGPU = true) { diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index bf83f97b28775..e5e36189fac50 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -917,6 +917,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN, lane); + SetONNXGPUStream(nnApplication.model_class.updateSessionOptions(), lane); + SetONNXGPUStream(nnApplication.model_reg_1.updateSessionOptions(), lane); + SetONNXGPUStream(nnApplication.model_reg_2.updateSessionOptions(), lane); if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); From 9d9267f6d9afc191c15022eb58f0afd9ce6b997f Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 29 Mar 2025 19:20:21 +0100 Subject: [PATCH 16/40] Adding shadow instance. Not sure if this correctly allocates GPU memory using AllocateRegisteredMemory --- GPU/GPUTracking/Base/GPUReconstructionCPU.h | 2 +- .../Base/GPUReconstructionProcessing.h | 2 +- .../Base/cuda/GPUReconstructionCUDA.cu | 6 +- .../Base/cuda/GPUReconstructionCUDA.h | 2 +- GPU/GPUTracking/Global/GPUChain.h | 2 +- .../Global/GPUChainTrackingClusterizer.cxx | 97 +++++++++++-------- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 29 ++++++ .../TPCClusterFinder/GPUTPCNNClusterizer.h | 1 + .../GPUTPCNNClusterizerHost.cxx | 6 +- .../GPUTPCNNClusterizerHost.h | 2 +- 10 files changed, 100 insertions(+), 49 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index 3bb6fff25be17..6f2610c3c93c7 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -116,7 +116,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels GetThreadContext() override; - virtual void SetONNXGPUStream(OrtSessionOptions* session_options, int32_t stream) {} + virtual void SetONNXGPUStream(OrtSessionOptions*, int32_t, int32_t*) {} struct RecoStepTimerMeta { HighResTimer timerToGPU; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 3e0f739418125..26ef569fe1b7c 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -662,8 +662,9 @@ void GPUReconstructionCUDA::endGPUProfiling() } #if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1 -void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream) +void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream, int32_t* deviceId) { + cudaGetDevice(deviceId); OrtCUDAProviderOptionsV2* cuda_options = nullptr; CreateCUDAProviderOptions(&cuda_options); OrtSessionOptions* raw_options = session_options->operator OrtSessionOptions*(); @@ -690,9 +691,10 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr) } #if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1 -void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream) +void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream, int32_t* deviceId) { // Create ROCm provider options + cudaGetDevice(deviceId); const auto& api = Ort::GetApi(); OrtROCMProviderOptions rocm_options{}; rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index b72b8264c4098..8194385444ade 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -83,7 +83,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels* trackerTraits, std::unique_ptr* vertexerTraits, std::unique_ptr* timeFrame) override; diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h index 66b50b781172a..4130990a7d1e2 100644 --- a/GPU/GPUTracking/Global/GPUChain.h +++ b/GPU/GPUTracking/Global/GPUChain.h @@ -83,7 +83,7 @@ class GPUChain inline GPUParam& param() { return mRec->param(); } inline const GPUConstantMem* processors() const { return mRec->processors(); } inline void SynchronizeStream(int32_t stream) { mRec->SynchronizeStream(stream); } - inline void SetONNXGPUStream(Ort::SessionOptions* opt, int32_t stream) { mRec->SetONNXGPUStream(opt, stream); } + inline void SetONNXGPUStream(Ort::SessionOptions* opt, int32_t stream, int32_t* deviceId) { mRec->SetONNXGPUStream(opt, stream, deviceId); } inline void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) { mRec->SynchronizeEvents(evList, nEvents); } inline void SynchronizeEventAndRelease(deviceEvent& ev, bool doGPU = true) { diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index a27bac308cdc6..3d5cb79711957 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -622,28 +622,45 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; - clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression; - clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow; - clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad; - clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime; - clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData; - clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0); - clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode; - clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue; - clustererNN.nnClusterizerTotalClusters = maxClusters; - clustererNN.nnClassThreshold = nn_settings.nnClassThreshold; - clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold; - if (clustererNN.nnSigmoidTrafoClassThreshold) { - clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold)); + GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[iSector] : clustererNN; + clustererNNShadow.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression; + clustererNNShadow.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow; + clustererNNShadow.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad; + clustererNNShadow.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime; + clustererNNShadow.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData; + clustererNNShadow.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0); + clustererNNShadow.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode; + clustererNNShadow.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue; + clustererNNShadow.nnClusterizerTotalClusters = maxClusters; + clustererNNShadow.nnClassThreshold = nn_settings.nnClassThreshold; + clustererNNShadow.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold; + if (clustererNNShadow.nnSigmoidTrafoClassThreshold) { + clustererNNShadow.nnClassThreshold = (float)std::log(clustererNNShadow.nnClassThreshold / (1.f - clustererNNShadow.nnClassThreshold)); } if (nn_settings.nnClusterizerVerbosity < 0) { - clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity; + clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity; } else { - clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; + clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; } - clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos; - nnApplication.initClusterizer(nn_settings, clustererNN); - AllocateRegisteredMemory(clustererNN.mMemoryId); + clustererNNShadow.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos; + nnApplication.initClusterizer(nn_settings, clustererNNShadow); + // if (doGPU) { + // std::vector pointerSizes = clustererNNShadow.pointerSizes(); + // // FIXME: These are for sure not needed. The arrays are empty at this point, only the space needs to be reserved. Is this already handeled by computePointerWithAlignment? + // // Once a GPU is available, everything should be done on the GPU for now. + // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.inputData32, clustererNN.inputData32, pointerSizes[0], lane, true); + // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.inputData16, clustererNN.inputData16, pointerSizes[1], lane, true); + // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataClass, clustererNN.outputDataClass, pointerSizes[2], lane, true); + // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.modelProbabilities, clustererNN.modelProbabilities, pointerSizes[3], lane, true); + // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataReg1, clustererNN.outputDataReg1, pointerSizes[4], lane, true); + // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataReg2, clustererNN.outputDataReg2, pointerSizes[5], lane, true); + // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.peakPositions, clustererNN.peakPositions, pointerSizes[6], lane, true); + // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.clusterFlags, clustererNN.clusterFlags, pointerSizes[7], lane, true); + // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.centralCharges, clustererNN.centralCharges, pointerSizes[8], lane, true); + // } else { + // AllocateRegisteredMemory(clustererNNShadow.mMemoryId); + // } + AllocateRegisteredMemory(clustererNNShadow.mMemoryId); } } #endif @@ -917,41 +934,43 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (GetProcessingSettings().nn.applyNNclusterizer) { #ifdef GPUCA_HAS_ONNX GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; + GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[iSector] : clustererNN; const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; - GPUTPCNNClusterizerHost nnApplication(nn_settings, lane); - SetONNXGPUStream(nnApplication.model_class.updateSessionOptions(), lane); - SetONNXGPUStream(nnApplication.model_reg_1.updateSessionOptions(), lane); - SetONNXGPUStream(nnApplication.model_reg_2.updateSessionOptions(), lane); + GPUTPCNNClusterizerHost nnApplication(nn_settings, lane); // FIXME: This needs to be the deviceID. If that is the lane, then this line is correct + int32_t deviceId = -1; + SetONNXGPUStream(nnApplication.model_class.updateSessionOptions(), lane, &deviceId); + SetONNXGPUStream(nnApplication.model_reg_1.updateSessionOptions(), lane, &deviceId); + SetONNXGPUStream(nnApplication.model_reg_2.updateSessionOptions(), lane, &deviceId); int withMC = (doGPU && propagateMCLabels); - if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { + if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); } float time_clusterizer = 0, time_fill = 0; - for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNN.nnClusterizerBatchedMode); batch++) { - uint batchStart = batch * clustererNN.nnClusterizerBatchedMode; - size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); + for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.nnClusterizerBatchedMode); batch++) { + uint batchStart = batch * clustererNNShadow.nnClusterizerBatchedMode; + size_t iSize = CAMath::Min((uint)clustererNNShadow.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); auto start0 = std::chrono::high_resolution_clock::now(); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Filling the data + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data auto stop0 = std::chrono::high_resolution_clock::now(); auto start1 = std::chrono::high_resolution_clock::now(); - nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnInferenceInputDType); + nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType, deviceId); if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels } else { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels } - if (!clustererNN.nnClusterizerUseCfRegression) { - nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnInferenceInputDType); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1 + if (!clustererNNShadow.nnClusterizerUseCfRegression) { + nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType, deviceId); + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1 if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { - nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnInferenceInputDType); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2 + nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType, deviceId); + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2 } } auto stop1 = std::chrono::high_resolution_clock::now(); @@ -960,15 +979,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) time_fill += std::chrono::duration_cast(stop0 - start0).count() / 1e9; } auto start1 = std::chrono::high_resolution_clock::now(); - if (clustererNN.nnClusterizerUseCfRegression) { - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 + if (clustererNNShadow.nnClusterizerUseCfRegression) { + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 } auto stop1 = std::chrono::high_resolution_clock::now(); time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; - if (clustererNN.nnClusterizerVerbosity < 3) { + if (clustererNNShadow.nnClusterizerVerbosity < 3) { int acceptedClusters = 0; for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) { - acceptedClusters += clustererNN.outputDataClass[i]; + acceptedClusters += clustererNNShadow.outputDataClass[i]; } LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s"; } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index cc3f29434615f..f4e442a6d7fb4 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -51,6 +51,35 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) return mem; } +std::vector GPUTPCNNClusterizer::pointerSizes() { + std::vector sizes(9, -1); + if (nnClusterizerBatchedMode > 0) { + if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) { + sizes[0] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData16 + } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) { + sizes[1] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData32 + } + sizes[2] = nnClusterizerBatchedMode; // peakPositions + sizes[3] = 2 * nnClusterizerBatchedMode; // clusterFlags + sizes[4] = nnClusterizerBatchedMode; // centralCharges + if (nnClusterizerModelClassNumOutputNodes > 0) { + sizes[5] = nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes; // modelProbabilities + } + if (!nnClusterizerUseCfRegression) { + if (nnClusterizerModelReg1NumOutputNodes > 0) { + sizes[6] = nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes; // outputDataReg1 + } + if (nnClusterizerModelReg2NumOutputNodes > 0) { + sizes[7] = nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes; // outputDataReg2 + } + } + } + if (nnClusterizerTotalClusters > 0) { + sizes[8] = nnClusterizerTotalClusters; // outputDataClass + } + return sizes; +} + void GPUTPCNNClusterizer::RegisterMemoryAllocation() { AllocateAndInitializeLate(); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 0b9e3a6572684..0457534b3f903 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -34,6 +34,7 @@ class GPUTPCNNClusterizer : public GPUProcessor void RegisterMemoryAllocation(); void InitializeProcessor(); void SetMaxData(const GPUTrackingInOutPointers&); + std::vector pointerSizes(); // Neural network clusterization diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 5b9413ffbea32..00a1bc09e536a 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -142,11 +142,11 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust } } -void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype) +void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype, int32_t deviceId) { if (dtype == 0) { - model.inference(clustererNN.inputData16, size, output); + model.inference(clustererNN.inputData16, size, output, deviceId); } else { - model.inference(clustererNN.inputData32, size, output); + model.inference(clustererNN.inputData32, size, output, deviceId); } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 788186e13966d..ee0a5ea19d1dd 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -43,7 +43,7 @@ class GPUTPCNNClusterizerHost void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); void loadFromCCDB(std::map); - void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype); + void networkInference(o2::ml::OrtModel, GPUTPCNNClusterizer&, size_t, float*, int32_t, int32_t); std::unordered_map OrtOptions; o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters From 007a4a16a984c54f365f0ec416bfdc4607971be4 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 1 Apr 2025 14:40:25 +0200 Subject: [PATCH 17/40] This runs, but will eventually fill up the VRAM. Need to include a mem clean --- Common/ML/CMakeLists.txt | 16 +-- Common/ML/include/ML/OrtInterface.h | 19 ++- Common/ML/src/OrtInterface.cxx | 124 +++++++----------- GPU/GPUTracking/Base/GPUReconstructionCPU.h | 2 +- .../Base/GPUReconstructionProcessing.h | 6 +- .../Base/cuda/GPUReconstructionCUDA.cu | 28 ++-- .../Base/cuda/GPUReconstructionCUDA.h | 2 +- GPU/GPUTracking/CMakeLists.txt | 21 ++- GPU/GPUTracking/Global/GPUChain.h | 2 +- .../Global/GPUChainTrackingClusterizer.cxx | 117 +++++++++++------ .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 16 +-- .../GPUTPCNNClusterizerHost.cxx | 35 +++-- .../GPUTPCNNClusterizerHost.h | 6 +- .../GPUTPCNNClusterizerKernels.cxx | 71 +++++----- 14 files changed, 242 insertions(+), 223 deletions(-) diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt index 74be306c8b6a5..5bfa05b716123 100644 --- a/Common/ML/CMakeLists.txt +++ b/Common/ML/CMakeLists.txt @@ -10,18 +10,10 @@ # or submit itself to any jurisdiction. # Pass ORT variables as a preprocessor definition -if(DEFINED ENV{ORT_ROCM_BUILD}) - add_compile_definitions(ORT_ROCM_BUILD=$ENV{ORT_ROCM_BUILD}) -endif() -if(DEFINED ENV{ORT_CUDA_BUILD}) - add_compile_definitions(ORT_CUDA_BUILD=$ENV{ORT_CUDA_BUILD}) -endif() -if(DEFINED ENV{ORT_MIGRAPHX_BUILD}) - add_compile_definitions(ORT_MIGRAPHX_BUILD=$ENV{ORT_MIGRAPHX_BUILD}) -endif() -if(DEFINED ENV{ORT_TENSORRT_BUILD}) - add_compile_definitions(ORT_TENSORRT_BUILD=$ENV{ORT_TENSORRT_BUILD}) -endif() +add_compile_definitions(ORT_ROCM_BUILD=${ORT_ROCM_BUILD}) +add_compile_definitions(ORT_CUDA_BUILD=${ORT_CUDA_BUILD}) +add_compile_definitions(ORT_MIGRAPHX_BUILD=${ORT_MIGRAPHX_BUILD}) +add_compile_definitions(ORT_TENSORRT_BUILD=${ORT_TENSORRT_BUILD}) o2_add_library(ML SOURCES src/OrtInterface.cxx diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 5034899debb60..44c89b748f52c 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -43,12 +43,19 @@ class OrtModel public: // Constructor OrtModel() = default; - OrtModel(std::unordered_map optionsMap) { reset(optionsMap); } - void init(std::unordered_map optionsMap) { reset(optionsMap); } - void reset(std::unordered_map); + OrtModel(std::unordered_map optionsMap) { + initOptions(optionsMap); + initEnvironment(); + } + void init(std::unordered_map optionsMap) { + initOptions(optionsMap); + initEnvironment(); + } + void initOptions(std::unordered_map optionsMap); + void initEnvironment(); bool isInitialized() { return mInitialized; } - Ort::SessionOptions* updateSessionOptions(); - Ort::MemoryInfo* updateMemoryInfo(); + Ort::SessionOptions& updateSessionOptions(); + void setIO(); virtual ~OrtModel() = default; @@ -91,7 +98,7 @@ class OrtModel // Environment settings bool mInitialized = false; - std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda + std::string modelPath, envName = "", device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; std::string printShape(const std::vector&); diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 5d85e3194d07d..3100eb6dd2243 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -35,19 +35,13 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); }; -Ort::SessionOptions* OrtModel::updateSessionOptions() +Ort::SessionOptions& OrtModel::updateSessionOptions() { - return &(pImplOrt->sessionOptions); + return pImplOrt->sessionOptions; } -Ort::MemoryInfo* OrtModel::updateMemoryInfo() +void OrtModel::initOptions(std::unordered_map optionsMap) { - return &(pImplOrt->memoryInfo); -} - -void OrtModel::reset(std::unordered_map optionsMap) -{ - pImplOrt = new OrtVariables(); // Load from options map @@ -58,71 +52,57 @@ void OrtModel::reset(std::unordered_map optionsMap) if (!optionsMap["model-path"].empty()) { modelPath = optionsMap["model-path"]; device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU"); - deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0); loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); - -// #if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1 -// if (device == "ROCM") { -// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); -// SetONNXGPUStream(pImplOrt->sessionOptions, deviceId); -// LOG(info) << "(ORT) ROCM execution provider set"; -// } -// #endif -// #if defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1 -// if (device == "MIGRAPHX") { -// Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); -// LOG(info) << "(ORT) MIGraphX execution provider set"; -// } -// #endif -// #if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1 -// if (device == "CUDA") { -// // Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); -// SetONNXGPUStream(pImplOrt->sessionOptions, deviceId); -// LOG(info) << "(ORT) CUDA execution provider set"; -// dev_mem_str = "Cuda"; -// } -// #endif - - if (device == "CPU") { - (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); - (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads); - if (intraOpNumThreads > 1 || interOpNumThreads > 1) { - (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); - } else if (intraOpNumThreads == 1) { - (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); - } - if (loggingLevel < 2) { - LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " (intraOpNumThreads) and " << interOpNumThreads << " (interOpNumThreads) threads"; + envName = (optionsMap.contains("onnx-environment-name") ? optionsMap["onnx-environment-name"] : "onnx_model_inference"); + + if (device == "CPU") { + (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); + (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads); + if (intraOpNumThreads > 1 || interOpNumThreads > 1) { + (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); + } else if (intraOpNumThreads == 1) { + (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + } + if (loggingLevel < 2) { + LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " (intraOpNumThreads) and " << interOpNumThreads << " (interOpNumThreads) threads"; + } } - } - (pImplOrt->sessionOptions).DisableMemPattern(); - (pImplOrt->sessionOptions).DisableCpuMemArena(); + // OrtROCMProviderOptions rocm_options{}; + // (pImplOrt->sessionOptions).AppendExecutionProvider_ROCM(rocm_options); - if (enableProfiling) { - if (optionsMap.contains("profiling-output-path")) { - (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); + (pImplOrt->sessionOptions).DisableMemPattern(); + (pImplOrt->sessionOptions).DisableCpuMemArena(); + + if (enableProfiling) { + if (optionsMap.contains("profiling-output-path")) { + (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); + } else { + LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; + (pImplOrt->sessionOptions).DisableProfiling(); + } } else { - LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; (pImplOrt->sessionOptions).DisableProfiling(); } + + (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); + (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); } else { - (pImplOrt->sessionOptions).DisableProfiling(); + LOG(fatal) << "(ORT) Model path cannot be empty!"; } +} +void OrtModel::initEnvironment() +{ mInitialized = true; - - (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); - (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); - pImplOrt->env = std::make_shared( OrtLoggingLevel(loggingLevel), - (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()), + (envName.empty() ? "ORT" : envName.c_str()), // Integrate ORT logging into Fairlogger [](void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, const char* message) { if (severity == ORT_LOGGING_LEVEL_VERBOSE) { @@ -143,6 +123,10 @@ void OrtModel::reset(std::unordered_map optionsMap) (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); + setIO(); +} + +void OrtModel::setIO() { for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); } @@ -162,7 +146,6 @@ void OrtModel::reset(std::unordered_map optionsMap) outputNamesChar.resize(mOutputNames.size(), nullptr); std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), [&](const std::string& str) { return str.c_str(); }); - } if (loggingLevel < 2) { LOG(info) << "(ORT) Model loaded successfully! (input: " << printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")"; } @@ -203,9 +186,6 @@ std::vector OrtModel::inference(std::vector& input, int32_t deviceIndex) { #if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1) if (allocateDeviceMemory) { - if (deviceIndex >= 0) { - deviceId = deviceIndex; - } std::string dev_mem_str = ""; if (device == "ROCM") { dev_mem_str = "Hip"; @@ -213,8 +193,8 @@ std::vector OrtModel::inference(std::vector& input, int32_t deviceIndex) if (device == "CUDA") { dev_mem_str = "Cuda"; } - pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); - LOG(info) << "(ORT) Memory info set to on-device memory"; + pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault); + LOG(info) << "(ORT) Memory info set to on-device memory for device " << device << " with ID "<< deviceIndex; } #endif std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; @@ -241,11 +221,12 @@ template std::vector OrtModel::inference void OrtModel::inference(I* input, size_t input_size, O* output, int32_t deviceIndex) { + // std::vector providers = Ort::GetAvailableProviders(); + // for (const auto& provider : providers) { + // LOG(info) << "Available Execution Provider: " << provider; + // } #if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1) if (allocateDeviceMemory) { - if (deviceIndex >= 0) { - deviceId = deviceIndex; - } std::string dev_mem_str = ""; if (device == "ROCM") { dev_mem_str = "Hip"; @@ -253,8 +234,8 @@ void OrtModel::inference(I* input, size_t input_size, O* output, int32_t deviceI if (device == "CUDA") { dev_mem_str = "Cuda"; } - pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); - LOG(info) << "(ORT) Memory info set to on-device memory"; + pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault); + LOG(info) << "(ORT) Memory info set to on-device memory for device " << device << " with ID "<< deviceIndex; } #endif std::vector inputShape{input_size, (int64_t)mInputShapes[0][1]}; @@ -268,7 +249,7 @@ void OrtModel::inference(I* input, size_t input_size, O* output, int32_t deviceI std::vector outputShape{input_size, mOutputShapes[0][1]}; Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); - (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is always correct here + (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); } template void OrtModel::inference(OrtDataType::Float16_t*, size_t, float*, int32_t); @@ -280,9 +261,6 @@ std::vector OrtModel::inference(std::vector>& input, int32_t d { #if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1) if (allocateDeviceMemory) { - if (deviceIndex >= 0) { - deviceId = deviceIndex; - } std::string dev_mem_str = ""; if (device == "ROCM") { dev_mem_str = "Hip"; @@ -290,8 +268,8 @@ std::vector OrtModel::inference(std::vector>& input, int32_t d if (device == "CUDA") { dev_mem_str = "Cuda"; } - pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); - LOG(info) << "(ORT) Memory info set to on-device memory"; + pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault); + LOG(info) << "(ORT) Memory info set to on-device memory for device " << device << " with ID " << deviceIndex; } #endif std::vector inputTensor; diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index 6f2610c3c93c7..f41893e32b175 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -116,7 +116,7 @@ class GPUReconstructionCPU : public GPUReconstructionKernels #include -struct OrtSessionOptions; +namespace Ort { +struct SessionOptions; +} namespace o2::gpu { @@ -90,7 +92,7 @@ class GPUReconstructionProcessing : public GPUReconstruction void AddGPUEvents(T*& events); virtual std::unique_ptr GetThreadContext() override; - virtual void SetONNXGPUStream(OrtSessionOptions*, int32_t, int32_t*) {} + virtual void SetONNXGPUStream(Ort::SessionOptions&, int32_t, int32_t*) {} struct RecoStepTimerMeta { HighResTimer timerToGPU; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 26ef569fe1b7c..959072222125e 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -662,12 +662,11 @@ void GPUReconstructionCUDA::endGPUProfiling() } #if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1 -void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream, int32_t* deviceId) +void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId) { cudaGetDevice(deviceId); OrtCUDAProviderOptionsV2* cuda_options = nullptr; CreateCUDAProviderOptions(&cuda_options); - OrtSessionOptions* raw_options = session_options->operator OrtSessionOptions*(); // std::vector keys{"device_id", "gpu_mem_limit", "arena_extend_strategy", "cudnn_conv_algo_search", "do_copy_in_default_stream", "cudnn_conv_use_max_workspace", "cudnn_conv1d_pad_to_nc1d"}; // std::vector values{"0", "2147483648", "kSameAsRequested", "DEFAULT", "1", "1", "1"}; @@ -675,7 +674,7 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions* session_option // this implicitly sets "has_user_compute_stream" UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", &mInternals->Streams[stream]); - Ort::ThrowOnError(SessionOptionsAppendExecutionProvider_CUDA_V2(raw_options, cuda_options)); + session_options.AppendExecutionProvider_CUDA_V2(cuda_options); // Finally, don't forget to release the provider options ReleaseCUDAProviderOptions(cuda_options); @@ -691,20 +690,23 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr) } #if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1 -void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions* session_options, int32_t stream, int32_t* deviceId) +void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId) { // Create ROCm provider options cudaGetDevice(deviceId); const auto& api = Ort::GetApi(); - OrtROCMProviderOptions rocm_options{}; - rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream - rocm_options.user_compute_stream = &mInternals->Streams[stream]; - - // Get the raw OrtSessionOptions pointer from the Ort::SessionOptions wrapper - OrtSessionOptions* raw_options = session_options->operator OrtSessionOptions*(); - - // Append the ROCm execution provider with the custom HIP stream - Ort::ThrowOnError(api.SessionOptionsAppendExecutionProvider_ROCM(raw_options, &rocm_options)); + // api.GetCurrentGpuDeviceId(deviceId); + OrtROCMProviderOptions rocm_options; + LOG(info) << "Creating ROCm provider options"; + // rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream + // LOG(info) << "Setting user compute stream"; + // rocm_options.user_compute_stream = &(mInternals->Streams[stream]); + // LOG(info) << "Stream is set with streamId " << stream << " and reference " << &(mInternals->Streams[stream]); + session_options.AppendExecutionProvider_ROCM(rocm_options); + LOG(info) << "Appending ROCm provider options"; + // OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, *deviceId); + // api.ReleaseROCMProviderOptions(rocm_options); + LOG(info) << "Releasing ROCm provider options"; } #endif // GPUCA_HAS_ONNX diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index 8194385444ade..cb4540015ff76 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -83,7 +83,7 @@ class GPUReconstructionCUDA : public GPUReconstructionKernels* trackerTraits, std::unique_ptr* vertexerTraits, std::unique_ptr* timeFrame) override; diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 0859502e59ef2..186d8ce4b0551 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -15,18 +15,15 @@ set(MODULE GPUTracking) # set(GPUCA_BUILD_DEBUG 1) # Pass ORT variables as a preprocessor definition -if(DEFINED ENV{ORT_ROCM_BUILD}) - add_compile_definitions(ORT_ROCM_BUILD=$ENV{ORT_ROCM_BUILD}) -endif() -if(DEFINED ENV{ORT_CUDA_BUILD}) - add_compile_definitions(ORT_CUDA_BUILD=$ENV{ORT_CUDA_BUILD}) -endif() -if(DEFINED ENV{ORT_MIGRAPHX_BUILD}) - add_compile_definitions(ORT_MIGRAPHX_BUILD=$ENV{ORT_MIGRAPHX_BUILD}) -endif() -if(DEFINED ENV{ORT_TENSORRT_BUILD}) - add_compile_definitions(ORT_TENSORRT_BUILD=$ENV{ORT_TENSORRT_BUILD}) -endif() +add_compile_definitions(ORT_ROCM_BUILD=${ORT_ROCM_BUILD}) +add_compile_definitions(ORT_CUDA_BUILD=${ORT_CUDA_BUILD}) +add_compile_definitions(ORT_MIGRAPHX_BUILD=${ORT_MIGRAPHX_BUILD}) +add_compile_definitions(ORT_TENSORRT_BUILD=${ORT_TENSORRT_BUILD}) + +message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}") +message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}") +message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}") +message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}") if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH}) set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") diff --git a/GPU/GPUTracking/Global/GPUChain.h b/GPU/GPUTracking/Global/GPUChain.h index 4130990a7d1e2..59712c30a62dd 100644 --- a/GPU/GPUTracking/Global/GPUChain.h +++ b/GPU/GPUTracking/Global/GPUChain.h @@ -83,7 +83,7 @@ class GPUChain inline GPUParam& param() { return mRec->param(); } inline const GPUConstantMem* processors() const { return mRec->processors(); } inline void SynchronizeStream(int32_t stream) { mRec->SynchronizeStream(stream); } - inline void SetONNXGPUStream(Ort::SessionOptions* opt, int32_t stream, int32_t* deviceId) { mRec->SetONNXGPUStream(opt, stream, deviceId); } + inline void SetONNXGPUStream(Ort::SessionOptions& opt, int32_t stream, int32_t* deviceId) { mRec->SetONNXGPUStream(opt, stream, deviceId); } inline void SynchronizeEvents(deviceEvent* evList, int32_t nEvents = 1) { mRec->SynchronizeEvents(evList, nEvents); } inline void SynchronizeEventAndRelease(deviceEvent& ev, bool doGPU = true) { diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 3d5cb79711957..e3088d6143f9b 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -623,44 +623,56 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[iSector] : clustererNN; - clustererNNShadow.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression; - clustererNNShadow.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow; - clustererNNShadow.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad; - clustererNNShadow.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime; - clustererNNShadow.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData; - clustererNNShadow.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0); - clustererNNShadow.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode; - clustererNNShadow.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue; - clustererNNShadow.nnClusterizerTotalClusters = maxClusters; - clustererNNShadow.nnClassThreshold = nn_settings.nnClassThreshold; - clustererNNShadow.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold; - if (clustererNNShadow.nnSigmoidTrafoClassThreshold) { - clustererNNShadow.nnClassThreshold = (float)std::log(clustererNNShadow.nnClassThreshold / (1.f - clustererNNShadow.nnClassThreshold)); - } - if (nn_settings.nnClusterizerVerbosity < 0) { - clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity; + + if (doGPU){ + clustererNNShadow.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression; + clustererNNShadow.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow; + clustererNNShadow.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad; + clustererNNShadow.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime; + clustererNNShadow.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData; + clustererNNShadow.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0); + clustererNNShadow.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode; + clustererNNShadow.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue; + clustererNNShadow.nnClusterizerTotalClusters = maxClusters; + clustererNNShadow.nnClassThreshold = nn_settings.nnClassThreshold; + clustererNNShadow.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold; + if (clustererNNShadow.nnSigmoidTrafoClassThreshold) { + clustererNNShadow.nnClassThreshold = (float)std::log(clustererNNShadow.nnClassThreshold / (1.f - clustererNNShadow.nnClassThreshold)); + } + if (nn_settings.nnClusterizerVerbosity < 0) { + clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity; + } else { + clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; + } + clustererNNShadow.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos; + nnApplication.initModels(); + nnApplication.initClusterizer(nn_settings, clustererNNShadow); } else { - clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; + // not sure if this part is needed at all + clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression; + clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow; + clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad; + clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime; + clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData; + clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0); + clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode; + clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue; + clustererNN.nnClusterizerTotalClusters = maxClusters; + clustererNN.nnClassThreshold = nn_settings.nnClassThreshold; + clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold; + if (clustererNN.nnSigmoidTrafoClassThreshold) { + clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold)); + } + if (nn_settings.nnClusterizerVerbosity < 0) { + clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity; + } else { + clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; + } + clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos; + nnApplication.initModels(); + nnApplication.initClusterizer(nn_settings, clustererNN); } - clustererNNShadow.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos; - nnApplication.initClusterizer(nn_settings, clustererNNShadow); - // if (doGPU) { - // std::vector pointerSizes = clustererNNShadow.pointerSizes(); - // // FIXME: These are for sure not needed. The arrays are empty at this point, only the space needs to be reserved. Is this already handeled by computePointerWithAlignment? - // // Once a GPU is available, everything should be done on the GPU for now. - // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.inputData32, clustererNN.inputData32, pointerSizes[0], lane, true); - // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.inputData16, clustererNN.inputData16, pointerSizes[1], lane, true); - // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataClass, clustererNN.outputDataClass, pointerSizes[2], lane, true); - // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.modelProbabilities, clustererNN.modelProbabilities, pointerSizes[3], lane, true); - // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataReg1, clustererNN.outputDataReg1, pointerSizes[4], lane, true); - // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.outputDataReg2, clustererNN.outputDataReg2, pointerSizes[5], lane, true); - // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.peakPositions, clustererNN.peakPositions, pointerSizes[6], lane, true); - // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.clusterFlags, clustererNN.clusterFlags, pointerSizes[7], lane, true); - // GPUMemCpy(RecoStep::TPCClusterFinding, clustererNNShadow.centralCharges, clustererNN.centralCharges, pointerSizes[8], lane, true); - // } else { - // AllocateRegisteredMemory(clustererNNShadow.mMemoryId); - // } - AllocateRegisteredMemory(clustererNNShadow.mMemoryId); + AllocateRegisteredMemory(clustererNN.mMemoryId); } } #endif @@ -936,12 +948,30 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[iSector] : clustererNN; const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; - GPUTPCNNClusterizerHost nnApplication(nn_settings, lane); // FIXME: This needs to be the deviceID. If that is the lane, then this line is correct + int32_t deviceId = -1; - SetONNXGPUStream(nnApplication.model_class.updateSessionOptions(), lane, &deviceId); - SetONNXGPUStream(nnApplication.model_reg_1.updateSessionOptions(), lane, &deviceId); - SetONNXGPUStream(nnApplication.model_reg_2.updateSessionOptions(), lane, &deviceId); + GPUTPCNNClusterizerHost nnApplication(nn_settings); + LOG(info) << "Allocating ONNX stream for lane " << lane << " and sector " << iSector; + if (nnApplication.modelsUsed[0]) { + SetONNXGPUStream((nnApplication.model_class).updateSessionOptions(), lane, &deviceId); + (nnApplication.model_class).initEnvironment(); + } + if (nnApplication.modelsUsed[1]) { + SetONNXGPUStream((nnApplication.model_reg_1).updateSessionOptions(), lane, &deviceId); + (nnApplication.model_reg_1).initEnvironment(); + } + if (nnApplication.modelsUsed[2]) { + SetONNXGPUStream((nnApplication.model_reg_2).updateSessionOptions(), lane, &deviceId); + (nnApplication.model_reg_2).initEnvironment(); + } + int withMC = (doGPU && propagateMCLabels); + if (doGPU){ + // SetupGPUProcessor(&clustererNN, true); + WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer[lane] - (char*)processors(), &clustererNNShadow, sizeof(clustererNN), lane); + TransferMemoryResourcesToGPU(RecoStep::TPCClusterFinding, &clustererNNShadow, lane); + LOG(info) << "Successfully allocated for stream " << lane << " and sector " << iSector << " with memory size " << sizeof(clustererNN) << " and shadow size " << sizeof(clustererNNShadow); + } if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); @@ -958,7 +988,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) auto stop0 = std::chrono::high_resolution_clock::now(); auto start1 = std::chrono::high_resolution_clock::now(); - nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType, deviceId); + LOG(info) << "ONNX stream set. Device ID is " << deviceId << " for stream " << lane; + nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType, deviceId); if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels } else { @@ -966,10 +997,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } if (!clustererNNShadow.nnClusterizerUseCfRegression) { - nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType, deviceId); + nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType, deviceId); runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1 if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { - nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType, deviceId); + nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType, deviceId); runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2 } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index f4e442a6d7fb4..208e8c6428cb5 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -30,9 +30,7 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) { computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize); } - computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode); computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode); - computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode); if (nnClusterizerModelClassNumOutputNodes > 0) { computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); } @@ -52,30 +50,28 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) } std::vector GPUTPCNNClusterizer::pointerSizes() { - std::vector sizes(9, -1); + std::vector sizes(7, -1); if (nnClusterizerBatchedMode > 0) { if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) { sizes[0] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData16 } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) { sizes[1] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData32 } - sizes[2] = nnClusterizerBatchedMode; // peakPositions - sizes[3] = 2 * nnClusterizerBatchedMode; // clusterFlags - sizes[4] = nnClusterizerBatchedMode; // centralCharges + sizes[2] = 2 * nnClusterizerBatchedMode; // clusterFlags if (nnClusterizerModelClassNumOutputNodes > 0) { - sizes[5] = nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes; // modelProbabilities + sizes[3] = nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes; // modelProbabilities } if (!nnClusterizerUseCfRegression) { if (nnClusterizerModelReg1NumOutputNodes > 0) { - sizes[6] = nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes; // outputDataReg1 + sizes[4] = nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes; // outputDataReg1 } if (nnClusterizerModelReg2NumOutputNodes > 0) { - sizes[7] = nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes; // outputDataReg2 + sizes[5] = nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes; // outputDataReg2 } } } if (nnClusterizerTotalClusters > 0) { - sizes[8] = nnClusterizerTotalClusters; // outputDataClass + sizes[6] = nnClusterizerTotalClusters; // outputDataClass } return sizes; } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 00a1bc09e536a..bd17d27edb3c4 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -22,11 +22,6 @@ using namespace o2::gpu; -GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, int32_t deviceId) -{ - init(settings, deviceId); -} - void GPUTPCNNClusterizerHost::loadFromCCDB(std::map settings) { o2::ccdb::CcdbApi ccdbApi; @@ -54,7 +49,7 @@ void GPUTPCNNClusterizerHost::loadFromCCDB(std::map se } } -void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings, int32_t deviceId) +void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) { std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath; std::vector reg_model_paths; @@ -103,7 +98,6 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set OrtOptions = { {"model-path", class_model_path}, {"device", settings.nnInferenceDevice}, - {"device-id", std::to_string(deviceId)}, {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)}, {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)}, {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)}, @@ -112,23 +106,40 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set {"profiling-output-path", settings.nnInferenceOrtProfilingPath}, {"logging-level", std::to_string(settings.nnInferenceVerbosity)}}; - model_class.init(OrtOptions); + LOG(info) << "Model path: " << class_model_path; + model_class.initOptions(OrtOptions); + modelsUsed[0] = true; reg_model_paths = o2::utils::Str::tokenize(reg_model_path, ':'); if (!settings.nnClusterizerUseCfRegression) { - if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { + if (reg_model_paths.size() == 1) { OrtOptions["model-path"] = reg_model_paths[0]; - model_reg_1.init(OrtOptions); + model_reg_1.initOptions(OrtOptions); + modelsUsed[1] = true; } else { OrtOptions["model-path"] = reg_model_paths[0]; - model_reg_1.init(OrtOptions); + model_reg_1.initOptions(OrtOptions); + modelsUsed[1] = true; OrtOptions["model-path"] = reg_model_paths[1]; - model_reg_2.init(OrtOptions); + model_reg_2.initOptions(OrtOptions); + modelsUsed[2] = true; } } } +void GPUTPCNNClusterizerHost::initModels() { + if (!model_class.isInitialized() && modelsUsed[0]) { + model_class.initEnvironment(); + } + if (!model_reg_1.isInitialized() && modelsUsed[1]) { + model_reg_1.initEnvironment(); + } + if (!model_reg_2.isInitialized() && modelsUsed[2]) { + model_reg_2.initEnvironment(); + } +} + void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer) { clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index ee0a5ea19d1dd..a383cbfd2bc7f 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -37,16 +37,18 @@ class GPUTPCNNClusterizerHost { public: GPUTPCNNClusterizerHost() = default; - GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, int32_t = 0); + GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) { init(settings); } - void init(const GPUSettingsProcessingNNclusterizer&, int32_t = 0); + void init(const GPUSettingsProcessingNNclusterizer&); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + void initModels(); void loadFromCCDB(std::map); void networkInference(o2::ml::OrtModel, GPUTPCNNClusterizer&, size_t, float*, int32_t, int32_t); std::unordered_map OrtOptions; o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters + std::vector modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2 std::vector reg_model_paths; private: diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 73051bd8477fd..ef75e1c1af19e 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -56,20 +56,15 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); - Array2D isPeakMap(clusterer.mPpeakMap); - uint write_idx = glo_idx * clustererNN.nnClusterizerElementSize; // Potential optimization: Either choose nnClusterizerBatchedMode as a power of 2 or calculate from threadId and blockId + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + Array2D isPeakMap(clusterer.mPpeakMap); ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; int row = static_cast(peak.row()), pad = static_cast(peak.pad()), time = static_cast(peak.time()); // Explicit casting to avoid conversion errors float central_charge = static_cast(chargeMap[peak].unpack()); - - clustererNN.peakPositions[glo_idx] = peak; - clustererNN.centralCharges[glo_idx] = central_charge; - clustererNN.outputDataClass[glo_idx + batchStart] = -1.f; - int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow); + #ifndef GPUCA_GPUCODE GPUCA_UNROLL(U(), U()); #endif @@ -153,6 +148,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; + float central_charge = static_cast(chargeMap[peak].unpack()); + CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; @@ -168,34 +166,34 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadcollect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack())); + CPU_ONLY(labelAcc->collect(peak, central_charge)); GPUTPCCFClusterizer::buildCluster( clusterer.Param().rec, chargeMap, - clustererNN.peakPositions[glo_idx], + peak, smem.posBcast, smem.buf, smem.innerAboveThreshold, &dummy_pc, labelAcc); } - if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) { + if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; } return; } - pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg1[model_output_index + 4], - static_cast(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg1[model_output_index], + pc.setFull(central_charge * clustererNN.outputDataReg1[model_output_index + 4], + static_cast(peak.pad()) + clustererNN.outputDataReg1[model_output_index], clustererNN.outputDataReg1[model_output_index + 2], - (clusterer.mPmemory->fragment).start + static_cast(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg1[model_output_index + 1], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg1[model_output_index + 1], clustererNN.outputDataReg1[model_output_index + 3], clustererNN.clusterFlags[2 * glo_idx], clustererNN.clusterFlags[2 * glo_idx + 1]); tpc::ClusterNative myCluster; - bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap); + bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap); if (rejectCluster) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -204,11 +202,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadcommit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); + CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); } else { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -235,6 +233,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + ChargePos peak = clusterer.mPfilteredPeakPositions[glo_idx + batchStart]; + float central_charge = static_cast(chargeMap[peak].unpack()); + CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; @@ -247,18 +248,18 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadcollect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack())); + CPU_ONLY(labelAcc->collect(peak, central_charge)); GPUTPCCFClusterizer::buildCluster( clusterer.Param().rec, chargeMap, - clustererNN.peakPositions[glo_idx], + peak, smem.posBcast, smem.buf, smem.innerAboveThreshold, &dummy_pc, labelAcc); } - if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) { + if ((clusterer.mPmemory->fragment).isOverlap(peak.time())) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; } @@ -266,16 +267,16 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index], + pc.setFull(central_charge * clustererNN.outputDataReg2[model_output_index + 8], + static_cast(peak.pad()) + clustererNN.outputDataReg2[model_output_index], clustererNN.outputDataReg2[model_output_index + 4], - (clusterer.mPmemory->fragment).start + static_cast(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 2], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2[model_output_index + 2], clustererNN.outputDataReg2[model_output_index + 6], clustererNN.clusterFlags[2 * glo_idx], clustererNN.clusterFlags[2 * glo_idx + 1]); tpc::ClusterNative myCluster; - bool rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap); + bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap); if (rejectCluster) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -284,11 +285,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadcommit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); + CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); // Cluster 2 - pc.setFull(clustererNN.centralCharges[glo_idx] * clustererNN.outputDataReg2[model_output_index + 9], - static_cast(clustererNN.peakPositions[glo_idx].pad()) + clustererNN.outputDataReg2[model_output_index + 1], + pc.setFull(central_charge * clustererNN.outputDataReg2[model_output_index + 9], + static_cast(peak.pad()) + clustererNN.outputDataReg2[model_output_index + 1], clustererNN.outputDataReg2[model_output_index + 5], - (clusterer.mPmemory->fragment).start + static_cast(clustererNN.peakPositions[glo_idx].time()) + clustererNN.outputDataReg2[model_output_index + 3], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2[model_output_index + 3], clustererNN.outputDataReg2[model_output_index + 7], clustererNN.clusterFlags[2 * glo_idx], clustererNN.clusterFlags[2 * glo_idx + 1]); - rejectCluster = !pc.toNative(clustererNN.peakPositions[glo_idx], clustererNN.centralCharges[glo_idx], myCluster, clusterer.Param(), chargeMap); + rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap); if (rejectCluster) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -317,11 +318,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadcommit(clustererNN.peakPositions[glo_idx].row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters? + // CPU_ONLY(labelAcc->commit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); // -> Is this needed? How to handle MC labels for split clusters? } else { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; From 4ef35fc1c25611d79c88ab40cd9b1ffacd4829f5 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 1 Apr 2025 15:10:33 +0200 Subject: [PATCH 18/40] Found the stream allocation issue. Now starting optimizations --- Common/ML/include/ML/OrtInterface.h | 1 + Common/ML/src/OrtInterface.cxx | 5 +++++ GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 9 ++------- GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 1 - 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 44c89b748f52c..47e98683c3800 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -55,6 +55,7 @@ class OrtModel void initEnvironment(); bool isInitialized() { return mInitialized; } Ort::SessionOptions& updateSessionOptions(); + Ort::MemoryInfo& updateMemoryInfo(); void setIO(); virtual ~OrtModel() = default; diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 3100eb6dd2243..149f86d98eb0e 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -40,6 +40,11 @@ Ort::SessionOptions& OrtModel::updateSessionOptions() return pImplOrt->sessionOptions; } +Ort::MemoryInfo& OrtModel::updateMemoryInfo() +{ + return pImplOrt->memoryInfo; +} + void OrtModel::initOptions(std::unordered_map optionsMap) { pImplOrt = new OrtVariables(); diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 959072222125e..844e754ee2f6c 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -697,16 +697,11 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options const auto& api = Ort::GetApi(); // api.GetCurrentGpuDeviceId(deviceId); OrtROCMProviderOptions rocm_options; - LOG(info) << "Creating ROCm provider options"; - // rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream - // LOG(info) << "Setting user compute stream"; - // rocm_options.user_compute_stream = &(mInternals->Streams[stream]); - // LOG(info) << "Stream is set with streamId " << stream << " and reference " << &(mInternals->Streams[stream]); + rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream + rocm_options.user_compute_stream = mInternals->Streams[stream]; session_options.AppendExecutionProvider_ROCM(rocm_options); - LOG(info) << "Appending ROCm provider options"; // OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, *deviceId); // api.ReleaseROCMProviderOptions(rocm_options); - LOG(info) << "Releasing ROCm provider options"; } #endif // GPUCA_HAS_ONNX diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index e3088d6143f9b..2905601bd8f28 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -988,7 +988,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) auto stop0 = std::chrono::high_resolution_clock::now(); auto start1 = std::chrono::high_resolution_clock::now(); - LOG(info) << "ONNX stream set. Device ID is " << deviceId << " for stream " << lane; nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType, deviceId); if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels From 4faaa4a69e0f6cf53b65c09f973490f0089e0fa5 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 1 Apr 2025 20:20:41 +0200 Subject: [PATCH 19/40] Improve readability and adapt for some comments --- Common/ML/include/ML/OrtInterface.h | 57 +++--- Common/ML/src/OrtInterface.cxx | 168 ++++++++---------- .../Base/GPUReconstructionProcessing.h | 2 +- .../Base/cuda/GPUReconstructionCUDA.cu | 4 +- GPU/GPUTracking/CMakeLists.txt | 25 +-- .../Global/GPUChainTrackingClusterizer.cxx | 122 +++++-------- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 1 + .../GPUTPCNNClusterizerHost.cxx | 47 ++--- .../GPUTPCNNClusterizerHost.h | 4 +- 9 files changed, 197 insertions(+), 233 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 47e98683c3800..56be450fb2ff1 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -41,24 +41,34 @@ class OrtModel { public: - // Constructor + // Constructors & destructors OrtModel() = default; - OrtModel(std::unordered_map optionsMap) { - initOptions(optionsMap); - initEnvironment(); - } + OrtModel(std::unordered_map optionsMap) { init(optionsMap); } void init(std::unordered_map optionsMap) { initOptions(optionsMap); initEnvironment(); } + virtual ~OrtModel() = default; + + // General purpose void initOptions(std::unordered_map optionsMap); void initEnvironment(); + void memoryOnDevice(int32_t = 0); bool isInitialized() { return mInitialized; } - Ort::SessionOptions& updateSessionOptions(); - Ort::MemoryInfo& updateMemoryInfo(); - void setIO(); + void resetSession(); - virtual ~OrtModel() = default; + // Getters + std::vector> getNumInputNodes() const { return mInputShapes; } + std::vector> getNumOutputNodes() const { return mOutputShapes; } + std::vector getInputNames() const { return mInputNames; } + std::vector getOutputNames() const { return mOutputNames; } + Ort::SessionOptions& getSessionOptions(); + Ort::MemoryInfo& getMemoryInfo(); + + // Setters + void setDeviceId(int32_t id) { deviceId = id; } + void setIO(); + void setActiveThreads(int threads) { intraOpNumThreads = threads; } // Conversion template @@ -66,29 +76,16 @@ class OrtModel // Inferencing template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h - std::vector inference(std::vector&, int32_t = -1); - - template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h - std::vector inference(std::vector>&, int32_t = -1); - - template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h - void inference(I*, size_t, O*, int32_t = -1); - - // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type - // std::vector inference(std::vector&); - - // Reset session - void resetSession(); + std::vector inference(std::vector&); - std::vector> getNumInputNodes() const { return mInputShapes; } - std::vector> getNumOutputNodes() const { return mOutputShapes; } - std::vector getInputNames() const { return mInputNames; } - std::vector getOutputNames() const { return mOutputNames; } + template + std::vector inference(std::vector>&); - void setActiveThreads(int threads) { intraOpNumThreads = threads; } + template + void inference(I*, size_t, O*); private: - // ORT variables -> need to be hidden as Pimpl + // ORT variables -> need to be hidden as pImpl struct OrtVariables; OrtVariables* pImplOrt; @@ -99,8 +96,8 @@ class OrtModel // Environment settings bool mInitialized = false; - std::string modelPath, envName = "", device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda - int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; + std::string modelPath, envName = "", deviceType = "CPU", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda + int32_t intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = -1, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; std::string printShape(const std::vector&); }; diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 149f86d98eb0e..49ca969c811df 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -35,16 +35,7 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); }; -Ort::SessionOptions& OrtModel::updateSessionOptions() -{ - return pImplOrt->sessionOptions; -} - -Ort::MemoryInfo& OrtModel::updateMemoryInfo() -{ - return pImplOrt->memoryInfo; -} - +// General purpose void OrtModel::initOptions(std::unordered_map optionsMap) { pImplOrt = new OrtVariables(); @@ -56,7 +47,8 @@ void OrtModel::initOptions(std::unordered_map optionsM if (!optionsMap["model-path"].empty()) { modelPath = optionsMap["model-path"]; - device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU"); + deviceType = (optionsMap.contains("device-type") ? optionsMap["device-type"] : "CPU"); + deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : -1); allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); interOpNumThreads = (optionsMap.contains("inter-op-num-threads") ? std::stoi(optionsMap["inter-op-num-threads"]) : 0); @@ -65,7 +57,7 @@ void OrtModel::initOptions(std::unordered_map optionsM enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); envName = (optionsMap.contains("onnx-environment-name") ? optionsMap["onnx-environment-name"] : "onnx_model_inference"); - if (device == "CPU") { + if (deviceType == "CPU") { (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); (pImplOrt->sessionOptions).SetInterOpNumThreads(interOpNumThreads); if (intraOpNumThreads > 1 || interOpNumThreads > 1) { @@ -97,6 +89,8 @@ void OrtModel::initOptions(std::unordered_map optionsM (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); + + mInitialized = true; } else { LOG(fatal) << "(ORT) Model path cannot be empty!"; } @@ -104,7 +98,9 @@ void OrtModel::initOptions(std::unordered_map optionsM void OrtModel::initEnvironment() { - mInitialized = true; + if(allocateDeviceMemory) { + memoryOnDevice(deviceId); + } pImplOrt->env = std::make_shared( OrtLoggingLevel(loggingLevel), (envName.empty() ? "ORT" : envName.c_str()), @@ -128,32 +124,30 @@ void OrtModel::initEnvironment() (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); + if (loggingLevel < 2) { + LOG(info) << "(ORT) Model loaded successfully! (input: " << printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")"; + } + setIO(); } -void OrtModel::setIO() { - for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); - } - for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); - } - for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); - } - for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); - } - - inputNamesChar.resize(mInputNames.size(), nullptr); - std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), - [&](const std::string& str) { return str.c_str(); }); - outputNamesChar.resize(mOutputNames.size(), nullptr); - std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), - [&](const std::string& str) { return str.c_str(); }); - if (loggingLevel < 2) { - LOG(info) << "(ORT) Model loaded successfully! (input: " << printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")"; +void OrtModel::memoryOnDevice(int32_t deviceIndex) +{ +#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1) + if (deviceIndex >= 0) { + std::string dev_mem_str = ""; + if (deviceType == "ROCM") { + dev_mem_str = "Hip"; + } + if (deviceType == "CUDA") { + dev_mem_str = "Cuda"; + } + pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault); + if (loggingLevel < 2) { + LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex; + } } +#endif } void OrtModel::resetSession() @@ -161,6 +155,17 @@ void OrtModel::resetSession() pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); } +// Getters +Ort::SessionOptions& OrtModel::getSessionOptions() +{ + return pImplOrt->sessionOptions; +} + +Ort::MemoryInfo& OrtModel::getMemoryInfo() +{ + return pImplOrt->memoryInfo; +} + template std::vector OrtModel::v2v(std::vector& input, bool clearInput) { @@ -176,32 +181,32 @@ std::vector OrtModel::v2v(std::vector& input, bool clearInput) } } -std::string OrtModel::printShape(const std::vector& v) -{ - std::stringstream ss(""); - for (size_t i = 0; i < v.size() - 1; i++) { - ss << v[i] << "x"; +void OrtModel::setIO() { + for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { + mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); } - ss << v[v.size() - 1]; - return ss.str(); + for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { + mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { + mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); + } + for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { + mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + + inputNamesChar.resize(mInputNames.size(), nullptr); + std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), + [&](const std::string& str) { return str.c_str(); }); + outputNamesChar.resize(mOutputNames.size(), nullptr); + std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), + [&](const std::string& str) { return str.c_str(); }); } +// Inference template -std::vector OrtModel::inference(std::vector& input, int32_t deviceIndex) +std::vector OrtModel::inference(std::vector& input) { -#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1) - if (allocateDeviceMemory) { - std::string dev_mem_str = ""; - if (device == "ROCM") { - dev_mem_str = "Hip"; - } - if (device == "CUDA") { - dev_mem_str = "Cuda"; - } - pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault); - LOG(info) << "(ORT) Memory info set to on-device memory for device " << device << " with ID "<< deviceIndex; - } -#endif std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; if constexpr (std::is_same_v) { @@ -217,32 +222,19 @@ std::vector OrtModel::inference(std::vector& input, int32_t deviceIndex) return outputValuesVec; } -template std::vector OrtModel::inference(std::vector&, int32_t); +template std::vector OrtModel::inference(std::vector&); -template std::vector OrtModel::inference(std::vector&, int32_t); +template std::vector OrtModel::inference(std::vector&); -template std::vector OrtModel::inference(std::vector&, int32_t); +template std::vector OrtModel::inference(std::vector&); template -void OrtModel::inference(I* input, size_t input_size, O* output, int32_t deviceIndex) +void OrtModel::inference(I* input, size_t input_size, O* output) { // std::vector providers = Ort::GetAvailableProviders(); // for (const auto& provider : providers) { // LOG(info) << "Available Execution Provider: " << provider; // } -#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1) - if (allocateDeviceMemory) { - std::string dev_mem_str = ""; - if (device == "ROCM") { - dev_mem_str = "Hip"; - } - if (device == "CUDA") { - dev_mem_str = "Cuda"; - } - pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault); - LOG(info) << "(ORT) Memory info set to on-device memory for device " << device << " with ID "<< deviceIndex; - } -#endif std::vector inputShape{input_size, (int64_t)mInputShapes[0][1]}; Ort::Value inputTensor = Ort::Value(nullptr); if constexpr (std::is_same_v) { @@ -257,26 +249,13 @@ void OrtModel::inference(I* input, size_t input_size, O* output, int32_t deviceI (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); } -template void OrtModel::inference(OrtDataType::Float16_t*, size_t, float*, int32_t); +template void OrtModel::inference(OrtDataType::Float16_t*, size_t, float*); -template void OrtModel::inference(float*, size_t, float*, int32_t); +template void OrtModel::inference(float*, size_t, float*); template -std::vector OrtModel::inference(std::vector>& input, int32_t deviceIndex) +std::vector OrtModel::inference(std::vector>& input) { -#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1) - if (allocateDeviceMemory) { - std::string dev_mem_str = ""; - if (device == "ROCM") { - dev_mem_str = "Hip"; - } - if (device == "CUDA") { - dev_mem_str = "Cuda"; - } - pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault); - LOG(info) << "(ORT) Memory info set to on-device memory for device " << device << " with ID " << deviceIndex; - } -#endif std::vector inputTensor; for (auto i : input) { std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; @@ -294,6 +273,17 @@ std::vector OrtModel::inference(std::vector>& input, int32_t d return outputValuesVec; } +// private +std::string OrtModel::printShape(const std::vector& v) +{ + std::stringstream ss(""); + for (size_t i = 0; i < v.size() - 1; i++) { + ss << v[i] << "x"; + } + ss << v[v.size() - 1]; + return ss.str(); +} + } // namespace ml } // namespace o2 diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h index 353c4bd76abb9..0e826b8794983 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h +++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h @@ -92,7 +92,7 @@ class GPUReconstructionProcessing : public GPUReconstruction void AddGPUEvents(T*& events); virtual std::unique_ptr GetThreadContext() override; - virtual void SetONNXGPUStream(Ort::SessionOptions&, int32_t, int32_t*) {} + // virtual void SetONNXGPUStream(Ort::SessionOptions&, int32_t, int32_t*) {} struct RecoStepTimerMeta { HighResTimer timerToGPU; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 844e754ee2f6c..4c3dc12d04568 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -673,7 +673,7 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_option // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size()); // this implicitly sets "has_user_compute_stream" - UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", &mInternals->Streams[stream]); + UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream]); session_options.AppendExecutionProvider_CUDA_V2(cuda_options); // Finally, don't forget to release the provider options @@ -694,7 +694,7 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options { // Create ROCm provider options cudaGetDevice(deviceId); - const auto& api = Ort::GetApi(); + // const auto& api = Ort::GetApi(); // api.GetCurrentGpuDeviceId(deviceId); OrtROCMProviderOptions rocm_options; rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 186d8ce4b0551..e0f5e3bc37c8f 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -14,17 +14,6 @@ set(MODULE GPUTracking) # set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -O0") # to uncomment if needed, tired of typing this... # set(GPUCA_BUILD_DEBUG 1) -# Pass ORT variables as a preprocessor definition -add_compile_definitions(ORT_ROCM_BUILD=${ORT_ROCM_BUILD}) -add_compile_definitions(ORT_CUDA_BUILD=${ORT_CUDA_BUILD}) -add_compile_definitions(ORT_MIGRAPHX_BUILD=${ORT_MIGRAPHX_BUILD}) -add_compile_definitions(ORT_TENSORRT_BUILD=${ORT_TENSORRT_BUILD}) - -message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}") -message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}") -message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}") -message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}") - if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_NO_FAST_MATH}) set(CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} ${GPUCA_CXX_NO_FAST_MATH_FLAGS}") if(GPUCA_DETERMINISTIC_MODE GREATER_EQUAL ${GPUCA_DETERMINISTIC_MODE_MAP_OPTO2}) @@ -345,7 +334,19 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${targetName} PRIVATE $) - target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1) + message("Compile definitions for ONNX runtime:") + message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}") + message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}") + message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}") + message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}") + + + target_compile_definitions(${targetName} PRIVATE + GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1 + ORT_ROCM_BUILD=$ + ORT_CUDA_BUILD=$ + ORT_MIGRAPHX_BUILD=$ + ORT_TENSORRT_BUILD=$) o2_target_root_dictionary(${MODULE} HEADERS ${HDRS_CINT_O2} ${HDRS_CINT_O2_ADDITIONAL} diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 2905601bd8f28..67ed38ee04aa8 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -613,64 +613,49 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) #ifdef GPUCA_HAS_ONNX const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; - GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only + GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes]; + if (GetProcessingSettings().nn.applyNNclusterizer) { uint32_t maxClusters = 0; - nnApplication.init(nn_settings); - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { - maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters); + for (uint32_t lane = 0; lane < GetProcessingSettings().nTPCClustererLanes; lane++) { + maxClusters = std::max(maxClusters, processors()->tpcClusterer[lane].mNMaxClusters); } - for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { - GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; - GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[iSector] : clustererNN; + for (uint32_t lane = 0; lane < GetProcessingSettings().nTPCClustererLanes; lane++) { + nnApplications[lane].init(nn_settings); + GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane]; + GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN; + + int32_t deviceId = -1; + if (clustererNNShadow.nnClusterizerVerbosity < 3) { + LOG(info) << "Allocating ONNX stream for lane " << lane << " and lane " << lane; + } + if (nnApplications[lane].modelsUsed[0]) { + SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId); + (nnApplications[lane].model_class).setDeviceId(deviceId); + (nnApplications[lane].model_class).initEnvironment(); + } + if (nnApplications[lane].modelsUsed[1]) { + SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId); + (nnApplications[lane].model_reg_1).setDeviceId(deviceId); + (nnApplications[lane].model_reg_1).initEnvironment(); + } + if (nnApplications[lane].modelsUsed[2]) { + SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId); + (nnApplications[lane].model_reg_2).setDeviceId(deviceId); + (nnApplications[lane].model_reg_2).initEnvironment(); + } if (doGPU){ - clustererNNShadow.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression; - clustererNNShadow.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow; - clustererNNShadow.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad; - clustererNNShadow.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime; - clustererNNShadow.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData; - clustererNNShadow.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0); - clustererNNShadow.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode; - clustererNNShadow.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue; + clustererNNShadow.deviceId = deviceId; + clustererNNShadow.mISector = lane; clustererNNShadow.nnClusterizerTotalClusters = maxClusters; - clustererNNShadow.nnClassThreshold = nn_settings.nnClassThreshold; - clustererNNShadow.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold; - if (clustererNNShadow.nnSigmoidTrafoClassThreshold) { - clustererNNShadow.nnClassThreshold = (float)std::log(clustererNNShadow.nnClassThreshold / (1.f - clustererNNShadow.nnClassThreshold)); - } - if (nn_settings.nnClusterizerVerbosity < 0) { - clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity; - } else { - clustererNNShadow.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; - } - clustererNNShadow.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos; - nnApplication.initModels(); - nnApplication.initClusterizer(nn_settings, clustererNNShadow); + nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow); } else { - // not sure if this part is needed at all - clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression; - clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow; - clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad; - clustererNN.nnClusterizerSizeInputTime = nn_settings.nnClusterizerSizeInputTime; - clustererNN.nnClusterizerAddIndexData = nn_settings.nnClusterizerAddIndexData; - clustererNN.nnClusterizerElementSize = ((2 * nn_settings.nnClusterizerSizeInputRow + 1) * (2 * nn_settings.nnClusterizerSizeInputPad + 1) * (2 * nn_settings.nnClusterizerSizeInputTime + 1)) + (nn_settings.nnClusterizerAddIndexData ? 3 : 0); - clustererNN.nnClusterizerBatchedMode = nn_settings.nnClusterizerBatchedMode; - clustererNN.nnClusterizerBoundaryFillValue = nn_settings.nnClusterizerBoundaryFillValue; + // TODO: not sure if this part is needed at all + clustererNN.deviceId = deviceId; + clustererNN.mISector = lane; clustererNN.nnClusterizerTotalClusters = maxClusters; - clustererNN.nnClassThreshold = nn_settings.nnClassThreshold; - clustererNN.nnSigmoidTrafoClassThreshold = nn_settings.nnSigmoidTrafoClassThreshold; - if (clustererNN.nnSigmoidTrafoClassThreshold) { - clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold)); - } - if (nn_settings.nnClusterizerVerbosity < 0) { - clustererNN.nnClusterizerVerbosity = nn_settings.nnInferenceVerbosity; - } else { - clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; - } - clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos; - nnApplication.initModels(); - nnApplication.initClusterizer(nn_settings, clustererNN); + nnApplications[lane].initClusterizer(nn_settings, clustererNN); } AllocateRegisteredMemory(clustererNN.mMemoryId); } @@ -945,32 +930,15 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (GetProcessingSettings().nn.applyNNclusterizer) { #ifdef GPUCA_HAS_ONNX - GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; - GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[iSector] : clustererNN; - const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; - - int32_t deviceId = -1; - GPUTPCNNClusterizerHost nnApplication(nn_settings); - LOG(info) << "Allocating ONNX stream for lane " << lane << " and sector " << iSector; - if (nnApplication.modelsUsed[0]) { - SetONNXGPUStream((nnApplication.model_class).updateSessionOptions(), lane, &deviceId); - (nnApplication.model_class).initEnvironment(); - } - if (nnApplication.modelsUsed[1]) { - SetONNXGPUStream((nnApplication.model_reg_1).updateSessionOptions(), lane, &deviceId); - (nnApplication.model_reg_1).initEnvironment(); - } - if (nnApplication.modelsUsed[2]) { - SetONNXGPUStream((nnApplication.model_reg_2).updateSessionOptions(), lane, &deviceId); - (nnApplication.model_reg_2).initEnvironment(); - } + GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane]; + GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN; + GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane]; int withMC = (doGPU && propagateMCLabels); if (doGPU){ // SetupGPUProcessor(&clustererNN, true); - WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer[lane] - (char*)processors(), &clustererNNShadow, sizeof(clustererNN), lane); + WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&clustererNN - (char*)processors(), &clustererNNShadow, sizeof(clustererNN), lane); TransferMemoryResourcesToGPU(RecoStep::TPCClusterFinding, &clustererNNShadow, lane); - LOG(info) << "Successfully allocated for stream " << lane << " and sector " << iSector << " with memory size " << sizeof(clustererNN) << " and shadow size " << sizeof(clustererNNShadow); } if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { @@ -988,7 +956,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) auto stop0 = std::chrono::high_resolution_clock::now(); auto start1 = std::chrono::high_resolution_clock::now(); - nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType, deviceId); + nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType); if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels } else { @@ -996,10 +964,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } if (!clustererNNShadow.nnClusterizerUseCfRegression) { - nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType, deviceId); + nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType); runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1 if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { - nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType, deviceId); + nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType); runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2 } } @@ -1008,12 +976,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; time_fill += std::chrono::duration_cast(stop0 - start0).count() / 1e9; } - auto start1 = std::chrono::high_resolution_clock::now(); if (clustererNNShadow.nnClusterizerUseCfRegression) { + auto start1 = std::chrono::high_resolution_clock::now(); runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 + auto stop1 = std::chrono::high_resolution_clock::now(); + time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; } - auto stop1 = std::chrono::high_resolution_clock::now(); - time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; if (clustererNNShadow.nnClusterizerVerbosity < 3) { int acceptedClusters = 0; for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 0457534b3f903..9bf89e6337c6e 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -57,6 +57,7 @@ class GPUTPCNNClusterizer : public GPUProcessor int nnClusterizerModelReg2NumOutputNodes = -1; int nnInferenceInputDType = 0; // 0: float16, 1: float32 int mISector = -1; + int deviceId = -1; // Memory allocation for neural network float* inputData32 = nullptr; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index bd17d27edb3c4..0a2c35a6f6623 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -97,7 +97,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set OrtOptions = { {"model-path", class_model_path}, - {"device", settings.nnInferenceDevice}, + {"device-type", settings.nnInferenceDevice}, {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)}, {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)}, {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)}, @@ -128,36 +128,43 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set } } -void GPUTPCNNClusterizerHost::initModels() { - if (!model_class.isInitialized() && modelsUsed[0]) { - model_class.initEnvironment(); - } - if (!model_reg_1.isInitialized() && modelsUsed[1]) { - model_reg_1.initEnvironment(); +void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clustererNN) +{ + clustererNN.nnClusterizerUseCfRegression = settings.nnClusterizerUseCfRegression; + clustererNN.nnClusterizerSizeInputRow = settings.nnClusterizerSizeInputRow; + clustererNN.nnClusterizerSizeInputPad = settings.nnClusterizerSizeInputPad; + clustererNN.nnClusterizerSizeInputTime = settings.nnClusterizerSizeInputTime; + clustererNN.nnClusterizerAddIndexData = settings.nnClusterizerAddIndexData; + clustererNN.nnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0); + clustererNN.nnClusterizerBatchedMode = settings.nnClusterizerBatchedMode; + clustererNN.nnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue; + clustererNN.nnClassThreshold = settings.nnClassThreshold; + clustererNN.nnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold; + if (clustererNN.nnSigmoidTrafoClassThreshold) { + clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold)); } - if (!model_reg_2.isInitialized() && modelsUsed[2]) { - model_reg_2.initEnvironment(); + if (settings.nnClusterizerVerbosity < 0) { + clustererNN.nnClusterizerVerbosity = settings.nnInferenceVerbosity; + } else { + clustererNN.nnClusterizerVerbosity = settings.nnClusterizerVerbosity; } -} - -void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer) -{ - clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; + clustererNN.nnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos; + clustererNN.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; if (!settings.nnClusterizerUseCfRegression) { if (model_class.getNumOutputNodes()[0][1] == 1 || model_reg_2.isInitialized()) { - clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; + clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; } else { - clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; - clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1]; + clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; + clustererNN.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1]; } } } -void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype, int32_t deviceId) +void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype) { if (dtype == 0) { - model.inference(clustererNN.inputData16, size, output, deviceId); + model.inference(clustererNN.inputData16, size, output); } else { - model.inference(clustererNN.inputData32, size, output, deviceId); + model.inference(clustererNN.inputData32, size, output); } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index a383cbfd2bc7f..2c0e704595933 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -41,14 +41,14 @@ class GPUTPCNNClusterizerHost void init(const GPUSettingsProcessingNNclusterizer&); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); - void initModels(); void loadFromCCDB(std::map); - void networkInference(o2::ml::OrtModel, GPUTPCNNClusterizer&, size_t, float*, int32_t, int32_t); + void networkInference(o2::ml::OrtModel, GPUTPCNNClusterizer&, size_t, float*, int32_t); std::unordered_map OrtOptions; o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters std::vector modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2 + int32_t deviceId = -1; std::vector reg_model_paths; private: From 2801c2e4a73e2cf85ff88cf51d77816210d73c10 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 2 Apr 2025 11:31:55 +0200 Subject: [PATCH 20/40] Fixing memory assignment issue. Reconstruction runs through with FP32 networks --- .../Global/GPUChainTrackingClusterizer.cxx | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 67ed38ee04aa8..532ea169cd006 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -617,18 +617,16 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (GetProcessingSettings().nn.applyNNclusterizer) { uint32_t maxClusters = 0; - for (uint32_t lane = 0; lane < GetProcessingSettings().nTPCClustererLanes; lane++) { + int32_t deviceId = -1; + int32_t numLanes = GetProcessingSettings().nTPCClustererLanes; + for (uint32_t lane = 0; lane < NSECTORS; lane++) { maxClusters = std::max(maxClusters, processors()->tpcClusterer[lane].mNMaxClusters); } - for (uint32_t lane = 0; lane < GetProcessingSettings().nTPCClustererLanes; lane++) { + mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) { nnApplications[lane].init(nn_settings); GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane]; GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN; - int32_t deviceId = -1; - if (clustererNNShadow.nnClusterizerVerbosity < 3) { - LOG(info) << "Allocating ONNX stream for lane " << lane << " and lane " << lane; - } if (nnApplications[lane].modelsUsed[0]) { SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId); (nnApplications[lane].model_class).setDeviceId(deviceId); @@ -644,21 +642,32 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplications[lane].model_reg_2).setDeviceId(deviceId); (nnApplications[lane].model_reg_2).initEnvironment(); } - + if (clustererNNShadow.nnClusterizerVerbosity < 3) { + LOG(info) << "Allocated ONNX stream for lane " << lane << " and device " << deviceId; + } + }); + mRec->runParallelOuterLoop(doGPU, NSECTORS, [&](uint32_t sector) { + GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector]; + GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN; + int32_t lane = sector % numLanes; if (doGPU){ clustererNNShadow.deviceId = deviceId; - clustererNNShadow.mISector = lane; + clustererNNShadow.mISector = sector; clustererNNShadow.nnClusterizerTotalClusters = maxClusters; nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow); } else { // TODO: not sure if this part is needed at all clustererNN.deviceId = deviceId; - clustererNN.mISector = lane; + clustererNN.mISector = sector; clustererNN.nnClusterizerTotalClusters = maxClusters; nnApplications[lane].initClusterizer(nn_settings, clustererNN); } AllocateRegisteredMemory(clustererNN.mMemoryId); - } + if (doGPU){ + WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&clustererNN - (char*)processors(), &clustererNNShadow, sizeof(clustererNN), lane); + TransferMemoryResourcesToGPU(RecoStep::TPCClusterFinding, &clustererNNShadow, lane); + } + }); } #endif @@ -934,12 +943,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN; GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane]; + LOG(info) << "clustererNNShadow.inputData32: " << clustererNNShadow.inputData32; + LOG(info) << "clustererShadow.mPclusterInRow: " << clustererShadow.mPclusterInRow; + int withMC = (doGPU && propagateMCLabels); - if (doGPU){ - // SetupGPUProcessor(&clustererNN, true); - WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&clustererNN - (char*)processors(), &clustererNNShadow, sizeof(clustererNN), lane); - TransferMemoryResourcesToGPU(RecoStep::TPCClusterFinding, &clustererNNShadow, lane); - } if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); From 1dcb1daf2e670620b43ddb27ddc19efef2c0f5f1 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 2 Apr 2025 15:39:58 +0200 Subject: [PATCH 21/40] Major reworkings to add FP16 support --- Common/ML/include/ML/3rdparty/GPUORTFloat16.h | 2 +- Common/ML/src/OrtInterface.cxx | 11 +- .../Global/GPUChainTrackingClusterizer.cxx | 58 +++++++-- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 35 ++++-- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 25 ++-- .../GPUTPCNNClusterizerHost.cxx | 10 +- .../GPUTPCNNClusterizerHost.h | 2 - .../GPUTPCNNClusterizerKernels.cxx | 112 ++++++++++++------ 8 files changed, 177 insertions(+), 78 deletions(-) diff --git a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h index 76fd6734cf9db..9516ba5dad573 100644 --- a/Common/ML/include/ML/3rdparty/GPUORTFloat16.h +++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h @@ -882,4 +882,4 @@ static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match"); } // namespace OrtDataType } // namespace o2 -#endif \ No newline at end of file +#endif diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 49ca969c811df..1a729a97c6952 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -244,13 +244,22 @@ void OrtModel::inference(I* input, size_t input_size, O* output) } std::vector outputShape{input_size, mOutputShapes[0][1]}; - Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); + Ort::Value outputTensor = Ort::Value(nullptr); + if constexpr (std::is_same_v) { + Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); + } else { + Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); + } (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); } +template void OrtModel::inference(OrtDataType::Float16_t*, size_t, OrtDataType::Float16_t*); + template void OrtModel::inference(OrtDataType::Float16_t*, size_t, float*); +template void OrtModel::inference(float*, size_t, OrtDataType::Float16_t*); + template void OrtModel::inference(float*, size_t, float*); template diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 532ea169cd006..d9bc4ac30190b 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -943,9 +943,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN; GPUTPCNNClusterizerHost& nnApplication = nnApplications[lane]; - LOG(info) << "clustererNNShadow.inputData32: " << clustererNNShadow.inputData32; - LOG(info) << "clustererShadow.mPclusterInRow: " << clustererShadow.mPclusterInRow; - int withMC = (doGPU && propagateMCLabels); if (clustererNNShadow.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { @@ -963,19 +960,58 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) auto stop0 = std::chrono::high_resolution_clock::now(); auto start1 = std::chrono::high_resolution_clock::now(); - nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType); + + // nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType); + if (clustererNNShadow.nnInferenceInputDType == 0) { + if (clustererNNShadow.nnInferenceOutputDType == 0) { + (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_16); + } else if (clustererNNShadow.nnInferenceOutputDType == 1) { + (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_32); + } + } else if (clustererNNShadow.nnInferenceInputDType == 1) { + if (clustererNNShadow.nnInferenceOutputDType == 0) { + (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_16); + } else if (clustererNNShadow.nnInferenceOutputDType == 1) { + (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_32); + } + } if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels } else { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels } - if (!clustererNNShadow.nnClusterizerUseCfRegression) { - nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1 + // nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType); + if (clustererNNShadow.nnInferenceInputDType == 0) { + if (clustererNNShadow.nnInferenceOutputDType == 0) { + (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_16); + } else if (clustererNNShadow.nnInferenceOutputDType == 1) { + (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_32); + } + } else if (clustererNNShadow.nnInferenceInputDType == 1) { + if (clustererNNShadow.nnInferenceOutputDType == 0) { + (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_16); + } else if (clustererNNShadow.nnInferenceOutputDType == 1) { + (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_32); + } + } + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Running the NN for regression class 1 if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { - nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2 + // nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType); + if (clustererNNShadow.nnInferenceInputDType == 0) { + if (clustererNNShadow.nnInferenceOutputDType == 0) { + (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_16); + } else if (clustererNNShadow.nnInferenceOutputDType == 1) { + (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_32); + } + } else if (clustererNNShadow.nnInferenceInputDType == 1) { + if (clustererNNShadow.nnInferenceOutputDType == 0) { + (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_16); + } else if (clustererNNShadow.nnInferenceOutputDType == 1) { + (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_32); + } + } + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Running the NN for regression class 2 } } auto stop1 = std::chrono::high_resolution_clock::now(); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 208e8c6428cb5..4ae5e0d9b49a7 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -26,20 +26,35 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) { if (nnClusterizerBatchedMode > 0) { if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize); + computePointerWithAlignment(mem, inputData_16, nnClusterizerBatchedMode * nnClusterizerElementSize); } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize); + computePointerWithAlignment(mem, inputData_32, nnClusterizerBatchedMode * nnClusterizerElementSize); } computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode); - if (nnClusterizerModelClassNumOutputNodes > 0) { - computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); - } - if (!nnClusterizerUseCfRegression) { - if (nnClusterizerModelReg1NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + + if (nnInferenceOutputDType == 0 && nnClusterizerElementSize > 0) { + if (nnClusterizerModelClassNumOutputNodes > 0) { + computePointerWithAlignment(mem, modelProbabilities_16, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); } - if (nnClusterizerModelReg2NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + if (!nnClusterizerUseCfRegression) { + if (nnClusterizerModelReg1NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg1_16, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + } + if (nnClusterizerModelReg2NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg2_16, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + } + } + } else if (nnInferenceOutputDType == 1 && nnClusterizerElementSize > 0) { + if (nnClusterizerModelClassNumOutputNodes > 0) { + computePointerWithAlignment(mem, modelProbabilities_32, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); + } + if (!nnClusterizerUseCfRegression) { + if (nnClusterizerModelReg1NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg1_32, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + } + if (nnClusterizerModelReg2NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg2_32, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + } } } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 9bf89e6337c6e..70c9e9c20d18b 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -56,20 +56,27 @@ class GPUTPCNNClusterizer : public GPUProcessor int nnClusterizerModelReg1NumOutputNodes = -1; int nnClusterizerModelReg2NumOutputNodes = -1; int nnInferenceInputDType = 0; // 0: float16, 1: float32 + int nnInferenceOutputDType = 0; // 0: float16, 1: float32 int mISector = -1; int deviceId = -1; // Memory allocation for neural network - float* inputData32 = nullptr; - OrtDataType::Float16_t* inputData16 = nullptr; - float* outputDataClass = nullptr; - float* modelProbabilities = nullptr; - float* outputDataReg1 = nullptr; - float* outputDataReg2 = nullptr; - ChargePos* peakPositions = nullptr; - bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptrx - float* centralCharges = nullptr; + bool* clusterFlags = nullptr; // mSplitInTime, mSplitInPad. Techincally both flags are set in the same way -> ClusterAccumulator.cx=nullptr + int* outputDataClass = nullptr; + + // FP32 + float* inputData_32 = nullptr; + float* modelProbabilities_32 = nullptr; + float* outputDataReg1_32 = nullptr; + float* outputDataReg2_32 = nullptr; + + // FP16 + OrtDataType::Float16_t* inputData_16 = nullptr; + OrtDataType::Float16_t* modelProbabilities_16 = nullptr; + OrtDataType::Float16_t* outputDataReg1_16 = nullptr; + OrtDataType::Float16_t* outputDataReg2_16 = nullptr; + int16_t mMemoryId = -1; }; // class GPUTPCNNClusterizer diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 0a2c35a6f6623..4372fea7ed9e5 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -149,6 +149,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.nnClusterizerVerbosity = settings.nnClusterizerVerbosity; } clustererNN.nnInferenceInputDType = settings.nnInferenceInputDType.find("32") != std::string::npos; + clustererNN.nnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos; clustererNN.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; if (!settings.nnClusterizerUseCfRegression) { if (model_class.getNumOutputNodes()[0][1] == 1 || model_reg_2.isInitialized()) { @@ -159,12 +160,3 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust } } } - -void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype) -{ - if (dtype == 0) { - model.inference(clustererNN.inputData16, size, output); - } else { - model.inference(clustererNN.inputData32, size, output); - } -} diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 2c0e704595933..87532deff9917 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -43,8 +43,6 @@ class GPUTPCNNClusterizerHost void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); void loadFromCCDB(std::map); - void networkInference(o2::ml::OrtModel, GPUTPCNNClusterizer&, size_t, float*, int32_t); - std::unordered_map OrtOptions; o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters std::vector modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2 diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index ef75e1c1af19e..70d605ac72fc7 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -81,16 +81,16 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(chargeMap[tmp_pos].unpack()) / central_charge); - } else { - clustererNN.inputData32[write_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); + } else if (dtype == 1) { + clustererNN.inputData_32[write_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; } } else { // Filling boundary just to make sure that no values are left unintentionally if (dtype == 0) { - clustererNN.inputData16[write_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); + clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); } else { - clustererNN.inputData32[write_idx] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); + clustererNN.inputData_32[write_idx] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); } } write_idx++; @@ -99,13 +99,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(pad) / GPUTPCGeometry::NPads(row)); + clustererNN.inputData_16[write_idx] = (OrtDataType::Float16_t)(clusterer.mISector / 36.f); + clustererNN.inputData_16[write_idx + 1] = (OrtDataType::Float16_t)(row / 152.f); + clustererNN.inputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); } else { - clustererNN.inputData32[write_idx] = clusterer.mISector / 36.f; - clustererNN.inputData32[write_idx + 1] = row / 152.f; - clustererNN.inputData32[write_idx + 2] = static_cast(pad) / GPUTPCGeometry::NPads(row); + clustererNN.inputData_32[write_idx] = clusterer.mISector / 36.f; + clustererNN.inputData_32[write_idx + 1] = row / 152.f; + clustererNN.inputData_32[write_idx + 2] = static_cast(pad) / GPUTPCGeometry::NPads(row); } } } @@ -114,7 +114,11 @@ template <> GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) { uint glo_idx = get_global_id(0); - processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold); + if (dtype == 0) { + processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)((processors.tpcNNClusterer[sector].modelProbabilities_16[glo_idx]).ToFloat() > processors.tpcNNClusterer[sector].nnClassThreshold); + } else if (dtype == 1) { + processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities_32[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold); + } } template <> @@ -127,9 +131,17 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(clustererNN.modelProbabilities_16[pIdx]); + } else if (dtype == 1) { + current_max_prob = clustererNN.modelProbabilities_32[pIdx]; + } } else { - class_label = (clustererNN.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label); + if (dtype == 0) { + current_max_prob = CAMath::Max(current_max_prob, clustererNN.modelProbabilities_16[pIdx].ToFloat()); + } else if (dtype == 1) { + current_max_prob = CAMath::Max(current_max_prob, clustererNN.modelProbabilities_32[pIdx]); + } } } // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" @@ -184,13 +196,23 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.outputDataReg1[model_output_index], - clustererNN.outputDataReg1[model_output_index + 2], - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg1[model_output_index + 1], - clustererNN.outputDataReg1[model_output_index + 3], - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + if (dtype == 0) { + pc.setFull(central_charge * clustererNN.outputDataReg1_16[model_output_index + 4].ToFloat(), + static_cast(peak.pad()) + clustererNN.outputDataReg1_16[model_output_index].ToFloat(), + clustererNN.outputDataReg1_16[model_output_index + 2].ToFloat(), + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg1_16[model_output_index + 1].ToFloat(), + clustererNN.outputDataReg1_16[model_output_index + 3].ToFloat(), + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + } else if (dtype == 1) { + pc.setFull(central_charge * clustererNN.outputDataReg1_32[model_output_index + 4], + static_cast(peak.pad()) + clustererNN.outputDataReg1_32[model_output_index], + clustererNN.outputDataReg1_32[model_output_index + 2], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg1_32[model_output_index + 1], + clustererNN.outputDataReg1_32[model_output_index + 3], + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + } tpc::ClusterNative myCluster; bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap); @@ -267,13 +289,23 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.outputDataReg2[model_output_index], - clustererNN.outputDataReg2[model_output_index + 4], - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2[model_output_index + 2], - clustererNN.outputDataReg2[model_output_index + 6], - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + if (dtype == 0) { + pc.setFull(central_charge * clustererNN.outputDataReg2_16[model_output_index + 8].ToFloat(), + static_cast(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index].ToFloat(), + clustererNN.outputDataReg2_16[model_output_index + 4].ToFloat(), + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 2].ToFloat(), + clustererNN.outputDataReg2_16[model_output_index + 6].ToFloat(), + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + } else if (dtype == 1) { + pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 8], + static_cast(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index], + clustererNN.outputDataReg2_32[model_output_index + 4], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 2], + clustererNN.outputDataReg2_32[model_output_index + 6], + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + } tpc::ClusterNative myCluster; bool rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap); @@ -302,13 +334,23 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadcommit(peak.row(), rowIndex, clusterer.mNMaxClusterPerRow)); // Cluster 2 - pc.setFull(central_charge * clustererNN.outputDataReg2[model_output_index + 9], - static_cast(peak.pad()) + clustererNN.outputDataReg2[model_output_index + 1], - clustererNN.outputDataReg2[model_output_index + 5], - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2[model_output_index + 3], - clustererNN.outputDataReg2[model_output_index + 7], - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + if (dtype == 0) { + pc.setFull(central_charge * clustererNN.outputDataReg2_16[model_output_index + 9].ToFloat(), + static_cast(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index + 1].ToFloat(), + clustererNN.outputDataReg2_16[model_output_index + 5].ToFloat(), + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 3].ToFloat(), + clustererNN.outputDataReg2_16[model_output_index + 7].ToFloat(), + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + } else if (dtype == 1) { + pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 9], + static_cast(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index + 1], + clustererNN.outputDataReg2_32[model_output_index + 5], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 3], + clustererNN.outputDataReg2_32[model_output_index + 7], + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); + } rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap); if (rejectCluster) { From 381955a57f5ce34f3318eb974263f9d0a7e6f7c3 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 3 Apr 2025 15:17:13 +0200 Subject: [PATCH 22/40] Bug-fixes --- Common/ML/src/OrtInterface.cxx | 4 +- .../Global/GPUChainTrackingClusterizer.cxx | 40 ++++++-------- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 52 +++++++++---------- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 3 -- .../GPUTPCNNClusterizerHost.cxx | 7 +-- 5 files changed, 49 insertions(+), 57 deletions(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 1a729a97c6952..4e41fec0c8ca9 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -246,9 +246,9 @@ void OrtModel::inference(I* input, size_t input_size, O* output) std::vector outputShape{input_size, mOutputShapes[0][1]}; Ort::Value outputTensor = Ort::Value(nullptr); if constexpr (std::is_same_v) { - Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); + outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); } else { - Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); + outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); } (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index d9bc4ac30190b..e313f30c07656 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -611,6 +611,14 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) RunTPCClusterizer_prepare(true); // Restore some pointers, allocated by the other pipeline, and set to 0 by SetupGPUProcessor (since not allocated in this pipeline) } + if (doGPU && mIOPtrs.tpcZS) { + processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta; + WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1); + } + if (doGPU) { + WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init); + } + #ifdef GPUCA_HAS_ONNX const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes]; @@ -624,9 +632,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) { nnApplications[lane].init(nn_settings); - GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[lane]; - GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[lane] : clustererNN; - if (nnApplications[lane].modelsUsed[0]) { SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId); (nnApplications[lane].model_class).setDeviceId(deviceId); @@ -642,7 +647,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplications[lane].model_reg_2).setDeviceId(deviceId); (nnApplications[lane].model_reg_2).initEnvironment(); } - if (clustererNNShadow.nnClusterizerVerbosity < 3) { + if (nn_settings.nnClusterizerVerbosity < 3) { LOG(info) << "Allocated ONNX stream for lane " << lane << " and device " << deviceId; } }); @@ -650,35 +655,24 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[sector]; GPUTPCNNClusterizer& clustererNNShadow = doGPU ? processorsShadow()->tpcNNClusterer[sector] : clustererNN; int32_t lane = sector % numLanes; + clustererNN.deviceId = deviceId; + clustererNN.mISector = sector; + clustererNN.nnClusterizerTotalClusters = maxClusters; + nnApplications[lane].initClusterizer(nn_settings, clustererNN); if (doGPU){ clustererNNShadow.deviceId = deviceId; clustererNNShadow.mISector = sector; clustererNNShadow.nnClusterizerTotalClusters = maxClusters; nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow); - } else { - // TODO: not sure if this part is needed at all - clustererNN.deviceId = deviceId; - clustererNN.mISector = sector; - clustererNN.nnClusterizerTotalClusters = maxClusters; - nnApplications[lane].initClusterizer(nn_settings, clustererNN); } AllocateRegisteredMemory(clustererNN.mMemoryId); - if (doGPU){ - WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&clustererNN - (char*)processors(), &clustererNNShadow, sizeof(clustererNN), lane); - TransferMemoryResourcesToGPU(RecoStep::TPCClusterFinding, &clustererNNShadow, lane); - } }); + if (doGPU){ + WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer)*NSECTORS, mRec->NStreams() - 1, &mEvents->init); + } } #endif - if (doGPU && mIOPtrs.tpcZS) { - processorsShadow()->ioPtrs.tpcZS = mInputsShadow->mPzsMeta; - WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), mRec->NStreams() - 1); - } - if (doGPU) { - WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)processors()->tpcClusterer - (char*)processors(), processorsShadow()->tpcClusterer, sizeof(GPUTPCClusterFinder) * NSECTORS, mRec->NStreams() - 1, &mEvents->init); - } - size_t nClsTotal = 0; ClusterNativeAccess* tmpNativeAccess = mClusterNativeAccess.get(); ClusterNative* tmpNativeClusters = nullptr; @@ -961,7 +955,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) auto stop0 = std::chrono::high_resolution_clock::now(); auto start1 = std::chrono::high_resolution_clock::now(); - // nnApplication.networkInference(nnApplication.model_class, clustererNNShadow, iSize, clustererNNShadow.modelProbabilities, clustererNNShadow.nnInferenceInputDType); if (clustererNNShadow.nnInferenceInputDType == 0) { if (clustererNNShadow.nnInferenceOutputDType == 0) { (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_16); @@ -975,6 +968,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_32); } } + if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels } else { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 4ae5e0d9b49a7..fe3202fe7b439 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -64,32 +64,32 @@ void* GPUTPCNNClusterizer::setIOPointers(void* mem) return mem; } -std::vector GPUTPCNNClusterizer::pointerSizes() { - std::vector sizes(7, -1); - if (nnClusterizerBatchedMode > 0) { - if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) { - sizes[0] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData16 - } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) { - sizes[1] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData32 - } - sizes[2] = 2 * nnClusterizerBatchedMode; // clusterFlags - if (nnClusterizerModelClassNumOutputNodes > 0) { - sizes[3] = nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes; // modelProbabilities - } - if (!nnClusterizerUseCfRegression) { - if (nnClusterizerModelReg1NumOutputNodes > 0) { - sizes[4] = nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes; // outputDataReg1 - } - if (nnClusterizerModelReg2NumOutputNodes > 0) { - sizes[5] = nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes; // outputDataReg2 - } - } - } - if (nnClusterizerTotalClusters > 0) { - sizes[6] = nnClusterizerTotalClusters; // outputDataClass - } - return sizes; -} +// std::vector GPUTPCNNClusterizer::pointerSizes() { +// std::vector sizes(7, -1); +// if (nnClusterizerBatchedMode > 0) { +// if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) { +// sizes[0] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData16 +// } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) { +// sizes[1] = nnClusterizerBatchedMode * nnClusterizerElementSize; // inputData32 +// } +// sizes[2] = 2 * nnClusterizerBatchedMode; // clusterFlags +// if (nnClusterizerModelClassNumOutputNodes > 0) { +// sizes[3] = nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes; // modelProbabilities +// } +// if (!nnClusterizerUseCfRegression) { +// if (nnClusterizerModelReg1NumOutputNodes > 0) { +// sizes[4] = nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes; // outputDataReg1 +// } +// if (nnClusterizerModelReg2NumOutputNodes > 0) { +// sizes[5] = nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes; // outputDataReg2 +// } +// } +// } +// if (nnClusterizerTotalClusters > 0) { +// sizes[6] = nnClusterizerTotalClusters; // outputDataClass +// } +// return sizes; +// } void GPUTPCNNClusterizer::RegisterMemoryAllocation() { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 70c9e9c20d18b..e9b2061bea36a 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -34,7 +34,6 @@ class GPUTPCNNClusterizer : public GPUProcessor void RegisterMemoryAllocation(); void InitializeProcessor(); void SetMaxData(const GPUTrackingInOutPointers&); - std::vector pointerSizes(); // Neural network clusterization @@ -50,8 +49,6 @@ class GPUTPCNNClusterizer : public GPUProcessor int nnClusterizerTotalClusters = 1; int nnClusterizerVerbosity = 0; int nnClusterizerBoundaryFillValue = -1; - int nnClusterizerDumpDigits = 0; - int nnClusterizerApplyCfDeconvolution = 0; int nnClusterizerModelClassNumOutputNodes = -1; int nnClusterizerModelReg1NumOutputNodes = -1; int nnClusterizerModelReg2NumOutputNodes = -1; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 4372fea7ed9e5..29fdaada06855 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -138,10 +138,11 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.nnClusterizerElementSize = ((2 * settings.nnClusterizerSizeInputRow + 1) * (2 * settings.nnClusterizerSizeInputPad + 1) * (2 * settings.nnClusterizerSizeInputTime + 1)) + (settings.nnClusterizerAddIndexData ? 3 : 0); clustererNN.nnClusterizerBatchedMode = settings.nnClusterizerBatchedMode; clustererNN.nnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue; - clustererNN.nnClassThreshold = settings.nnClassThreshold; clustererNN.nnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold; if (clustererNN.nnSigmoidTrafoClassThreshold) { - clustererNN.nnClassThreshold = (float)std::log(clustererNN.nnClassThreshold / (1.f - clustererNN.nnClassThreshold)); + clustererNN.nnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold)); + } else { + clustererNN.nnClassThreshold = settings.nnClassThreshold; } if (settings.nnClusterizerVerbosity < 0) { clustererNN.nnClusterizerVerbosity = settings.nnInferenceVerbosity; @@ -152,7 +153,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.nnInferenceOutputDType = settings.nnInferenceOutputDType.find("32") != std::string::npos; clustererNN.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; if (!settings.nnClusterizerUseCfRegression) { - if (model_class.getNumOutputNodes()[0][1] == 1 || model_reg_2.isInitialized()) { + if (model_class.getNumOutputNodes()[0][1] == 1 || !model_reg_2.isInitialized()) { clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; } else { clustererNN.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; From 19b5bd596ce191f177884cbe880029c8f9fe65a0 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 3 Apr 2025 21:12:02 +0200 Subject: [PATCH 23/40] Improved data filling speeds by factor 3 --- .../Definitions/GPUDefParametersDefault.h | 1 + .../Global/GPUChainTrackingClusterizer.cxx | 4 +- .../GPUTPCNNClusterizerKernels.cxx | 74 ++++++++++++++++++- .../GPUTPCNNClusterizerKernels.h | 9 ++- GPU/GPUTracking/kernels.cmake | 1 + 5 files changed, 80 insertions(+), 9 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h index 4435e69c60ff6..8249bbb70832d 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefault.h @@ -500,6 +500,7 @@ #ifdef GPUCA_HAS_ONNX #define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN GPUCA_LB_GPUTPCNNClusterizerKernels + #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNSingleElement GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index e313f30c07656..0077cc3cda9aa 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -950,9 +950,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) size_t iSize = CAMath::Min((uint)clustererNNShadow.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); auto start0 = std::chrono::high_resolution_clock::now(); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data - + runKernel({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data auto stop0 = std::chrono::high_resolution_clock::now(); + auto start1 = std::chrono::high_resolution_clock::now(); if (clustererNNShadow.nnInferenceInputDType == 0) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 70d605ac72fc7..202860733b839 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -77,7 +77,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(pad) / GPUTPCGeometry::NPads(row)); } else { - clustererNN.inputData_32[write_idx] = clusterer.mISector / 36.f; + clustererNN.inputData_32[write_idx] = sector / 36.f; clustererNN.inputData_32[write_idx + 1] = row / 152.f; clustererNN.inputData_32[write_idx + 2] = static_cast(pad) / GPUTPCGeometry::NPads(row); } } } +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +{ + uint glo_idx = get_global_id(0); + auto& clusterer = processors.tpcClusterer[sector]; + auto& clustererNN = processors.tpcNNClusterer[sector]; + uint base_idx = CAMath::Floor(glo_idx / clustererNN.nnClusterizerElementSize); + uint transient_index = glo_idx % clustererNN.nnClusterizerElementSize; + + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + Array2D isPeakMap(clusterer.mPpeakMap); + ChargePos peak = clusterer.mPfilteredPeakPositions[base_idx + batchStart]; + int row = static_cast(peak.row()), pad = static_cast(peak.pad()); + + if (clustererNN.nnClusterizerAddIndexData && transient_index == 0) { + uint top_idx = (base_idx + 1) * clustererNN.nnClusterizerElementSize; + for (uint16_t i = 0; i < 8; i++) { + Delta2 d = cfconsts::InnerNeighbors[i]; + ChargePos tmp_pos = peak.delta(d); + clustererNN.clusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); + clustererNN.clusterFlags[2 * glo_idx + 1] = clustererNN.clusterFlags[2 * glo_idx]; + } + if (dtype == 0) { + clustererNN.inputData_16[top_idx - 3] = (OrtDataType::Float16_t)(sector / 36.f); + clustererNN.inputData_16[top_idx - 2] = (OrtDataType::Float16_t)(row / 152.f); + clustererNN.inputData_16[top_idx - 1] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); + } else { + clustererNN.inputData_32[top_idx - 3] = sector / 36.f; + clustererNN.inputData_32[top_idx - 2] = row / 152.f; + clustererNN.inputData_32[top_idx - 1] = static_cast(pad) / GPUTPCGeometry::NPads(row); + } + } else { + int time = static_cast(peak.time()); + int r = CAMath::Floor(transient_index / ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1))) - clustererNN.nnClusterizerSizeInputRow; + bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); + if (is_row_boundary) { + if (dtype == 0) { + clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); + } else { + clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); + } + } else { + int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow); + int pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r); + int rest_1 = transient_index % ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1)); + int p = CAMath::Floor(rest_1 / (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputPad + pad_offset; + bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.nnClusterizerSizeInputRow); + + if (!is_boundary) { + float central_charge = static_cast(chargeMap[peak].unpack()); + int t = (rest_1 % (2 * clustererNN.nnClusterizerSizeInputTime + 1)) - clustererNN.nnClusterizerSizeInputTime; + ChargePos tmp_pos(row + r, pad + p, time + t); + if (dtype == 0) { + clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); + } else if (dtype == 1) { + clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + } + } else { + if (dtype == 0) { + clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); + } else { + clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); + } + } + } + } +} + template <> GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index e6c1dc508d6e4..0e15913dcee0c 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -53,10 +53,11 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate enum K : int32_t { runCfClusterizer = 0, fillInputNN = 1, - determineClass1Labels = 2, - determineClass2Labels = 3, - publishClass1Regression = 4, - publishClass2Regression = 5, + fillInputNNSingleElement = 2, + determineClass1Labels = 3, + determineClass2Labels = 4, + publishClass1Regression = 5, + publishClass2Regression = 6, }; template diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index 994f10a516b10..3e435fe7e74bc 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -116,6 +116,7 @@ o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUS if(NOT ALIGPU_BUILD_TYPE STREQUAL "Standalone") o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) From 83d02579b0f1804dc1084a3c51c07e2fc85a3fcc Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 3 Apr 2025 21:49:53 +0200 Subject: [PATCH 24/40] Limiting threads for ONNX evaluation --- Common/ML/CMakeLists.txt | 13 +++++++------ Common/ML/include/ML/OrtInterface.h | 4 ++++ .../Global/GPUChainTrackingClusterizer.cxx | 10 ++++++++++ 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt index 5bfa05b716123..7e2107651cf10 100644 --- a/Common/ML/CMakeLists.txt +++ b/Common/ML/CMakeLists.txt @@ -9,13 +9,14 @@ # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. -# Pass ORT variables as a preprocessor definition -add_compile_definitions(ORT_ROCM_BUILD=${ORT_ROCM_BUILD}) -add_compile_definitions(ORT_CUDA_BUILD=${ORT_CUDA_BUILD}) -add_compile_definitions(ORT_MIGRAPHX_BUILD=${ORT_MIGRAPHX_BUILD}) -add_compile_definitions(ORT_TENSORRT_BUILD=${ORT_TENSORRT_BUILD}) - o2_add_library(ML SOURCES src/OrtInterface.cxx TARGETVARNAME targetName PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime) + +# Pass ORT variables as a preprocessor definition +target_compile_definitions(${targetName} PRIVATE + ORT_ROCM_BUILD=$ + ORT_CUDA_BUILD=$ + ORT_MIGRAPHX_BUILD=$ + ORT_TENSORRT_BUILD=$) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 56be450fb2ff1..bd81ecca109c9 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -64,11 +64,15 @@ class OrtModel std::vector getOutputNames() const { return mOutputNames; } Ort::SessionOptions& getSessionOptions(); Ort::MemoryInfo& getMemoryInfo(); + int32_t getIntraOpNumThreads() const { return intraOpNumThreads; } + int32_t getInterOpNumThreads() const { return interOpNumThreads; } // Setters void setDeviceId(int32_t id) { deviceId = id; } void setIO(); void setActiveThreads(int threads) { intraOpNumThreads = threads; } + void setIntraOpNumThreads(int threads) { if(deviceType == "CPU") { intraOpNumThreads = threads; } } + void setInterOpNumThreads(int threads) { if(deviceType == "CPU") { interOpNumThreads = threads; } } // Conversion template diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 0077cc3cda9aa..621952b1ac654 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -627,6 +627,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) uint32_t maxClusters = 0; int32_t deviceId = -1; int32_t numLanes = GetProcessingSettings().nTPCClustererLanes; + int32_t maxThreads = mRec->MemoryScalers()->nTPCdigits / 6000; for (uint32_t lane = 0; lane < NSECTORS; lane++) { maxClusters = std::max(maxClusters, processors()->tpcClusterer[lane].mNMaxClusters); } @@ -635,16 +636,25 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (nnApplications[lane].modelsUsed[0]) { SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId); (nnApplications[lane].model_class).setDeviceId(deviceId); + if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) { + nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads); + } (nnApplications[lane].model_class).initEnvironment(); } if (nnApplications[lane].modelsUsed[1]) { SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId); (nnApplications[lane].model_reg_1).setDeviceId(deviceId); + if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) { + nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads); + } (nnApplications[lane].model_reg_1).initEnvironment(); } if (nnApplications[lane].modelsUsed[2]) { SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId); (nnApplications[lane].model_reg_2).setDeviceId(deviceId); + if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) { + nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads); + } (nnApplications[lane].model_reg_2).initEnvironment(); } if (nn_settings.nnClusterizerVerbosity < 3) { From fff6dc39c42f72afeed643ffe11ec1e1a55052f4 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 3 Apr 2025 23:42:20 +0200 Subject: [PATCH 25/40] Bug-fix for correct thread assignment and input data filling --- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 202860733b839..d1be1d00027e2 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -124,7 +124,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.row()), pad = static_cast(peak.pad()); - if (clustererNN.nnClusterizerAddIndexData && transient_index == 0) { + if (clustererNN.nnClusterizerAddIndexData && transient_index == (clustererNN.nnClusterizerElementSize - 1)) { uint top_idx = (base_idx + 1) * clustererNN.nnClusterizerElementSize; for (uint16_t i = 0; i < 8; i++) { Delta2 d = cfconsts::InnerNeighbors[i]; @@ -141,7 +141,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(pad) / GPUTPCGeometry::NPads(row); } - } else { + } else if (transient_index < (clustererNN.nnClusterizerElementSize - 3)) { int time = static_cast(peak.time()); int r = CAMath::Floor(transient_index / ((2 * clustererNN.nnClusterizerSizeInputPad + 1) * (2 * clustererNN.nnClusterizerSizeInputTime + 1))) - clustererNN.nnClusterizerSizeInputRow; bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); From b437e38aa18b46c54432a9560002abc98eb22869 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 4 Apr 2025 08:59:57 +0200 Subject: [PATCH 26/40] Minor changes --- GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 621952b1ac654..b03a27867f8fa 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -624,13 +624,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCNNClusterizerHost nnApplications[GetProcessingSettings().nTPCClustererLanes]; if (GetProcessingSettings().nn.applyNNclusterizer) { - uint32_t maxClusters = 0; int32_t deviceId = -1; int32_t numLanes = GetProcessingSettings().nTPCClustererLanes; int32_t maxThreads = mRec->MemoryScalers()->nTPCdigits / 6000; - for (uint32_t lane = 0; lane < NSECTORS; lane++) { - maxClusters = std::max(maxClusters, processors()->tpcClusterer[lane].mNMaxClusters); - } mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) { nnApplications[lane].init(nn_settings); if (nnApplications[lane].modelsUsed[0]) { @@ -667,12 +663,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) int32_t lane = sector % numLanes; clustererNN.deviceId = deviceId; clustererNN.mISector = sector; - clustererNN.nnClusterizerTotalClusters = maxClusters; + clustererNN.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters; nnApplications[lane].initClusterizer(nn_settings, clustererNN); if (doGPU){ clustererNNShadow.deviceId = deviceId; clustererNNShadow.mISector = sector; - clustererNNShadow.nnClusterizerTotalClusters = maxClusters; + clustererNNShadow.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters; nnApplications[lane].initClusterizer(nn_settings, clustererNNShadow); } AllocateRegisteredMemory(clustererNN.mMemoryId); @@ -1034,7 +1030,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) { acceptedClusters += clustererNNShadow.outputDataClass[i]; } - LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s"; + LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s"; } #else GPUFatal("Project not compiled with neural network clusterization. Aborting."); From 710993a7df62a37e0d73c2ec484caa872a39aa76 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 5 Apr 2025 13:55:40 +0200 Subject: [PATCH 27/40] Adding I** inference, potentally needed for CNN + FC inference --- Common/ML/include/ML/OrtInterface.h | 8 +- Common/ML/src/OrtInterface.cxx | 173 ++++++++++++++++++++++++---- 2 files changed, 158 insertions(+), 23 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index bd81ecca109c9..d496de866da7f 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -88,6 +88,9 @@ class OrtModel template void inference(I*, size_t, O*); + template + void inference(I**, size_t, O*); + private: // ORT variables -> need to be hidden as pImpl struct OrtVariables; @@ -96,7 +99,9 @@ class OrtModel // Input & Output specifications of the loaded network std::vector inputNamesChar, outputNamesChar; std::vector mInputNames, mOutputNames; - std::vector> mInputShapes, mOutputShapes; + std::vector> mInputShapes, mOutputShapes, inputShapesCopy, outputShapesCopy; // Input shapes + std::vector inputSizePerNode, outputSizePerNode; // Output shapes + int32_t mInputsTotal = 0, mOutputsTotal = 0; // Total number of inputs and outputs // Environment settings bool mInitialized = false; @@ -104,6 +109,7 @@ class OrtModel int32_t intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = -1, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; std::string printShape(const std::vector&); + std::string printShape(const std::vector>&, std::vector&); }; } // namespace ml diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 4e41fec0c8ca9..e525fc1d2709f 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -124,11 +124,11 @@ void OrtModel::initEnvironment() (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); + setIO(); + if (loggingLevel < 2) { - LOG(info) << "(ORT) Model loaded successfully! (input: " << printShape(mInputShapes[0]) << ", output: " << printShape(mOutputShapes[0]) << ")"; + LOG(info) << "(ORT) Model loaded successfully! (inputs: " << printShape(mInputShapes, mInputNames) << ", outputs: " << printShape(mOutputShapes, mInputNames) << ")"; } - - setIO(); } void OrtModel::memoryOnDevice(int32_t deviceIndex) @@ -201,13 +201,45 @@ void OrtModel::setIO() { outputNamesChar.resize(mOutputNames.size(), nullptr); std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), [&](const std::string& str) { return str.c_str(); }); + + inputShapesCopy = mInputShapes; + outputShapesCopy = mOutputShapes; + inputSizePerNode.resize(mInputShapes.size(), 1); + outputSizePerNode.resize(mOutputShapes.size(), 1); + mInputsTotal = 1; + for (size_t i = 0; i < mInputShapes.size(); ++i) { + if(mInputShapes[i].size() > 0) { + for (size_t j = 1; j < mInputShapes[i].size(); ++j) { + if (mInputShapes[i][j] > 0) { + mInputsTotal *= mInputShapes[i][j]; + inputSizePerNode[i] *= mInputShapes[i][j]; + } + } + } + } + mOutputsTotal = 1; + for (size_t i = 0; i < mOutputShapes.size(); ++i) { + if(mOutputShapes[i].size() > 0) { + for (size_t j = 1; j < mOutputShapes[i].size(); ++j) { + if (mOutputShapes[i][j] > 0) { + mOutputsTotal *= mOutputShapes[i][j]; + outputSizePerNode[i] *= mOutputShapes[i][j]; + } + } + } + } } // Inference template std::vector OrtModel::inference(std::vector& input) { - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputShape = mInputShapes[0]; + inputShape[0] = input.size(); + for (size_t i = 1; i < mInputShapes[0].size(); ++i) + { + inputShape[0] /= mInputShapes[0][i]; + } std::vector inputTensor; if constexpr (std::is_same_v) { inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -223,9 +255,7 @@ std::vector OrtModel::inference(std::vector& input) } template std::vector OrtModel::inference(std::vector&); - template std::vector OrtModel::inference(std::vector&); - template std::vector OrtModel::inference(std::vector&); template @@ -255,33 +285,119 @@ void OrtModel::inference(I* input, size_t input_size, O* output) } template void OrtModel::inference(OrtDataType::Float16_t*, size_t, OrtDataType::Float16_t*); - template void OrtModel::inference(OrtDataType::Float16_t*, size_t, float*); - template void OrtModel::inference(float*, size_t, OrtDataType::Float16_t*); - template void OrtModel::inference(float*, size_t, float*); template -std::vector OrtModel::inference(std::vector>& input) -{ - std::vector inputTensor; - for (auto i : input) { - std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; +void OrtModel::inference(I** input, size_t input_size, O* output) { + std::vector inputTensors(inputShapesCopy.size()); + + for (size_t i = 0; i < inputShapesCopy.size(); ++i) { + + inputShapesCopy[i][0] = input_size; // batch-size + outputShapesCopy[i][0] = input_size; // batch-size + if constexpr (std::is_same_v) { - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); + inputTensors[i] = Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + reinterpret_cast(input[i]), + inputSizePerNode[i]*input_size, + inputShapesCopy[i].data(), + inputShapesCopy[i].size()); } else { - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, i.data(), i.size(), inputShape.data(), inputShape.size())); + inputTensors[i] = Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + input[i], + inputSizePerNode[i]*input_size, + inputShapesCopy[i].data(), + inputShapesCopy[i].size()); } } - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); - std::vector outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; + + Ort::Value outputTensor = Ort::Value(nullptr); + if constexpr (std::is_same_v) { + outputTensor = Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + reinterpret_cast(output), + outputSizePerNode[0]*input_size, // assumes that there is only one output node + outputShapesCopy[0].data(), + outputShapesCopy[0].size()); + } else { + outputTensor = Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + output, + outputSizePerNode[0]*input_size, // assumes that there is only one output node + outputShapesCopy[0].data(), + outputShapesCopy[0].size()); + } + + // === Run inference === + pImplOrt->session->Run( + pImplOrt->runOptions, + inputNamesChar.data(), + inputTensors.data(), + inputNamesChar.size(), + outputNamesChar.data(), + &outputTensor, + outputNamesChar.size() + ); +} + +template void OrtModel::inference(OrtDataType::Float16_t**, size_t, OrtDataType::Float16_t*); +template void OrtModel::inference(OrtDataType::Float16_t**, size_t, float*); +template void OrtModel::inference(float**, size_t, OrtDataType::Float16_t*); +template void OrtModel::inference(float**, size_t, float*); + +template +std::vector OrtModel::inference(std::vector>& inputs) +{ + std::vector input_tensors; + + for (size_t i = 0; i < inputs.size(); ++i) { + + inputShapesCopy[i][0] = inputs[i].size() / inputSizePerNode[i]; // batch-size + + if constexpr (std::is_same_v) { + input_tensors.emplace_back( + Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + reinterpret_cast(inputs[i].data()), + inputSizePerNode[i]*inputShapesCopy[i][0], + inputShapesCopy[i].data(), + inputShapesCopy[i].size())); + } else { + input_tensors.emplace_back( + Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + inputs[i].data(), + inputSizePerNode[i]*inputShapesCopy[i][0], + inputShapesCopy[i].data(), + inputShapesCopy[i].size())); + } + } + + int32_t totalOutputSize = mOutputsTotal*inputShapesCopy[0][0]; + + // === Run inference === + auto output_tensors = pImplOrt->session->Run( + pImplOrt->runOptions, + inputNamesChar.data(), + input_tensors.data(), + input_tensors.size(), + outputNamesChar.data(), + outputNamesChar.size()); + + // === Extract output values === + O* output_data = output_tensors[0].template GetTensorMutableData(); + std::vector output_vec(output_data, output_data + totalOutputSize); + output_tensors.clear(); + return output_vec; } +template std::vector OrtModel::inference(std::vector>&); +template std::vector OrtModel::inference(std::vector>&); + // private std::string OrtModel::printShape(const std::vector& v) { @@ -293,6 +409,19 @@ std::string OrtModel::printShape(const std::vector& v) return ss.str(); } +std::string OrtModel::printShape(const std::vector>& v, std::vector& n) +{ + std::stringstream ss(""); + for (size_t i = 0; i < v.size(); i++) { + ss << n[i] << " -> ("; + for (size_t j = 0; j < v[i].size() - 1; j++) { + ss << v[i][j] << "x"; + } + ss << v[i][v[i].size() - 1] << "); "; + } + return ss.str(); +} + } // namespace ml } // namespace o2 From 77c1691202239c3f0a60e7dd930aa8cacd0dc760 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 7 Apr 2025 21:04:23 +0200 Subject: [PATCH 28/40] CCDB fetching of NNs ported to GPUWorkflowSpec --- Detectors/TPC/calibration/CMakeLists.txt | 2 + .../TPCCalibration/NeuralNetworkClusterizer.h | 39 ++++++++++ .../src/NeuralNetworkClusterizer.cxx | 47 ++++++++++++ GPU/GPUTracking/Definitions/GPUSettingsList.h | 3 +- .../GPUTPCNNClusterizerHost.cxx | 75 +++---------------- .../GPUTPCNNClusterizerHost.h | 4 - .../include/GPUWorkflow/GPUWorkflowSpec.h | 3 + GPU/Workflow/src/GPUWorkflowSpec.cxx | 45 +++++++++++ 8 files changed, 148 insertions(+), 70 deletions(-) create mode 100644 Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h create mode 100644 Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx diff --git a/Detectors/TPC/calibration/CMakeLists.txt b/Detectors/TPC/calibration/CMakeLists.txt index 0ec62e5f323b3..7722fc4e2884f 100644 --- a/Detectors/TPC/calibration/CMakeLists.txt +++ b/Detectors/TPC/calibration/CMakeLists.txt @@ -25,6 +25,7 @@ o2_add_library(TPCCalibration src/CalibPadGainTracksBase.cxx src/CalibLaserTracks.cxx src/LaserTracksCalibrator.cxx + src/NeuralNetworkClusterizer.cxx src/SACDecoder.cxx src/IDCAverageGroup.cxx src/IDCAverageGroupBase.cxx @@ -82,6 +83,7 @@ o2_target_root_dictionary(TPCCalibration include/TPCCalibration/FastHisto.h include/TPCCalibration/CalibLaserTracks.h include/TPCCalibration/LaserTracksCalibrator.h + include/TPCCalibration/NeuralNetworkClusterizer.h include/TPCCalibration/SACDecoder.h include/TPCCalibration/IDCAverageGroup.h include/TPCCalibration/IDCAverageGroupBase.h diff --git a/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h new file mode 100644 index 0000000000000..e4fcfa56df438 --- /dev/null +++ b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h @@ -0,0 +1,39 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file NeuralNetworkClusterizer.h +/// \brief Fetching neural networks for clusterization from CCDB +/// \author Christian Sonnabend + +#ifndef AliceO2_TPC_NeuralNetworkClusterizer_h +#define AliceO2_TPC_NeuralNetworkClusterizer_h + +#include "CCDB/CcdbApi.h" + +namespace o2::tpc +{ + +class NeuralNetworkClusterizer +{ + public: + NeuralNetworkClusterizer() = default; + void initCcdbApi(std::string url); + void loadIndividualFromCCDB(std::map settings); + + private: + o2::ccdb::CcdbApi ccdbApi; + std::map metadata; + std::map headers; + +}; + +} // namespace o2::tpc +#endif diff --git a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx new file mode 100644 index 0000000000000..8a2e739b772fb --- /dev/null +++ b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx @@ -0,0 +1,47 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file NeuralNetworkClusterizer.cxx +/// \brief Fetching neural networks for clusterization from CCDB +/// \author Christian Sonnabend + +#include +#include "TPCCalibration/NeuralNetworkClusterizer.h" + +using namespace o2::tpc; + +void NeuralNetworkClusterizer::initCcdbApi(std::string url) { + ccdbApi.init(url); +} + +void NeuralNetworkClusterizer::loadIndividualFromCCDB(std::map settings) +{ + metadata["inputDType"] = settings["inputDType"]; + metadata["outputDType"] = settings["outputDType"]; + metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C + metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model + metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN + if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) { + metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"]; + } + if (settings["nnCCDBBeamType"] != "") { + metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"]; + } + + bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], settings["outputFolder"], metadata, 1, false, settings["outputFile"]); + // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables + + if (retrieveSuccess) { + LOG(info) << "Network " << settings["nnCCDBPath"] << " retrieved from CCDB, stored at " << settings["outputFile"]; + } else { + LOG(error) << "Failed to retrieve network from CCDB"; + } +} diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 78c77c37dd511..a978e657e76b7 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -268,11 +268,12 @@ AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The c AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.") AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path") AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") +AddOption(nnEvalMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)") // CCDB AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally") +AddOption(nnLocalFolder, std::string, ".", "", 0, "Local folder in which the networks will be fetched") AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched") AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks") -AddOption(nnCCDBFetchMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)") AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression") AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 29fdaada06855..5125d7a3fd364 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -16,82 +16,27 @@ #include "GPUTPCNNClusterizerHost.h" #include "GPUTPCNNClusterizer.h" -#include "CCDB/CcdbApi.h" #include "GPUSettings.h" #include "ML/3rdparty/GPUORTFloat16.h" using namespace o2::gpu; -void GPUTPCNNClusterizerHost::loadFromCCDB(std::map settings) -{ - o2::ccdb::CcdbApi ccdbApi; - ccdbApi.init(settings["nnCCDBURL"]); - - metadata["inputDType"] = settings["inputDType"]; - metadata["outputDType"] = settings["outputDType"]; - metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C - metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model - metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN - if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) { - metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"]; - } - if (settings["nnCCDBBeamType"] != "") { - metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"]; - } - - bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], ".", metadata, 1, false, settings["outputFile"]); - // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables - - if (retrieveSuccess) { - LOG(info) << "Network " << settings["nnCCDBPath"] << " retrieved from CCDB, stored at " << settings["outputFile"]; - } else { - LOG(error) << "Failed to retrieve network from CCDB"; - } -} - void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) { std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath; std::vector reg_model_paths; - - if (settings.nnLoadFromCCDB) { - std::map ccdbSettings = { - {"nnCCDBURL", settings.nnCCDBURL}, - {"nnCCDBPath", settings.nnCCDBPath}, - {"inputDType", settings.nnInferenceInputDType}, - {"outputDType", settings.nnInferenceOutputDType}, - {"nnCCDBWithMomentum", std::to_string(settings.nnCCDBWithMomentum)}, - {"nnCCDBBeamType", settings.nnCCDBBeamType}, - {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)}}; - - std::string nnFetchFolder = ""; - std::vector fetchMode = o2::utils::Str::tokenize(settings.nnCCDBFetchMode, ':'); - std::map networkRetrieval = ccdbSettings; - - if (fetchMode[0] == "c1") { - networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType; - networkRetrieval["nnCCDBEvalType"] = "classification_c1"; - networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx"; - loadFromCCDB(networkRetrieval); - } else if (fetchMode[0] == "c2") { - networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType; - networkRetrieval["nnCCDBEvalType"] = "classification_c2"; - networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx"; - loadFromCCDB(networkRetrieval); + std::vector evalMode = o2::utils::Str::tokenize(settings.nnEvalMode, ':'); + + if(settings.nnLoadFromCCDB) { + reg_model_path = settings.nnLocalFolder + "/net_regression_c1.onnx"; // Needs to be set identical to NeuralNetworkClusterizer.cxx, otherwise the networks might be loaded from the wrong place + if (evalMode[0] == "c1") { + class_model_path = settings.nnLocalFolder + "/net_classification_c1.onnx"; + } else if (evalMode[0] == "c2") { + class_model_path = settings.nnLocalFolder + "/net_classification_c2.onnx"; } - class_model_path = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally - networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType; - networkRetrieval["nnCCDBEvalType"] = "regression_c1"; - networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx"; - loadFromCCDB(networkRetrieval); - reg_model_path = networkRetrieval["outputFile"]; - if (fetchMode[1] == "r2") { - networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType; - networkRetrieval["nnCCDBEvalType"] = "regression_c2"; - networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx"; - loadFromCCDB(networkRetrieval); - reg_model_path += ":", networkRetrieval["outputFile"]; + if (evalMode[1] == "r2") { + reg_model_path += ":" + settings.nnLocalFolder + "/net_regression_c2.onnx"; } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 87532deff9917..8001ecc96fcfd 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -48,10 +48,6 @@ class GPUTPCNNClusterizerHost std::vector modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2 int32_t deviceId = -1; std::vector reg_model_paths; - - private: - std::map metadata; - std::map headers; }; // class GPUTPCNNClusterizerHost } // namespace o2::gpu diff --git a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h index eda3b28c6cff6..6d471da2879cb 100644 --- a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h +++ b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h @@ -83,6 +83,7 @@ class GPUO2Interface; struct TPCPadGainCalib; struct TPCZSLinkMapping; struct GPUSettingsO2; +struct GPUSettingsProcessingNNclusterizer; class GPUO2InterfaceQA; struct GPUTrackingInOutPointers; struct GPUTrackingInOutZS; @@ -224,6 +225,8 @@ class GPURecoWorkflowSpec : public o2::framework::Task uint32_t mNextThreadIndex = 0; bool mUpdateGainMapCCDB = true; std::unique_ptr mTFSettings; + std::unique_ptr mNNClusterizerSettings; + Config mSpecConfig; std::shared_ptr mGGR; bool mGRPGeomUpdated = false; diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index aa4f3cfca1289..f2cc2806115fb 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -78,6 +78,7 @@ #include "DetectorsRaw/RDHUtils.h" #include "ITStracking/TrackingInterface.h" #include "GPUWorkflowInternal.h" +#include "TPCCalibration/NeuralNetworkClusterizer.h" // #include "Framework/ThreadPool.h" #include @@ -118,6 +119,7 @@ GPURecoWorkflowSpec::GPURecoWorkflowSpec(GPURecoWorkflowSpec::CompletionPolicyDa mConfig.reset(new GPUO2InterfaceConfiguration); mConfParam.reset(new GPUSettingsO2); mTFSettings.reset(new GPUSettingsTF); + mNNClusterizerSettings.reset(new GPUSettingsProcessingNNclusterizer); mTimer.reset(new TStopwatch); mPipeline.reset(new GPURecoWorkflowSpec_PipelineInternals); @@ -133,6 +135,49 @@ void GPURecoWorkflowSpec::init(InitContext& ic) GRPGeomHelper::instance().setRequest(mGGR); GPUO2InterfaceConfiguration& config = *mConfig.get(); + if (mNNClusterizerSettings->nnLoadFromCCDB){ + LOG(info) << "Loading neural networks from CCDB"; + o2::tpc::NeuralNetworkClusterizer nnClusterizerFetcher; + nnClusterizerFetcher.initCcdbApi(mNNClusterizerSettings->nnCCDBURL); + std::map ccdbSettings = { + {"nnCCDBURL", mNNClusterizerSettings->nnCCDBURL}, + {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath}, + {"inputDType", mNNClusterizerSettings->nnInferenceInputDType}, + {"outputDType", mNNClusterizerSettings->nnInferenceOutputDType}, + {"outputFolder", mNNClusterizerSettings->nnLocalFolder}, + {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath}, + {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings->nnCCDBWithMomentum)}, + {"nnCCDBBeamType", mNNClusterizerSettings->nnCCDBBeamType}, + {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings->nnCCDBInteractionRate)}}; + + std::string nnFetchFolder = mNNClusterizerSettings->nnLocalFolder; + std::vector evalMode = o2::utils::Str::tokenize(mNNClusterizerSettings->nnEvalMode, ':'); + + if (evalMode[0] == "c1") { + ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBClassificationLayerType; + ccdbSettings["nnCCDBEvalType"] = "classification_c1"; + ccdbSettings["outputFile"] = "net_classification_c1.onnx"; + nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); + } else if (evalMode[0] == "c2") { + ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBClassificationLayerType; + ccdbSettings["nnCCDBEvalType"] = "classification_c2"; + ccdbSettings["outputFile"] = "net_classification_c2.onnx"; + nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); + } + + ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBRegressionLayerType; + ccdbSettings["nnCCDBEvalType"] = "regression_c1"; + ccdbSettings["outputFile"] = "net_regression_c1.onnx"; + nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); + if (evalMode[1] == "r2") { + ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBRegressionLayerType; + ccdbSettings["nnCCDBEvalType"] = "regression_c2"; + ccdbSettings["outputFile"] = "net_regression_c2.onnx"; + nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); + } + LOG(info) << "Neural network loading done!"; + } + // Create configuration object and fill settings mConfig->configGRP.solenoidBzNominalGPU = 0; mTFSettings->hasSimStartOrbit = 1; From a9857986166b76fd898d845e9b42a20c98eee368 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 10 Apr 2025 10:22:30 +0200 Subject: [PATCH 29/40] Adjusting CPU threads and ORT copmile definitions --- GPU/GPUTracking/Base/cuda/CMakeLists.txt | 13 +++++++++++++ GPU/GPUTracking/Base/hip/CMakeLists.txt | 13 +++++++++++++ GPU/GPUTracking/CMakeLists.txt | 13 +------------ .../Global/GPUChainTrackingClusterizer.cxx | 2 +- 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt index 6c88c69cbe35f..2611af88ad113 100644 --- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt +++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt @@ -122,6 +122,19 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${CMAKE_CURRENT_SOURCE_DIR} TARGETVARNAME targetName) + message("Compile definitions for ONNX runtime (CUDA):") + message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}") + message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}") + message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}") + message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}") + + target_compile_definitions(${targetName} PRIVATE + GPUCA_HAS_ONNX=1 + ORT_ROCM_BUILD=$ + ORT_CUDA_BUILD=$ + ORT_MIGRAPHX_BUILD=$ + ORT_TENSORRT_BUILD=$) + install(FILES ${HDRS} DESTINATION include/GPU) endif() diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index 2a3c0ea7d9eb1..570b9c4bd2683 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -160,6 +160,19 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${GPUCA_HIP_SOURCE_DIR} TARGETVARNAME targetName) + message("Compile definitions for ONNX runtime (HIP / ROCM):") + message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}") + message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}") + message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}") + message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}") + + target_compile_definitions(${targetName} PRIVATE + GPUCA_HAS_ONNX=1 + ORT_ROCM_BUILD=$ + ORT_CUDA_BUILD=$ + ORT_MIGRAPHX_BUILD=$ + ORT_TENSORRT_BUILD=$) + install(FILES ${HDRS} DESTINATION include/GPU) # o2_add_test(GPUsortHIP NAME test_GPUsortHIP diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index c4b084e260ea8..c6cccfb71a27a 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -352,19 +352,8 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${targetName} PRIVATE $) - message("Compile definitions for ONNX runtime:") - message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}") - message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}") - message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}") - message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}") - - target_compile_definitions(${targetName} PRIVATE - GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1 - ORT_ROCM_BUILD=$ - ORT_CUDA_BUILD=$ - ORT_MIGRAPHX_BUILD=$ - ORT_TENSORRT_BUILD=$) + GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1) o2_target_root_dictionary(${MODULE} HEADERS ${HDRS_CINT_O2} ${HDRS_CINT_O2_ADDITIONAL} diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index b03a27867f8fa..5565958d8d9ab 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -626,7 +626,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (GetProcessingSettings().nn.applyNNclusterizer) { int32_t deviceId = -1; int32_t numLanes = GetProcessingSettings().nTPCClustererLanes; - int32_t maxThreads = mRec->MemoryScalers()->nTPCdigits / 6000; + int32_t maxThreads = mRec->getNKernelHostThreads(true); mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) { nnApplications[lane].init(nn_settings); if (nnApplications[lane].modelsUsed[0]) { From fb08f18d45df31b34aa969387f38ad898d2f6faa Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 10 Apr 2025 23:22:15 +0200 Subject: [PATCH 30/40] About 10x speed-up due to explicit io binding --- Common/ML/include/ML/OrtInterface.h | 2 + Common/ML/src/OrtInterface.cxx | 23 +++++++--- .../Base/cuda/GPUReconstructionCUDA.cu | 4 +- .../Global/GPUChainTrackingClusterizer.cxx | 44 ++++++++++++------- 4 files changed, 50 insertions(+), 23 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index d496de866da7f..625f506684fd8 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -91,6 +91,8 @@ class OrtModel template void inference(I**, size_t, O*); + void release(); + private: // ORT variables -> need to be hidden as pImpl struct OrtVariables; diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index e525fc1d2709f..52ab22b5d1f87 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -33,6 +33,7 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the .c Ort::SessionOptions sessionOptions; Ort::AllocatorWithDefaultOptions allocator; Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); + std::unique_ptr ioBinding = nullptr; }; // General purpose @@ -122,7 +123,8 @@ void OrtModel::initEnvironment() }, (void*)3); (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events - pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); + pImplOrt->session = std::make_shared(*pImplOrt->env, modelPath.c_str(), pImplOrt->sessionOptions); + pImplOrt->ioBinding = std::make_unique(*pImplOrt->session); setIO(); @@ -135,6 +137,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex) { #if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1) if (deviceIndex >= 0) { + (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1"); std::string dev_mem_str = ""; if (deviceType == "ROCM") { dev_mem_str = "Hip"; @@ -268,20 +271,22 @@ void OrtModel::inference(I* input, size_t input_size, O* output) std::vector inputShape{input_size, (int64_t)mInputShapes[0][1]}; Ort::Value inputTensor = Ort::Value(nullptr); if constexpr (std::is_same_v) { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size * mInputShapes[0][1] * sizeof(Ort::Float16_t), inputShape.data(), inputShape.size()); } else { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1] * sizeof(float), inputShape.data(), inputShape.size()); } + (pImplOrt->ioBinding)->BindInput(mInputNames[0].c_str(), inputTensor); std::vector outputShape{input_size, mOutputShapes[0][1]}; Ort::Value outputTensor = Ort::Value(nullptr); if constexpr (std::is_same_v) { - outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); + outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(output), input_size * mOutputShapes[0][1] * sizeof(Ort::Float16_t), outputShape.data(), outputShape.size()); } else { - outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); + outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1] * sizeof(float), outputShape.data(), outputShape.size()); } + (pImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor); - (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); + (pImplOrt->session)->Run(pImplOrt->runOptions, *pImplOrt->ioBinding); } template void OrtModel::inference(OrtDataType::Float16_t*, size_t, OrtDataType::Float16_t*); @@ -398,6 +403,12 @@ std::vector OrtModel::inference(std::vector>& inputs) template std::vector OrtModel::inference(std::vector>&); template std::vector OrtModel::inference(std::vector>&); +// Release session +void OrtModel::release() +{ + LOG(info) << "(ORT) Size of pImplOrt: " << sizeof(*pImplOrt) << " bytes"; +} + // private std::string OrtModel::printShape(const std::vector& v) { diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index d71c5f3211774..4e36b3fd3380a 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -673,6 +673,7 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_option // UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), keys.size()); // this implicitly sets "has_user_compute_stream" + cuda_options.has_user_compute_stream = 1; UpdateCUDAProviderOptionsWithValue(cuda_options, "user_compute_stream", mInternals->Streams[stream]); session_options.AppendExecutionProvider_CUDA_V2(cuda_options); @@ -698,10 +699,9 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options // api.GetCurrentGpuDeviceId(deviceId); OrtROCMProviderOptions rocm_options; rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream + rocm_options.arena_extend_strategy = 0; rocm_options.user_compute_stream = mInternals->Streams[stream]; session_options.AppendExecutionProvider_ROCM(rocm_options); - // OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, *deviceId); - // api.ReleaseROCMProviderOptions(rocm_options); } #endif // GPUCA_HAS_ONNX diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 5565958d8d9ab..6c0b9140297b1 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -630,7 +630,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) { nnApplications[lane].init(nn_settings); if (nnApplications[lane].modelsUsed[0]) { - SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId); + SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane + numLanes, &deviceId); (nnApplications[lane].model_class).setDeviceId(deviceId); if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) { nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads); @@ -638,7 +638,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplications[lane].model_class).initEnvironment(); } if (nnApplications[lane].modelsUsed[1]) { - SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId); + SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane + 2*numLanes, &deviceId); (nnApplications[lane].model_reg_1).setDeviceId(deviceId); if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) { nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads); @@ -646,7 +646,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplications[lane].model_reg_1).initEnvironment(); } if (nnApplications[lane].modelsUsed[2]) { - SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId); + SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane + 3*numLanes, &deviceId); (nnApplications[lane].model_reg_2).setDeviceId(deviceId); if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) { nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads); @@ -950,7 +950,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); } - float time_clusterizer = 0, time_fill = 0; + float time_clusterizer = 0, time_fill = 0, time_networks = 0; for (int batch = 0; batch < std::ceil((float)clusterer.mPmemory->counters.nClusters / clustererNNShadow.nnClusterizerBatchedMode); batch++) { uint batchStart = batch * clustererNNShadow.nnClusterizerBatchedMode; size_t iSize = CAMath::Min((uint)clustererNNShadow.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); @@ -961,6 +961,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) auto start1 = std::chrono::high_resolution_clock::now(); + // NN evaluations if (clustererNNShadow.nnInferenceInputDType == 0) { if (clustererNNShadow.nnInferenceOutputDType == 0) { (nnApplication.model_class).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.modelProbabilities_16); @@ -974,14 +975,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.model_class).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.modelProbabilities_32); } } - - if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels - } else { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels - } if (!clustererNNShadow.nnClusterizerUseCfRegression) { - // nnApplication.networkInference(nnApplication.model_reg_1, clustererNNShadow, iSize, clustererNNShadow.outputDataReg1, clustererNNShadow.nnInferenceInputDType); if (clustererNNShadow.nnInferenceInputDType == 0) { if (clustererNNShadow.nnInferenceOutputDType == 0) { (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg1_16); @@ -995,9 +989,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.model_reg_1).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg1_32); } } - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Running the NN for regression class 1 if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { - // nnApplication.networkInference(nnApplication.model_reg_2, clustererNNShadow, iSize, clustererNNShadow.outputDataReg2, clustererNNShadow.nnInferenceInputDType); if (clustererNNShadow.nnInferenceInputDType == 0) { if (clustererNNShadow.nnInferenceOutputDType == 0) { (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_16, iSize, clustererNNShadow.outputDataReg2_16); @@ -1011,11 +1003,26 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplication.model_reg_2).inference(clustererNNShadow.inputData_32, iSize, clustererNNShadow.outputDataReg2_32); } } - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Running the NN for regression class 2 + } + } + + auto stopNNs = std::chrono::high_resolution_clock::now(); + + // Publishing kernels + if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels + } else { + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Assigning class labels + } + if (!clustererNNShadow.nnClusterizerUseCfRegression) { + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 1 regression results + if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results } } auto stop1 = std::chrono::high_resolution_clock::now(); + time_networks += std::chrono::duration_cast(stopNNs - start1).count() / 1e9; time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; time_fill += std::chrono::duration_cast(stop0 - start0).count() / 1e9; } @@ -1030,8 +1037,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) { acceptedClusters += clustererNNShadow.outputDataClass[i]; } - LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s"; + LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s"; } + TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clustererNN, lane); #else GPUFatal("Project not compiled with neural network clusterization. Aborting."); #endif @@ -1132,6 +1140,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } } for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) { + if (GetProcessingSettings().nn.applyNNclusterizer) { + GPUTPCNNClusterizerHost& nnApplication = nnApplications[i]; + nnApplication.model_class.release(); + nnApplication.model_reg_1.release(); + nnApplication.model_reg_2.release(); + } if (transferRunning[i]) { ReleaseEvent(mEvents->stream[i], doGPU); } From b1c88f09a758c3e0cb67cbbac063fc4c82071d82 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 11 Apr 2025 14:08:13 +0200 Subject: [PATCH 31/40] Changes for synchronization and consistency. No performance loss. --- Common/ML/src/OrtInterface.cxx | 8 ++++---- GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 7 +++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 52ab22b5d1f87..bfbd8343efedf 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -271,18 +271,18 @@ void OrtModel::inference(I* input, size_t input_size, O* output) std::vector inputShape{input_size, (int64_t)mInputShapes[0][1]}; Ort::Value inputTensor = Ort::Value(nullptr); if constexpr (std::is_same_v) { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size * mInputShapes[0][1] * sizeof(Ort::Float16_t), inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); } else { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1] * sizeof(float), inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); } (pImplOrt->ioBinding)->BindInput(mInputNames[0].c_str(), inputTensor); std::vector outputShape{input_size, mOutputShapes[0][1]}; Ort::Value outputTensor = Ort::Value(nullptr); if constexpr (std::is_same_v) { - outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(output), input_size * mOutputShapes[0][1] * sizeof(Ort::Float16_t), outputShape.data(), outputShape.size()); + outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(output), input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); } else { - outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1] * sizeof(float), outputShape.data(), outputShape.size()); + outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); } (pImplOrt->ioBinding)->BindOutput(mOutputNames[0].c_str(), outputTensor); diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 6c0b9140297b1..7026a5ea01a1a 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -630,7 +630,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) { nnApplications[lane].init(nn_settings); if (nnApplications[lane].modelsUsed[0]) { - SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane + numLanes, &deviceId); + SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId); (nnApplications[lane].model_class).setDeviceId(deviceId); if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) { nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads); @@ -638,7 +638,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplications[lane].model_class).initEnvironment(); } if (nnApplications[lane].modelsUsed[1]) { - SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane + 2*numLanes, &deviceId); + SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId); (nnApplications[lane].model_reg_1).setDeviceId(deviceId); if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) { nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads); @@ -646,7 +646,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) (nnApplications[lane].model_reg_1).initEnvironment(); } if (nnApplications[lane].modelsUsed[2]) { - SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane + 3*numLanes, &deviceId); + SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId); (nnApplications[lane].model_reg_2).setDeviceId(deviceId); if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) { nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads); @@ -1039,7 +1039,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s"; } - TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clustererNN, lane); #else GPUFatal("Project not compiled with neural network clusterization. Aborting."); #endif From 32cab70fa540e3976e9b1d9cc5e0664cb21001b7 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Fri, 11 Apr 2025 12:08:51 +0000 Subject: [PATCH 32/40] Please consider the following formatting changes --- Common/ML/include/ML/OrtInterface.h | 30 +++-- Common/ML/src/OrtInterface.cxx | 116 +++++++++--------- .../TPCCalibration/NeuralNetworkClusterizer.h | 19 ++- .../src/NeuralNetworkClusterizer.cxx | 3 +- GPU/GPUTracking/Base/GPUReconstructionCPU.h | 5 +- .../Base/GPUReconstructionProcessing.h | 3 +- .../Base/cuda/GPUReconstructionCUDA.h | 5 +- .../Global/GPUChainTrackingClusterizer.cxx | 6 +- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 2 +- .../GPUTPCNNClusterizerHost.cxx | 2 +- .../GPUTPCNNClusterizerHost.h | 2 +- .../GPUTPCNNClusterizerKernels.cxx | 84 ++++++------- GPU/Workflow/src/GPUWorkflowSpec.cxx | 20 +-- 13 files changed, 156 insertions(+), 141 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 625f506684fd8..e44b56e62a04e 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -26,10 +26,11 @@ // O2 includes #include "Framework/Logger.h" -namespace Ort { - struct SessionOptions; - struct MemoryInfo; -} +namespace Ort +{ +struct SessionOptions; +struct MemoryInfo; +} // namespace Ort namespace o2 { @@ -44,7 +45,8 @@ class OrtModel // Constructors & destructors OrtModel() = default; OrtModel(std::unordered_map optionsMap) { init(optionsMap); } - void init(std::unordered_map optionsMap) { + void init(std::unordered_map optionsMap) + { initOptions(optionsMap); initEnvironment(); } @@ -71,8 +73,18 @@ class OrtModel void setDeviceId(int32_t id) { deviceId = id; } void setIO(); void setActiveThreads(int threads) { intraOpNumThreads = threads; } - void setIntraOpNumThreads(int threads) { if(deviceType == "CPU") { intraOpNumThreads = threads; } } - void setInterOpNumThreads(int threads) { if(deviceType == "CPU") { interOpNumThreads = threads; } } + void setIntraOpNumThreads(int threads) + { + if (deviceType == "CPU") { + intraOpNumThreads = threads; + } + } + void setInterOpNumThreads(int threads) + { + if (deviceType == "CPU") { + interOpNumThreads = threads; + } + } // Conversion template @@ -102,8 +114,8 @@ class OrtModel std::vector inputNamesChar, outputNamesChar; std::vector mInputNames, mOutputNames; std::vector> mInputShapes, mOutputShapes, inputShapesCopy, outputShapesCopy; // Input shapes - std::vector inputSizePerNode, outputSizePerNode; // Output shapes - int32_t mInputsTotal = 0, mOutputsTotal = 0; // Total number of inputs and outputs + std::vector inputSizePerNode, outputSizePerNode; // Output shapes + int32_t mInputsTotal = 0, mOutputsTotal = 0; // Total number of inputs and outputs // Environment settings bool mInitialized = false; diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index bfbd8343efedf..8771a312a7e45 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -99,7 +99,7 @@ void OrtModel::initOptions(std::unordered_map optionsM void OrtModel::initEnvironment() { - if(allocateDeviceMemory) { + if (allocateDeviceMemory) { memoryOnDevice(deviceId); } pImplOrt->env = std::make_shared( @@ -184,7 +184,8 @@ std::vector OrtModel::v2v(std::vector& input, bool clearInput) } } -void OrtModel::setIO() { +void OrtModel::setIO() +{ for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); } @@ -211,7 +212,7 @@ void OrtModel::setIO() { outputSizePerNode.resize(mOutputShapes.size(), 1); mInputsTotal = 1; for (size_t i = 0; i < mInputShapes.size(); ++i) { - if(mInputShapes[i].size() > 0) { + if (mInputShapes[i].size() > 0) { for (size_t j = 1; j < mInputShapes[i].size(); ++j) { if (mInputShapes[i][j] > 0) { mInputsTotal *= mInputShapes[i][j]; @@ -222,7 +223,7 @@ void OrtModel::setIO() { } mOutputsTotal = 1; for (size_t i = 0; i < mOutputShapes.size(); ++i) { - if(mOutputShapes[i].size() > 0) { + if (mOutputShapes[i].size() > 0) { for (size_t j = 1; j < mOutputShapes[i].size(); ++j) { if (mOutputShapes[i][j] > 0) { mOutputsTotal *= mOutputShapes[i][j]; @@ -239,8 +240,7 @@ std::vector OrtModel::inference(std::vector& input) { std::vector inputShape = mInputShapes[0]; inputShape[0] = input.size(); - for (size_t i = 1; i < mInputShapes[0].size(); ++i) - { + for (size_t i = 1; i < mInputShapes[0].size(); ++i) { inputShape[0] /= mInputShapes[0][i]; } std::vector inputTensor; @@ -295,28 +295,29 @@ template void OrtModel::inference(float*, size_t, template void OrtModel::inference(float*, size_t, float*); template -void OrtModel::inference(I** input, size_t input_size, O* output) { +void OrtModel::inference(I** input, size_t input_size, O* output) +{ std::vector inputTensors(inputShapesCopy.size()); for (size_t i = 0; i < inputShapesCopy.size(); ++i) { - inputShapesCopy[i][0] = input_size; // batch-size + inputShapesCopy[i][0] = input_size; // batch-size outputShapesCopy[i][0] = input_size; // batch-size if constexpr (std::is_same_v) { inputTensors[i] = Ort::Value::CreateTensor( - pImplOrt->memoryInfo, - reinterpret_cast(input[i]), - inputSizePerNode[i]*input_size, - inputShapesCopy[i].data(), - inputShapesCopy[i].size()); + pImplOrt->memoryInfo, + reinterpret_cast(input[i]), + inputSizePerNode[i] * input_size, + inputShapesCopy[i].data(), + inputShapesCopy[i].size()); } else { inputTensors[i] = Ort::Value::CreateTensor( - pImplOrt->memoryInfo, - input[i], - inputSizePerNode[i]*input_size, - inputShapesCopy[i].data(), - inputShapesCopy[i].size()); + pImplOrt->memoryInfo, + input[i], + inputSizePerNode[i] * input_size, + inputShapesCopy[i].data(), + inputShapesCopy[i].size()); } } @@ -325,14 +326,14 @@ void OrtModel::inference(I** input, size_t input_size, O* output) { outputTensor = Ort::Value::CreateTensor( pImplOrt->memoryInfo, reinterpret_cast(output), - outputSizePerNode[0]*input_size, // assumes that there is only one output node + outputSizePerNode[0] * input_size, // assumes that there is only one output node outputShapesCopy[0].data(), outputShapesCopy[0].size()); } else { outputTensor = Ort::Value::CreateTensor( pImplOrt->memoryInfo, output, - outputSizePerNode[0]*input_size, // assumes that there is only one output node + outputSizePerNode[0] * input_size, // assumes that there is only one output node outputShapesCopy[0].data(), outputShapesCopy[0].size()); } @@ -345,8 +346,7 @@ void OrtModel::inference(I** input, size_t input_size, O* output) { inputNamesChar.size(), outputNamesChar.data(), &outputTensor, - outputNamesChar.size() - ); + outputNamesChar.size()); } template void OrtModel::inference(OrtDataType::Float16_t**, size_t, OrtDataType::Float16_t*); @@ -357,47 +357,47 @@ template void OrtModel::inference(float**, size_t, float*); template std::vector OrtModel::inference(std::vector>& inputs) { - std::vector input_tensors; + std::vector input_tensors; - for (size_t i = 0; i < inputs.size(); ++i) { + for (size_t i = 0; i < inputs.size(); ++i) { - inputShapesCopy[i][0] = inputs[i].size() / inputSizePerNode[i]; // batch-size + inputShapesCopy[i][0] = inputs[i].size() / inputSizePerNode[i]; // batch-size - if constexpr (std::is_same_v) { - input_tensors.emplace_back( - Ort::Value::CreateTensor( - pImplOrt->memoryInfo, - reinterpret_cast(inputs[i].data()), - inputSizePerNode[i]*inputShapesCopy[i][0], - inputShapesCopy[i].data(), - inputShapesCopy[i].size())); - } else { - input_tensors.emplace_back( - Ort::Value::CreateTensor( - pImplOrt->memoryInfo, - inputs[i].data(), - inputSizePerNode[i]*inputShapesCopy[i][0], - inputShapesCopy[i].data(), - inputShapesCopy[i].size())); - } + if constexpr (std::is_same_v) { + input_tensors.emplace_back( + Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + reinterpret_cast(inputs[i].data()), + inputSizePerNode[i] * inputShapesCopy[i][0], + inputShapesCopy[i].data(), + inputShapesCopy[i].size())); + } else { + input_tensors.emplace_back( + Ort::Value::CreateTensor( + pImplOrt->memoryInfo, + inputs[i].data(), + inputSizePerNode[i] * inputShapesCopy[i][0], + inputShapesCopy[i].data(), + inputShapesCopy[i].size())); } + } + + int32_t totalOutputSize = mOutputsTotal * inputShapesCopy[0][0]; + + // === Run inference === + auto output_tensors = pImplOrt->session->Run( + pImplOrt->runOptions, + inputNamesChar.data(), + input_tensors.data(), + input_tensors.size(), + outputNamesChar.data(), + outputNamesChar.size()); - int32_t totalOutputSize = mOutputsTotal*inputShapesCopy[0][0]; - - // === Run inference === - auto output_tensors = pImplOrt->session->Run( - pImplOrt->runOptions, - inputNamesChar.data(), - input_tensors.data(), - input_tensors.size(), - outputNamesChar.data(), - outputNamesChar.size()); - - // === Extract output values === - O* output_data = output_tensors[0].template GetTensorMutableData(); - std::vector output_vec(output_data, output_data + totalOutputSize); - output_tensors.clear(); - return output_vec; + // === Extract output values === + O* output_data = output_tensors[0].template GetTensorMutableData(); + std::vector output_vec(output_data, output_data + totalOutputSize); + output_tensors.clear(); + return output_vec; } template std::vector OrtModel::inference(std::vector>&); diff --git a/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h index e4fcfa56df438..196bba644714c 100644 --- a/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h +++ b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h @@ -23,16 +23,15 @@ namespace o2::tpc class NeuralNetworkClusterizer { - public: - NeuralNetworkClusterizer() = default; - void initCcdbApi(std::string url); - void loadIndividualFromCCDB(std::map settings); - - private: - o2::ccdb::CcdbApi ccdbApi; - std::map metadata; - std::map headers; - + public: + NeuralNetworkClusterizer() = default; + void initCcdbApi(std::string url); + void loadIndividualFromCCDB(std::map settings); + + private: + o2::ccdb::CcdbApi ccdbApi; + std::map metadata; + std::map headers; }; } // namespace o2::tpc diff --git a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx index 8a2e739b772fb..bfbb7afc946f8 100644 --- a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx +++ b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx @@ -18,7 +18,8 @@ using namespace o2::tpc; -void NeuralNetworkClusterizer::initCcdbApi(std::string url) { +void NeuralNetworkClusterizer::initCcdbApi(std::string url) +{ ccdbApi.init(url); } diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index ec02015fc91cb..1174fcd8a38d7 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -24,8 +24,9 @@ #include "GPUReconstructionKernelIncludes.h" #include "GPUReconstructionKernels.h" -namespace Ort { - struct SessionOptions; +namespace Ort +{ +struct SessionOptions; } namespace o2::gpu diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h index b83de29ef4af2..94e13d15d9c89 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h +++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h @@ -22,7 +22,8 @@ #include #include -namespace Ort { +namespace Ort +{ struct SessionOptions; } diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h index 08e3078f767e6..d4712f2c0ed25 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.h @@ -25,8 +25,9 @@ extern "C" __declspec(dllexport) o2::gpu::GPUReconstruction* GPUReconstruction_C extern "C" o2::gpu::GPUReconstruction* GPUReconstruction_Create_CUDA(const o2::gpu::GPUSettingsDeviceBackend& cfg); #endif -namespace Ort { - struct SessionOptions; +namespace Ort +{ +struct SessionOptions; } namespace o2::gpu diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 7026a5ea01a1a..1d71c92bcdfe9 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -665,7 +665,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) clustererNN.mISector = sector; clustererNN.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters; nnApplications[lane].initClusterizer(nn_settings, clustererNN); - if (doGPU){ + if (doGPU) { clustererNNShadow.deviceId = deviceId; clustererNNShadow.mISector = sector; clustererNNShadow.nnClusterizerTotalClusters = processors()->tpcClusterer[lane].mNMaxClusters; @@ -673,8 +673,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } AllocateRegisteredMemory(clustererNN.mMemoryId); }); - if (doGPU){ - WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer)*NSECTORS, mRec->NStreams() - 1, &mEvents->init); + if (doGPU) { + WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init); } } #endif diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index e9b2061bea36a..da490b0f94d58 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -52,7 +52,7 @@ class GPUTPCNNClusterizer : public GPUProcessor int nnClusterizerModelClassNumOutputNodes = -1; int nnClusterizerModelReg1NumOutputNodes = -1; int nnClusterizerModelReg2NumOutputNodes = -1; - int nnInferenceInputDType = 0; // 0: float16, 1: float32 + int nnInferenceInputDType = 0; // 0: float16, 1: float32 int nnInferenceOutputDType = 0; // 0: float16, 1: float32 int mISector = -1; int deviceId = -1; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 5125d7a3fd364..8c6d4fff67528 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -27,7 +27,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set std::vector reg_model_paths; std::vector evalMode = o2::utils::Str::tokenize(settings.nnEvalMode, ':'); - if(settings.nnLoadFromCCDB) { + if (settings.nnLoadFromCCDB) { reg_model_path = settings.nnLocalFolder + "/net_regression_c1.onnx"; // Needs to be set identical to NeuralNetworkClusterizer.cxx, otherwise the networks might be loaded from the wrong place if (evalMode[0] == "c1") { class_model_path = settings.nnLocalFolder + "/net_classification_c1.onnx"; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 8001ecc96fcfd..bae9e5fa677b2 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -45,7 +45,7 @@ class GPUTPCNNClusterizerHost std::unordered_map OrtOptions; o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters - std::vector modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2 + std::vector modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2 int32_t deviceId = -1; std::vector reg_model_paths; }; // class GPUTPCNNClusterizerHost diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index d1be1d00027e2..2cf9ab2037007 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -147,9 +147,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); if (is_row_boundary) { if (dtype == 0) { - clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); + clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); } else { - clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); + clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); } } else { int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow); @@ -163,15 +163,15 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(chargeMap[tmp_pos].unpack()) / central_charge); + clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); } else if (dtype == 1) { - clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; } } else { if (dtype == 0) { - clustererNN.inputData_16[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); + clustererNN.inputData_16[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.nnClusterizerBoundaryFillValue)); } else { - clustererNN.inputData_32[base_idx*clustererNN.nnClusterizerElementSize + transient_index] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); + clustererNN.inputData_32[base_idx * clustererNN.nnClusterizerElementSize + transient_index] = static_cast(clustererNN.nnClusterizerBoundaryFillValue); } } } @@ -266,20 +266,20 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.outputDataReg1_16[model_output_index].ToFloat(), - clustererNN.outputDataReg1_16[model_output_index + 2].ToFloat(), - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg1_16[model_output_index + 1].ToFloat(), - clustererNN.outputDataReg1_16[model_output_index + 3].ToFloat(), - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + static_cast(peak.pad()) + clustererNN.outputDataReg1_16[model_output_index].ToFloat(), + clustererNN.outputDataReg1_16[model_output_index + 2].ToFloat(), + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg1_16[model_output_index + 1].ToFloat(), + clustererNN.outputDataReg1_16[model_output_index + 3].ToFloat(), + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); } else if (dtype == 1) { pc.setFull(central_charge * clustererNN.outputDataReg1_32[model_output_index + 4], - static_cast(peak.pad()) + clustererNN.outputDataReg1_32[model_output_index], - clustererNN.outputDataReg1_32[model_output_index + 2], - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg1_32[model_output_index + 1], - clustererNN.outputDataReg1_32[model_output_index + 3], - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + static_cast(peak.pad()) + clustererNN.outputDataReg1_32[model_output_index], + clustererNN.outputDataReg1_32[model_output_index + 2], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg1_32[model_output_index + 1], + clustererNN.outputDataReg1_32[model_output_index + 3], + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); } tpc::ClusterNative myCluster; @@ -359,20 +359,20 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index].ToFloat(), - clustererNN.outputDataReg2_16[model_output_index + 4].ToFloat(), - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 2].ToFloat(), - clustererNN.outputDataReg2_16[model_output_index + 6].ToFloat(), - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + static_cast(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index].ToFloat(), + clustererNN.outputDataReg2_16[model_output_index + 4].ToFloat(), + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 2].ToFloat(), + clustererNN.outputDataReg2_16[model_output_index + 6].ToFloat(), + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); } else if (dtype == 1) { pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 8], - static_cast(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index], - clustererNN.outputDataReg2_32[model_output_index + 4], - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 2], - clustererNN.outputDataReg2_32[model_output_index + 6], - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + static_cast(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index], + clustererNN.outputDataReg2_32[model_output_index + 4], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 2], + clustererNN.outputDataReg2_32[model_output_index + 6], + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); } tpc::ClusterNative myCluster; @@ -404,20 +404,20 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index + 1].ToFloat(), - clustererNN.outputDataReg2_16[model_output_index + 5].ToFloat(), - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 3].ToFloat(), - clustererNN.outputDataReg2_16[model_output_index + 7].ToFloat(), - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + static_cast(peak.pad()) + clustererNN.outputDataReg2_16[model_output_index + 1].ToFloat(), + clustererNN.outputDataReg2_16[model_output_index + 5].ToFloat(), + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_16[model_output_index + 3].ToFloat(), + clustererNN.outputDataReg2_16[model_output_index + 7].ToFloat(), + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); } else if (dtype == 1) { pc.setFull(central_charge * clustererNN.outputDataReg2_32[model_output_index + 9], - static_cast(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index + 1], - clustererNN.outputDataReg2_32[model_output_index + 5], - (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 3], - clustererNN.outputDataReg2_32[model_output_index + 7], - clustererNN.clusterFlags[2 * glo_idx], - clustererNN.clusterFlags[2 * glo_idx + 1]); + static_cast(peak.pad()) + clustererNN.outputDataReg2_32[model_output_index + 1], + clustererNN.outputDataReg2_32[model_output_index + 5], + (clusterer.mPmemory->fragment).start + static_cast(peak.time()) + clustererNN.outputDataReg2_32[model_output_index + 3], + clustererNN.outputDataReg2_32[model_output_index + 7], + clustererNN.clusterFlags[2 * glo_idx], + clustererNN.clusterFlags[2 * glo_idx + 1]); } rejectCluster = !pc.toNative(peak, central_charge, myCluster, clusterer.Param(), chargeMap); diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index f2cc2806115fb..7aae3c176db74 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -135,20 +135,20 @@ void GPURecoWorkflowSpec::init(InitContext& ic) GRPGeomHelper::instance().setRequest(mGGR); GPUO2InterfaceConfiguration& config = *mConfig.get(); - if (mNNClusterizerSettings->nnLoadFromCCDB){ + if (mNNClusterizerSettings->nnLoadFromCCDB) { LOG(info) << "Loading neural networks from CCDB"; o2::tpc::NeuralNetworkClusterizer nnClusterizerFetcher; nnClusterizerFetcher.initCcdbApi(mNNClusterizerSettings->nnCCDBURL); std::map ccdbSettings = { - {"nnCCDBURL", mNNClusterizerSettings->nnCCDBURL}, - {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath}, - {"inputDType", mNNClusterizerSettings->nnInferenceInputDType}, - {"outputDType", mNNClusterizerSettings->nnInferenceOutputDType}, - {"outputFolder", mNNClusterizerSettings->nnLocalFolder}, - {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath}, - {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings->nnCCDBWithMomentum)}, - {"nnCCDBBeamType", mNNClusterizerSettings->nnCCDBBeamType}, - {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings->nnCCDBInteractionRate)}}; + {"nnCCDBURL", mNNClusterizerSettings->nnCCDBURL}, + {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath}, + {"inputDType", mNNClusterizerSettings->nnInferenceInputDType}, + {"outputDType", mNNClusterizerSettings->nnInferenceOutputDType}, + {"outputFolder", mNNClusterizerSettings->nnLocalFolder}, + {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath}, + {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings->nnCCDBWithMomentum)}, + {"nnCCDBBeamType", mNNClusterizerSettings->nnCCDBBeamType}, + {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings->nnCCDBInteractionRate)}}; std::string nnFetchFolder = mNNClusterizerSettings->nnLocalFolder; std::vector evalMode = o2::utils::Str::tokenize(mNNClusterizerSettings->nnEvalMode, ':'); From 70907aa03b7dbadf7f43cdd56f6ee58625db2c49 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 11 Apr 2025 20:46:06 +0200 Subject: [PATCH 33/40] Fixing warnings (errors due to size_t) --- Common/ML/include/ML/OrtInterface.h | 4 ++-- Common/ML/src/OrtInterface.cxx | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index e44b56e62a04e..a1d8123073ef5 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -98,10 +98,10 @@ class OrtModel std::vector inference(std::vector>&); template - void inference(I*, size_t, O*); + void inference(I*, int64_t, O*); template - void inference(I**, size_t, O*); + void inference(I**, int64_t, O*); void release(); diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 8771a312a7e45..000ffdfa39d94 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -262,7 +262,7 @@ template std::vector OrtModel::inference(s template std::vector OrtModel::inference(std::vector&); template -void OrtModel::inference(I* input, size_t input_size, O* output) +void OrtModel::inference(I* input, int64_t input_size, O* output) { // std::vector providers = Ort::GetAvailableProviders(); // for (const auto& provider : providers) { @@ -289,13 +289,13 @@ void OrtModel::inference(I* input, size_t input_size, O* output) (pImplOrt->session)->Run(pImplOrt->runOptions, *pImplOrt->ioBinding); } -template void OrtModel::inference(OrtDataType::Float16_t*, size_t, OrtDataType::Float16_t*); -template void OrtModel::inference(OrtDataType::Float16_t*, size_t, float*); -template void OrtModel::inference(float*, size_t, OrtDataType::Float16_t*); -template void OrtModel::inference(float*, size_t, float*); +template void OrtModel::inference(OrtDataType::Float16_t*, int64_t, OrtDataType::Float16_t*); +template void OrtModel::inference(OrtDataType::Float16_t*, int64_t, float*); +template void OrtModel::inference(float*, int64_t, OrtDataType::Float16_t*); +template void OrtModel::inference(float*, int64_t, float*); template -void OrtModel::inference(I** input, size_t input_size, O* output) +void OrtModel::inference(I** input, int64_t input_size, O* output) { std::vector inputTensors(inputShapesCopy.size()); @@ -349,10 +349,10 @@ void OrtModel::inference(I** input, size_t input_size, O* output) outputNamesChar.size()); } -template void OrtModel::inference(OrtDataType::Float16_t**, size_t, OrtDataType::Float16_t*); -template void OrtModel::inference(OrtDataType::Float16_t**, size_t, float*); -template void OrtModel::inference(float**, size_t, OrtDataType::Float16_t*); -template void OrtModel::inference(float**, size_t, float*); +template void OrtModel::inference(OrtDataType::Float16_t**, int64_t, OrtDataType::Float16_t*); +template void OrtModel::inference(OrtDataType::Float16_t**, int64_t, float*); +template void OrtModel::inference(float**, int64_t, OrtDataType::Float16_t*); +template void OrtModel::inference(float**, int64_t, float*); template std::vector OrtModel::inference(std::vector>& inputs) From e46cdfa184efcb58b534f6d32482ce3dd9a22e47 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sun, 13 Apr 2025 11:33:58 +0200 Subject: [PATCH 34/40] Fixing linker issues --- GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 4e36b3fd3380a..e06c43db814fe 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -661,9 +661,9 @@ void GPUReconstructionCUDA::endGPUProfiling() GPUChkErr(cudaProfilerStop()); } -#if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1 void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId) { +#if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1 cudaGetDevice(deviceId); OrtCUDAProviderOptionsV2* cuda_options = nullptr; CreateCUDAProviderOptions(&cuda_options); @@ -679,8 +679,8 @@ void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_option // Finally, don't forget to release the provider options ReleaseCUDAProviderOptions(cuda_options); +#endif // ORT_CUDA_BUILD } -#endif // GPUCA_HAS_ONNX #else // HIP void* GPUReconstructionHIP::getGPUPointer(void* ptr) @@ -690,9 +690,9 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr) return retVal; } -#if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1 void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId) { +#if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1 // Create ROCm provider options cudaGetDevice(deviceId); // const auto& api = Ort::GetApi(); @@ -702,9 +702,8 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options rocm_options.arena_extend_strategy = 0; rocm_options.user_compute_stream = mInternals->Streams[stream]; session_options.AppendExecutionProvider_ROCM(rocm_options); +#endif // ORT_ROCM_BUILD } - -#endif // GPUCA_HAS_ONNX #endif // __HIPCC__ namespace o2::gpu From 4b0825ac8d86909442981849ff7619aadceb5b5d Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 16 Apr 2025 13:54:20 +0200 Subject: [PATCH 35/40] Adding volatile memory allocation and MockedOrtAllocator. Removing print statements and time measurements --- Common/ML/include/ML/OrtInterface.h | 10 +- Common/ML/src/OrtInterface.cxx | 40 ++++++-- .../Base/GPUReconstructionProcessing.h | 1 - .../Base/cuda/GPUReconstructionCUDA.cu | 2 +- GPU/GPUTracking/CMakeLists.txt | 1 + .../Global/GPUChainTrackingClusterizer.cxx | 85 +++++++++++------ .../GPUTPCNNClusterizerHost.cxx | 93 ++++++++++++++++++- .../GPUTPCNNClusterizerHost.h | 16 +++- 8 files changed, 203 insertions(+), 45 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index a1d8123073ef5..e37b6a69b6036 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -30,6 +30,7 @@ namespace Ort { struct SessionOptions; struct MemoryInfo; +struct Env; } // namespace Ort namespace o2 @@ -55,6 +56,7 @@ class OrtModel // General purpose void initOptions(std::unordered_map optionsMap); void initEnvironment(); + void initSession(); void memoryOnDevice(int32_t = 0); bool isInitialized() { return mInitialized; } void resetSession(); @@ -64,8 +66,9 @@ class OrtModel std::vector> getNumOutputNodes() const { return mOutputShapes; } std::vector getInputNames() const { return mInputNames; } std::vector getOutputNames() const { return mOutputNames; } - Ort::SessionOptions& getSessionOptions(); - Ort::MemoryInfo& getMemoryInfo(); + Ort::SessionOptions* getSessionOptions(); + Ort::MemoryInfo* getMemoryInfo(); + Ort::Env* getEnv(); int32_t getIntraOpNumThreads() const { return intraOpNumThreads; } int32_t getInterOpNumThreads() const { return interOpNumThreads; } @@ -85,6 +88,7 @@ class OrtModel interOpNumThreads = threads; } } + void setEnv(Ort::Env*); // Conversion template @@ -103,7 +107,7 @@ class OrtModel template void inference(I**, int64_t, O*); - void release(); + void release(bool = false); private: // ORT variables -> need to be hidden as pImpl diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 000ffdfa39d94..6dd3887c82417 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -99,9 +99,6 @@ void OrtModel::initOptions(std::unordered_map optionsM void OrtModel::initEnvironment() { - if (allocateDeviceMemory) { - memoryOnDevice(deviceId); - } pImplOrt->env = std::make_shared( OrtLoggingLevel(loggingLevel), (envName.empty() ? "ORT" : envName.c_str()), @@ -123,6 +120,13 @@ void OrtModel::initEnvironment() }, (void*)3); (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events +} + +void OrtModel::initSession() +{ + if (allocateDeviceMemory) { + memoryOnDevice(deviceId); + } pImplOrt->session = std::make_shared(*pImplOrt->env, modelPath.c_str(), pImplOrt->sessionOptions); pImplOrt->ioBinding = std::make_unique(*pImplOrt->session); @@ -138,6 +142,13 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex) #if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1) if (deviceIndex >= 0) { (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1"); + (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h + (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time + + // Arena memory shrinkage comes at performance cost + /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0; + // (pImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27 + std::string dev_mem_str = ""; if (deviceType == "ROCM") { dev_mem_str = "Hip"; @@ -159,14 +170,19 @@ void OrtModel::resetSession() } // Getters -Ort::SessionOptions& OrtModel::getSessionOptions() +Ort::SessionOptions* OrtModel::getSessionOptions() +{ + return &pImplOrt->sessionOptions; +} + +Ort::MemoryInfo* OrtModel::getMemoryInfo() { - return pImplOrt->sessionOptions; + return &pImplOrt->memoryInfo; } -Ort::MemoryInfo& OrtModel::getMemoryInfo() +Ort::Env* OrtModel::getEnv() { - return pImplOrt->memoryInfo; + return (pImplOrt->env).get(); } template @@ -234,6 +250,11 @@ void OrtModel::setIO() } } +void OrtModel::setEnv(Ort::Env* env) +{ + pImplOrt->env = std::shared_ptr(env); +} + // Inference template std::vector OrtModel::inference(std::vector& input) @@ -404,8 +425,11 @@ template std::vector OrtModel::inference(std::vector OrtModel::inference(std::vector>&); // Release session -void OrtModel::release() +void OrtModel::release(bool profilingEnabled) { + // if (profilingEnabled) { + // pImplOrt->session->EndProfiling(); + // } LOG(info) << "(ORT) Size of pImplOrt: " << sizeof(*pImplOrt) << " bytes"; } diff --git a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h index 94e13d15d9c89..2339ee9fb6b83 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionProcessing.h +++ b/GPU/GPUTracking/Base/GPUReconstructionProcessing.h @@ -95,7 +95,6 @@ class GPUReconstructionProcessing : public GPUReconstruction void AddGPUEvents(T*& events); virtual std::unique_ptr GetThreadContext() override; - // virtual void SetONNXGPUStream(Ort::SessionOptions&, int32_t, int32_t*) {} struct RecoStepTimerMeta { HighResTimer timerToGPU; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index e06c43db814fe..247438fa8a13f 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -699,7 +699,7 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options // api.GetCurrentGpuDeviceId(deviceId); OrtROCMProviderOptions rocm_options; rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream - rocm_options.arena_extend_strategy = 0; + rocm_options.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code rocm_options.user_compute_stream = mInternals->Streams[stream]; session_options.AppendExecutionProvider_ROCM(rocm_options); #endif // ORT_ROCM_BUILD diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index eb7481819ea89..673c93cddb8ca 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -336,6 +336,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") O2::DetectorsRaw O2::Steer O2::ML + PRIVATE_LINK_LIBRARIES ONNXRuntime::ONNXRuntime PUBLIC_INCLUDE_DIRECTORIES ${INCDIRS} SOURCES ${SRCS} ${SRCS_NO_CINT} ${SRCS_NO_H}) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 1d71c92bcdfe9..0b9897977cc98 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -42,6 +42,7 @@ #ifdef GPUCA_HAS_ONNX #include "GPUTPCNNClusterizerKernels.h" #include "GPUTPCNNClusterizerHost.h" +// #include "ML/3rdparty/GPUORTFloat16.h" #endif using namespace o2::gpu; @@ -630,31 +631,39 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) { nnApplications[lane].init(nn_settings); if (nnApplications[lane].modelsUsed[0]) { - SetONNXGPUStream((nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId); + SetONNXGPUStream(*(nnApplications[lane].model_class).getSessionOptions(), lane, &deviceId); (nnApplications[lane].model_class).setDeviceId(deviceId); if (nnApplications[lane].model_class.getIntraOpNumThreads() > maxThreads) { nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads); } (nnApplications[lane].model_class).initEnvironment(); + // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, 0); + (nnApplications[lane].model_class).initSession(); } if (nnApplications[lane].modelsUsed[1]) { - SetONNXGPUStream((nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId); + SetONNXGPUStream(*(nnApplications[lane].model_reg_1).getSessionOptions(), lane, &deviceId); (nnApplications[lane].model_reg_1).setDeviceId(deviceId); if (nnApplications[lane].model_reg_1.getIntraOpNumThreads() > maxThreads) { nnApplications[lane].model_reg_1.setIntraOpNumThreads(maxThreads); } + // (nnApplications[lane].model_reg_1).setEnv((nnApplications[lane].model_class).getEnv()); (nnApplications[lane].model_reg_1).initEnvironment(); + // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, 1); + (nnApplications[lane].model_reg_1).initSession(); } if (nnApplications[lane].modelsUsed[2]) { - SetONNXGPUStream((nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId); + SetONNXGPUStream(*(nnApplications[lane].model_reg_2).getSessionOptions(), lane, &deviceId); (nnApplications[lane].model_reg_2).setDeviceId(deviceId); if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) { nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads); } + // (nnApplications[lane].model_reg_2).setEnv((nnApplications[lane].model_class).getEnv()); (nnApplications[lane].model_reg_2).initEnvironment(); + // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_2).getEnv(), (nnApplications[lane].model_reg_2).getMemoryInfo(), mRec, 2); + (nnApplications[lane].model_reg_2).initSession(); } if (nn_settings.nnClusterizerVerbosity < 3) { - LOG(info) << "Allocated ONNX stream for lane " << lane << " and device " << deviceId; + LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId; } }); mRec->runParallelOuterLoop(doGPU, NSECTORS, [&](uint32_t sector) { @@ -957,9 +966,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) auto start0 = std::chrono::high_resolution_clock::now(); runKernel({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data - auto stop0 = std::chrono::high_resolution_clock::now(); + // auto stop0 = std::chrono::high_resolution_clock::now(); - auto start1 = std::chrono::high_resolution_clock::now(); + // auto start1 = std::chrono::high_resolution_clock::now(); // NN evaluations if (clustererNNShadow.nnInferenceInputDType == 0) { @@ -1006,7 +1015,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } } - auto stopNNs = std::chrono::high_resolution_clock::now(); + // auto stopNNs = std::chrono::high_resolution_clock::now(); // Publishing kernels if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { @@ -1020,25 +1029,41 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceOutputDType, withMC, batchStart); // Publishing class 2 regression results } } - auto stop1 = std::chrono::high_resolution_clock::now(); - time_networks += std::chrono::duration_cast(stopNNs - start1).count() / 1e9; - time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; - time_fill += std::chrono::duration_cast(stop0 - start0).count() / 1e9; - } - if (clustererNNShadow.nnClusterizerUseCfRegression) { - auto start1 = std::chrono::high_resolution_clock::now(); - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 - auto stop1 = std::chrono::high_resolution_clock::now(); - time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; - } - if (clustererNNShadow.nnClusterizerVerbosity < 3) { - int acceptedClusters = 0; - for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) { - acceptedClusters += clustererNNShadow.outputDataClass[i]; - } - LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s"; + // for(int i = 0; i < iSize; ++i) { + // if(clustererNNShadow.outputDataClass[i + batchStart] > 1) { + // LOG(info) << "WARNING ORT: Output of " << i + batchStart << " / " << clusterer.mPmemory->counters.nClusters << " is " << clustererNNShadow.modelProbabilities_16[i].ToFloat() << " and " << clustererNNShadow.outputDataClass[i + batchStart] << " thresh " << clustererNNShadow.nnClassThreshold << " instead of 0 or 1. Please check the model and the input data."; + // // std::string input = "["; + // // for(int j = 0; j < clustererNNShadow.nnClusterizerElementSize; j++){ + // // input += std::to_string(clustererNNShadow.inputData_16[i * clustererNNShadow.nnClusterizerElementSize + j].ToFloat()) + ", "; + // // } + // // input += "]"; + // // LOG(info) << "Input is: " << input; + // } + // } + + // auto stop1 = std::chrono::high_resolution_clock::now(); + + // time_networks += std::chrono::duration_cast(stopNNs - start1).count() / 1e9; + // time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; + // time_fill += std::chrono::duration_cast(stop0 - start0).count() / 1e9; } + // if (clustererNNShadow.nnClusterizerUseCfRegression) { + // auto start1 = std::chrono::high_resolution_clock::now(); + // runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 + // auto stop1 = std::chrono::high_resolution_clock::now(); + // time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; + // } + // if (clustererNNShadow.nnClusterizerVerbosity < 3) { + // int acceptedClusters = 0; + // for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) { + // if(clustererNNShadow.outputDataClass[i] > 1 || clustererNNShadow.outputDataClass[i] < 0) { + // LOG(info) << "WARNING ORT 2: " << clustererNNShadow.outputDataClass[i] << " for index " << i << " / " << clusterer.mPmemory->counters.nClusters; + // } + // acceptedClusters += clustererNNShadow.outputDataClass[i]; + // } + // LOG(info) << "[NN CF] Apply NN (fragment " << fragment.index << ", lane: " << lane << ", sector: " << iSector << "): filling data " << time_fill << "s ; networks: " << time_networks << "s ; clusterizer: " << time_clusterizer << "s ; " << clusterer.mPmemory->counters.nClusters << " clusters, " << acceptedClusters << " accepted. --> " << (int32_t)clusterer.mPmemory->counters.nClusters / (time_fill + time_clusterizer) << " clusters/s"; + // } #else GPUFatal("Project not compiled with neural network clusterization. Aborting."); #endif @@ -1139,12 +1164,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } } for (int32_t i = 0; i < GetProcessingSettings().nTPCClustererLanes; i++) { - if (GetProcessingSettings().nn.applyNNclusterizer) { - GPUTPCNNClusterizerHost& nnApplication = nnApplications[i]; - nnApplication.model_class.release(); - nnApplication.model_reg_1.release(); - nnApplication.model_reg_2.release(); - } + // if (GetProcessingSettings().nn.applyNNclusterizer) { + // GPUTPCNNClusterizerHost& nnApplication = nnApplications[i]; + // nnApplication.model_class.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); + // nnApplication.model_reg_1.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); + // nnApplication.model_reg_2.release(GetProcessingSettings().nn.nnInferenceOrtProfiling); + // } if (transferRunning[i]) { ReleaseEvent(mEvents->stream[i], doGPU); } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 8c6d4fff67528..bda4c70d79c9d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -18,6 +18,11 @@ #include "GPUTPCNNClusterizer.h" #include "GPUSettings.h" #include "ML/3rdparty/GPUORTFloat16.h" +#include "GPUReconstruction.h" + +#ifdef GPUCA_HAS_ONNX +#include +#endif using namespace o2::gpu; @@ -51,7 +56,6 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set {"profiling-output-path", settings.nnInferenceOrtProfilingPath}, {"logging-level", std::to_string(settings.nnInferenceVerbosity)}}; - LOG(info) << "Model path: " << class_model_path; model_class.initOptions(OrtOptions); modelsUsed[0] = true; @@ -106,3 +110,90 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust } } } + +// MockedOrtAllocator implementation to be able to use volatile assignment +struct MockedOrtAllocator : OrtAllocator { + MockedOrtAllocator(GPUReconstruction* = nullptr, OrtMemoryInfo* = nullptr); + ~MockedOrtAllocator(); + + void* Alloc(size_t size); + void Free(void* p); + const OrtMemoryInfo* Info() const; + void* Reserve(size_t size); + size_t NumAllocations() const; + size_t NumReserveAllocations() const; + + void LeakCheck(); + +private: + MockedOrtAllocator(const MockedOrtAllocator&) = delete; + MockedOrtAllocator& operator=(const MockedOrtAllocator&) = delete; + + std::atomic memory_inuse{0}; + std::atomic num_allocations{0}; + std::atomic num_reserve_allocations{0}; + OrtMemoryInfo* memory_info; + GPUReconstruction* rec; +}; + +MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info) { + OrtAllocator::version = ORT_API_VERSION; + OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast(this_)->Alloc(size); }; + OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast(this_)->Free(p); }; + OrtAllocator::Info = [](const OrtAllocator* this_) { return static_cast(this_)->Info(); }; + OrtAllocator::Reserve = [](OrtAllocator* this_, size_t size) { return static_cast(this_)->Reserve(size); }; + rec = r; + memory_info = info; +} + +MockedOrtAllocator::~MockedOrtAllocator() { + // Ort::GetApi().ReleaseMemoryInfo(memory_info); +} + +void* MockedOrtAllocator::Alloc(size_t size) { + return rec->AllocateVolatileDeviceMemory(size); +} + +void* MockedOrtAllocator::Reserve(size_t size) { + return rec->AllocateVolatileDeviceMemory(size); +} + +void MockedOrtAllocator::Free(void* p) { + rec->ReturnVolatileDeviceMemory(); +} + +const OrtMemoryInfo* MockedOrtAllocator::Info() const { + return memory_info; +} + +size_t MockedOrtAllocator::NumAllocations() const { + return num_allocations.load(); +} + +size_t MockedOrtAllocator::NumReserveAllocations() const { + return num_reserve_allocations.load(); +} + +void MockedOrtAllocator::LeakCheck() { + if (memory_inuse.load()) + LOG(warning) << "memory leak!!!"; +} + +void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, int32_t chooseMockedAlloc) +{ + if(chooseMockedAlloc == 0) { + mockedAlloc_class = std::make_shared(rec, (OrtMemoryInfo*)memInfo); + Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_class.get()); + LOG(info) << "(ORT) Mocked ORT allocator for classification network registered"; + } else if (chooseMockedAlloc == 1) { + mockedAlloc_reg_1 = std::make_shared(rec, (OrtMemoryInfo*)memInfo); + Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_1.get()); + LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 1) registered"; + } else if (chooseMockedAlloc == 2) { + mockedAlloc_reg_2 = std::make_shared(rec, (OrtMemoryInfo*)memInfo); + Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_2.get()); + LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 2) registered"; + } else { + LOG(fatal) << "Invalid choice for mocked allocator"; + } +} diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index bae9e5fa677b2..1e0df7ea578f1 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -22,6 +22,15 @@ using namespace o2::ml; +struct OrtAllocator; +struct OrtMemoryInfo; +struct MockedOrtAllocator; +namespace Ort +{ +struct Env; +struct MemoryInfo; +} // namespace Ort + namespace o2::OrtDataType { struct Float16_t; @@ -30,6 +39,7 @@ struct Float16_t; namespace o2::gpu { +class GPUReconstruction; class GPUTPCNNClusterizer; struct GPUSettingsProcessingNNclusterizer; @@ -41,13 +51,17 @@ class GPUTPCNNClusterizerHost void init(const GPUSettingsProcessingNNclusterizer&); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); - void loadFromCCDB(std::map); + + // ONNX + void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, int32_t = 0); std::unordered_map OrtOptions; o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters std::vector modelsUsed = {false, false, false}; // 0: class, 1: reg_1, 2: reg_2 int32_t deviceId = -1; std::vector reg_model_paths; + + std::shared_ptr mockedAlloc_class = nullptr, mockedAlloc_reg_1 = nullptr, mockedAlloc_reg_2 = nullptr; }; // class GPUTPCNNClusterizerHost } // namespace o2::gpu From 497a9d421671f78f93c5b266235e5c0742aa4df1 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Wed, 16 Apr 2025 11:54:58 +0000 Subject: [PATCH 36/40] Please consider the following formatting changes --- Common/ML/src/OrtInterface.cxx | 2 +- .../Base/cuda/GPUReconstructionCUDA.cu | 2 +- .../GPUTPCNNClusterizerHost.cxx | 31 ++++++++++++------- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 6dd3887c82417..520d2273e2185 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -143,7 +143,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex) if (deviceIndex >= 0) { (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1"); (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h - (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time + (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time // Arena memory shrinkage comes at performance cost /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0; diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 247438fa8a13f..382e93f06aea8 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -699,7 +699,7 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options // api.GetCurrentGpuDeviceId(deviceId); OrtROCMProviderOptions rocm_options; rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream - rocm_options.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code + rocm_options.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code rocm_options.user_compute_stream = mInternals->Streams[stream]; session_options.AppendExecutionProvider_ROCM(rocm_options); #endif // ORT_ROCM_BUILD diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index bda4c70d79c9d..ceda3acd7db46 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -125,7 +125,7 @@ struct MockedOrtAllocator : OrtAllocator { void LeakCheck(); -private: + private: MockedOrtAllocator(const MockedOrtAllocator&) = delete; MockedOrtAllocator& operator=(const MockedOrtAllocator&) = delete; @@ -136,7 +136,8 @@ struct MockedOrtAllocator : OrtAllocator { GPUReconstruction* rec; }; -MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info) { +MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info) +{ OrtAllocator::version = ORT_API_VERSION; OrtAllocator::Alloc = [](OrtAllocator* this_, size_t size) { return static_cast(this_)->Alloc(size); }; OrtAllocator::Free = [](OrtAllocator* this_, void* p) { static_cast(this_)->Free(p); }; @@ -146,42 +147,50 @@ MockedOrtAllocator::MockedOrtAllocator(GPUReconstruction* r, OrtMemoryInfo* info memory_info = info; } -MockedOrtAllocator::~MockedOrtAllocator() { +MockedOrtAllocator::~MockedOrtAllocator() +{ // Ort::GetApi().ReleaseMemoryInfo(memory_info); } -void* MockedOrtAllocator::Alloc(size_t size) { +void* MockedOrtAllocator::Alloc(size_t size) +{ return rec->AllocateVolatileDeviceMemory(size); } -void* MockedOrtAllocator::Reserve(size_t size) { +void* MockedOrtAllocator::Reserve(size_t size) +{ return rec->AllocateVolatileDeviceMemory(size); } -void MockedOrtAllocator::Free(void* p) { +void MockedOrtAllocator::Free(void* p) +{ rec->ReturnVolatileDeviceMemory(); } -const OrtMemoryInfo* MockedOrtAllocator::Info() const { +const OrtMemoryInfo* MockedOrtAllocator::Info() const +{ return memory_info; } -size_t MockedOrtAllocator::NumAllocations() const { +size_t MockedOrtAllocator::NumAllocations() const +{ return num_allocations.load(); } -size_t MockedOrtAllocator::NumReserveAllocations() const { +size_t MockedOrtAllocator::NumReserveAllocations() const +{ return num_reserve_allocations.load(); } -void MockedOrtAllocator::LeakCheck() { +void MockedOrtAllocator::LeakCheck() +{ if (memory_inuse.load()) LOG(warning) << "memory leak!!!"; } void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, int32_t chooseMockedAlloc) { - if(chooseMockedAlloc == 0) { + if (chooseMockedAlloc == 0) { mockedAlloc_class = std::make_shared(rec, (OrtMemoryInfo*)memInfo); Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_class.get()); LOG(info) << "(ORT) Mocked ORT allocator for classification network registered"; From a67b634643064c9fa5ab6504a540a0687a513a4e Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 16 Apr 2025 15:05:06 +0200 Subject: [PATCH 37/40] Circumvent "unused result" warning and build failure --- .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index ceda3acd7db46..9ca899158c199 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -192,15 +192,15 @@ void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInf { if (chooseMockedAlloc == 0) { mockedAlloc_class = std::make_shared(rec, (OrtMemoryInfo*)memInfo); - Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_class.get()); + Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_class.get())); LOG(info) << "(ORT) Mocked ORT allocator for classification network registered"; } else if (chooseMockedAlloc == 1) { mockedAlloc_reg_1 = std::make_shared(rec, (OrtMemoryInfo*)memInfo); - Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_1.get()); + Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_1.get())); LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 1) registered"; } else if (chooseMockedAlloc == 2) { mockedAlloc_reg_2 = std::make_shared(rec, (OrtMemoryInfo*)memInfo); - Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_2.get()); + Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_2.get())); LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 2) registered"; } else { LOG(fatal) << "Invalid choice for mocked allocator"; From 938a1edbe6695280e475560d3315b034ba2db754 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sun, 20 Apr 2025 00:38:19 +0200 Subject: [PATCH 38/40] Adjust for comments --- Common/ML/CMakeLists.txt | 8 ++-- Common/ML/src/OrtInterface.cxx | 4 +- GPU/GPUTracking/Base/cuda/CMakeLists.txt | 8 ---- .../Base/cuda/GPUReconstructionCUDA.cu | 5 ++- GPU/GPUTracking/Base/hip/CMakeLists.txt | 10 +---- GPU/GPUTracking/CMakeLists.txt | 3 +- .../Global/GPUChainTrackingClusterizer.cxx | 30 +++++++++------ .../GPUTPCNNClusterizerHost.cxx | 38 +++++++++++-------- .../GPUTPCNNClusterizerHost.h | 10 +++-- GPU/Workflow/src/GPUWorkflowSpec.cxx | 38 +++++++++---------- 10 files changed, 76 insertions(+), 78 deletions(-) diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt index 7e2107651cf10..2db91fc4f4320 100644 --- a/Common/ML/CMakeLists.txt +++ b/Common/ML/CMakeLists.txt @@ -16,7 +16,7 @@ o2_add_library(ML # Pass ORT variables as a preprocessor definition target_compile_definitions(${targetName} PRIVATE - ORT_ROCM_BUILD=$ - ORT_CUDA_BUILD=$ - ORT_MIGRAPHX_BUILD=$ - ORT_TENSORRT_BUILD=$) + $<$:ORT_ROCM_BUILD> + $<$:ORT_CUDA_BUILD> + $<$:ORT_MIGRAPHX_BUILD> + $<$:ORT_TENSORRT_BUILD>) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 520d2273e2185..a0665841bec31 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -144,7 +144,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex) (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1"); (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time - + (pImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time // Arena memory shrinkage comes at performance cost /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0; // (pImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27 @@ -158,7 +158,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex) } pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceIndex, OrtMemType::OrtMemTypeDefault); if (loggingLevel < 2) { - LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex; + LOG(info) << "(ORT) Memory info set to on-device memory for device type " << deviceType << " with ID " << deviceIndex << " and pImplOrt pointer " << pImplOrt; } } #endif diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt index e4877d8ccef25..554f700bd57df 100644 --- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt +++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt @@ -122,17 +122,9 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${CMAKE_CURRENT_SOURCE_DIR} TARGETVARNAME targetName) - message("Compile definitions for ONNX runtime (CUDA):") - message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}") - message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}") - message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}") - message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}") - target_compile_definitions(${targetName} PRIVATE GPUCA_HAS_ONNX=1 - ORT_ROCM_BUILD=$ ORT_CUDA_BUILD=$ - ORT_MIGRAPHX_BUILD=$ ORT_TENSORRT_BUILD=$) install(FILES ${HDRS} DESTINATION include/GPU) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 32e3ae76abe7d..741f160158b43 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -655,7 +655,7 @@ void GPUReconstructionCUDA::endGPUProfiling() void GPUReconstructionCUDA::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId) { -#if defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1 +#ifdef ORT_CUDA_BUILD cudaGetDevice(deviceId); OrtCUDAProviderOptionsV2* cuda_options = nullptr; CreateCUDAProviderOptions(&cuda_options); @@ -684,7 +684,7 @@ void* GPUReconstructionHIP::getGPUPointer(void* ptr) void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options, int32_t stream, int32_t* deviceId) { -#if defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1 +#ifdef ORT_ROCM_BUILD // Create ROCm provider options cudaGetDevice(deviceId); // const auto& api = Ort::GetApi(); @@ -692,6 +692,7 @@ void GPUReconstructionHIP::SetONNXGPUStream(Ort::SessionOptions& session_options OrtROCMProviderOptions rocm_options; rocm_options.has_user_compute_stream = 1; // Indicate that we are passing a user stream rocm_options.arena_extend_strategy = 0; // kNextPowerOfTwo = 0, kSameAsRequested = 1 -> https://github.com/search?q=repo%3Amicrosoft%2Fonnxruntime%20kSameAsRequested&type=code + // rocm_options.gpu_mem_limit = 1073741824; // 0 means no limit rocm_options.user_compute_stream = mInternals->Streams[stream]; session_options.AppendExecutionProvider_ROCM(rocm_options); #endif // ORT_ROCM_BUILD diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index c3cee1e4ebf18..bd3ebe6bc667f 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -170,18 +170,10 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${GPUCA_HIP_SOURCE_DIR} TARGETVARNAME targetName) - message("Compile definitions for ONNX runtime (HIP / ROCM):") - message(STATUS "ORT_ROCM_BUILD: ${ORT_ROCM_BUILD}") - message(STATUS "ORT_CUDA_BUILD: ${ORT_CUDA_BUILD}") - message(STATUS "ORT_MIGRAPHX_BUILD: ${ORT_MIGRAPHX_BUILD}") - message(STATUS "ORT_TENSORRT_BUILD: ${ORT_TENSORRT_BUILD}") - target_compile_definitions(${targetName} PRIVATE GPUCA_HAS_ONNX=1 ORT_ROCM_BUILD=$ - ORT_CUDA_BUILD=$ - ORT_MIGRAPHX_BUILD=$ - ORT_TENSORRT_BUILD=$) + ORT_MIGRAPHX_BUILD=$) install(FILES ${HDRS} DESTINATION include/GPU) diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 414f72eef7329..e82799b9e59c3 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -349,8 +349,7 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") ${targetName} PRIVATE $) - target_compile_definitions(${targetName} PRIVATE - GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1) + target_compile_definitions(${targetName} PRIVATE GPUCA_O2_LIB GPUCA_TPC_GEOMETRY_O2 GPUCA_HAS_ONNX=1) o2_target_root_dictionary(${MODULE} HEADERS ${HDRS_CINT_O2} ${HDRS_CINT_O2_ADDITIONAL} diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 0b9897977cc98..7db0ba66305e9 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -42,7 +42,6 @@ #ifdef GPUCA_HAS_ONNX #include "GPUTPCNNClusterizerKernels.h" #include "GPUTPCNNClusterizerHost.h" -// #include "ML/3rdparty/GPUORTFloat16.h" #endif using namespace o2::gpu; @@ -628,6 +627,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) int32_t deviceId = -1; int32_t numLanes = GetProcessingSettings().nTPCClustererLanes; int32_t maxThreads = mRec->getNKernelHostThreads(true); + // bool recreateMemoryAllocator = false; mRec->runParallelOuterLoop(doGPU, numLanes, [&](uint32_t lane) { nnApplications[lane].init(nn_settings); if (nnApplications[lane].modelsUsed[0]) { @@ -637,7 +637,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) nnApplications[lane].model_class.setIntraOpNumThreads(maxThreads); } (nnApplications[lane].model_class).initEnvironment(); - // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, 0); + // Registering this once seems to be enough, even with different environmnents / models. ONNX apparently uses this per device and stores the OrtAllocator internally. All models will then use the volatile allocation. + // But environment must be valid, so we init the model environment first and use it here afterwards. + // Either this is done in one environment with lane == 0 or by recreating the allocator using recreateMemoryAllocator. + // TODO: Volatile allocation works for reserving, but not yet for allocations when binding the input tensor + // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, recreateMemoryAllocator); + // recreateMemoryAllocator = true; (nnApplications[lane].model_class).initSession(); } if (nnApplications[lane].modelsUsed[1]) { @@ -648,7 +653,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } // (nnApplications[lane].model_reg_1).setEnv((nnApplications[lane].model_class).getEnv()); (nnApplications[lane].model_reg_1).initEnvironment(); - // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, 1); + // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_1).getEnv(), (nnApplications[lane].model_reg_1).getMemoryInfo(), mRec, recreateMemoryAllocator); (nnApplications[lane].model_reg_1).initSession(); } if (nnApplications[lane].modelsUsed[2]) { @@ -657,9 +662,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (nnApplications[lane].model_reg_2.getIntraOpNumThreads() > maxThreads) { nnApplications[lane].model_reg_2.setIntraOpNumThreads(maxThreads); } - // (nnApplications[lane].model_reg_2).setEnv((nnApplications[lane].model_class).getEnv()); (nnApplications[lane].model_reg_2).initEnvironment(); - // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_reg_2).getEnv(), (nnApplications[lane].model_reg_2).getMemoryInfo(), mRec, 2); + // nnApplications[lane].volatileOrtAllocator((nnApplications[lane].model_class).getEnv(), (nnApplications[lane].model_class).getMemoryInfo(), mRec, recreateMemoryAllocator); (nnApplications[lane].model_reg_2).initSession(); } if (nn_settings.nnClusterizerVerbosity < 3) { @@ -685,6 +689,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (doGPU) { WriteToConstantMemory(RecoStep::TPCClusterFinding, (char*)&processors()->tpcNNClusterer - (char*)processors(), &processorsShadow()->tpcNNClusterer, sizeof(GPUTPCNNClusterizer) * NSECTORS, mRec->NStreams() - 1, &mEvents->init); } + LOG(info) << "Size of nnApplications[lane]: " << sizeof(nnApplications[0]) << " bytes"; + LOG(info) << "Size of nnApplications: " << sizeof(GPUTPCNNClusterizerHost) * GetProcessingSettings().nTPCClustererLanes << " bytes"; } #endif @@ -966,8 +972,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) auto start0 = std::chrono::high_resolution_clock::now(); runKernel({GetGrid(iSize * clustererNNShadow.nnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, batchStart); // Filling the data - // auto stop0 = std::chrono::high_resolution_clock::now(); + // auto stop0 = std::chrono::high_resolution_clock::now(); // auto start1 = std::chrono::high_resolution_clock::now(); // NN evaluations @@ -1048,12 +1054,12 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; // time_fill += std::chrono::duration_cast(stop0 - start0).count() / 1e9; } - // if (clustererNNShadow.nnClusterizerUseCfRegression) { - // auto start1 = std::chrono::high_resolution_clock::now(); - // runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 - // auto stop1 = std::chrono::high_resolution_clock::now(); - // time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; - // } + if (clustererNNShadow.nnClusterizerUseCfRegression) { + // auto start1 = std::chrono::high_resolution_clock::now(); + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNNShadow.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 + // auto stop1 = std::chrono::high_resolution_clock::now(); + // time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; + } // if (clustererNNShadow.nnClusterizerVerbosity < 3) { // int acceptedClusters = 0; // for (size_t i = 0; i < clusterer.mPmemory->counters.nClusters; ++i) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 9ca899158c199..2e98ca1982ad5 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -54,7 +54,8 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)}, {"enable-profiling", std::to_string(settings.nnInferenceOrtProfiling)}, {"profiling-output-path", settings.nnInferenceOrtProfilingPath}, - {"logging-level", std::to_string(settings.nnInferenceVerbosity)}}; + {"logging-level", std::to_string(settings.nnInferenceVerbosity)}, + {"onnx-environment-name", "c1"}}; model_class.initOptions(OrtOptions); modelsUsed[0] = true; @@ -64,13 +65,16 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set if (!settings.nnClusterizerUseCfRegression) { if (reg_model_paths.size() == 1) { OrtOptions["model-path"] = reg_model_paths[0]; + OrtOptions["onnx-environment-name"] = "r1"; model_reg_1.initOptions(OrtOptions); modelsUsed[1] = true; } else { OrtOptions["model-path"] = reg_model_paths[0]; + OrtOptions["onnx-environment-name"] = "r1"; model_reg_1.initOptions(OrtOptions); modelsUsed[1] = true; OrtOptions["model-path"] = reg_model_paths[1]; + OrtOptions["onnx-environment-name"] = "r2"; model_reg_2.initOptions(OrtOptions); modelsUsed[2] = true; } @@ -154,16 +158,19 @@ MockedOrtAllocator::~MockedOrtAllocator() void* MockedOrtAllocator::Alloc(size_t size) { + // LOG(info) << "(ORT) Allocating volatile memory of size " << size << " bytes"; return rec->AllocateVolatileDeviceMemory(size); } void* MockedOrtAllocator::Reserve(size_t size) { + // LOG(info) << "(ORT) Reserving volatile memory of size " << size << " bytes"; return rec->AllocateVolatileDeviceMemory(size); } void MockedOrtAllocator::Free(void* p) { + // LOG(info) << "(ORT) Freeing volatile memory " << p; rec->ReturnVolatileDeviceMemory(); } @@ -188,21 +195,20 @@ void MockedOrtAllocator::LeakCheck() LOG(warning) << "memory leak!!!"; } -void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, int32_t chooseMockedAlloc) +void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInfo* memInfo, GPUReconstruction* rec, bool recreate) { - if (chooseMockedAlloc == 0) { - mockedAlloc_class = std::make_shared(rec, (OrtMemoryInfo*)memInfo); - Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_class.get())); - LOG(info) << "(ORT) Mocked ORT allocator for classification network registered"; - } else if (chooseMockedAlloc == 1) { - mockedAlloc_reg_1 = std::make_shared(rec, (OrtMemoryInfo*)memInfo); - Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_1.get())); - LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 1) registered"; - } else if (chooseMockedAlloc == 2) { - mockedAlloc_reg_2 = std::make_shared(rec, (OrtMemoryInfo*)memInfo); - Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc_reg_2.get())); - LOG(info) << "(ORT) Mocked ORT allocator for regression network (class 2) registered"; - } else { - LOG(fatal) << "Invalid choice for mocked allocator"; + mockedAlloc = std::make_shared(rec, (OrtMemoryInfo*)(*memInfo)); + if (recreate) { + Ort::ThrowOnError(Ort::GetApi().UnregisterAllocator((OrtEnv*)(*env), (OrtMemoryInfo*)(*memInfo))); } + Ort::ThrowOnError(Ort::GetApi().RegisterAllocator((OrtEnv*)(*env), mockedAlloc.get())); + memInfo = (Ort::MemoryInfo*)mockedAlloc->Info(); +} + +const OrtMemoryInfo* GPUTPCNNClusterizerHost::getMockedMemoryInfo() { + return mockedAlloc->Info(); +} + +MockedOrtAllocator* GPUTPCNNClusterizerHost::getMockedAllocator() { + return mockedAlloc.get(); } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 1e0df7ea578f1..0379b83d0ae02 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -22,8 +22,8 @@ using namespace o2::ml; -struct OrtAllocator; -struct OrtMemoryInfo; +class OrtMemoryInfo; +class OrtAllocator; struct MockedOrtAllocator; namespace Ort { @@ -53,7 +53,9 @@ class GPUTPCNNClusterizerHost void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); // ONNX - void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, int32_t = 0); + void volatileOrtAllocator(Ort::Env*, Ort::MemoryInfo*, GPUReconstruction*, bool = false); + MockedOrtAllocator* getMockedAllocator(); + const OrtMemoryInfo* getMockedMemoryInfo(); std::unordered_map OrtOptions; o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters @@ -61,7 +63,7 @@ class GPUTPCNNClusterizerHost int32_t deviceId = -1; std::vector reg_model_paths; - std::shared_ptr mockedAlloc_class = nullptr, mockedAlloc_reg_1 = nullptr, mockedAlloc_reg_2 = nullptr; + std::shared_ptr mockedAlloc = nullptr; }; // class GPUTPCNNClusterizerHost } // namespace o2::gpu diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index dde5810b89c82..8a755a703705f 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -119,7 +119,6 @@ GPURecoWorkflowSpec::GPURecoWorkflowSpec(GPURecoWorkflowSpec::CompletionPolicyDa mConfig.reset(new GPUO2InterfaceConfiguration); mConfParam.reset(new GPUSettingsO2); mTFSettings.reset(new GPUSettingsTF); - mNNClusterizerSettings.reset(new GPUSettingsProcessingNNclusterizer); mTimer.reset(new TStopwatch); mPipeline.reset(new GPURecoWorkflowSpec_PipelineInternals); @@ -134,43 +133,44 @@ void GPURecoWorkflowSpec::init(InitContext& ic) { GRPGeomHelper::instance().setRequest(mGGR); GPUO2InterfaceConfiguration& config = *mConfig.get(); + GPUSettingsProcessingNNclusterizer& mNNClusterizerSettings = mConfig->configProcessing.nn; - if (mNNClusterizerSettings->nnLoadFromCCDB) { + if (mNNClusterizerSettings.nnLoadFromCCDB) { LOG(info) << "Loading neural networks from CCDB"; o2::tpc::NeuralNetworkClusterizer nnClusterizerFetcher; - nnClusterizerFetcher.initCcdbApi(mNNClusterizerSettings->nnCCDBURL); + nnClusterizerFetcher.initCcdbApi(mNNClusterizerSettings.nnCCDBURL); std::map ccdbSettings = { - {"nnCCDBURL", mNNClusterizerSettings->nnCCDBURL}, - {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath}, - {"inputDType", mNNClusterizerSettings->nnInferenceInputDType}, - {"outputDType", mNNClusterizerSettings->nnInferenceOutputDType}, - {"outputFolder", mNNClusterizerSettings->nnLocalFolder}, - {"nnCCDBPath", mNNClusterizerSettings->nnCCDBPath}, - {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings->nnCCDBWithMomentum)}, - {"nnCCDBBeamType", mNNClusterizerSettings->nnCCDBBeamType}, - {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings->nnCCDBInteractionRate)}}; - - std::string nnFetchFolder = mNNClusterizerSettings->nnLocalFolder; - std::vector evalMode = o2::utils::Str::tokenize(mNNClusterizerSettings->nnEvalMode, ':'); + {"nnCCDBURL", mNNClusterizerSettings.nnCCDBURL}, + {"nnCCDBPath", mNNClusterizerSettings.nnCCDBPath}, + {"inputDType", mNNClusterizerSettings.nnInferenceInputDType}, + {"outputDType", mNNClusterizerSettings.nnInferenceOutputDType}, + {"outputFolder", mNNClusterizerSettings.nnLocalFolder}, + {"nnCCDBPath", mNNClusterizerSettings.nnCCDBPath}, + {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings.nnCCDBWithMomentum)}, + {"nnCCDBBeamType", mNNClusterizerSettings.nnCCDBBeamType}, + {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings.nnCCDBInteractionRate)}}; + + std::string nnFetchFolder = mNNClusterizerSettings.nnLocalFolder; + std::vector evalMode = o2::utils::Str::tokenize(mNNClusterizerSettings.nnEvalMode, ':'); if (evalMode[0] == "c1") { - ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBClassificationLayerType; + ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBClassificationLayerType; ccdbSettings["nnCCDBEvalType"] = "classification_c1"; ccdbSettings["outputFile"] = "net_classification_c1.onnx"; nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); } else if (evalMode[0] == "c2") { - ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBClassificationLayerType; + ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBClassificationLayerType; ccdbSettings["nnCCDBEvalType"] = "classification_c2"; ccdbSettings["outputFile"] = "net_classification_c2.onnx"; nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); } - ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBRegressionLayerType; + ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBRegressionLayerType; ccdbSettings["nnCCDBEvalType"] = "regression_c1"; ccdbSettings["outputFile"] = "net_regression_c1.onnx"; nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); if (evalMode[1] == "r2") { - ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings->nnCCDBRegressionLayerType; + ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBRegressionLayerType; ccdbSettings["nnCCDBEvalType"] = "regression_c2"; ccdbSettings["outputFile"] = "net_regression_c2.onnx"; nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); From 7b07496138b0f17bef84628ec21ddd0e93b0cb17 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Sat, 19 Apr 2025 22:39:06 +0000 Subject: [PATCH 39/40] Please consider the following formatting changes --- Common/ML/src/OrtInterface.cxx | 2 +- .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index a0665841bec31..8ce6b673660fb 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -144,7 +144,7 @@ void OrtModel::memoryOnDevice(int32_t deviceIndex) (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1"); (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h (pImplOrt->sessionOptions).AddConfigEntry("session.use_env_allocators", "1"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time - (pImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time + (pImplOrt->sessionOptions).AddConfigEntry("session_options.enable_cpu_mem_arena", "0"); // This should enable to use the volatile memory allocation defined in O2/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx; not working yet: ONNX still assigns new memory at init time // Arena memory shrinkage comes at performance cost /// For now prefer to use single allocation, enabled by O2/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu -> SetONNXGPUStream -> rocm_options.arena_extend_strategy = 0; // (pImplOrt->runOptions).AddConfigEntry("memory.enable_memory_arena_shrinkage", ("gpu:" + std::to_string(deviceIndex)).c_str()); // See kOrtRunOptionsConfigEnableMemoryArenaShrinkage, https://github.com/microsoft/onnxruntime/blob/90c263f471bbce724e77d8e62831d3a9fa838b2f/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h#L27 diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 2e98ca1982ad5..db2f05711f537 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -205,10 +205,12 @@ void GPUTPCNNClusterizerHost::volatileOrtAllocator(Ort::Env* env, Ort::MemoryInf memInfo = (Ort::MemoryInfo*)mockedAlloc->Info(); } -const OrtMemoryInfo* GPUTPCNNClusterizerHost::getMockedMemoryInfo() { +const OrtMemoryInfo* GPUTPCNNClusterizerHost::getMockedMemoryInfo() +{ return mockedAlloc->Info(); } -MockedOrtAllocator* GPUTPCNNClusterizerHost::getMockedAllocator() { +MockedOrtAllocator* GPUTPCNNClusterizerHost::getMockedAllocator() +{ return mockedAlloc.get(); } From af89c9a63b025828b6d83a0598846aca00a1498d Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sun, 20 Apr 2025 09:01:47 +0200 Subject: [PATCH 40/40] Fixing build flags --- Common/ML/src/OrtInterface.cxx | 2 +- GPU/GPUTracking/Base/cuda/CMakeLists.txt | 4 ++-- GPU/GPUTracking/Base/hip/CMakeLists.txt | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 8ce6b673660fb..24a2fbffb252c 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -139,7 +139,7 @@ void OrtModel::initSession() void OrtModel::memoryOnDevice(int32_t deviceIndex) { -#if (defined(ORT_ROCM_BUILD) && ORT_ROCM_BUILD == 1) || (defined(ORT_MIGRAPHX_BUILD) && ORT_MIGRAPHX_BUILD == 1) || (defined(ORT_CUDA_BUILD) && ORT_CUDA_BUILD == 1) +#if (defined(ORT_ROCM_BUILD) || defined(ORT_MIGRAPHX_BUILD) || defined(ORT_CUDA_BUILD) || defined(ORT_TENSORRT_BUILD)) if (deviceIndex >= 0) { (pImplOrt->runOptions).AddConfigEntry("disable_synchronize_execution_providers", "1"); (pImplOrt->sessionOptions).AddConfigEntry("session.use_device_allocator_for_initializers", "1"); // See kOrtSessionOptionsUseDeviceAllocatorForInitializers, https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt index 554f700bd57df..f595fb051db54 100644 --- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt +++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt @@ -124,8 +124,8 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") target_compile_definitions(${targetName} PRIVATE GPUCA_HAS_ONNX=1 - ORT_CUDA_BUILD=$ - ORT_TENSORRT_BUILD=$) + $<$:ORT_CUDA_BUILD> + $<$:ORT_TENSORRT_BUILD>) install(FILES ${HDRS} DESTINATION include/GPU) endif() diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index bd3ebe6bc667f..d7adb222d547b 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -172,8 +172,8 @@ if(ALIGPU_BUILD_TYPE STREQUAL "O2") target_compile_definitions(${targetName} PRIVATE GPUCA_HAS_ONNX=1 - ORT_ROCM_BUILD=$ - ORT_MIGRAPHX_BUILD=$) + $<$:ORT_ROCM_BUILD> + $<$:ORT_MIGRAPHX_BUILD>) install(FILES ${HDRS} DESTINATION include/GPU)