From 84eac06b0bb1562d826b4ace3a8435c3385b91a0 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 15 Mar 2025 21:36:05 +0100 Subject: [PATCH 01/10] Initial set of bug.fixes and cosmetic changes --- .../Global/GPUChainTrackingClusterizer.cxx | 15 +- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 38 +++-- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 3 +- .../GPUTPCNNClusterizerHost.cxx | 4 +- .../GPUTPCNNClusterizerHost.h | 17 -- .../GPUTPCNNClusterizerKernels.cxx | 160 ++++++++---------- 6 files changed, 101 insertions(+), 136 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 63d56da37595b..546f62b6c35d6 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -614,7 +614,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) #ifdef GPUCA_HAS_ONNX if (GetProcessingSettings().nn.applyNNclusterizer) { - uint32_t maxClusters = -1; + uint32_t maxClusters = 0; for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters); } @@ -918,6 +918,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); + int withMC = (doGPU && propagateMCLabels); if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); @@ -930,23 +931,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); auto start0 = std::chrono::high_resolution_clock::now(); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Filling the data + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Filling the data auto stop0 = std::chrono::high_resolution_clock::now(); auto start1 = std::chrono::high_resolution_clock::now(); nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype); if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels } else { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels } if (!clustererNN.nnClusterizerUseCfRegression) { nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 1 + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 1 if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) { nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, batchStart); // Running the NN for regression class 2 + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 2 } } auto stop1 = std::chrono::high_resolution_clock::now(); @@ -956,7 +957,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } auto start1 = std::chrono::high_resolution_clock::now(); if (clustererNN.nnClusterizerUseCfRegression) { - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, 0, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 } auto stop1 = std::chrono::high_resolution_clock::now(); time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 6a9b6f546ae07..df0f895cd5976 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -24,25 +24,29 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {} void* GPUTPCNNClusterizer::setIOPointers(void* mem) { - if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize); - } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) { - computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize); - } - computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode); - computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode); - computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode); - computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters); - if (nnClusterizerModelClassNumOutputNodes > 0) { - computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); - } - if (!nnClusterizerUseCfRegression) { - if (nnClusterizerModelReg1NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + if (nnClusterizerBatchedMode > 0){ + if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize); + } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) { + computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize); } - if (nnClusterizerModelReg2NumOutputNodes > 0) { - computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode); + computePointerWithAlignment(mem, clusterFlags, 2 * nnClusterizerBatchedMode); + computePointerWithAlignment(mem, centralCharges, nnClusterizerBatchedMode); + if (nnClusterizerModelClassNumOutputNodes > 0) { + computePointerWithAlignment(mem, modelProbabilities, nnClusterizerBatchedMode * nnClusterizerModelClassNumOutputNodes); } + if (!nnClusterizerUseCfRegression) { + if (nnClusterizerModelReg1NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg1, nnClusterizerBatchedMode * nnClusterizerModelReg1NumOutputNodes); + } + if (nnClusterizerModelReg2NumOutputNodes > 0) { + computePointerWithAlignment(mem, outputDataReg2, nnClusterizerBatchedMode * nnClusterizerModelReg2NumOutputNodes); + } + } + } + if (nnClusterizerTotalClusters > 0) { + computePointerWithAlignment(mem, outputDataClass, nnClusterizerTotalClusters); } return mem; } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index ea6340dfd48bc..01d1873f3b351 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -42,7 +42,7 @@ class GPUTPCNNClusterizer : public GPUProcessor int nnClusterizerSizeInputTime = 3; int nnClusterizerElementSize = -1; bool nnClusterizerAddIndexData = true; - float nnClassThreshold = 0.16; + float nnClassThreshold = 0.01; bool nnSigmoidTrafoClassThreshold = 1; int nnClusterizerUseCfRegression = 0; int nnClusterizerBatchedMode = 1; @@ -58,7 +58,6 @@ class GPUTPCNNClusterizer : public GPUProcessor int mISector = -1; // Memory allocation for neural network - uint class2_elements = 0; float* inputData32 = nullptr; OrtDataType::Float16_t* inputData16 = nullptr; float* outputDataClass = nullptr; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 5002c63524020..321fad3d039db 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -12,6 +12,8 @@ /// \file GPUTPCNNClusterizerHost.cxx /// \author Christian Sonnabend +#include + #include "GPUTPCNNClusterizerHost.h" #include "GPUTPCNNClusterizer.h" #include "GPUSettings.h" @@ -37,7 +39,7 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl model_class.init(OrtOptions); clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; - reg_model_paths = splitString(settings.nnRegressionPath, ":"); + reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':'); if (!settings.nnClusterizerUseCfRegression) { if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 7efa0edecb893..430d78d0bb2fb 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -44,23 +44,6 @@ class GPUTPCNNClusterizerHost std::unordered_map OrtOptions; o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters std::vector reg_model_paths; - - private: - // Avoid including CommonUtils/StringUtils.h - std::vector splitString(const std::string& input, const std::string& delimiter) - { - std::vector tokens; - std::size_t pos = 0; - std::size_t found; - - while ((found = input.find(delimiter, pos)) != std::string::npos) { - tokens.push_back(input.substr(pos, found - pos)); - pos = found + delimiter.length(); - } - tokens.push_back(input.substr(pos)); - - return tokens; - } }; // class GPUTPCNNClusterizerHost } // namespace o2::gpu diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 25cd2497fbf62..c536303147ae6 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -34,7 +34,7 @@ using namespace o2::gpu::tpccf; // Defining individual thread functions for data filling, determining the class label and running the CF clusterizer template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; @@ -44,91 +44,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY(MCLabelAccumulator labelAcc(clusterer)); - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; o2::gpu::GPUTPCCFClusterizer::GPUSharedMemory smem_new; GPUTPCCFClusterizer::computeClustersImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), clusterer, clusterer.mPmemory->fragment, smem_new, chargeMap, clusterer.mPfilteredPeakPositions, clusterer.Param().rec, CPU_PTR(&labelAcc), clusterer.mPmemory->counters.nClusters, clusterer.mNMaxClusterPerRow, clusterer.mPclusterInRow, clusterOut, clusterer.mPclusterPosInRow); } template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - GPUTPCNNClusterizerKernels::fillInputData(nBlocks, nThreads, iBlock, iThread, processors, sector, dtype, batchStart); -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - uint glo_idx = get_global_id(0); - processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold); -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - auto& clusterer = processors.tpcNNClusterer[sector]; - uint glo_idx = get_global_id(0); - uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes; - float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty] - uint class_label = 0; - for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) { - if (pIdx == elem_iterator) { - current_max_prob = clusterer.modelProbabilities[pIdx]; - } else { - class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label); - } - } - // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" - clusterer.outputDataClass[glo_idx + batchStart] = class_label; -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - uint glo_idx = get_global_id(0); - if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) { - return; - } - GPUTPCNNClusterizerKernels::publishClustersReg1(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart); -} - -template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) -{ - uint glo_idx = get_global_id(0); - if (glo_idx >= processors.tpcClusterer[sector].mPmemory->counters.nClusters) { - return; - } - GPUTPCNNClusterizerKernels::publishClustersReg2(glo_idx, smem, processors, sector, dtype, onlyMC, batchStart); -} - -// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary -GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo) -{ - return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2); -} - -GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift) -{ - return (row > 62 ? global_shift : 0); -} - -GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo) -{ - if (pad < 0 || row < 0) { // Faster short-circuit - return true; - } else if (row < 63) { - return (pad >= static_cast(geo.NPads(row))); - } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network - return true; - } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) { - return (pad >= static_cast(geo.NPads(row - global_shift))); - } else { - return true; - } -} - -// Filling the input data for the neural network where there is no boundary -GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, processorType& processors, uint8_t sector, int8_t dtype, uint batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; @@ -144,7 +66,7 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n clustererNN.peakPositions[glo_idx] = peak; clustererNN.centralCharges[glo_idx] = central_charge; - clustererNN.outputDataClass[glo_idx + batchStart] = -1; + clustererNN.outputDataClass[glo_idx + batchStart] = -1.f; int row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.nnClusterizerSizeInputRow); #ifndef GPUCA_GPUCODE @@ -192,14 +114,43 @@ GPUd() void GPUTPCNNClusterizerKernels::fillInputData(int32_t nBlocks, int32_t n } } -GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) +{ + uint glo_idx = get_global_id(0); + processors.tpcNNClusterer[sector].outputDataClass[glo_idx + batchStart] = (int)(processors.tpcNNClusterer[sector].modelProbabilities[glo_idx] > processors.tpcNNClusterer[sector].nnClassThreshold); +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) +{ + auto& clusterer = processors.tpcNNClusterer[sector]; + uint glo_idx = get_global_id(0); + uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes; + float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty] + uint class_label = 0; + for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) { + if (pIdx == elem_iterator) { + current_max_prob = clusterer.modelProbabilities[pIdx]; + } else { + class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label); + } + } + // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" + clusterer.outputDataClass[glo_idx + batchStart] = class_label; +} + +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { + uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; uint full_glo_idx = glo_idx + batchStart; int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg1NumOutputNodes; @@ -210,7 +161,7 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha ClusterAccumulator pc; // Publishing logic is taken from default clusterizer - if (onlyMC) { + if (withMC) { ClusterAccumulator dummy_pc; CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack())); GPUTPCCFClusterizer::buildCluster( @@ -223,7 +174,6 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha &dummy_pc, labelAcc); } - if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -272,24 +222,25 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg1(uint glo_idx, GPUSha } } -GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) +template <> +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) { + uint glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; + Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CPU_ONLY(MCLabelAccumulator labelAccElem(clusterer)); MCLabelAccumulator* labelAcc = CPU_PTR(&labelAccElem); - tpc::ClusterNative* clusterOut = (onlyMC) ? nullptr : clusterer.mPclusterByRow; + tpc::ClusterNative* clusterOut = (withMC) ? nullptr : clusterer.mPclusterByRow; uint full_glo_idx = glo_idx + batchStart; int model_output_index = glo_idx * clustererNN.nnClusterizerModelReg2NumOutputNodes; - // LOG(info) << glo_idx << " -- " << model_output_index << " / " << clustererNN.outputDataReg1.size() << " / " << clustererNN.nnClusterizerModelReg2NumOutputNodes << " -- " << clustererNN.peakPositions.size() << " -- " << clustererNN.centralCharges.size(); - if (clustererNN.outputDataClass[full_glo_idx] > 0) { ClusterAccumulator pc; - if (onlyMC) { + if (withMC) { ClusterAccumulator dummy_pc; CPU_ONLY(labelAcc->collect(clustererNN.peakPositions[glo_idx], chargeMap[clustererNN.peakPositions[glo_idx]].unpack())); GPUTPCCFClusterizer::buildCluster( @@ -302,7 +253,6 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha &dummy_pc, labelAcc); } - if ((clusterer.mPmemory->fragment).isOverlap(clustererNN.peakPositions[glo_idx].time())) { if (clusterer.mPclusterPosInRow) { clusterer.mPclusterPosInRow[full_glo_idx] = clusterer.mNMaxClusterPerRow; @@ -384,3 +334,29 @@ GPUd() void GPUTPCNNClusterizerKernels::publishClustersReg2(uint glo_idx, GPUSha return; } } + +// THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary +GPUd() int GPUTPCNNClusterizerKernels::padOffset(int row_ref, int row_current, const GPUTPCGeometry& geo) +{ + return (int)((geo.NPads(row_current) - geo.NPads(row_ref)) / 2); +} + +GPUd() int GPUTPCNNClusterizerKernels::rowOffset(int row, int global_shift) +{ + return (row > 62 ? global_shift : 0); +} + +GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int row, int pad, int global_shift, const GPUTPCGeometry& geo) +{ + if (pad < 0 || row < 0) { // Faster short-circuit + return true; + } else if (row < 63) { + return (pad >= static_cast(geo.NPads(row))); + } else if (row < (63 + global_shift)) { // to account for the gap between IROC and OROC. Charge will be set to -1 in order to signal boundary to the neural network + return true; + } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + global_shift)) { + return (pad >= static_cast(geo.NPads(row - global_shift))); + } else { + return true; + } +} From 219164923257101b6084bd97700314ea4f109d30 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Sat, 15 Mar 2025 20:37:59 +0000 Subject: [PATCH 02/10] Please consider the following formatting changes --- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index df0f895cd5976..655e2bf5a933c 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -24,7 +24,7 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {} void* GPUTPCNNClusterizer::setIOPointers(void* mem) { - if (nnClusterizerBatchedMode > 0){ + if (nnClusterizerBatchedMode > 0) { if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) { computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize); } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) { From b742c50537a7aa73812e62079306830884fce271 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 15 Mar 2025 21:42:35 +0100 Subject: [PATCH 03/10] Adjusting eval sizes. Makes code neater and avoids some calculations --- Common/ML/src/OrtInterface.cxx | 13 ++++++------- .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 6 +++--- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index fc784dd14d2dc..ae809a2ba5c1a 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -226,19 +226,18 @@ template std::vector OrtModel::inference void OrtModel::inference(I* input, size_t input_size, O* output) { - std::vector inputShape{(int64_t)(input_size / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + std::vector inputShape{input_size, (int64_t)mInputShapes[0][1]}; Ort::Value inputTensor = Ort::Value(nullptr); if constexpr (std::is_same_v) { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size, inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input), input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); } else { - inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size, inputShape.data(), inputShape.size()); + inputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, input, input_size * mInputShapes[0][1], inputShape.data(), inputShape.size()); } - std::vector outputShape{inputShape[0], mOutputShapes[0][1]}; - size_t outputSize = (int64_t)(input_size * mOutputShapes[0][1] / mInputShapes[0][1]); - Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, outputSize, outputShape.data(), outputShape.size()); + std::vector outputShape{input_size, mOutputShapes[0][1]}; + Ort::Value outputTensor = Ort::Value::CreateTensor(pImplOrt->memoryInfo, output, input_size * mOutputShapes[0][1], outputShape.data(), outputShape.size()); - (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is correct here + (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), &inputTensor, 1, outputNamesChar.data(), &outputTensor, outputNamesChar.size()); // TODO: Not sure if 1 is always correct here } template void OrtModel::inference(OrtDataType::Float16_t*, size_t, float*); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 321fad3d039db..b32d042ebd1fa 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -57,11 +57,11 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl } } -void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype) +void GPUTPCNNClusterizerHost::networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clustererNN, size_t size, float* output, int32_t dtype) { if (dtype == 0) { - model.inference(clusterer.inputData16, size * clusterer.nnClusterizerElementSize, output); + model.inference(clustererNN.inputData16, size, output); } else { - model.inference(clusterer.inputData32, size * clusterer.nnClusterizerElementSize, output); + model.inference(clustererNN.inputData32, size, output); } } From 0c1cfb742e987ab50d87c0f5023a63841531335d Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 20 Mar 2025 13:13:25 +0100 Subject: [PATCH 04/10] Adding separate functions. Now the host process only needs one instance and one initialization --- .../Global/GPUChainTrackingClusterizer.cxx | 7 +++--- .../GPUTPCNNClusterizerHost.cxx | 24 +++++++++++++++---- .../GPUTPCNNClusterizerHost.h | 3 +++ 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index b928ed7c177eb..916f2634fb2f6 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -612,14 +612,16 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } #ifdef GPUCA_HAS_ONNX + const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; + GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only if (GetProcessingSettings().nn.applyNNclusterizer) { uint32_t maxClusters = 0; + nnApplication.init(nn_settings); for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { maxClusters = std::max(maxClusters, processors()->tpcClusterer[iSector].mNMaxClusters); } for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; - const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; clustererNN.nnClusterizerUseCfRegression = nn_settings.nnClusterizerUseCfRegression; clustererNN.nnClusterizerSizeInputRow = nn_settings.nnClusterizerSizeInputRow; clustererNN.nnClusterizerSizeInputPad = nn_settings.nnClusterizerSizeInputPad; @@ -640,7 +642,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; } clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos; - GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); + nnApplication.initClusterizer(nn_settings, clustererNN); AllocateRegisteredMemory(clustererNN.mMemoryId); } } @@ -916,7 +918,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) #ifdef GPUCA_HAS_ONNX GPUTPCNNClusterizer& clustererNN = processors()->tpcNNClusterer[iSector]; const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; - GPUTPCNNClusterizerHost nnApplication(nn_settings, clustererNN); int withMC = (doGPU && propagateMCLabels); if (clustererNN.nnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index b32d042ebd1fa..a1f78ca787282 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -21,7 +21,12 @@ using namespace o2::gpu; -GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer) +GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) +{ + init(settings); +} + +void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) { OrtOptions = { {"model-path", settings.nnClassificationPath}, @@ -37,21 +42,30 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl {"logging-level", std::to_string(settings.nnInferenceVerbosity)}}; model_class.init(OrtOptions); - clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; - reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':'); + reg_model_paths = splitString(settings.nnRegressionPath, ":"); if (!settings.nnClusterizerUseCfRegression) { if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { OrtOptions["model-path"] = reg_model_paths[0]; model_reg_1.init(OrtOptions); - clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; } else { OrtOptions["model-path"] = reg_model_paths[0]; model_reg_1.init(OrtOptions); - clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; OrtOptions["model-path"] = reg_model_paths[1]; model_reg_2.init(OrtOptions); + } + } +} + +void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclusterizer& settings, GPUTPCNNClusterizer& clusterer) +{ + clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; + if (!settings.nnClusterizerUseCfRegression) { + if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { + clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; + } else { + clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; clusterer.nnClusterizerModelReg2NumOutputNodes = model_reg_2.getNumOutputNodes()[0][1]; } } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 430d78d0bb2fb..1f31567dc42f1 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -39,6 +39,9 @@ class GPUTPCNNClusterizerHost GPUTPCNNClusterizerHost() = default; GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + void init(const GPUSettingsProcessingNNclusterizer&); + void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype); std::unordered_map OrtOptions; From 83c004fa0f84ab8fec7a7458e210ad63ab7a489f Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 22 Mar 2025 16:55:50 +0100 Subject: [PATCH 05/10] First version of CCDB implementation --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 8 ++++++ .../Global/GPUChainTrackingClusterizer.cxx | 21 +++++++++++++++ .../GPUTPCNNClusterizerHost.cxx | 26 +++++++++++++++++++ .../GPUTPCNNClusterizerHost.h | 1 + 4 files changed, 56 insertions(+) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 40a7fc71cbb4d..7611e810768fe 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -249,6 +249,14 @@ AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The c AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.") AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path") AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") +// CCDB +AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally") +AddOption(nnCCDBURL, std::string, "http://alice-ccdb.cern.ch", "", 0, "The CCDB URL from where the network files are fetched") +AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks") +AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression") +AddOption(nnCCDBLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") +AddOption(nnCCDBBeamType, std::string, "", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp") +AddOption(nnCCDBInteractionRate, int, -1, "", 0, "Distinguishes between networks for different interaction rates [kHz].") AddHelp("help", 'h') EndConfig() diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 916f2634fb2f6..c7816bb9ec17c 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -615,6 +615,27 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only if (GetProcessingSettings().nn.applyNNclusterizer) { + if(nn_settings.nnLoadFromCCDB) { + std::unordered_map ccdbSettings = { + {"nnCCDBPath", nn_settings.nnCCDBPath}, + {"inputDType", nn_settings.inputDType}, + {"outputDType", nn_settings.outputDType}, + {"nnCCDBWithMomentum", std::to_string(nn_settings.nnCCDBWithMomentum)}, + {"nnCCDBLayerType", nn_settings.nnCCDBLayerType}, + {"nnCCDBBeamType", nn_settings.nnCCDBBeamType}, + {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)} + }; + + std::unordered_map networkRetrieval = ccdbSettings; + + networkRetrieval["nnCCDBEvalType"] = "classification_c1"; + networkRetrieval["outputFile"] = "net_classification_c1.onnx"; + nnApplication.loadFromCCDB(networkRetrieval); + + networkRetrieval["nnCCDBEvalType"] = "regression_c1"; + networkRetrieval["outputFile"] = "net_regression_c1.onnx"; + nnApplication.loadFromCCDB(networkRetrieval); + } uint32_t maxClusters = 0; nnApplication.init(nn_settings); for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index a1f78ca787282..20190994b97ba 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -26,6 +26,32 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl init(settings); } +void GPUTPCNNClusterizerHost::loadFromCCDB(std::unordered_map settings) { + o2::ccdb::CcdbApi ccdbApi; + ccdbApi.init(settings["nnCCDBURL"]); + + metadata[settings["inputDType"]] = settings["inputDType"]; + metadata[settings["outputDType"]] = settings["outputDType"]; + metadata[settings["nnCCDBEvalType"]] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C + metadata[settings["nnCCDBWithMomentum"]] = std::stoi(settings["nnCCDBWithMomentum"]); // 0, 1 -> Only for regression model + metadata[settings["nnCCDBLayerType"]] = settings["nnCCDBLayerType"]; // FC, CNN + if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) { + metadata[settings["nnCCDBInteractionRate"]] = settings["nnCCDBInteractionRate"]; + } + if (settings["nnCCDBBeamType"] != "") { + metadata[settings["nnCCDBBeamType"]] = settings["nnCCDBBeamType"]; + } + + bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnPathCCDB"], ".", metadata, 1, false, settings["outputFile"]); + // headers = ccdbApi.retrieveHeaders(nnPathCCDB, metadata, ccdbTimestamp); // potentially needed to init some local variables + + if (retrieveSuccess) { + LOG(info) << "Network " << settings["nnPathCCDB"] << " retrieved from CCDB, stored at " << settings["networkPathLocal"]; + } else { + LOG(error) << "Failed to retrieve network from CCDB"; + } +} + void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) { OrtOptions = { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 1f31567dc42f1..a3f3ecd72ffca 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -41,6 +41,7 @@ class GPUTPCNNClusterizerHost void init(const GPUSettingsProcessingNNclusterizer&); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + void loadFromCCDB(std::unordered_map); void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype); From d767ed1b636f97c0fe8447e8e3ebfc854a4aa214 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 24 Mar 2025 00:16:18 +0100 Subject: [PATCH 06/10] Working CCDB API calls (tested with test-ccdb) --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 11 +++++--- .../Global/GPUChainTrackingClusterizer.cxx | 13 ++++++---- .../GPUTPCNNClusterizerHost.cxx | 25 ++++++++++--------- .../GPUTPCNNClusterizerHost.h | 8 ++++-- 4 files changed, 34 insertions(+), 23 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 7611e810768fe..5b4d08f5ffe67 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -229,6 +229,8 @@ AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify i AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id") AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference") AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 +AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16 +AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") AddOption(nnInferenceInterOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetInterOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") AddOption(nnInferenceEnableOrtOptimization, unsigned int, 99, "", 0, "Enables graph optimizations in ONNX Runtime. Can be [0, 1, 2, 99] -> see https://github.com/microsoft/onnxruntime/blob/3f71d637a83dc3540753a8bb06740f67e926dc13/include/onnxruntime/core/session/onnxruntime_c_api.h#L347") @@ -251,12 +253,13 @@ AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regress AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") // CCDB AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally") -AddOption(nnCCDBURL, std::string, "http://alice-ccdb.cern.ch", "", 0, "The CCDB URL from where the network files are fetched") +AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched") AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks") AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression") -AddOption(nnCCDBLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") -AddOption(nnCCDBBeamType, std::string, "", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp") -AddOption(nnCCDBInteractionRate, int, -1, "", 0, "Distinguishes between networks for different interaction rates [kHz].") +AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") +AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") +AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp") +AddOption(nnCCDBInteractionRate, int, 50, "", 0, "Distinguishes between networks for different interaction rates [kHz].") AddHelp("help", 'h') EndConfig() diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index c7816bb9ec17c..fb6bffe51a160 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -616,26 +616,29 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only if (GetProcessingSettings().nn.applyNNclusterizer) { if(nn_settings.nnLoadFromCCDB) { - std::unordered_map ccdbSettings = { + std::map ccdbSettings = { + {"nnCCDBURL", nn_settings.nnCCDBURL}, {"nnCCDBPath", nn_settings.nnCCDBPath}, - {"inputDType", nn_settings.inputDType}, - {"outputDType", nn_settings.outputDType}, + {"inputDType", nn_settings.nnInferenceInputDType}, + {"outputDType", nn_settings.nnInferenceOutputDType}, {"nnCCDBWithMomentum", std::to_string(nn_settings.nnCCDBWithMomentum)}, - {"nnCCDBLayerType", nn_settings.nnCCDBLayerType}, {"nnCCDBBeamType", nn_settings.nnCCDBBeamType}, {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)} }; - std::unordered_map networkRetrieval = ccdbSettings; + std::map networkRetrieval = ccdbSettings; + networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType; networkRetrieval["nnCCDBEvalType"] = "classification_c1"; networkRetrieval["outputFile"] = "net_classification_c1.onnx"; nnApplication.loadFromCCDB(networkRetrieval); + networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType; networkRetrieval["nnCCDBEvalType"] = "regression_c1"; networkRetrieval["outputFile"] = "net_regression_c1.onnx"; nnApplication.loadFromCCDB(networkRetrieval); } + uint32_t maxClusters = 0; nnApplication.init(nn_settings); for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 20190994b97ba..b4ee558b1e201 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -16,6 +16,7 @@ #include "GPUTPCNNClusterizerHost.h" #include "GPUTPCNNClusterizer.h" +#include "CCDB/CcdbApi.h" #include "GPUSettings.h" #include "ML/3rdparty/GPUORTFloat16.h" @@ -26,27 +27,27 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl init(settings); } -void GPUTPCNNClusterizerHost::loadFromCCDB(std::unordered_map settings) { +void GPUTPCNNClusterizerHost::loadFromCCDB(std::map settings) { o2::ccdb::CcdbApi ccdbApi; ccdbApi.init(settings["nnCCDBURL"]); - metadata[settings["inputDType"]] = settings["inputDType"]; - metadata[settings["outputDType"]] = settings["outputDType"]; - metadata[settings["nnCCDBEvalType"]] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C - metadata[settings["nnCCDBWithMomentum"]] = std::stoi(settings["nnCCDBWithMomentum"]); // 0, 1 -> Only for regression model - metadata[settings["nnCCDBLayerType"]] = settings["nnCCDBLayerType"]; // FC, CNN + metadata["inputDType"] = settings["inputDType"]; + metadata["outputDType"] = settings["outputDType"]; + metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C + metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model + metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) { - metadata[settings["nnCCDBInteractionRate"]] = settings["nnCCDBInteractionRate"]; + metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"]; } if (settings["nnCCDBBeamType"] != "") { - metadata[settings["nnCCDBBeamType"]] = settings["nnCCDBBeamType"]; + metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"]; } - bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnPathCCDB"], ".", metadata, 1, false, settings["outputFile"]); - // headers = ccdbApi.retrieveHeaders(nnPathCCDB, metadata, ccdbTimestamp); // potentially needed to init some local variables + bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], ".", metadata, 1, false, settings["outputFile"]); + // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables if (retrieveSuccess) { - LOG(info) << "Network " << settings["nnPathCCDB"] << " retrieved from CCDB, stored at " << settings["networkPathLocal"]; + LOG(info) << "Network " << settings["nnCCDBPath"] << " retrieved from CCDB, stored at " << settings["outputFile"]; } else { LOG(error) << "Failed to retrieve network from CCDB"; } @@ -69,7 +70,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set model_class.init(OrtOptions); - reg_model_paths = splitString(settings.nnRegressionPath, ":"); + reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':'); if (!settings.nnClusterizerUseCfRegression) { if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index a3f3ecd72ffca..798d4af2826b3 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -37,17 +37,21 @@ class GPUTPCNNClusterizerHost { public: GPUTPCNNClusterizerHost() = default; - GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); + GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&); void init(const GPUSettingsProcessingNNclusterizer&); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); - void loadFromCCDB(std::unordered_map); + void loadFromCCDB(std::map); void networkInference(o2::ml::OrtModel model, GPUTPCNNClusterizer& clusterer, size_t size, float* output, int32_t dtype); std::unordered_map OrtOptions; o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters std::vector reg_model_paths; + + private: + std::map metadata; + std::map headers; }; // class GPUTPCNNClusterizerHost } // namespace o2::gpu From ad4b22be3c457fbcb6e0cca852bf8800d7b9929e Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 24 Mar 2025 10:38:51 +0100 Subject: [PATCH 07/10] Improve fetching, but have to pass settings by value, not const ref --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 + .../Global/GPUChainTrackingClusterizer.cxx | 31 +++++++++++++++---- .../GPUTPCNNClusterizerHost.cxx | 4 +-- .../GPUTPCNNClusterizerHost.h | 4 +-- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 5b4d08f5ffe67..a8a4ae566f485 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -255,6 +255,7 @@ AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then AddOption(nnLoadFromCCDB, int, 1, "", 0, "If 1 networks are fetched from ccdb, else locally") AddOption(nnCCDBURL, std::string, "http://ccdb-test.cern.ch:8080", "", 0, "The CCDB URL from where the network files are fetched") AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks") +AddOption(nnCCDBFetchMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)") AddOption(nnCCDBWithMomentum, int, 1, "", 0, "Distinguishes between the network with and without momentum output for the regression") AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") AddOption(nnCCDBRegressionLayerType, std::string, "CNN", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index fb6bffe51a160..dcd5cc2197e3c 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -40,6 +40,7 @@ #endif #ifdef GPUCA_HAS_ONNX +#include #include "GPUTPCNNClusterizerKernels.h" #include "GPUTPCNNClusterizerHost.h" #endif @@ -612,7 +613,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } #ifdef GPUCA_HAS_ONNX - const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; + GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn; GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only if (GetProcessingSettings().nn.applyNNclusterizer) { if(nn_settings.nnLoadFromCCDB) { @@ -626,17 +627,35 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)} }; + std::string nnFetchFolder = ""; + std::vector fetchMode = o2::utils::Str::tokenize(nn_settings.nnCCDBFetchMode, ':'); std::map networkRetrieval = ccdbSettings; - networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType; - networkRetrieval["nnCCDBEvalType"] = "classification_c1"; - networkRetrieval["outputFile"] = "net_classification_c1.onnx"; - nnApplication.loadFromCCDB(networkRetrieval); + if (fetchMode[0] == "c1") { + networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType; + networkRetrieval["nnCCDBEvalType"] = "classification_c1"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx"; + nnApplication.loadFromCCDB(networkRetrieval); + } else if (fetchMode[0] == "c2") { + networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType; + networkRetrieval["nnCCDBEvalType"] = "classification_c2"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx"; + nnApplication.loadFromCCDB(networkRetrieval); + } + nn_settings.nnClassificationPath = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType; networkRetrieval["nnCCDBEvalType"] = "regression_c1"; - networkRetrieval["outputFile"] = "net_regression_c1.onnx"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx"; nnApplication.loadFromCCDB(networkRetrieval); + nn_settings.nnRegressionPath = networkRetrieval["outputFile"]; + if (fetchMode[1] == "r2") { + networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType; + networkRetrieval["nnCCDBEvalType"] = "regression_c2"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx"; + nnApplication.loadFromCCDB(networkRetrieval); + nn_settings.nnRegressionPath += ":", networkRetrieval["outputFile"]; + } } uint32_t maxClusters = 0; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index b4ee558b1e201..da32d4938ebed 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -22,7 +22,7 @@ using namespace o2::gpu; -GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) +GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer settings) { init(settings); } @@ -53,7 +53,7 @@ void GPUTPCNNClusterizerHost::loadFromCCDB(std::map se } } -void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) +void GPUTPCNNClusterizerHost::init(GPUSettingsProcessingNNclusterizer settings) { OrtOptions = { {"model-path", settings.nnClassificationPath}, diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 798d4af2826b3..b6d5e48304e0d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -37,9 +37,9 @@ class GPUTPCNNClusterizerHost { public: GPUTPCNNClusterizerHost() = default; - GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&); + GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer); - void init(const GPUSettingsProcessingNNclusterizer&); + void init(GPUSettingsProcessingNNclusterizer); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); void loadFromCCDB(std::map); From 81c646be0d2e27af8707a0b67a651cccc9de5b64 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 24 Mar 2025 11:04:57 +0100 Subject: [PATCH 08/10] Using const ref and moving CCDB calls to host initialization --- .../Global/GPUChainTrackingClusterizer.cxx | 47 +--------------- .../GPUTPCNNClusterizerHost.cxx | 53 +++++++++++++++++-- .../GPUTPCNNClusterizerHost.h | 4 +- 3 files changed, 53 insertions(+), 51 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index dcd5cc2197e3c..98a0ec16495c5 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -40,7 +40,6 @@ #endif #ifdef GPUCA_HAS_ONNX -#include #include "GPUTPCNNClusterizerKernels.h" #include "GPUTPCNNClusterizerHost.h" #endif @@ -613,51 +612,9 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } #ifdef GPUCA_HAS_ONNX - GPUSettingsProcessingNNclusterizer nn_settings = GetProcessingSettings().nn; + const GPUSettingsProcessingNNclusterizer& nn_settings = GetProcessingSettings().nn; GPUTPCNNClusterizerHost nnApplication; // potentially this needs to be GPUTPCNNClusterizerHost nnApplication[NSECTORS]; Technically ONNX ->Run() is threadsafe at inference time since its read-only if (GetProcessingSettings().nn.applyNNclusterizer) { - if(nn_settings.nnLoadFromCCDB) { - std::map ccdbSettings = { - {"nnCCDBURL", nn_settings.nnCCDBURL}, - {"nnCCDBPath", nn_settings.nnCCDBPath}, - {"inputDType", nn_settings.nnInferenceInputDType}, - {"outputDType", nn_settings.nnInferenceOutputDType}, - {"nnCCDBWithMomentum", std::to_string(nn_settings.nnCCDBWithMomentum)}, - {"nnCCDBBeamType", nn_settings.nnCCDBBeamType}, - {"nnCCDBInteractionRate", std::to_string(nn_settings.nnCCDBInteractionRate)} - }; - - std::string nnFetchFolder = ""; - std::vector fetchMode = o2::utils::Str::tokenize(nn_settings.nnCCDBFetchMode, ':'); - std::map networkRetrieval = ccdbSettings; - - if (fetchMode[0] == "c1") { - networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType; - networkRetrieval["nnCCDBEvalType"] = "classification_c1"; - networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx"; - nnApplication.loadFromCCDB(networkRetrieval); - } else if (fetchMode[0] == "c2") { - networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBClassificationLayerType; - networkRetrieval["nnCCDBEvalType"] = "classification_c2"; - networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx"; - nnApplication.loadFromCCDB(networkRetrieval); - } - nn_settings.nnClassificationPath = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally - - networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType; - networkRetrieval["nnCCDBEvalType"] = "regression_c1"; - networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx"; - nnApplication.loadFromCCDB(networkRetrieval); - nn_settings.nnRegressionPath = networkRetrieval["outputFile"]; - if (fetchMode[1] == "r2") { - networkRetrieval["nnCCDBLayerType"] = nn_settings.nnCCDBRegressionLayerType; - networkRetrieval["nnCCDBEvalType"] = "regression_c2"; - networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx"; - nnApplication.loadFromCCDB(networkRetrieval); - nn_settings.nnRegressionPath += ":", networkRetrieval["outputFile"]; - } - } - uint32_t maxClusters = 0; nnApplication.init(nn_settings); for (uint32_t iSector = 0; iSector < NSECTORS; iSector++) { @@ -988,7 +945,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (!clustererNN.nnClusterizerUseCfRegression) { nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype); runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 1 - if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.reg_model_paths.size() > 1) { + if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype); runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 2 } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index da32d4938ebed..533ac0c7481ff 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -22,7 +22,7 @@ using namespace o2::gpu; -GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer settings) +GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer& settings) { init(settings); } @@ -53,10 +53,55 @@ void GPUTPCNNClusterizerHost::loadFromCCDB(std::map se } } -void GPUTPCNNClusterizerHost::init(GPUSettingsProcessingNNclusterizer settings) +void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& settings) { + std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath; + std::vector reg_model_paths; + + if(settings.nnLoadFromCCDB) { + std::map ccdbSettings = { + {"nnCCDBURL", settings.nnCCDBURL}, + {"nnCCDBPath", settings.nnCCDBPath}, + {"inputDType", settings.nnInferenceInputDType}, + {"outputDType", settings.nnInferenceOutputDType}, + {"nnCCDBWithMomentum", std::to_string(settings.nnCCDBWithMomentum)}, + {"nnCCDBBeamType", settings.nnCCDBBeamType}, + {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)} + }; + + std::string nnFetchFolder = ""; + std::vector fetchMode = o2::utils::Str::tokenize(settings.nnCCDBFetchMode, ':'); + std::map networkRetrieval = ccdbSettings; + + if (fetchMode[0] == "c1") { + networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType; + networkRetrieval["nnCCDBEvalType"] = "classification_c1"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c1.onnx"; + loadFromCCDB(networkRetrieval); + } else if (fetchMode[0] == "c2") { + networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBClassificationLayerType; + networkRetrieval["nnCCDBEvalType"] = "classification_c2"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_classification_c2.onnx"; + loadFromCCDB(networkRetrieval); + } + class_model_path = networkRetrieval["outputFile"]; // Setting the proper path from the where the models will be initialized locally + + networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType; + networkRetrieval["nnCCDBEvalType"] = "regression_c1"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c1.onnx"; + loadFromCCDB(networkRetrieval); + reg_model_path = networkRetrieval["outputFile"]; + if (fetchMode[1] == "r2") { + networkRetrieval["nnCCDBLayerType"] = settings.nnCCDBRegressionLayerType; + networkRetrieval["nnCCDBEvalType"] = "regression_c2"; + networkRetrieval["outputFile"] = nnFetchFolder + "net_regression_c2.onnx"; + loadFromCCDB(networkRetrieval); + reg_model_path += ":", networkRetrieval["outputFile"]; + } + } + OrtOptions = { - {"model-path", settings.nnClassificationPath}, + {"model-path", class_model_path}, {"device", settings.nnInferenceDevice}, {"device-id", std::to_string(settings.nnInferenceDeviceId)}, {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)}, @@ -70,7 +115,7 @@ void GPUTPCNNClusterizerHost::init(GPUSettingsProcessingNNclusterizer settings) model_class.init(OrtOptions); - reg_model_paths = o2::utils::Str::tokenize(settings.nnRegressionPath, ':'); + reg_model_paths = o2::utils::Str::tokenize(reg_model_path, ':'); if (!settings.nnClusterizerUseCfRegression) { if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index b6d5e48304e0d..798d4af2826b3 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -37,9 +37,9 @@ class GPUTPCNNClusterizerHost { public: GPUTPCNNClusterizerHost() = default; - GPUTPCNNClusterizerHost(GPUSettingsProcessingNNclusterizer); + GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNclusterizer&); - void init(GPUSettingsProcessingNNclusterizer); + void init(const GPUSettingsProcessingNNclusterizer&); void initClusterizer(const GPUSettingsProcessingNNclusterizer&, GPUTPCNNClusterizer&); void loadFromCCDB(std::map); From 566ddb7b0b6133cde807ef5526a2efa66be1a785 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 25 Mar 2025 09:51:18 +0100 Subject: [PATCH 09/10] Simplifications and renaming --- Common/ML/include/ML/OrtInterface.h | 2 +- Common/ML/src/OrtInterface.cxx | 1 - GPU/GPUTracking/Definitions/GPUSettingsList.h | 1 - .../Global/GPUChainTrackingClusterizer.cxx | 20 +++++++++---------- .../TPCClusterFinder/GPUTPCNNClusterizer.cxx | 4 ++-- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 2 +- .../GPUTPCNNClusterizerHost.cxx | 3 +-- .../GPUTPCNNClusterizerKernels.cxx | 20 +++++++++++-------- 8 files changed, 27 insertions(+), 26 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 93549178848ca..cbd8501f9898f 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -84,7 +84,7 @@ class OrtModel // Environment settings bool mInitialized = false; - std::string modelPath, device = "cpu", dtype = "float", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda + std::string modelPath, device = "cpu", thread_affinity = ""; // device options should be cpu, rocm, migraphx, cuda int intraOpNumThreads = 1, interOpNumThreads = 1, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; std::string printShape(const std::vector&); diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index ae809a2ba5c1a..5e9f1a8b0a5b6 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -48,7 +48,6 @@ void OrtModel::reset(std::unordered_map optionsMap) if (!optionsMap["model-path"].empty()) { modelPath = optionsMap["model-path"]; device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU"); - dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index a8a4ae566f485..83f6e320b8f5b 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -228,7 +228,6 @@ AddOption(applyNNclusterizer, int, 0, "", 0, "(bool, default = 0), if the neural AddOption(nnInferenceDevice, std::string, "CPU", "", 0, "(std::string) Specify inference device (cpu (default), rocm, cuda)") AddOption(nnInferenceDeviceId, unsigned int, 0, "", 0, "(unsigned int) Specify inference device id") AddOption(nnInferenceAllocateDevMem, int, 0, "", 0, "(bool, default = 0), if the device memory should be allocated for inference") -AddOption(nnInferenceDtype, std::string, "fp32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 AddOption(nnInferenceInputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (FP32: default, fp16)") // fp32 or fp16 AddOption(nnInferenceOutputDType, std::string, "FP32", "", 0, "(std::string) Specify the datatype for which inference is performed (fp32: default, fp16)") // fp32 or fp16 AddOption(nnInferenceIntraOpNumThreads, int, 1, "", 0, "Number of threads used to evaluate one neural network (ONNX: SetIntraOpNumThreads). 0 = auto-detect, can lead to problems on SLURM systems.") diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 98a0ec16495c5..1638e134a4d6a 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -641,7 +641,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } else { clustererNN.nnClusterizerVerbosity = nn_settings.nnClusterizerVerbosity; } - clustererNN.nnClusterizerDtype = nn_settings.nnInferenceDtype.find("32") != std::string::npos; + clustererNN.nnInferenceInputDType = nn_settings.nnInferenceInputDType.find("32") != std::string::npos; nnApplication.initClusterizer(nn_settings, clustererNN); AllocateRegisteredMemory(clustererNN.mMemoryId); } @@ -931,23 +931,23 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) size_t iSize = CAMath::Min((uint)clustererNN.nnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); auto start0 = std::chrono::high_resolution_clock::now(); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Filling the data + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Filling the data auto stop0 = std::chrono::high_resolution_clock::now(); auto start1 = std::chrono::high_resolution_clock::now(); - nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnClusterizerDtype); + nnApplication.networkInference(nnApplication.model_class, clustererNN, iSize, clustererNN.modelProbabilities, clustererNN.nnInferenceInputDType); if (nnApplication.model_class.getNumOutputNodes()[0][1] == 1) { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels } else { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Assigning class labels + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Assigning class labels } if (!clustererNN.nnClusterizerUseCfRegression) { - nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnClusterizerDtype); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 1 + nnApplication.networkInference(nnApplication.model_reg_1, clustererNN, iSize, clustererNN.outputDataReg1, clustererNN.nnInferenceInputDType); + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 1 if (nnApplication.model_class.getNumOutputNodes()[0][1] > 1 && nnApplication.model_reg_2.isInitialized()) { - nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnClusterizerDtype); - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, batchStart); // Running the NN for regression class 2 + nnApplication.networkInference(nnApplication.model_reg_2, clustererNN, iSize, clustererNN.outputDataReg2, clustererNN.nnInferenceInputDType); + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, batchStart); // Running the NN for regression class 2 } } auto stop1 = std::chrono::high_resolution_clock::now(); @@ -957,7 +957,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } auto start1 = std::chrono::high_resolution_clock::now(); if (clustererNN.nnClusterizerUseCfRegression) { - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnClusterizerDtype, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), krnlRunRangeNone}, iSector, clustererNN.nnInferenceInputDType, withMC, 0); // Running the CF regression kernel - no batching needed: batchStart = 0 } auto stop1 = std::chrono::high_resolution_clock::now(); time_clusterizer += std::chrono::duration_cast(stop1 - start1).count() / 1e9; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx index 655e2bf5a933c..cc3f29434615f 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.cxx @@ -25,9 +25,9 @@ void GPUTPCNNClusterizer::SetMaxData(const GPUTrackingInOutPointers& io) {} void* GPUTPCNNClusterizer::setIOPointers(void* mem) { if (nnClusterizerBatchedMode > 0) { - if (nnClusterizerDtype == 0 && nnClusterizerElementSize > 0) { + if (nnInferenceInputDType == 0 && nnClusterizerElementSize > 0) { computePointerWithAlignment(mem, inputData16, nnClusterizerBatchedMode * nnClusterizerElementSize); - } else if (nnClusterizerDtype == 1 && nnClusterizerElementSize > 0) { + } else if (nnInferenceInputDType == 1 && nnClusterizerElementSize > 0) { computePointerWithAlignment(mem, inputData32, nnClusterizerBatchedMode * nnClusterizerElementSize); } computePointerWithAlignment(mem, peakPositions, nnClusterizerBatchedMode); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 01d1873f3b351..0b9e3a6572684 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -54,7 +54,7 @@ class GPUTPCNNClusterizer : public GPUProcessor int nnClusterizerModelClassNumOutputNodes = -1; int nnClusterizerModelReg1NumOutputNodes = -1; int nnClusterizerModelReg2NumOutputNodes = -1; - int nnClusterizerDtype = 0; // 0: float16, 1: float32 + int nnInferenceInputDType = 0; // 0: float16, 1: float32 int mISector = -1; // Memory allocation for neural network diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 533ac0c7481ff..3463740cf7918 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -105,7 +105,6 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set {"device", settings.nnInferenceDevice}, {"device-id", std::to_string(settings.nnInferenceDeviceId)}, {"allocate-device-memory", std::to_string(settings.nnInferenceAllocateDevMem)}, - {"dtype", settings.nnInferenceDtype}, {"intra-op-num-threads", std::to_string(settings.nnInferenceIntraOpNumThreads)}, {"inter-op-num-threads", std::to_string(settings.nnInferenceInterOpNumThreads)}, {"enable-optimizations", std::to_string(settings.nnInferenceEnableOrtOptimization)}, @@ -134,7 +133,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust { clusterer.nnClusterizerModelClassNumOutputNodes = model_class.getNumOutputNodes()[0][1]; if (!settings.nnClusterizerUseCfRegression) { - if (model_class.getNumOutputNodes()[0][1] == 1 || reg_model_paths.size() == 1) { + if (model_class.getNumOutputNodes()[0][1] == 1 || model_reg_2.isInitialized()) { clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; } else { clusterer.nnClusterizerModelReg1NumOutputNodes = model_reg_1.getNumOutputNodes()[0][1]; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 763119444bf7c..73051bd8477fd 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -125,20 +125,24 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint batchStart) { - auto& clusterer = processors.tpcNNClusterer[sector]; + auto& clustererNN = processors.tpcNNClusterer[sector]; uint glo_idx = get_global_id(0); - uint elem_iterator = glo_idx * clusterer.nnClusterizerModelClassNumOutputNodes; + uint elem_iterator = glo_idx * clustererNN.nnClusterizerModelClassNumOutputNodes; float current_max_prob = 0.f; // If the neural network doesn't contain the softmax as a last layer, the outputs can range in [-infty, infty] uint class_label = 0; - for (int pIdx = elem_iterator; pIdx < elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes; pIdx++) { + for (int pIdx = elem_iterator; pIdx < elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes; pIdx++) { if (pIdx == elem_iterator) { - current_max_prob = clusterer.modelProbabilities[pIdx]; + current_max_prob = clustererNN.modelProbabilities[pIdx]; } else { - class_label = (clusterer.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label); + class_label = (clustererNN.modelProbabilities[pIdx] > current_max_prob ? pIdx : class_label); } } - // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clusterer.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" - clusterer.outputDataClass[glo_idx + batchStart] = class_label; + // uint class_label = std::distance(elem_iterator, std::max_element(elem_iterator, elem_iterator + clustererNN.nnClusterizerModelClassNumOutputNodes)); // Multiple outputs of the class network are the probabilities for each class. The highest one "wins" + clustererNN.outputDataClass[glo_idx + batchStart] = class_label; + if (class_label > 1) { + clustererNN.clusterFlags[2 * glo_idx] = 1; + clustererNN.clusterFlags[2 * glo_idx + 1] = 1; + } } template <> @@ -157,7 +161,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= 1)) { ClusterAccumulator pc; From a9c33b5b7775123283b0de118b99ae2945b0c669 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Tue, 25 Mar 2025 08:52:20 +0000 Subject: [PATCH 10/10] Please consider the following formatting changes --- .../TPCClusterFinder/GPUTPCNNClusterizerHost.cxx | 12 ++++++------ .../TPCClusterFinder/GPUTPCNNClusterizerHost.h | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 3463740cf7918..35db3f2107e7d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -27,15 +27,16 @@ GPUTPCNNClusterizerHost::GPUTPCNNClusterizerHost(const GPUSettingsProcessingNNcl init(settings); } -void GPUTPCNNClusterizerHost::loadFromCCDB(std::map settings) { +void GPUTPCNNClusterizerHost::loadFromCCDB(std::map settings) +{ o2::ccdb::CcdbApi ccdbApi; ccdbApi.init(settings["nnCCDBURL"]); metadata["inputDType"] = settings["inputDType"]; metadata["outputDType"] = settings["outputDType"]; - metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C + metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model - metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN + metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) { metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"]; } @@ -58,7 +59,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set std::string class_model_path = settings.nnClassificationPath, reg_model_path = settings.nnRegressionPath; std::vector reg_model_paths; - if(settings.nnLoadFromCCDB) { + if (settings.nnLoadFromCCDB) { std::map ccdbSettings = { {"nnCCDBURL", settings.nnCCDBURL}, {"nnCCDBPath", settings.nnCCDBPath}, @@ -66,8 +67,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set {"outputDType", settings.nnInferenceOutputDType}, {"nnCCDBWithMomentum", std::to_string(settings.nnCCDBWithMomentum)}, {"nnCCDBBeamType", settings.nnCCDBBeamType}, - {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)} - }; + {"nnCCDBInteractionRate", std::to_string(settings.nnCCDBInteractionRate)}}; std::string nnFetchFolder = ""; std::vector fetchMode = o2::utils::Str::tokenize(settings.nnCCDBFetchMode, ':'); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h index 798d4af2826b3..210d5f94dd503 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.h @@ -49,9 +49,9 @@ class GPUTPCNNClusterizerHost o2::ml::OrtModel model_class, model_reg_1, model_reg_2; // For splitting clusters std::vector reg_model_paths; - private: - std::map metadata; - std::map headers; + private: + std::map metadata; + std::map headers; }; // class GPUTPCNNClusterizerHost } // namespace o2::gpu