From 9c8984c53d01d851f4294d5a68d6a8bbefd39295 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 18 Oct 2025 00:55:54 +0200 Subject: [PATCH 01/16] Improve GPU filling kernel speed --- .../Global/GPUChainTrackingClusterizer.cxx | 7 +- .../TPCClusterFinder/GPUTPCNNClusterizer.h | 1 + .../GPUTPCNNClusterizerHost.cxx | 1 + .../GPUTPCNNClusterizerKernels.cxx | 181 +++++++++--------- .../GPUTPCNNClusterizerKernels.h | 4 +- 5 files changed, 104 insertions(+), 90 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index b0d466f13e5ef..c4b7b2e47cbd9 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -642,6 +642,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Maximum of 4 lanes supported HighResTimer* nnTimers[12]; + int32_t countLoops = 0; if (GetProcessingSettings().nn.applyNNclusterizer) { int32_t deviceId = -1; @@ -1035,7 +1036,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Filling the data if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) { // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations - runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeFull, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); + for(int throughput_counter = 0; throughput_counter < 16; throughput_counter++) { // Loop to increase throughput on GPU, at least for large batch sizes + runKernel({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); + countLoops++; + } } else { // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); @@ -1138,6 +1142,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")"; } } + LOG(info) << "countLoops: " << countLoops; #else GPUFatal("Project not compiled with neural network clusterization. Aborting."); #endif diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index 0b9553437765c..b7bc1575d349a 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -71,6 +71,7 @@ class GPUTPCNNClusterizer : public GPUProcessor uint32_t mNnClusterizerPadTimeSize = 0; uint32_t mNnClusterizerRowTimeSize = 0; uint32_t mNnClusterizerRowTimeSizeFull = 0; + uint32_t mNnClusterizerRowTimeSizeThreads = 0; // Boundary lookup table // int32_t mBoundaryMapSizeRow = 0; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index ae833ace2f648..582a0c6d7435a 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -98,6 +98,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerPadTimeSize = clustererNN.mNnClusterizerFullPadSize * clustererNN.mNnClusterizerFullTimeSize; clustererNN.mNnClusterizerRowTimeSize = clustererNN.mNnClusterizerFullRowSize * clustererNN.mNnClusterizerFullTimeSize; clustererNN.mNnClusterizerRowTimeSizeFull = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 3 : 0); + clustererNN.mNnClusterizerRowTimeSizeThreads = clustererNN.mNnClusterizerRowTimeSize + (settings.nnClusterizerAddIndexData ? 1 : 0); clustererNN.mNnClusterizerElementSize = clustererNN.mNnClusterizerChargeArraySize + (settings.nnClusterizerAddIndexData ? 3 : 0); // clustererNN.mBoundaryMapSizeRow = 3 * clustererNN.mNnClusterizerSizeInputRow + o2::tpc::constants::MAXGLOBALPADROW; // clustererNN.mBoundaryPadding = 11; // padding on each side to account for pad_offset. N=11 since then mIsBoundary = 24320 ~< (1.5 x 2^14 = 24576) && N must be bigger than (NPads[row(end_iroc + 1)] - NPads[row(end_iroc)])/2 (=6) for pad_offset to work diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 55fefa7dcf149..1c76fc3f1fa57 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -72,15 +72,19 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.time()); float central_charge = static_cast(chargeMap[peak].unpack()); int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); + const int32_t iroc_row = 63 + row_offset; + const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset; + const int32_t npads_row = GPUTPCGeometry::NPads(row); for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) { int32_t target_row = row + r; bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW); int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row); + int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(target_row + row_offset); for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) { int32_t target_pad = pad + p; - bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow); + bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference); for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) { int32_t target_time = time + t; @@ -143,125 +147,119 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeFull) { + if (glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerRowTimeSizeThreads) { return; } - uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeFull; - uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeFull); + uint32_t base_idx = glo_idx / clustererNN.mNnClusterizerRowTimeSizeThreads; + uint32_t transient_index = glo_idx - (base_idx * clustererNN.mNnClusterizerRowTimeSizeThreads); // Early exit for out-of-bounds threads - if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) { - return; - } + // if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) { + // return; + // } CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); // Use dedicated neural network shared memory arrays for warp-level caching // First thread in each warp loads shared data CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(base_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))]; - float central_charge = static_cast(chargeMap[peak].unpack()); + float central_charge = chargeMap[peak].unpack(); int32_t row = static_cast(peak.row()); int32_t pad = static_cast(peak.pad()); int32_t time = static_cast(peak.time()); + const int32_t npads_row = GPUTPCGeometry::NPads(row); + // Handle index data with fewer branches if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) { - int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize; - uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx; - - float index_values[3] = { - static_cast(sector) / o2::tpc::constants::MAXSECTOR, - static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW, - static_cast(pad) / GPUTPCGeometry::NPads(row)}; - + // int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize; + // uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx; +// + // float index_values[3] = { + // static_cast(sector) / o2::tpc::constants::MAXSECTOR, + // static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW, + // static_cast(pad) / GPUTPCGeometry::NPads(row)}; +// + // if (dtype == 0) { + // clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx]; + // } else { + // clustererNN.mInputData_32[write_idx] = index_values[data_idx]; + // } +// + // // Handle deconvolution flags only once per cluster (last thread in element) + // if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) { + // uint8_t cluster_flags = 0; + // for (uint16_t i = 0; i < 8; i++) { + // Delta2 d = cfconsts::InnerNeighbors[i]; + // CfChargePos tmp_pos = peak.delta(d); + // cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]); + // } + // clustererNN.mClusterFlags[2 * base_idx] = cluster_flags; + // clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags; + // } + // return; + uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize; if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx]; + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); + clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads_row); } else { - clustererNN.mInputData_32[write_idx] = index_values[data_idx]; - } - - // Handle deconvolution flags only once per cluster (last thread in element) - if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) { - uint8_t cluster_flags = 0; - for (uint16_t i = 0; i < 8; i++) { - Delta2 d = cfconsts::InnerNeighbors[i]; - CfChargePos tmp_pos = peak.delta(d); - cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]); - } - clustererNN.mClusterFlags[2 * base_idx] = cluster_flags; - clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags; + clustererNN.mInputData_32[write_idx] = static_cast(sector) / o2::tpc::constants::MAXSECTOR; + clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; + clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads_row; } - return; } // Main data processing - optimize index calculations if (transient_index < clustererNN.mNnClusterizerRowTimeSize) { // Optimize 3D index calculation - int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize; - int32_t r_local = row_idx - clustererNN.mNnClusterizerSizeInputRow; - int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize; - int32_t t_local = time_idx - clustererNN.mNnClusterizerSizeInputTime; + const int32_t row_idx = transient_index / clustererNN.mNnClusterizerFullTimeSize; + const int32_t time_idx = transient_index - row_idx * clustererNN.mNnClusterizerFullTimeSize; int32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + row_idx * clustererNN.mNnClusterizerPadTimeSize + time_idx; // Early boundary check for row - int32_t target_row = row + r_local; - int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1)); + const int32_t target_row = row + row_idx - clustererNN.mNnClusterizerSizeInputRow; + const int8_t is_row_boundary = (target_row < 0) || (target_row > (o2::tpc::constants::MAXGLOBALPADROW - 1)); + const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime; + const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin); + const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide // Calculate offsets - int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row); - for (int32_t p_local = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p_local++) { + // int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); + // int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row); + const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); + const int32_t iroc_row = 63 + row_offset; + const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset; + const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row); + const int32_t boundary_row = target_row + row_offset; + const int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(boundary_row - clustererNN.mNnClusterizerSizeInputRow); + const float boundary_val = clustererNN.mNnClusterizerBoundaryFillValue; + + float output_value = boundary_val; + + const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; + const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local; + + for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) { if (is_row_boundary) { - // Use boundary fill value - float boundary_val = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); - if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_val; + output_value = boundary_val; + } else { + const uint8_t is_boundary = is_time_boundary || GPUTPCNNClusterizerKernels::isBoundary(boundary_row, target_pad, maxrow, iroc_row, npads_row, npads_reference); + if (!is_boundary) { + CfChargePos pos(target_row, target_pad, target_time); + // one load + one multiply + output_value = chargeMap[pos].unpack() * inverse_central_charge; } else { - clustererNN.mInputData_32[write_idx] = boundary_val; + output_value = boundary_val; } - write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position - continue; - } - - // Calculate target pad and time - int32_t target_pad = pad + p_local; - int32_t target_time = time + t_local; - - // Optimized boundary check - int8_t is_boundary = GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, clustererNN.mNnClusterizerSizeInputRow) || (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin); - - float output_value; - if (is_boundary) { - output_value = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); - } else { - // Coalesced memory access - create position and read charge - CfChargePos tmp_pos(target_row, target_pad, target_time); - output_value = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; // Normalize by central charge } - - // Write output with reduced branching if (dtype == 0) { clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; } else { clustererNN.mInputData_32[write_idx] = output_value; } - // if (write_idx >= clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode) { - // printf("Error: Write index out of bounds (central array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n", - // write_idx, (int)(clustererNN.mNnClusterizerElementSize * clustererNN.mNnClusterizerBatchedMode), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local); - // } - // if ((clusterer.mPmemory->counters.nClusters - batchStart) < clustererNN.mNnClusterizerBatchedMode) { - // if (write_idx >= ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize)) { - // printf("Error: Write index out of bounds (end of array)! %d >= %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n", - // write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local); - // } - // if (write_idx > ((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 5)) { - // printf("Sanity check (should appear only once) %d == %d (write_idx: %d, base_idx: %d, transient_index: %d, row_idx: %d, time_idx: %d, r_local: %d, t_local: %d)\n", - // write_idx, (int)((clusterer.mPmemory->counters.nClusters - batchStart) * clustererNN.mNnClusterizerElementSize - 4), write_idx, base_idx, transient_index, row_idx, time_idx, r_local, t_local); - // } - // } - - write_idx += clustererNN.mNnClusterizerFullTimeSize; // Move to next pad position + write_idx += clustererNN.mNnClusterizerFullTimeSize; } } } @@ -275,6 +273,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) { return; } + if (glo_idx + batchStart >= clustererNN.mNnClusterizerTotalClusters) { + printf("Error: Class output index out of bounds! %d >= %d (glo_idx: %d, batchStart: %d, mNnClusterizerBatchedMode: %d, mNnClusterizerModelClassNumOutputNodes: %d, clusterer.mPmemory->counters.nClusters %d)\n", + glo_idx + batchStart, clustererNN.mNnClusterizerTotalClusters, glo_idx, batchStart, clustererNN.mNnClusterizerBatchedMode, clustererNN.mNnClusterizerModelClassNumOutputNodes, clusterer.mPmemory->counters.nClusters); + } if (clustererNN.mNnClusterizerUseClassification) { if (dtype == 0) { clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold); @@ -364,6 +366,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes) { + printf("Error: Global index out of bounds! %d >= %d (full_glo_idx: %d, maxClusterNum: %d, batchStart: %d)\n", + full_glo_idx, clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes, full_glo_idx, maxClusterNum, batchStart); + } + tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; ClusterAccumulator pc; @@ -737,16 +744,16 @@ GPUd() int32_t GPUTPCNNClusterizerKernels::rowOffset(int32_t row, int32_t offset return (row > 62 ? offset : 0); } -GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t offset) +GPUd() bool GPUTPCNNClusterizerKernels::isBoundary(int32_t row, int32_t pad, int32_t maxrow, int32_t iroc_row, int32_t npads_row, int32_t npads_reference) { - if (pad < 0 || row < 0) { // Faster short-circuit + if (pad < 0) { // Faster short-circuit return true; } else if (row < 63) { - return (pad >= static_cast(GPUTPCGeometry::NPads(row))); - } else if (row < (63 + offset)) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network + return (pad >= npads_row); + } else if (row < iroc_row) { // to account for the gap between IROC and OROC. Charge will be set to the boundary fill value in order to signal boundaries to the neural network return true; - } else if (row < (o2::tpc::constants::MAXGLOBALPADROW + offset)) { - return (pad >= static_cast(GPUTPCGeometry::NPads(row - offset))); + } else if (row < maxrow) { + return (pad >= npads_reference); } else { return true; } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index cd3d7783771fe..9353722568b1f 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -57,7 +57,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate determineClass2Labels = 4, publishClass1Regression = 5, publishClass2Regression = 6, - publishDeconvolutionFlags = 7, + publishDeconvolutionFlags = 7 }; template @@ -66,7 +66,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate private: static GPUd() int32_t padOffset(int32_t, int32_t); static GPUd() int32_t rowOffset(int32_t, int32_t); - static GPUd() bool isBoundary(int32_t, int32_t, int32_t); + static GPUd() bool isBoundary(int32_t, int32_t, int32_t, int32_t, int32_t, int32_t); static GPUd() bool isBoundaryPublish(int32_t, int32_t, float&, float&); }; From a075c43cd3da71e56da1cd96769100c68c6c6c11 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 18 Oct 2025 18:33:48 +0200 Subject: [PATCH 02/16] Adjusting parameter bounds and additional GPU kernel optimizations --- .../Definitions/GPUDefParametersDefaults.h | 2 +- .../Global/GPUChainTrackingClusterizer.cxx | 7 +- .../GPUTPCNNClusterizerKernels.cxx | 104 +++++++----------- 3 files changed, 40 insertions(+), 73 deletions(-) diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h index 29aa3808506dc..648482304aca2 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h @@ -482,7 +482,7 @@ #define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNCPU GPUCA_LB_GPUTPCNNClusterizerKernels - #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU GPUCA_LB_GPUTPCNNClusterizerKernels + #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU 1024 #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index c4b7b2e47cbd9..96ee665bda4eb 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -642,7 +642,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Maximum of 4 lanes supported HighResTimer* nnTimers[12]; - int32_t countLoops = 0; if (GetProcessingSettings().nn.applyNNclusterizer) { int32_t deviceId = -1; @@ -1036,10 +1035,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Filling the data if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) { // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations - for(int throughput_counter = 0; throughput_counter < 16; throughput_counter++) { // Loop to increase throughput on GPU, at least for large batch sizes - runKernel({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); - countLoops++; - } + runKernel({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); } else { // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); @@ -1142,7 +1138,6 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) LOG(info) << "(NNCLUS, GPUChainTrackingClusterizer, this=" << this << ") Done with CF regression. (clustererNN=" << &clustererNN << ", clustererNNShadow=" << &clustererNNShadow << ")"; } } - LOG(info) << "countLoops: " << countLoops; #else GPUFatal("Project not compiled with neural network clusterization. Aborting."); #endif diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 1c76fc3f1fa57..3a27b8740e46d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -72,8 +72,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.time()); float central_charge = static_cast(chargeMap[peak].unpack()); int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - const int32_t iroc_row = 63 + row_offset; - const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset; + const int32_t iroc_row = 63 + clustererNN.mNnClusterizerSizeInputRow; + const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + clustererNN.mNnClusterizerSizeInputRow; const int32_t npads_row = GPUTPCGeometry::NPads(row); for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) { @@ -169,45 +169,18 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(peak.pad()); int32_t time = static_cast(peak.time()); - const int32_t npads_row = GPUTPCGeometry::NPads(row); - // Handle index data with fewer branches if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) { - // int32_t data_idx = transient_index - clustererNN.mNnClusterizerRowTimeSize; - // uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize + data_idx; -// - // float index_values[3] = { - // static_cast(sector) / o2::tpc::constants::MAXSECTOR, - // static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW, - // static_cast(pad) / GPUTPCGeometry::NPads(row)}; -// - // if (dtype == 0) { - // clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)index_values[data_idx]; - // } else { - // clustererNN.mInputData_32[write_idx] = index_values[data_idx]; - // } -// - // // Handle deconvolution flags only once per cluster (last thread in element) - // if (!clustererNN.mNnClusterizerSetDeconvolutionFlags && data_idx == 2) { - // uint8_t cluster_flags = 0; - // for (uint16_t i = 0; i < 8; i++) { - // Delta2 d = cfconsts::InnerNeighbors[i]; - // CfChargePos tmp_pos = peak.delta(d); - // cluster_flags += CfUtils::isPeak(isPeakMap[tmp_pos]); - // } - // clustererNN.mClusterFlags[2 * base_idx] = cluster_flags; - // clustererNN.mClusterFlags[2 * base_idx + 1] = cluster_flags; - // } - // return; uint32_t write_idx = base_idx * clustererNN.mNnClusterizerElementSize + clustererNN.mNnClusterizerChargeArraySize; + const int32_t npads = GPUTPCGeometry::NPads(row); if (dtype == 0) { clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)(static_cast(sector) / o2::tpc::constants::MAXSECTOR); clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); - clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads_row); + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads); } else { clustererNN.mInputData_32[write_idx] = static_cast(sector) / o2::tpc::constants::MAXSECTOR; clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; - clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads_row; + clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads; } } @@ -220,46 +193,45 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread (o2::tpc::constants::MAXGLOBALPADROW - 1)); - const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime; - const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin); - const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide - - // Calculate offsets - // int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - // int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, target_row); - const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); - const int32_t iroc_row = 63 + row_offset; - const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + row_offset; - const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row); - const int32_t boundary_row = target_row + row_offset; - const int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(boundary_row - clustererNN.mNnClusterizerSizeInputRow); - const float boundary_val = clustererNN.mNnClusterizerBoundaryFillValue; - - float output_value = boundary_val; - - const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; - const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local; - - for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) { - if (is_row_boundary) { - output_value = boundary_val; - } else { - const uint8_t is_boundary = is_time_boundary || GPUTPCNNClusterizerKernels::isBoundary(boundary_row, target_pad, maxrow, iroc_row, npads_row, npads_reference); - if (!is_boundary) { + float output_value = clustererNN.mNnClusterizerBoundaryFillValue; + + if ((row < 63 && target_row > 62) || (target_row < 0) || (row > 62 && target_row < 63) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) { + for (int32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) { + if (dtype == 0) { + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; + } else { + clustererNN.mInputData_32[write_idx] = output_value; + } + write_idx += clustererNN.mNnClusterizerFullTimeSize; + } + return; + } else { + // Calculate offsets + const int32_t target_time = time + time_idx - clustererNN.mNnClusterizerSizeInputTime; + const uint8_t is_time_boundary = (target_time < 0) || (target_time >= clustererNN.maxAllowedTimebin); + const float inverse_central_charge = 1.f / central_charge; // multiply by inverse is cheaper than divide + const int32_t p_local = pad + GPUTPCNNClusterizerKernels::padOffset(row, target_row); + const int32_t npads = GPUTPCGeometry::NPads(target_row); + + const int32_t start_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; + const int32_t end_pad = clustererNN.mNnClusterizerSizeInputPad + p_local; + + for (int32_t target_pad = start_pad; target_pad <= end_pad; ++target_pad) { + if (target_pad >= npads || target_pad < 0 || is_time_boundary) { + output_value = clustererNN.mNnClusterizerBoundaryFillValue; + } else { CfChargePos pos(target_row, target_pad, target_time); // one load + one multiply output_value = chargeMap[pos].unpack() * inverse_central_charge; + } + if (dtype == 0) { + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; } else { - output_value = boundary_val; + clustererNN.mInputData_32[write_idx] = output_value; } + write_idx += clustererNN.mNnClusterizerFullTimeSize; } - if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; - } else { - clustererNN.mInputData_32[write_idx] = output_value; - } - write_idx += clustererNN.mNnClusterizerFullTimeSize; + return; } } } From 587c3e6fadd812b0489891c28759dc3d6865e036 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 18 Oct 2025 21:12:47 +0200 Subject: [PATCH 03/16] Adding back if statement for early exit --- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 3a27b8740e46d..0f000451794ec 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -155,9 +155,9 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clusterer.mPmemory->counters.nClusters) { - // return; - // } + if (base_idx + batchStart >= clusterer.mPmemory->counters.nClusters) { + return; + } CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); From 6e432576d34b871974fdb236bc9c6bc75619fcb8 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sat, 18 Oct 2025 22:19:01 +0200 Subject: [PATCH 04/16] const'ing + fixing CPU kernel --- .../Global/GPUChainTrackingClusterizer.cxx | 2 +- .../GPUTPCNNClusterizerKernels.cxx | 63 +++++++++---------- 2 files changed, 32 insertions(+), 33 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 96ee665bda4eb..1740e525937f3 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -1035,7 +1035,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Filling the data if (mRec->IsGPU() || GetProcessingSettings().nn.nnClusterizerForceGpuInputFill) { // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations - runKernel({GetGrid(clustererNNShadow.mNnClusterizerBatchedMode * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); + runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerRowTimeSizeThreads , lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); } else { // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 0f000451794ec..7927dcf572acc 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -57,7 +57,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) { return; } @@ -67,43 +67,42 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); CfChargePos peak = clusterer.mPfilteredPeakPositions[CAMath::Min(glo_idx + batchStart, (uint32_t)(clusterer.mPmemory->counters.nClusters - 1))]; - int32_t row = static_cast(peak.row()); - int32_t pad = static_cast(peak.pad()); - int32_t time = static_cast(peak.time()); - float central_charge = static_cast(chargeMap[peak].unpack()); - int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); + const int32_t row = static_cast(peak.row()); + const int32_t pad = static_cast(peak.pad()); + const int32_t time = static_cast(peak.time()); + const float central_charge = static_cast(chargeMap[peak].unpack()); + const float inverse_charge = 1.f / central_charge; + + const int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); const int32_t iroc_row = 63 + clustererNN.mNnClusterizerSizeInputRow; const int32_t maxrow = o2::tpc::constants::MAXGLOBALPADROW + clustererNN.mNnClusterizerSizeInputRow; const int32_t npads_row = GPUTPCGeometry::NPads(row); + float output_value = clustererNN.mNnClusterizerBoundaryFillValue; - for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; ++r) { - int32_t target_row = row + r; - bool is_row_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW); - int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row); - int32_t npads_reference = is_row_boundary ? 0 : GPUTPCGeometry::NPads(target_row + row_offset); + for (int32_t target_row = -clustererNN.mNnClusterizerSizeInputRow + row; target_row <= clustererNN.mNnClusterizerSizeInputRow + row; ++target_row) { + uint8_t is_boundary = (target_row < 0) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW); + const int32_t p_local = pad + (is_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, target_row)); + const int32_t npads_reference = is_boundary ? 0 : GPUTPCGeometry::NPads(target_row - row_offset); - for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; ++p) { - int32_t target_pad = pad + p; - bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference); - - for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; ++t) { - int32_t target_time = time + t; + for (int32_t target_pad = -clustererNN.mNnClusterizerSizeInputPad + p_local; target_pad <= clustererNN.mNnClusterizerSizeInputPad + p_local; ++target_pad) { + is_boundary = is_boundary || GPUTPCNNClusterizerKernels::isBoundary(target_row + row_offset, target_pad, maxrow, iroc_row, npads_row, npads_reference); + for (int32_t target_time = -clustererNN.mNnClusterizerSizeInputTime + time; target_time <= clustererNN.mNnClusterizerSizeInputTime + time; ++target_time) { if (is_boundary || target_time < 0 || target_time >= clustererNN.maxAllowedTimebin) { // Fill boundary value - float boundary_value = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); + output_value = clustererNN.mNnClusterizerBoundaryFillValue; if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)boundary_value; + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; } else { - clustererNN.mInputData_32[write_idx] = boundary_value; + clustererNN.mInputData_32[write_idx] = output_value; } } else { CfChargePos tmp_pos(target_row, target_pad, target_time); - float normalized_charge = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + output_value = chargeMap[tmp_pos].unpack() * inverse_charge; if (dtype == 0) { - clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)normalized_charge; + clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; } else { - clustererNN.mInputData_32[write_idx] = normalized_charge; + clustererNN.mInputData_32[write_idx] = output_value; } } // if((CAMath::Abs(static_cast(clustererNN.mInputData_16_Test[write_idx]) - static_cast(clustererNN.mInputData_16[write_idx])) > 1e-4) && ((glo_idx + batchStart) < clusterer.mPmemory->counters.nClusters)) { @@ -119,11 +118,11 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(sector) / o2::tpc::constants::MAXSECTOR); clustererNN.mInputData_16[write_idx + 1] = (OrtDataType::Float16_t)(static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW); - clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / GPUTPCGeometry::NPads(row)); + clustererNN.mInputData_16[write_idx + 2] = (OrtDataType::Float16_t)(static_cast(pad) / npads_row); } else { clustererNN.mInputData_32[write_idx] = static_cast(sector) / o2::tpc::constants::MAXSECTOR; clustererNN.mInputData_32[write_idx + 1] = static_cast(row) / o2::tpc::constants::MAXGLOBALPADROW; - clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / GPUTPCGeometry::NPads(row); + clustererNN.mInputData_32[write_idx + 2] = static_cast(pad) / npads_row; } } @@ -143,7 +142,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { - uint32_t glo_idx = get_global_id(0); + const uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; auto& clustererNN = processors.tpcNNClusterer[sector]; @@ -151,8 +150,8 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clusterer.mPmemory->counters.nClusters) { @@ -164,10 +163,10 @@ GPUdii() void GPUTPCNNClusterizerKernels::Threadcounters.nClusters - 1))]; - float central_charge = chargeMap[peak].unpack(); - int32_t row = static_cast(peak.row()); - int32_t pad = static_cast(peak.pad()); - int32_t time = static_cast(peak.time()); + const float central_charge = chargeMap[peak].unpack(); + const int32_t row = static_cast(peak.row()); + const int32_t pad = static_cast(peak.pad()); + const int32_t time = static_cast(peak.time()); // Handle index data with fewer branches if (clustererNN.mNnClusterizerAddIndexData && transient_index >= clustererNN.mNnClusterizerRowTimeSize) { From bb795c4170289bae93b19223264dc09aacaebea0 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sun, 19 Oct 2025 09:43:08 +0200 Subject: [PATCH 05/16] Remiving print statements --- .../TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 9 --------- 1 file changed, 9 deletions(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 7927dcf572acc..b214a8f549a56 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -244,10 +244,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clusterer.mPmemory->counters.nClusters || glo_idx >= (uint32_t)clustererNN.mNnClusterizerBatchedMode) { return; } - if (glo_idx + batchStart >= clustererNN.mNnClusterizerTotalClusters) { - printf("Error: Class output index out of bounds! %d >= %d (glo_idx: %d, batchStart: %d, mNnClusterizerBatchedMode: %d, mNnClusterizerModelClassNumOutputNodes: %d, clusterer.mPmemory->counters.nClusters %d)\n", - glo_idx + batchStart, clustererNN.mNnClusterizerTotalClusters, glo_idx, batchStart, clustererNN.mNnClusterizerBatchedMode, clustererNN.mNnClusterizerModelClassNumOutputNodes, clusterer.mPmemory->counters.nClusters); - } if (clustererNN.mNnClusterizerUseClassification) { if (dtype == 0) { clustererNN.mOutputDataClass[glo_idx + batchStart] = (int32_t)((clustererNN.mModelProbabilities_16[glo_idx]).ToFloat() > clustererNN.mNnClassThreshold); @@ -337,11 +333,6 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread= clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes) { - printf("Error: Global index out of bounds! %d >= %d (full_glo_idx: %d, maxClusterNum: %d, batchStart: %d)\n", - full_glo_idx, clustererNN.mNnClusterizerBatchedMode * clustererNN.mNnClusterizerModelReg1NumOutputNodes, full_glo_idx, maxClusterNum, batchStart); - } - tpc::ClusterNative* clusterOut = clusterer.mPclusterByRow; ClusterAccumulator pc; From f7cdc0be08798e9b10a490a8b08697d0826450bc Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 27 Oct 2025 13:08:55 +0100 Subject: [PATCH 06/16] Fixing CI build issue --- GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index b214a8f549a56..ee0fa217b8095 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -195,7 +195,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread 62) || (target_row < 0) || (row > 62 && target_row < 63) || (target_row >= o2::tpc::constants::MAXGLOBALPADROW)) { - for (int32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) { + for (uint32_t target_pad = 0; target_pad < clustererNN.mNnClusterizerFullPadSize; ++target_pad) { if (dtype == 0) { clustererNN.mInputData_16[write_idx] = (OrtDataType::Float16_t)output_value; } else { From 3775044da0baef9edade6b9f2ac01041871b4166 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 21 Nov 2025 13:02:54 +0100 Subject: [PATCH 07/16] Working version of NN CCDB fetching and loading to file --- Common/ML/include/ML/OrtInterface.h | 1 + Common/ML/src/OrtInterface.cxx | 15 ++ .../src/NeuralNetworkClusterizer.cxx | 2 + GPU/GPUTracking/Definitions/GPUSettingsList.h | 10 +- .../include/GPUWorkflow/GPUWorkflowSpec.h | 5 +- GPU/Workflow/src/GPUWorkflowSpec.cxx | 168 +++++++++++++----- 6 files changed, 151 insertions(+), 50 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 04a5e0ba5c9fc..1893cf5a90179 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -51,6 +51,7 @@ class OrtModel void initOptions(std::unordered_map optionsMap); void initEnvironment(); void initSession(); + void initSessionFromBuffer(const void* buffer, size_t bufferSize); void memoryOnDevice(int32_t = 0); bool isInitialized() { return mInitialized; } void resetSession(); diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index d30d05d1d1a00..8d22d0075b887 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -138,6 +138,21 @@ void OrtModel::initEnvironment() (mPImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events } +void OrtModel::initSessionFromBuffer(const void* buffer, size_t bufferSize) +{ + mPImplOrt->session = std::make_unique(*mPImplOrt->env, + static_cast(buffer), + bufferSize, + mPImplOrt->sessionOptions); + mPImplOrt->ioBinding = std::make_unique(*mPImplOrt->session); + + setIO(); + + if (mLoggingLevel < 2) { + LOG(info) << "(ORT) Model loaded successfully from buffer! (inputs: " << printShape(mInputShapes, mInputNames) << ", outputs: " << printShape(mOutputShapes, mInputNames) << ")"; + } +} + void OrtModel::initSession() { if (mAllocateDeviceMemory) { diff --git a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx index bfbb7afc946f8..5b0a06086e50d 100644 --- a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx +++ b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx @@ -37,6 +37,8 @@ void NeuralNetworkClusterizer::loadIndividualFromCCDB(std::map const& tpcsectors, uint64_t tpcSectorMask, std::shared_ptr& ggr, std::function** gPolicyOrder = nullptr); @@ -230,7 +233,7 @@ class GPURecoWorkflowSpec : public o2::framework::Task uint32_t mNextThreadIndex = 0; bool mUpdateGainMapCCDB = true; std::unique_ptr mTFSettings; - std::unique_ptr mNNClusterizerSettings; + std::map nnCCDBSettings; Config mSpecConfig; std::shared_ptr mGGR; diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index d3d3eb14869e0..32a2a572523ea 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -133,50 +133,6 @@ void GPURecoWorkflowSpec::init(InitContext& ic) { GRPGeomHelper::instance().setRequest(mGGR); GPUO2InterfaceConfiguration& config = *mConfig.get(); - GPUSettingsProcessingNNclusterizer& mNNClusterizerSettings = mConfig->configProcessing.nn; - - if (mNNClusterizerSettings.nnLoadFromCCDB) { - LOG(info) << "Loading neural networks from CCDB"; - o2::tpc::NeuralNetworkClusterizer nnClusterizerFetcher; - nnClusterizerFetcher.initCcdbApi(mNNClusterizerSettings.nnCCDBURL); - std::map ccdbSettings = { - {"nnCCDBURL", mNNClusterizerSettings.nnCCDBURL}, - {"nnCCDBPath", mNNClusterizerSettings.nnCCDBPath}, - {"inputDType", mNNClusterizerSettings.nnInferenceInputDType}, - {"outputDType", mNNClusterizerSettings.nnInferenceOutputDType}, - {"outputFolder", mNNClusterizerSettings.nnLocalFolder}, - {"nnCCDBPath", mNNClusterizerSettings.nnCCDBPath}, - {"nnCCDBWithMomentum", std::to_string(mNNClusterizerSettings.nnCCDBWithMomentum)}, - {"nnCCDBBeamType", mNNClusterizerSettings.nnCCDBBeamType}, - {"nnCCDBInteractionRate", std::to_string(mNNClusterizerSettings.nnCCDBInteractionRate)}}; - - std::string nnFetchFolder = mNNClusterizerSettings.nnLocalFolder; - std::vector evalMode = o2::utils::Str::tokenize(mNNClusterizerSettings.nnEvalMode, ':'); - - if (evalMode[0] == "c1") { - ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBClassificationLayerType; - ccdbSettings["nnCCDBEvalType"] = "classification_c1"; - ccdbSettings["outputFile"] = "net_classification_c1.onnx"; - nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); - } else if (evalMode[0] == "c2") { - ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBClassificationLayerType; - ccdbSettings["nnCCDBEvalType"] = "classification_c2"; - ccdbSettings["outputFile"] = "net_classification_c2.onnx"; - nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); - } - - ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBRegressionLayerType; - ccdbSettings["nnCCDBEvalType"] = "regression_c1"; - ccdbSettings["outputFile"] = "net_regression_c1.onnx"; - nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); - if (evalMode[1] == "r2") { - ccdbSettings["nnCCDBLayerType"] = mNNClusterizerSettings.nnCCDBRegressionLayerType; - ccdbSettings["nnCCDBEvalType"] = "regression_c2"; - ccdbSettings["outputFile"] = "net_regression_c2.onnx"; - nnClusterizerFetcher.loadIndividualFromCCDB(ccdbSettings); - } - LOG(info) << "Neural network loading done!"; - } // Create configuration object and fill settings mConfig->configGRP.solenoidBzNominalGPU = 0; @@ -185,6 +141,7 @@ void GPURecoWorkflowSpec::init(InitContext& ic) mTFSettings->simStartOrbit = hbfu.getFirstIRofTF(o2::InteractionRecord(0, hbfu.orbitFirstSampled)).orbit; *mConfParam = mConfig->ReadConfigurableParam(); + if (mConfParam->display) { mDisplayFrontend.reset(GPUDisplayFrontendInterface::getFrontend(mConfig->configDisplay.displayFrontend.c_str())); mConfig->configProcessing.eventDisplay = mDisplayFrontend.get(); @@ -814,6 +771,68 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc) // ------------------------------ Actual processing ------------------------------ + if (mSpecConfig.nnLoadFromCCDB) { + LOG(info) << "(NN CLUS) Fetching CCDB calib objects"; + + auto dumpOnnxToFile = [](const char* buffer, std::size_t size, const std::string& path) { + const char* marker = "Accept-Ranges"; + const char* pos = std::search(buffer, buffer + size, marker, marker + std::strlen(marker)); + + // Compute the actual number of bytes to write + std::size_t writeSize = (pos != buffer + size) + ? static_cast(pos - buffer) + : size; + + std::ofstream out(path, std::ios::binary | std::ios::trunc); + if (!out.is_open()) { + throw std::runtime_error("Failed to open ONNX output file: " + path); + } + + out.write(buffer, static_cast(writeSize)); + if (!out) { + throw std::runtime_error("Failed while writing ONNX data to: " + path); + } + }; + + GPUSettingsProcessingNNclusterizer& nnClusterizerSettings = mConfig->configProcessing.nn; + std::vector evalMode = o2::utils::Str::tokenize(nnClusterizerSettings.nnEvalMode, ':'); + + DataRef m; + if (evalMode[0] == "c1") { + m = pc.inputs().get("nn_classification_c1"); + const char* buffer = const_cast(m.payload); + size_t size = DataRefUtils::getPayloadSize(m); + if (nnClusterizerSettings.nnCCDBDumpToFile == 1) { + dumpOnnxToFile(buffer, size, "net_classification_c1.onnx"); + LOG(info) << "(NN CLUS) Dumped nn_classification_c1 from CCDB to net_classification_c1.onnx"; + } + } else if (evalMode[0] == "c2") { + m = pc.inputs().get("nn_classification_c2"); + const char* buffer = const_cast(m.payload); + size_t size = DataRefUtils::getPayloadSize(m); + if (nnClusterizerSettings.nnCCDBDumpToFile == 1) { + dumpOnnxToFile(buffer, size, "net_classification_c2.onnx"); + LOG(info) << "(NN CLUS) Dumped nn_classification_c2 from CCDB to net_classification_c2.onnx"; + } + } + + m = pc.inputs().get("nn_regression_c1"); + const char* buffer = const_cast(m.payload); + size_t size = DataRefUtils::getPayloadSize(m); + if (nnClusterizerSettings.nnCCDBDumpToFile == 1) { + dumpOnnxToFile(buffer, size, "net_regression_c1.onnx"); + LOG(info) << "(NN CLUS) Dumped nn_regression_c1 from CCDB to net_regression_c1.onnx"; + } + if (evalMode[1] == "r2") { + m = pc.inputs().get("nn_regression_c2"); + const char* buffer = const_cast(m.payload); + size_t size = DataRefUtils::getPayloadSize(m); + if (nnClusterizerSettings.nnCCDBDumpToFile == 1) { + dumpOnnxToFile(buffer, size, "net_regression_c2.onnx"); + LOG(info) << "(NN CLUS) Dumped nn_regression_c2 from CCDB to net_regression_c2.onnx"; + } + } + } if ((int32_t)(ptrs.tpcZS != nullptr) + (int32_t)(ptrs.tpcPackedDigits != nullptr && (ptrs.tpcZS == nullptr || ptrs.tpcPackedDigits->tpcDigitsMC == nullptr)) + (int32_t)(ptrs.clustersNative != nullptr) + (int32_t)(ptrs.tpcCompressedClusters != nullptr) != 1) { throw std::runtime_error("Invalid input for gpu tracking"); } @@ -1262,6 +1281,67 @@ Inputs GPURecoWorkflowSpec::inputs() } } + // NN clusterizer + *mConfParam = mConfig->ReadConfigurableParam(); + if (mConfig->configProcessing.nn.nnLoadFromCCDB) { + + LOG(info) << "(NN CLUS) Enabling fetching of TPC NN clusterizer from CCDB"; + mSpecConfig.nnLoadFromCCDB = true; + GPUSettingsProcessingNNclusterizer& nnClusterizerSettings = mConfig->configProcessing.nn; + + std::map metadata; + metadata["inputDType"] = nnClusterizerSettings.nnInferenceInputDType; // FP16 or FP32 + metadata["outputDType"] = nnClusterizerSettings.nnInferenceOutputDType; // FP16 or FP32 + metadata["nnCCDBWithMomentum"] = nnClusterizerSettings.nnCCDBWithMomentum; // 0, 1 -> Only for regression model + metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBClassificationLayerType; // FC, CNN + metadata["nnCCDBInteractionRate"] = nnClusterizerSettings.nnCCDBInteractionRate; // in kHz + metadata["nnCCDBBeamType"] = nnClusterizerSettings.nnCCDBBeamType; // pp, pPb, PbPb + + auto convert_map_to_metadata = [](const std::map& inputMap, std::vector& outputMetadata) { + for (const auto& [key, value] : inputMap) { + if (value != "") { + outputMetadata.push_back({key, value}); + } + } + }; + + std::vector evalMode = o2::utils::Str::tokenize(nnClusterizerSettings.nnEvalMode, ':'); + std::vector ccdb_metadata; + + auto printSettings = [](const std::map& settings) { + LOG(info) << "(NN CLUS) NN Clusterizer CCDB settings:"; + for (const auto& [key, value] : settings) { + LOG(info) << " " << key << " : " << value; + } + }; + printSettings(metadata); + + if (evalMode[0] == "c1") { + metadata["nnCCDBEvalType"] = "classification_c1"; + convert_map_to_metadata(metadata, ccdb_metadata); + inputs.emplace_back("nn_classification_c1", "TPC", "NNCLUSTERIZER_C1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0)); + LOG(info) << "(NN CLUS) Loading NN clusterizer classification (c1) from CCDB"; + } else if (evalMode[0] == "c2") { + metadata["nnCCDBEvalType"] = "classification_c2"; + convert_map_to_metadata(metadata, ccdb_metadata); + inputs.emplace_back("nn_classification_c2", "TPC", "NNCLUSTERIZER_C2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0)); + LOG(info) << "(NN CLUS) Loading NN clusterizer classification (c2) from CCDB"; + } + + metadata["nnCCDBEvalType"] = "regression_c1"; + metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBRegressionLayerType; + convert_map_to_metadata(metadata, ccdb_metadata); + inputs.emplace_back("nn_regression_c1", "TPC", "NNCLUSTERIZER_R1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0)); + LOG(info) << "(NN CLUS) Loading NN clusterizer regression (r1) from CCDB"; + + if (evalMode[1] == "r2") { + metadata["nnCCDBEvalType"] = "regression_c2"; + convert_map_to_metadata(metadata, ccdb_metadata); + inputs.emplace_back("nn_regression_c2", "TPC", "NNCLUSTERIZER_R2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0)); + LOG(info) << "(NN CLUS) Loading NN clusterizer regression (r2) from CCDB"; + } + } + return inputs; }; From a963c0191e3b4e37c81e09497512281907e58add Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 21 Nov 2025 16:28:19 +0100 Subject: [PATCH 08/16] Cleanup --- Detectors/TPC/calibration/CMakeLists.txt | 2 - .../TPCCalibration/NeuralNetworkClusterizer.h | 38 -------------- .../src/NeuralNetworkClusterizer.cxx | 50 ------------------- .../GPUTPCNNClusterizerHost.cxx | 2 +- GPU/Workflow/src/GPUWorkflowSpec.cxx | 25 ++++------ 5 files changed, 12 insertions(+), 105 deletions(-) delete mode 100644 Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h delete mode 100644 Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx diff --git a/Detectors/TPC/calibration/CMakeLists.txt b/Detectors/TPC/calibration/CMakeLists.txt index 8bcb3254edb32..e5cc25230d2fc 100644 --- a/Detectors/TPC/calibration/CMakeLists.txt +++ b/Detectors/TPC/calibration/CMakeLists.txt @@ -25,7 +25,6 @@ o2_add_library(TPCCalibration src/CalibPadGainTracksBase.cxx src/CalibLaserTracks.cxx src/LaserTracksCalibrator.cxx - src/NeuralNetworkClusterizer.cxx src/SACDecoder.cxx src/IDCAverageGroup.cxx src/IDCAverageGroupBase.cxx @@ -84,7 +83,6 @@ o2_target_root_dictionary(TPCCalibration include/TPCCalibration/FastHisto.h include/TPCCalibration/CalibLaserTracks.h include/TPCCalibration/LaserTracksCalibrator.h - include/TPCCalibration/NeuralNetworkClusterizer.h include/TPCCalibration/SACDecoder.h include/TPCCalibration/IDCAverageGroup.h include/TPCCalibration/IDCAverageGroupBase.h diff --git a/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h b/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h deleted file mode 100644 index 196bba644714c..0000000000000 --- a/Detectors/TPC/calibration/include/TPCCalibration/NeuralNetworkClusterizer.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// \file NeuralNetworkClusterizer.h -/// \brief Fetching neural networks for clusterization from CCDB -/// \author Christian Sonnabend - -#ifndef AliceO2_TPC_NeuralNetworkClusterizer_h -#define AliceO2_TPC_NeuralNetworkClusterizer_h - -#include "CCDB/CcdbApi.h" - -namespace o2::tpc -{ - -class NeuralNetworkClusterizer -{ - public: - NeuralNetworkClusterizer() = default; - void initCcdbApi(std::string url); - void loadIndividualFromCCDB(std::map settings); - - private: - o2::ccdb::CcdbApi ccdbApi; - std::map metadata; - std::map headers; -}; - -} // namespace o2::tpc -#endif diff --git a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx b/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx deleted file mode 100644 index 5b0a06086e50d..0000000000000 --- a/Detectors/TPC/calibration/src/NeuralNetworkClusterizer.cxx +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// \file NeuralNetworkClusterizer.cxx -/// \brief Fetching neural networks for clusterization from CCDB -/// \author Christian Sonnabend - -#include -#include "TPCCalibration/NeuralNetworkClusterizer.h" - -using namespace o2::tpc; - -void NeuralNetworkClusterizer::initCcdbApi(std::string url) -{ - ccdbApi.init(url); -} - -void NeuralNetworkClusterizer::loadIndividualFromCCDB(std::map settings) -{ - metadata["inputDType"] = settings["inputDType"]; - metadata["outputDType"] = settings["outputDType"]; - metadata["nnCCDBEvalType"] = settings["nnCCDBEvalType"]; // classification_1C, classification_2C, regression_1C, regression_2C - metadata["nnCCDBWithMomentum"] = settings["nnCCDBWithMomentum"]; // 0, 1 -> Only for regression model - metadata["nnCCDBLayerType"] = settings["nnCCDBLayerType"]; // FC, CNN - if (settings["nnCCDBInteractionRate"] != "" && std::stoi(settings["nnCCDBInteractionRate"]) > 0) { - metadata["nnCCDBInteractionRate"] = settings["nnCCDBInteractionRate"]; - } - if (settings["nnCCDBBeamType"] != "") { - metadata["nnCCDBBeamType"] = settings["nnCCDBBeamType"]; - } - - LOG(info) << "(NN CLUS) Retrieving network " << settings["nnCCDBPath"] << " from CCDB (NeuralNetworkClusterizer.cxx)"; - - bool retrieveSuccess = ccdbApi.retrieveBlob(settings["nnCCDBPath"], settings["outputFolder"], metadata, 1, false, settings["outputFile"]); - // headers = ccdbApi.retrieveHeaders(settings["nnPathCCDB"], metadata, 1); // potentially needed to init some local variables - - if (retrieveSuccess) { - LOG(info) << "Network " << settings["nnCCDBPath"] << " retrieved from CCDB, stored at " << settings["outputFile"]; - } else { - LOG(error) << "Failed to retrieve network from CCDB"; - } -} diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 582a0c6d7435a..77d5ee13f85fb 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -36,7 +36,7 @@ void GPUTPCNNClusterizerHost::init(const GPUSettingsProcessingNNclusterizer& set std::vector evalMode = o2::utils::Str::tokenize(settings.nnEvalMode, ':'); if (settings.nnLoadFromCCDB) { - reg_model_path = settings.nnLocalFolder + "/net_regression_c1.onnx"; // Needs to be set identical to NeuralNetworkClusterizer.cxx, otherwise the networks might be loaded from the wrong place + reg_model_path = settings.nnLocalFolder + "/net_regression_c1.onnx"; // Needs to be set identical to GPUWorkflowSpec.cxx, otherwise the networks might be loaded from the wrong place if (evalMode[0] == "c1") { class_model_path = settings.nnLocalFolder + "/net_classification_c1.onnx"; } else if (evalMode[0] == "c2") { diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index 32a2a572523ea..c5a75dcd762df 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -78,7 +78,6 @@ #include "DetectorsRaw/RDHUtils.h" #include "ITStracking/TrackingInterface.h" #include "GPUWorkflowInternal.h" -#include "TPCCalibration/NeuralNetworkClusterizer.h" // #include "Framework/ThreadPool.h" #include @@ -804,7 +803,6 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc) size_t size = DataRefUtils::getPayloadSize(m); if (nnClusterizerSettings.nnCCDBDumpToFile == 1) { dumpOnnxToFile(buffer, size, "net_classification_c1.onnx"); - LOG(info) << "(NN CLUS) Dumped nn_classification_c1 from CCDB to net_classification_c1.onnx"; } } else if (evalMode[0] == "c2") { m = pc.inputs().get("nn_classification_c2"); @@ -812,7 +810,6 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc) size_t size = DataRefUtils::getPayloadSize(m); if (nnClusterizerSettings.nnCCDBDumpToFile == 1) { dumpOnnxToFile(buffer, size, "net_classification_c2.onnx"); - LOG(info) << "(NN CLUS) Dumped nn_classification_c2 from CCDB to net_classification_c2.onnx"; } } @@ -821,7 +818,6 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc) size_t size = DataRefUtils::getPayloadSize(m); if (nnClusterizerSettings.nnCCDBDumpToFile == 1) { dumpOnnxToFile(buffer, size, "net_regression_c1.onnx"); - LOG(info) << "(NN CLUS) Dumped nn_regression_c1 from CCDB to net_regression_c1.onnx"; } if (evalMode[1] == "r2") { m = pc.inputs().get("nn_regression_c2"); @@ -829,7 +825,6 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc) size_t size = DataRefUtils::getPayloadSize(m); if (nnClusterizerSettings.nnCCDBDumpToFile == 1) { dumpOnnxToFile(buffer, size, "net_regression_c2.onnx"); - LOG(info) << "(NN CLUS) Dumped nn_regression_c2 from CCDB to net_regression_c2.onnx"; } } } @@ -1292,9 +1287,9 @@ Inputs GPURecoWorkflowSpec::inputs() std::map metadata; metadata["inputDType"] = nnClusterizerSettings.nnInferenceInputDType; // FP16 or FP32 metadata["outputDType"] = nnClusterizerSettings.nnInferenceOutputDType; // FP16 or FP32 - metadata["nnCCDBWithMomentum"] = nnClusterizerSettings.nnCCDBWithMomentum; // 0, 1 -> Only for regression model + metadata["nnCCDBWithMomentum"] = nnClusterizerSettings.nnCCDBWithMomentum; // 0, 1 -> Only for regression model metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBClassificationLayerType; // FC, CNN - metadata["nnCCDBInteractionRate"] = nnClusterizerSettings.nnCCDBInteractionRate; // in kHz + metadata["nnCCDBInteractionRate"] = nnClusterizerSettings.nnCCDBInteractionRate; // in kHz metadata["nnCCDBBeamType"] = nnClusterizerSettings.nnCCDBBeamType; // pp, pPb, PbPb auto convert_map_to_metadata = [](const std::map& inputMap, std::vector& outputMetadata) { @@ -1308,13 +1303,15 @@ Inputs GPURecoWorkflowSpec::inputs() std::vector evalMode = o2::utils::Str::tokenize(nnClusterizerSettings.nnEvalMode, ':'); std::vector ccdb_metadata; - auto printSettings = [](const std::map& settings) { - LOG(info) << "(NN CLUS) NN Clusterizer CCDB settings:"; - for (const auto& [key, value] : settings) { - LOG(info) << " " << key << " : " << value; - } - }; - printSettings(metadata); + if (mConfParam->printSettings) { + auto printSettings = [](const std::map& settings) { + LOG(info) << "(NN CLUS) NN Clusterizer CCDB settings:"; + for (const auto& [key, value] : settings) { + LOG(info) << " " << key << " : " << value; + } + }; + printSettings(metadata); + } if (evalMode[0] == "c1") { metadata["nnCCDBEvalType"] = "classification_c1"; From caf20fc089be9e1dc9a6fab02ab90814b2935627 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Fri, 21 Nov 2025 15:35:02 +0000 Subject: [PATCH 09/16] Please consider the following formatting changes --- GPU/Workflow/src/GPUWorkflowSpec.cxx | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index c5a75dcd762df..a5bdc53911e58 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -779,17 +779,17 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc) // Compute the actual number of bytes to write std::size_t writeSize = (pos != buffer + size) - ? static_cast(pos - buffer) - : size; + ? static_cast(pos - buffer) + : size; std::ofstream out(path, std::ios::binary | std::ios::trunc); if (!out.is_open()) { - throw std::runtime_error("Failed to open ONNX output file: " + path); + throw std::runtime_error("Failed to open ONNX output file: " + path); } out.write(buffer, static_cast(writeSize)); if (!out) { - throw std::runtime_error("Failed while writing ONNX data to: " + path); + throw std::runtime_error("Failed while writing ONNX data to: " + path); } }; @@ -1285,12 +1285,12 @@ Inputs GPURecoWorkflowSpec::inputs() GPUSettingsProcessingNNclusterizer& nnClusterizerSettings = mConfig->configProcessing.nn; std::map metadata; - metadata["inputDType"] = nnClusterizerSettings.nnInferenceInputDType; // FP16 or FP32 - metadata["outputDType"] = nnClusterizerSettings.nnInferenceOutputDType; // FP16 or FP32 - metadata["nnCCDBWithMomentum"] = nnClusterizerSettings.nnCCDBWithMomentum; // 0, 1 -> Only for regression model - metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBClassificationLayerType; // FC, CNN - metadata["nnCCDBInteractionRate"] = nnClusterizerSettings.nnCCDBInteractionRate; // in kHz - metadata["nnCCDBBeamType"] = nnClusterizerSettings.nnCCDBBeamType; // pp, pPb, PbPb + metadata["inputDType"] = nnClusterizerSettings.nnInferenceInputDType; // FP16 or FP32 + metadata["outputDType"] = nnClusterizerSettings.nnInferenceOutputDType; // FP16 or FP32 + metadata["nnCCDBWithMomentum"] = nnClusterizerSettings.nnCCDBWithMomentum; // 0, 1 -> Only for regression model + metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBClassificationLayerType; // FC, CNN + metadata["nnCCDBInteractionRate"] = nnClusterizerSettings.nnCCDBInteractionRate; // in kHz + metadata["nnCCDBBeamType"] = nnClusterizerSettings.nnCCDBBeamType; // pp, pPb, PbPb auto convert_map_to_metadata = [](const std::map& inputMap, std::vector& outputMetadata) { for (const auto& [key, value] : inputMap) { From 5284b01db9b48d2cd97ad569e378c25eabb267a9 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sun, 23 Nov 2025 11:52:54 +0100 Subject: [PATCH 10/16] Using char* buffer for model loading --- Common/ML/include/ML/OrtInterface.h | 2 +- Common/ML/src/OrtInterface.cxx | 7 +- GPU/GPUTracking/DataTypes/GPUDataTypes.h | 4 + .../Global/GPUChainTrackingClusterizer.cxx | 20 ++++- .../include/GPUWorkflow/GPUWorkflowSpec.h | 2 + GPU/Workflow/src/GPUWorkflowSpec.cxx | 77 +++---------------- GPU/Workflow/src/GPUWorkflowTPC.cxx | 70 +++++++++++++++++ 7 files changed, 109 insertions(+), 73 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 1893cf5a90179..987ce8fb4d6dd 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -51,7 +51,7 @@ class OrtModel void initOptions(std::unordered_map optionsMap); void initEnvironment(); void initSession(); - void initSessionFromBuffer(const void* buffer, size_t bufferSize); + void initSessionFromBuffer(const char* buffer, size_t bufferSize); void memoryOnDevice(int32_t = 0); bool isInitialized() { return mInitialized; } void resetSession(); diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 8d22d0075b887..8f88ab18dacbd 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -138,10 +138,13 @@ void OrtModel::initEnvironment() (mPImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events } -void OrtModel::initSessionFromBuffer(const void* buffer, size_t bufferSize) +void OrtModel::initSessionFromBuffer(const char* buffer, size_t bufferSize) { + mPImplOrt->sessionOptions.AddConfigEntry("session.load_model_format", "ONNX"); + mPImplOrt->sessionOptions.AddConfigEntry("session.use_ort_model_bytes_directly", "1"); + mPImplOrt->session = std::make_unique(*mPImplOrt->env, - static_cast(buffer), + buffer, bufferSize, mPImplOrt->sessionOptions); mPImplOrt->ioBinding = std::make_unique(*mPImplOrt->session); diff --git a/GPU/GPUTracking/DataTypes/GPUDataTypes.h b/GPU/GPUTracking/DataTypes/GPUDataTypes.h index 967d6a73914dd..e84ebe7231b34 100644 --- a/GPU/GPUTracking/DataTypes/GPUDataTypes.h +++ b/GPU/GPUTracking/DataTypes/GPUDataTypes.h @@ -182,6 +182,10 @@ struct GPUCalibObjectsTemplate { // use only pointers on PODs or flat objects he typename S::type* dEdxCalibContainer = nullptr; typename S>::type* o2Propagator = nullptr; typename S::type* itsPatternDict = nullptr; + + // NN clusterizer objects + char* nnClusterizerNetworks[3] = {nullptr, nullptr, nullptr}; // [c, r1, r2] networks as char arrays from CCDB + uint32_t nnClusterizerNetworkSizes[3] = {0, 0, 0}; }; typedef GPUCalibObjectsTemplate GPUCalibObjects; // NOTE: These 2 must have identical layout since they are memcopied typedef GPUCalibObjectsTemplate GPUCalibObjectsConst; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index bfb0457744ce5..2b4003396e4f7 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -639,7 +639,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // Maximum of 4 lanes supported HighResTimer* nnTimers[12]; - if (GetProcessingSettings().nn.applyNNclusterizer) { + if (nn_settings.applyNNclusterizer) { int32_t deviceId = -1; int32_t numLanes = GetProcessingSettings().nTPCClustererLanes; int32_t maxThreads = mRec->getNKernelHostThreads(true); @@ -677,7 +677,11 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator); // } // recreateMemoryAllocator = true; - (nnApplications[lane].mModelClass).initSession(); + if (!nn_settings.nnLoadFromCCDB){ + (nnApplications[lane].mModelClass).initSession(); // loads from file + } else { + (nnApplications[lane].mModelClass).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[0], processors()->calibObjects.nnClusterizerNetworkSizes[0]); // loads from CCDB + } } if (nnApplications[lane].mModelsUsed[1]) { SetONNXGPUStream(*(nnApplications[lane].mModelReg1).getSessionOptions(), lane, &deviceId); @@ -688,7 +692,11 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv()); (nnApplications[lane].mModelReg1).initEnvironment(); // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator); - (nnApplications[lane].mModelReg1).initSession(); + if (!nn_settings.nnLoadFromCCDB){ + (nnApplications[lane].mModelReg1).initSession(); // loads from file + } else { + (nnApplications[lane].mModelReg1).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[1], processors()->calibObjects.nnClusterizerNetworkSizes[1]); // loads from CCDB + } } if (nnApplications[lane].mModelsUsed[2]) { SetONNXGPUStream(*(nnApplications[lane].mModelReg2).getSessionOptions(), lane, &deviceId); @@ -699,7 +707,11 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv()); (nnApplications[lane].mModelReg2).initEnvironment(); // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator); - (nnApplications[lane].mModelReg2).initSession(); + if (!nn_settings.nnLoadFromCCDB){ + (nnApplications[lane].mModelReg2).initSession(); // loads from file + } else { + (nnApplications[lane].mModelReg2).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[2], processors()->calibObjects.nnClusterizerNetworkSizes[2]); // loads from CCDB + } } if (nn_settings.nnClusterizerVerbosity > 0) { LOG(info) << "(ORT) Allocated ONNX stream for lane " << lane << " and device " << deviceId; diff --git a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h index 3655ffd2dd0dc..d610269abca81 100644 --- a/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h +++ b/GPU/Workflow/include/GPUWorkflow/GPUWorkflowSpec.h @@ -138,6 +138,8 @@ class GPURecoWorkflowSpec : public o2::framework::Task // NN clusterizer bool nnLoadFromCCDB = false; + bool nnDumpToFile = false; + std::vector nnEvalMode; }; GPURecoWorkflowSpec(CompletionPolicyData* policyData, Config const& specconfig, std::vector const& tpcsectors, uint64_t tpcSectorMask, std::shared_ptr& ggr, std::function** gPolicyOrder = nullptr); diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index a5bdc53911e58..11c26c40f3077 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -770,64 +770,6 @@ void GPURecoWorkflowSpec::run(ProcessingContext& pc) // ------------------------------ Actual processing ------------------------------ - if (mSpecConfig.nnLoadFromCCDB) { - LOG(info) << "(NN CLUS) Fetching CCDB calib objects"; - - auto dumpOnnxToFile = [](const char* buffer, std::size_t size, const std::string& path) { - const char* marker = "Accept-Ranges"; - const char* pos = std::search(buffer, buffer + size, marker, marker + std::strlen(marker)); - - // Compute the actual number of bytes to write - std::size_t writeSize = (pos != buffer + size) - ? static_cast(pos - buffer) - : size; - - std::ofstream out(path, std::ios::binary | std::ios::trunc); - if (!out.is_open()) { - throw std::runtime_error("Failed to open ONNX output file: " + path); - } - - out.write(buffer, static_cast(writeSize)); - if (!out) { - throw std::runtime_error("Failed while writing ONNX data to: " + path); - } - }; - - GPUSettingsProcessingNNclusterizer& nnClusterizerSettings = mConfig->configProcessing.nn; - std::vector evalMode = o2::utils::Str::tokenize(nnClusterizerSettings.nnEvalMode, ':'); - - DataRef m; - if (evalMode[0] == "c1") { - m = pc.inputs().get("nn_classification_c1"); - const char* buffer = const_cast(m.payload); - size_t size = DataRefUtils::getPayloadSize(m); - if (nnClusterizerSettings.nnCCDBDumpToFile == 1) { - dumpOnnxToFile(buffer, size, "net_classification_c1.onnx"); - } - } else if (evalMode[0] == "c2") { - m = pc.inputs().get("nn_classification_c2"); - const char* buffer = const_cast(m.payload); - size_t size = DataRefUtils::getPayloadSize(m); - if (nnClusterizerSettings.nnCCDBDumpToFile == 1) { - dumpOnnxToFile(buffer, size, "net_classification_c2.onnx"); - } - } - - m = pc.inputs().get("nn_regression_c1"); - const char* buffer = const_cast(m.payload); - size_t size = DataRefUtils::getPayloadSize(m); - if (nnClusterizerSettings.nnCCDBDumpToFile == 1) { - dumpOnnxToFile(buffer, size, "net_regression_c1.onnx"); - } - if (evalMode[1] == "r2") { - m = pc.inputs().get("nn_regression_c2"); - const char* buffer = const_cast(m.payload); - size_t size = DataRefUtils::getPayloadSize(m); - if (nnClusterizerSettings.nnCCDBDumpToFile == 1) { - dumpOnnxToFile(buffer, size, "net_regression_c2.onnx"); - } - } - } if ((int32_t)(ptrs.tpcZS != nullptr) + (int32_t)(ptrs.tpcPackedDigits != nullptr && (ptrs.tpcZS == nullptr || ptrs.tpcPackedDigits->tpcDigitsMC == nullptr)) + (int32_t)(ptrs.clustersNative != nullptr) + (int32_t)(ptrs.tpcCompressedClusters != nullptr) != 1) { throw std::runtime_error("Invalid input for gpu tracking"); } @@ -1138,6 +1080,12 @@ void GPURecoWorkflowSpec::doCalibUpdates(o2::framework::ProcessingContext& pc, c newCalibValues.tpcTimeBinCut = mConfig->configGRP.tpcCutTimeBin = mTPCCutAtTimeBin; needCalibUpdate = true; } + if (mSpecConfig.nnLoadFromCCDB) { + for (int i = 0; i < 3; i++) { + newCalibObjects.nnClusterizerNetworks[i] = mConfig->configCalib.nnClusterizerNetworks[i]; + newCalibObjects.nnClusterizerNetworkSizes[i] = mConfig->configCalib.nnClusterizerNetworkSizes[i]; + } + } if (needCalibUpdate) { LOG(info) << "Updating GPUReconstruction calibration objects"; mGPUReco->UpdateCalibration(newCalibObjects, newCalibValues); @@ -1282,6 +1230,7 @@ Inputs GPURecoWorkflowSpec::inputs() LOG(info) << "(NN CLUS) Enabling fetching of TPC NN clusterizer from CCDB"; mSpecConfig.nnLoadFromCCDB = true; + mSpecConfig.nnDumpToFile = mConfig->configProcessing.nn.nnCCDBDumpToFile; GPUSettingsProcessingNNclusterizer& nnClusterizerSettings = mConfig->configProcessing.nn; std::map metadata; @@ -1300,7 +1249,7 @@ Inputs GPURecoWorkflowSpec::inputs() } }; - std::vector evalMode = o2::utils::Str::tokenize(nnClusterizerSettings.nnEvalMode, ':'); + mSpecConfig.nnEvalMode = o2::utils::Str::tokenize(nnClusterizerSettings.nnEvalMode, ':'); std::vector ccdb_metadata; if (mConfParam->printSettings) { @@ -1313,29 +1262,25 @@ Inputs GPURecoWorkflowSpec::inputs() printSettings(metadata); } - if (evalMode[0] == "c1") { + if (mSpecConfig.nnEvalMode[0] == "c1") { metadata["nnCCDBEvalType"] = "classification_c1"; convert_map_to_metadata(metadata, ccdb_metadata); inputs.emplace_back("nn_classification_c1", "TPC", "NNCLUSTERIZER_C1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0)); - LOG(info) << "(NN CLUS) Loading NN clusterizer classification (c1) from CCDB"; - } else if (evalMode[0] == "c2") { + } else if (mSpecConfig.nnEvalMode[0] == "c2") { metadata["nnCCDBEvalType"] = "classification_c2"; convert_map_to_metadata(metadata, ccdb_metadata); inputs.emplace_back("nn_classification_c2", "TPC", "NNCLUSTERIZER_C2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0)); - LOG(info) << "(NN CLUS) Loading NN clusterizer classification (c2) from CCDB"; } metadata["nnCCDBEvalType"] = "regression_c1"; metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBRegressionLayerType; convert_map_to_metadata(metadata, ccdb_metadata); inputs.emplace_back("nn_regression_c1", "TPC", "NNCLUSTERIZER_R1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0)); - LOG(info) << "(NN CLUS) Loading NN clusterizer regression (r1) from CCDB"; - if (evalMode[1] == "r2") { + if (mSpecConfig.nnEvalMode[1] == "r2") { metadata["nnCCDBEvalType"] = "regression_c2"; convert_map_to_metadata(metadata, ccdb_metadata); inputs.emplace_back("nn_regression_c2", "TPC", "NNCLUSTERIZER_R2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0)); - LOG(info) << "(NN CLUS) Loading NN clusterizer regression (r2) from CCDB"; } } diff --git a/GPU/Workflow/src/GPUWorkflowTPC.cxx b/GPU/Workflow/src/GPUWorkflowTPC.cxx index 6606386819b64..2d7d63e1cf25b 100644 --- a/GPU/Workflow/src/GPUWorkflowTPC.cxx +++ b/GPU/Workflow/src/GPUWorkflowTPC.cxx @@ -405,6 +405,76 @@ bool GPURecoWorkflowSpec::fetchCalibsCCDBTPC(ProcessingCon newCalibObjects.tpcPadGain = mCalibObjects.mTPCPadGainCalib.get(); mustUpdate = true; } + + // NN clusterizer networks + if (mSpecConfig.nnLoadFromCCDB) { + + auto findValidObjectEnd = [](const char* buffer, std::size_t size) { + const char* marker = "Accept-Ranges"; + std::size_t markerLen = std::strlen(marker); + + auto rpos = std::search( + std::make_reverse_iterator(buffer + size), + std::make_reverse_iterator(buffer), + std::make_reverse_iterator(marker + markerLen), + std::make_reverse_iterator(marker)); + + if (rpos == std::make_reverse_iterator(buffer)) { + return size; // Marker not found: keep full buffer + } + + const char* pos = rpos.base() - markerLen; // Convert reverse iterator back + return static_cast(pos - buffer); + }; + + auto dumpToFile = [](const char* buffer, std::size_t validSize, const std::string& path) { + std::ofstream out(path, std::ios::binary | std::ios::trunc); + if (!out.is_open()) { + throw std::runtime_error("Failed to open output file: " + path); + } + + out.write(buffer, static_cast(validSize)); + if (!out) { + throw std::runtime_error("Failed while writing data to: " + path); + } + }; + + DataRef m; + if (mSpecConfig.nnEvalMode[0] == "c1") { + m = pc.inputs().get("nn_classification_c1"); + mConfig->configCalib.nnClusterizerNetworks[0] = const_cast(m.payload); + size_t size = DataRefUtils::getPayloadSize(m); + mConfig->configCalib.nnClusterizerNetworkSizes[0] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[0], size); + if (mSpecConfig.nnDumpToFile) { + dumpToFile(mConfig->configCalib.nnClusterizerNetworks[0], mConfig->configCalib.nnClusterizerNetworkSizes[0], "net_classification_c1.onnx"); + } + } else if (mSpecConfig.nnEvalMode[0] == "c2") { + m = pc.inputs().get("nn_classification_c2"); + mConfig->configCalib.nnClusterizerNetworks[0] = const_cast(m.payload); + size_t size = DataRefUtils::getPayloadSize(m); + mConfig->configCalib.nnClusterizerNetworkSizes[0] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[0], size); + if (mSpecConfig.nnDumpToFile) { + dumpToFile(mConfig->configCalib.nnClusterizerNetworks[0], mConfig->configCalib.nnClusterizerNetworkSizes[0], "net_classification_c2.onnx"); + } + } + + m = pc.inputs().get("nn_regression_c1"); + mConfig->configCalib.nnClusterizerNetworks[2] = const_cast(m.payload); + size_t size = DataRefUtils::getPayloadSize(m); + mConfig->configCalib.nnClusterizerNetworkSizes[2] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[2], size); + if (mSpecConfig.nnDumpToFile) { + dumpToFile(mConfig->configCalib.nnClusterizerNetworks[2], mConfig->configCalib.nnClusterizerNetworkSizes[2], "net_regression_c1.onnx"); + } + if (mSpecConfig.nnEvalMode[1] == "r2") { + m = pc.inputs().get("nn_regression_c2"); + mConfig->configCalib.nnClusterizerNetworks[3] = const_cast(m.payload); + size_t size = DataRefUtils::getPayloadSize(m); + mConfig->configCalib.nnClusterizerNetworkSizes[3] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[3], size); + if (mSpecConfig.nnDumpToFile) { + dumpToFile(mConfig->configCalib.nnClusterizerNetworks[3], mConfig->configCalib.nnClusterizerNetworkSizes[3], "net_regression_c2.onnx"); + } + } + } } return mustUpdate; } From ab19782deff6d71ba65fd774b1388991e6ead679 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Sun, 23 Nov 2025 10:53:45 +0000 Subject: [PATCH 11/16] Please consider the following formatting changes --- GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 2b4003396e4f7..5daf3043f6c47 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -677,7 +677,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator); // } // recreateMemoryAllocator = true; - if (!nn_settings.nnLoadFromCCDB){ + if (!nn_settings.nnLoadFromCCDB) { (nnApplications[lane].mModelClass).initSession(); // loads from file } else { (nnApplications[lane].mModelClass).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[0], processors()->calibObjects.nnClusterizerNetworkSizes[0]); // loads from CCDB @@ -692,7 +692,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // (nnApplications[lane].mModelReg1).setEnv((nnApplications[lane].mModelClass).getEnv()); (nnApplications[lane].mModelReg1).initEnvironment(); // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelReg1).getEnv(), (nnApplications[lane].mModelReg1).getMemoryInfo(), mRec, recreateMemoryAllocator); - if (!nn_settings.nnLoadFromCCDB){ + if (!nn_settings.nnLoadFromCCDB) { (nnApplications[lane].mModelReg1).initSession(); // loads from file } else { (nnApplications[lane].mModelReg1).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[1], processors()->calibObjects.nnClusterizerNetworkSizes[1]); // loads from CCDB @@ -707,7 +707,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // (nnApplications[lane].mModelReg2).setEnv((nnApplications[lane].mModelClass).getEnv()); (nnApplications[lane].mModelReg2).initEnvironment(); // nnApplications[lane].directOrtAllocator((nnApplications[lane].mModelClass).getEnv(), (nnApplications[lane].mModelClass).getMemoryInfo(), mRec, recreateMemoryAllocator); - if (!nn_settings.nnLoadFromCCDB){ + if (!nn_settings.nnLoadFromCCDB) { (nnApplications[lane].mModelReg2).initSession(); // loads from file } else { (nnApplications[lane].mModelReg2).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[2], processors()->calibObjects.nnClusterizerNetworkSizes[2]); // loads from CCDB From 4fed621b94cc7583649b479bee3672c8acbfdb4c Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Sun, 23 Nov 2025 12:37:09 +0100 Subject: [PATCH 12/16] Bug-fix --- GPU/Workflow/src/GPUWorkflowTPC.cxx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/GPU/Workflow/src/GPUWorkflowTPC.cxx b/GPU/Workflow/src/GPUWorkflowTPC.cxx index 2d7d63e1cf25b..35103a6da6842 100644 --- a/GPU/Workflow/src/GPUWorkflowTPC.cxx +++ b/GPU/Workflow/src/GPUWorkflowTPC.cxx @@ -459,19 +459,19 @@ bool GPURecoWorkflowSpec::fetchCalibsCCDBTPC(ProcessingCon } m = pc.inputs().get("nn_regression_c1"); - mConfig->configCalib.nnClusterizerNetworks[2] = const_cast(m.payload); + mConfig->configCalib.nnClusterizerNetworks[1] = const_cast(m.payload); size_t size = DataRefUtils::getPayloadSize(m); - mConfig->configCalib.nnClusterizerNetworkSizes[2] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[2], size); + mConfig->configCalib.nnClusterizerNetworkSizes[1] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[2], size); if (mSpecConfig.nnDumpToFile) { - dumpToFile(mConfig->configCalib.nnClusterizerNetworks[2], mConfig->configCalib.nnClusterizerNetworkSizes[2], "net_regression_c1.onnx"); + dumpToFile(mConfig->configCalib.nnClusterizerNetworks[1], mConfig->configCalib.nnClusterizerNetworkSizes[2], "net_regression_c1.onnx"); } if (mSpecConfig.nnEvalMode[1] == "r2") { m = pc.inputs().get("nn_regression_c2"); - mConfig->configCalib.nnClusterizerNetworks[3] = const_cast(m.payload); + mConfig->configCalib.nnClusterizerNetworks[2] = const_cast(m.payload); size_t size = DataRefUtils::getPayloadSize(m); - mConfig->configCalib.nnClusterizerNetworkSizes[3] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[3], size); + mConfig->configCalib.nnClusterizerNetworkSizes[3] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[2], size); if (mSpecConfig.nnDumpToFile) { - dumpToFile(mConfig->configCalib.nnClusterizerNetworks[3], mConfig->configCalib.nnClusterizerNetworkSizes[3], "net_regression_c2.onnx"); + dumpToFile(mConfig->configCalib.nnClusterizerNetworks[2], mConfig->configCalib.nnClusterizerNetworkSizes[3], "net_regression_c2.onnx"); } } } From e7cd6fa5d7c5a6cbebdb2c0ef31b689fa981ac76 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 25 Nov 2025 11:26:01 +0100 Subject: [PATCH 13/16] Working version of CCDB fetching and loading into ROOT class of std::vector --- .../TPC/base/test/testTPCCDBInterface.cxx | 1 - GPU/GPUTracking/CMakeLists.txt | 1 + GPU/GPUTracking/DataTypes/GPUDataTypes.h | 4 +- .../DataTypes/ORTRootSerializer.cxx | 25 ++ GPU/GPUTracking/DataTypes/ORTRootSerializer.h | 43 ++++ GPU/GPUTracking/Definitions/GPUSettingsList.h | 2 +- .../GPUTrackingLinkDef_O2_DataTypes.h | 1 + .../Global/GPUChainTrackingClusterizer.cxx | 7 +- .../utils/convert_onnx_to_root_serialized.C | 220 ++++++++++++++++++ GPU/Workflow/src/GPUWorkflowSpec.cxx | 33 ++- GPU/Workflow/src/GPUWorkflowTPC.cxx | 76 ++---- 11 files changed, 339 insertions(+), 74 deletions(-) create mode 100644 GPU/GPUTracking/DataTypes/ORTRootSerializer.cxx create mode 100644 GPU/GPUTracking/DataTypes/ORTRootSerializer.h create mode 100644 GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C diff --git a/Detectors/TPC/base/test/testTPCCDBInterface.cxx b/Detectors/TPC/base/test/testTPCCDBInterface.cxx index 3074c5e90a00c..5a5384a4134ed 100644 --- a/Detectors/TPC/base/test/testTPCCDBInterface.cxx +++ b/Detectors/TPC/base/test/testTPCCDBInterface.cxx @@ -22,7 +22,6 @@ // o2 includes #include "TPCBase/CDBInterface.h" -#include "TPCBase/CDBInterface.h" #include "TPCBase/CalArray.h" #include "TPCBase/CalDet.h" #include "TPCBase/Mapper.h" diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index 2a0b9b9edfa09..6dd718f07a9f1 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -209,6 +209,7 @@ set(SRCS_DATATYPES DataTypes/TPCPadBitMap.cxx DataTypes/TPCZSLinkMapping.cxx DataTypes/CalibdEdxContainer.cxx + DataTypes/ORTRootSerializer.cxx DataTypes/CalibdEdxTrackTopologyPol.cxx DataTypes/CalibdEdxTrackTopologySpline.cxx DataTypes/GPUTRDTrackO2.cxx) diff --git a/GPU/GPUTracking/DataTypes/GPUDataTypes.h b/GPU/GPUTracking/DataTypes/GPUDataTypes.h index e84ebe7231b34..8bf8084e048fd 100644 --- a/GPU/GPUTracking/DataTypes/GPUDataTypes.h +++ b/GPU/GPUTracking/DataTypes/GPUDataTypes.h @@ -85,6 +85,7 @@ class Cluster; namespace tpc { class CalibdEdxContainer; +class ORTRootSerializer; } // namespace tpc } // namespace o2 @@ -184,8 +185,7 @@ struct GPUCalibObjectsTemplate { // use only pointers on PODs or flat objects he typename S::type* itsPatternDict = nullptr; // NN clusterizer objects - char* nnClusterizerNetworks[3] = {nullptr, nullptr, nullptr}; // [c, r1, r2] networks as char arrays from CCDB - uint32_t nnClusterizerNetworkSizes[3] = {0, 0, 0}; + typename S::type* nnClusterizerNetworks[3] = {nullptr, nullptr, nullptr}; }; typedef GPUCalibObjectsTemplate GPUCalibObjects; // NOTE: These 2 must have identical layout since they are memcopied typedef GPUCalibObjectsTemplate GPUCalibObjectsConst; diff --git a/GPU/GPUTracking/DataTypes/ORTRootSerializer.cxx b/GPU/GPUTracking/DataTypes/ORTRootSerializer.cxx new file mode 100644 index 0000000000000..82a8be1fdfec8 --- /dev/null +++ b/GPU/GPUTracking/DataTypes/ORTRootSerializer.cxx @@ -0,0 +1,25 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file ORTRootSerializer.cxx +/// \author Christian Sonnabend + +#include "ORTRootSerializer.h" +#include + +using namespace o2::tpc; + +/// Initialize the serialization from a char* buffer containing the model +void ORTRootSerializer::setOnnxModel(const char* onnxModel, uint32_t size) +{ + mModelBuffer.resize(size); + std::memcpy(mModelBuffer.data(), onnxModel, size); +} diff --git a/GPU/GPUTracking/DataTypes/ORTRootSerializer.h b/GPU/GPUTracking/DataTypes/ORTRootSerializer.h new file mode 100644 index 0000000000000..9952abfd55997 --- /dev/null +++ b/GPU/GPUTracking/DataTypes/ORTRootSerializer.h @@ -0,0 +1,43 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file ORTRootSerializer.h +/// \brief Class to serialize ONNX objects for ROOT snapshots of CCDB objects at runtime +/// \author Christian Sonnabend + +#ifndef ALICEO2_TPC_ORTROOTSERIALIZER_H_ +#define ALICEO2_TPC_ORTROOTSERIALIZER_H_ + +#include "GPUCommonRtypes.h" +#include +#include + +namespace o2::tpc +{ + +class ORTRootSerializer +{ + public: + ORTRootSerializer() = default; + ~ORTRootSerializer() = default; + + void setOnnxModel(const char* onnxModel, uint32_t size); + const char* getONNXModel() const { return mModelBuffer.data(); } + uint32_t getONNXModelSize() const { return static_cast(mModelBuffer.size()); } + + private: + std::vector mModelBuffer; ///< buffer for serialization + ClassDefNV(ORTRootSerializer, 1); +}; + +} // namespace o2::tpc + +#endif // ALICEO2_TPC_ORTROOTSERIALIZER_H_ diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index aa9e8a78c4484..06f681dea4c1a 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -277,8 +277,8 @@ AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the bo AddOption(nnClusterizerApplyNoiseSuppression, int, 1, "", 0, "Applies the NoiseSuppression kernel before the digits to the network are filled") AddOption(nnClusterizerSetDeconvolutionFlags, int, 1, "", 0, "Runs the deconvolution kernel without overwriting the charge in order to make cluster-to-track attachment identical to heuristic CF") AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path") -AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.") AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path") +AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.") AddOption(nnSigmoidTrafoClassThreshold, int, 1, "", 0, "If true (default), then the classification threshold is transformed by an inverse sigmoid function. This depends on how the network was trained (with a sigmoid as acitvation function in the last layer or not).") AddOption(nnEvalMode, std::string, "c1:r1", "", 0, "Concatention of modes, e.g. c1:r1 (classification class 1, regression class 1)") AddOption(nnClusterizerUseClassification, int, 1, "", 0, "If 1, the classification output of the network is used to select clusters, else only the regression output is used and no clusters are rejected by classification") diff --git a/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h b/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h index 46fd50464c69b..7bd2c689c5354 100644 --- a/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h +++ b/GPU/GPUTracking/GPUTrackingLinkDef_O2_DataTypes.h @@ -43,5 +43,6 @@ #pragma link C++ class o2::tpc::CalibdEdxTrackTopologyPol + ; #pragma link C++ class o2::tpc::CalibdEdxTrackTopologySpline + ; #pragma link C++ struct o2::tpc::CalibdEdxTrackTopologyPolContainer + ; +#pragma link C++ struct o2::tpc::ORTRootSerializer + ; #endif diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 5daf3043f6c47..5426f0eafdad6 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -47,6 +47,7 @@ #ifdef GPUCA_HAS_ONNX #include "GPUTPCNNClusterizerKernels.h" #include "GPUTPCNNClusterizerHost.h" +#include "ORTRootSerializer.h" #endif #ifdef GPUCA_O2_LIB @@ -680,7 +681,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (!nn_settings.nnLoadFromCCDB) { (nnApplications[lane].mModelClass).initSession(); // loads from file } else { - (nnApplications[lane].mModelClass).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[0], processors()->calibObjects.nnClusterizerNetworkSizes[0]); // loads from CCDB + (nnApplications[lane].mModelClass).initSessionFromBuffer((processors()->calibObjects.nnClusterizerNetworks[0])->getONNXModel(), (processors()->calibObjects.nnClusterizerNetworks[0])->getONNXModelSize()); // loads from CCDB } } if (nnApplications[lane].mModelsUsed[1]) { @@ -695,7 +696,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (!nn_settings.nnLoadFromCCDB) { (nnApplications[lane].mModelReg1).initSession(); // loads from file } else { - (nnApplications[lane].mModelReg1).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[1], processors()->calibObjects.nnClusterizerNetworkSizes[1]); // loads from CCDB + (nnApplications[lane].mModelReg1).initSessionFromBuffer((processors()->calibObjects.nnClusterizerNetworks[1])->getONNXModel(), (processors()->calibObjects.nnClusterizerNetworks[1])->getONNXModelSize()); // loads from CCDB } } if (nnApplications[lane].mModelsUsed[2]) { @@ -710,7 +711,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (!nn_settings.nnLoadFromCCDB) { (nnApplications[lane].mModelReg2).initSession(); // loads from file } else { - (nnApplications[lane].mModelReg2).initSessionFromBuffer(processors()->calibObjects.nnClusterizerNetworks[2], processors()->calibObjects.nnClusterizerNetworkSizes[2]); // loads from CCDB + (nnApplications[lane].mModelReg2).initSessionFromBuffer((processors()->calibObjects.nnClusterizerNetworks[2])->getONNXModel(), (processors()->calibObjects.nnClusterizerNetworks[2])->getONNXModelSize()); // loads from CCDB } } if (nn_settings.nnClusterizerVerbosity > 0) { diff --git a/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C b/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C new file mode 100644 index 0000000000000..1970c57353e54 --- /dev/null +++ b/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C @@ -0,0 +1,220 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file convert_onnx_to_root_serialized.C +/// \brief Utility functions to be executed as a ROOT macro for uploading ONNX models to CCDB as ROOT serialized objects and vice versa +/// \author Christian Sonnabend + +// Example execution: root -l -b -q '/scratch/csonnabe/MyO2/O2/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C("/scratch/csonnabe/PhD/jobs/clusterization/NN/output/21082025_smallWindow_clean/SC/training_data_21082025_reco_noise_supressed_p3t6_CoGselected/SC/PbPb_24arp2/0_5/class1/regression/399_noMom/network/net_fp16.onnx", "", 1, 1, "nnCCDBLayerType=FC/nnCCDBWithMomentum=0/inputDType=FP16/nnCCDBInteractionRate=500/outputDType=FP16/nnCCDBEvalType=regression_c1/nnCCDBBeamType=pp/partName=blob/quality=3", 1, 4108971600000, "Users/c/csonnabe/TPC/Clusterization", "model.root")' + +#include "ORTRootSerializer.h" +#include "CCDB/CcdbApi.h" +#include "CCDB/CcdbObjectInfo.h" +#include "TFile.h" +#include +#include + +o2::tpc::ORTRootSerializer serializer; + +/// Dumps the char* to a .onnx file -> Directly readable by ONNX runtime or Netron +void dumpOnnxToFile(const char* modelBuffer, uint32_t size, const std::string outputPath) +{ + std::ofstream outFile(outputPath, std::ios::binary | std::ios::trunc); + if (!outFile.is_open()) { + throw std::runtime_error("Failed to open output ONNX file: " + outputPath); + } + outFile.write(modelBuffer, static_cast(size)); + if (!outFile) { + throw std::runtime_error("Failed while writing data to: " + outputPath); + } + outFile.close(); +} + +/// Initialize the serialization from an ONNX file +void readOnnxModelFromFile(const std::string modelPath) +{ + std::ifstream inFile(modelPath, std::ios::binary | std::ios::ate); + if (!inFile.is_open()) { + throw std::runtime_error("Could not open input ONNX file " + modelPath); + } + std::streamsize size = inFile.tellg(); + std::vector mModelBuffer(size); + inFile.seekg(0, std::ios::beg); + if (!inFile.read(mModelBuffer.data(), size)) { + throw std::runtime_error("Could not read input ONNX file " + modelPath); + } + inFile.close(); + serializer.setOnnxModel(mModelBuffer.data(), static_cast(size)); +} + +/// Initialize the serialization from a ROOT file +void readRootModelFromFile(const std::string rootFilePath, std::string key) +{ + TFile inRootFile(rootFilePath.c_str()); + if (inRootFile.IsZombie()) { + throw std::runtime_error("Could not open input ROOT file " + rootFilePath); + } + auto* serPtr = inRootFile.Get(key.c_str()); + if (!serPtr) { + throw std::runtime_error("Could not find " + key + " in ROOT file " + rootFilePath); + } + serializer = *serPtr; + inRootFile.Close(); +} + +/// Serialize the ONNX model to a ROOT object and store to file +void onnxToRoot(std::string infile, std::string outfile, std::string key) +{ + readOnnxModelFromFile(infile); + TFile outRootFile(outfile.c_str(), "RECREATE"); + if (outRootFile.IsZombie()) { + throw std::runtime_error("Could not create output ROOT file " + outfile); + } + outRootFile.WriteObject(&serializer, key.c_str()); + outRootFile.Close(); +} + +/// Deserialize the ONNX model from a ROOT object and store to a .onnx file +void rootToOnnx(std::string infile, std::string outfile, std::string key) +{ + TFile inRootFile(infile.c_str()); + if (inRootFile.IsZombie()) { + throw std::runtime_error("Could not open input ROOT file " + infile); + } + auto* serPtr = inRootFile.Get(key.c_str()); + if (!serPtr) { + throw std::runtime_error("Could not find " + key + " in ROOT file " + infile); + } + serializer = *serPtr; + + std::ofstream outFile(outfile, std::ios::binary | std::ios::trunc); + if (!outFile.is_open()) { + throw std::runtime_error("Failed to open output ONNX file: " + outfile); + } + outFile.write(serializer.getONNXModel(), static_cast(serializer.getONNXModelSize())); + if (!outFile) { + throw std::runtime_error("Failed while writing data to: " + outfile); + } + outFile.close(); + + inRootFile.Close(); +} + +/// Upload the ONNX model to CCDB from an ONNX file +/// !!! Adjust the metadata, path and validity !!! +void uploadToCCDBFromONNX(std::string onnxFile, + const std::map& metadata, + // { // some example metadata entries + // "nnCCDBLayerType": "FC", + // "nnCCDBWithMomentum": "0", + // "inputDType": "FP16", + // "nnCCDBInteractionRate": "500", + // "outputDType": "FP16", + // "nnCCDBEvalType": "regression_c1", + // "nnCCDBBeamType": "pp", + // "partName": "blob", + // "quality": "3" + // } + long tsMin /* = 1 */, + long tsMax /* = 4108971600000 */, + std::string ccdbPath /* = "Users/c/csonnabe/TPC/Clusterization" */, + std::string objname /* = "net_regression_r1.root" */, + std::string ccdbUrl /* = "http://alice-ccdb.cern.ch" */) +{ + readOnnxModelFromFile(onnxFile); + + o2::ccdb::CcdbApi api; + api.init(ccdbUrl); + + // build full CCDB path including filename + const std::string fullPath = ccdbPath;//.back() == '/' ? (ccdbPath + objname) : (ccdbPath + "/" + objname); + + api.storeAsTFileAny(&serializer, fullPath, metadata, tsMin, tsMax); +} + +/// Upload the ONNX model to CCDB from a ROOT file +/// !!! Adjust the metadata, path and validity !!! +void uploadToCCDBFromROOT(std::string rootFile, + const std::map& metadata, + long tsMin /* = 1 */, + long tsMax /* = 4108971600000 */, + std::string ccdbPath /* = "Users/c/csonnabe/TPC/Clusterization" */, + std::string objname /* = "net_regression_r1.root" */, + std::string ccdbUrl /* = "http://alice-ccdb.cern.ch" */) +{ + // read ROOT file, extract ORTRootSerializer object and upload via storeAsTFileAny + TFile inRootFile(rootFile.c_str()); + if (inRootFile.IsZombie()) { + throw std::runtime_error("Could not open input ROOT file " + rootFile); + } + + // if objname is empty, fall back to default CCDB object key + const std::string key = objname.empty() ? o2::ccdb::CcdbApi::CCDBOBJECT_ENTRY : objname; + + auto* serPtr = inRootFile.Get(key.c_str()); + if (!serPtr) { + inRootFile.Close(); + throw std::runtime_error("Could not find " + key + " in ROOT file " + rootFile); + } + serializer = *serPtr; + + o2::ccdb::CcdbApi api; + api.init(ccdbUrl); + + // build full CCDB path including filename + const std::string fullPath = ccdbPath;//.back() == '/' ? (ccdbPath + objname) : (ccdbPath + "/" + objname); + + api.storeAsTFileAny(&serializer, fullPath, metadata, tsMin, tsMax); + + inRootFile.Close(); +} + +void convert_onnx_to_root_serialized(const std::string& onnxFile, + const std::string& rootFile, + int mode = 0, + int ccdbUpload = 0, + const std::string& metadataStr = "nnCCDBLayerType=FC/nnCCDBWithMomentum=0/inputDType=FP16/nnCCDBInteractionRate=500/outputDType=FP16/nnCCDBEvalType=regression_c1/nnCCDBBeamType=pp/partName=blob/quality=3", + long tsMin = 1, + long tsMax = 4108971600000, + std::string ccdbPath = "Users/c/csonnabe/TPC/Clusterization", + std::string objname = "net_regression_r1.root", + std::string ccdbUrl = "http://alice-ccdb.cern.ch") +{ + // parse metadataStr of the form key=value/key2=value2/... + std::map metadata; + std::size_t start = 0; + while (start < metadataStr.size()) { + auto sep = metadataStr.find('/', start); + auto token = metadataStr.substr(start, sep == std::string::npos ? std::string::npos : sep - start); + if (!token.empty()) { + auto eq = token.find('='); + if (eq != std::string::npos && eq > 0 && eq + 1 < token.size()) { + metadata.emplace(token.substr(0, eq), token.substr(eq + 1)); + } + } + if (sep == std::string::npos) { + break; + } + start = sep + 1; + } + + if (ccdbUpload == 0){ + if (mode == 0) + onnxToRoot(onnxFile, rootFile, o2::ccdb::CcdbApi::CCDBOBJECT_ENTRY); + else if (mode == 1) + rootToOnnx(rootFile, onnxFile, o2::ccdb::CcdbApi::CCDBOBJECT_ENTRY); + } else if (ccdbUpload == 1){ + if (mode == 0) + uploadToCCDBFromROOT(rootFile, metadata, tsMin, tsMax, ccdbPath, objname, ccdbUrl); + else if (mode == 1) + uploadToCCDBFromONNX(onnxFile, metadata, tsMin, tsMax, ccdbPath, objname, ccdbUrl); + } +} diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index 11c26c40f3077..c0e46cc7b75f3 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -54,6 +54,7 @@ #include "GPUO2Interface.h" #include "GPUO2InterfaceUtils.h" #include "CalibdEdxContainer.h" +#include "ORTRootSerializer.h" #include "GPUNewCalibValues.h" #include "TPCPadGainCalib.h" #include "TPCZSLinkMapping.h" @@ -1081,9 +1082,24 @@ void GPURecoWorkflowSpec::doCalibUpdates(o2::framework::ProcessingContext& pc, c needCalibUpdate = true; } if (mSpecConfig.nnLoadFromCCDB) { + auto dumpToFile = [](const char* buffer, std::size_t validSize, const std::string& path) { + std::ofstream out(path, std::ios::binary | std::ios::trunc); + if (!out.is_open()) { + throw std::runtime_error("Failed to open output file: " + path); + } + + out.write(buffer, static_cast(validSize)); + if (!out) { + throw std::runtime_error("Failed while writing data to: " + path); + } + }; for (int i = 0; i < 3; i++) { newCalibObjects.nnClusterizerNetworks[i] = mConfig->configCalib.nnClusterizerNetworks[i]; - newCalibObjects.nnClusterizerNetworkSizes[i] = mConfig->configCalib.nnClusterizerNetworkSizes[i]; + if (mSpecConfig.nnDumpToFile && newCalibObjects.nnClusterizerNetworks[i]) { + std::string path = "tpc_nn_clusterizer_" + std::to_string(i) + ".onnx"; + dumpToFile(newCalibObjects.nnClusterizerNetworks[i]->getONNXModel(), newCalibObjects.nnClusterizerNetworks[i]->getONNXModelSize(), path); + LOG(info) << "Dumped TPC clusterizer NN " << i << " to file " << path; + } } } if (needCalibUpdate) { @@ -1262,25 +1278,26 @@ Inputs GPURecoWorkflowSpec::inputs() printSettings(metadata); } - if (mSpecConfig.nnEvalMode[0] == "c1") { + if(mSpecConfig.nnEvalMode[0] == "c1") { metadata["nnCCDBEvalType"] = "classification_c1"; convert_map_to_metadata(metadata, ccdb_metadata); - inputs.emplace_back("nn_classification_c1", "TPC", "NNCLUSTERIZER_C1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0)); - } else if (mSpecConfig.nnEvalMode[0] == "c2") { + inputs.emplace_back("nn_classification_c1", gDataOriginTPC, "NNCLUSTERIZER_C1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0)); + } else if(mSpecConfig.nnEvalMode[0] == "c2") { + metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBRegressionLayerType; metadata["nnCCDBEvalType"] = "classification_c2"; convert_map_to_metadata(metadata, ccdb_metadata); - inputs.emplace_back("nn_classification_c2", "TPC", "NNCLUSTERIZER_C2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0)); + inputs.emplace_back("nn_classification_c2", gDataOriginTPC, "NNCLUSTERIZER_C2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0)); } metadata["nnCCDBEvalType"] = "regression_c1"; metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBRegressionLayerType; convert_map_to_metadata(metadata, ccdb_metadata); - inputs.emplace_back("nn_regression_c1", "TPC", "NNCLUSTERIZER_R1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0)); + inputs.emplace_back("nn_regression_c1", gDataOriginTPC, "NNCLUSTERIZER_R1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0)); - if (mSpecConfig.nnEvalMode[1] == "r2") { + if(mSpecConfig.nnEvalMode[1] == "r2") { metadata["nnCCDBEvalType"] = "regression_c2"; convert_map_to_metadata(metadata, ccdb_metadata); - inputs.emplace_back("nn_regression_c2", "TPC", "NNCLUSTERIZER_R2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath, ccdb_metadata, 0)); + inputs.emplace_back("nn_regression_c2", gDataOriginTPC, "NNCLUSTERIZER_R2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0)); } } diff --git a/GPU/Workflow/src/GPUWorkflowTPC.cxx b/GPU/Workflow/src/GPUWorkflowTPC.cxx index 35103a6da6842..ae9aabda1fd74 100644 --- a/GPU/Workflow/src/GPUWorkflowTPC.cxx +++ b/GPU/Workflow/src/GPUWorkflowTPC.cxx @@ -49,6 +49,7 @@ #include "GPUO2Interface.h" #include "GPUO2InterfaceUtils.h" #include "CalibdEdxContainer.h" +#include "ORTRootSerializer.h" #include "GPUNewCalibValues.h" #include "TPCPadGainCalib.h" #include "TPCZSLinkMapping.h" @@ -293,6 +294,18 @@ void GPURecoWorkflowSpec::finaliseCCDBTPC(ConcreteDataMatcher& matcher, void* ob mTPCDeadChannelMapCreator->getDeadChannelMapFEE().getSum(), mTPCDeadChannelMapCreator->getDeadChannelMap().getSum()); } else if (mTPCVDriftHelper->accountCCDBInputs(matcher, obj)) { } else if (mCalibObjects.mFastTransformHelper->accountCCDBInputs(matcher, obj)) { + } else if (matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_C1", 0)) { + mConfig->configCalib.nnClusterizerNetworks[0] = static_cast(obj); + LOG(info) << "(NN CLUS) " << (mConfig->configCalib.nnClusterizerNetworks[0])->getONNXModelSize() << " bytes loaded for NN clusterizer: classification_c1"; + } else if (matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_C2", 0)) { + mConfig->configCalib.nnClusterizerNetworks[0] = static_cast(obj); + LOG(info) << "(NN CLUS) " << (mConfig->configCalib.nnClusterizerNetworks[0])->getONNXModelSize() << " bytes loaded for NN clusterizer: classification_c2"; + } else if(matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_R1", 0)){ + mConfig->configCalib.nnClusterizerNetworks[1] = static_cast(obj); + LOG(info) << "(NN CLUS) " << (mConfig->configCalib.nnClusterizerNetworks[1])->getONNXModelSize() << " bytes loaded for NN clusterizer: regression_c1"; + } else if (matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_R2", 0)) { + mConfig->configCalib.nnClusterizerNetworks[2] = static_cast(obj); + LOG(info) << "(NN CLUS) " << (mConfig->configCalib.nnClusterizerNetworks[2])->getONNXModelSize() << " bytes loaded for NN clusterizer: regression_c2"; } } @@ -409,70 +422,15 @@ bool GPURecoWorkflowSpec::fetchCalibsCCDBTPC(ProcessingCon // NN clusterizer networks if (mSpecConfig.nnLoadFromCCDB) { - auto findValidObjectEnd = [](const char* buffer, std::size_t size) { - const char* marker = "Accept-Ranges"; - std::size_t markerLen = std::strlen(marker); - - auto rpos = std::search( - std::make_reverse_iterator(buffer + size), - std::make_reverse_iterator(buffer), - std::make_reverse_iterator(marker + markerLen), - std::make_reverse_iterator(marker)); - - if (rpos == std::make_reverse_iterator(buffer)) { - return size; // Marker not found: keep full buffer - } - - const char* pos = rpos.base() - markerLen; // Convert reverse iterator back - return static_cast(pos - buffer); - }; - - auto dumpToFile = [](const char* buffer, std::size_t validSize, const std::string& path) { - std::ofstream out(path, std::ios::binary | std::ios::trunc); - if (!out.is_open()) { - throw std::runtime_error("Failed to open output file: " + path); - } - - out.write(buffer, static_cast(validSize)); - if (!out) { - throw std::runtime_error("Failed while writing data to: " + path); - } - }; - - DataRef m; if (mSpecConfig.nnEvalMode[0] == "c1") { - m = pc.inputs().get("nn_classification_c1"); - mConfig->configCalib.nnClusterizerNetworks[0] = const_cast(m.payload); - size_t size = DataRefUtils::getPayloadSize(m); - mConfig->configCalib.nnClusterizerNetworkSizes[0] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[0], size); - if (mSpecConfig.nnDumpToFile) { - dumpToFile(mConfig->configCalib.nnClusterizerNetworks[0], mConfig->configCalib.nnClusterizerNetworkSizes[0], "net_classification_c1.onnx"); - } + pc.inputs().get("nn_classification_c1"); } else if (mSpecConfig.nnEvalMode[0] == "c2") { - m = pc.inputs().get("nn_classification_c2"); - mConfig->configCalib.nnClusterizerNetworks[0] = const_cast(m.payload); - size_t size = DataRefUtils::getPayloadSize(m); - mConfig->configCalib.nnClusterizerNetworkSizes[0] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[0], size); - if (mSpecConfig.nnDumpToFile) { - dumpToFile(mConfig->configCalib.nnClusterizerNetworks[0], mConfig->configCalib.nnClusterizerNetworkSizes[0], "net_classification_c2.onnx"); - } + pc.inputs().get("nn_classification_c2"); } - m = pc.inputs().get("nn_regression_c1"); - mConfig->configCalib.nnClusterizerNetworks[1] = const_cast(m.payload); - size_t size = DataRefUtils::getPayloadSize(m); - mConfig->configCalib.nnClusterizerNetworkSizes[1] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[2], size); - if (mSpecConfig.nnDumpToFile) { - dumpToFile(mConfig->configCalib.nnClusterizerNetworks[1], mConfig->configCalib.nnClusterizerNetworkSizes[2], "net_regression_c1.onnx"); - } + pc.inputs().get("nn_regression_c1"); if (mSpecConfig.nnEvalMode[1] == "r2") { - m = pc.inputs().get("nn_regression_c2"); - mConfig->configCalib.nnClusterizerNetworks[2] = const_cast(m.payload); - size_t size = DataRefUtils::getPayloadSize(m); - mConfig->configCalib.nnClusterizerNetworkSizes[3] = findValidObjectEnd(mConfig->configCalib.nnClusterizerNetworks[2], size); - if (mSpecConfig.nnDumpToFile) { - dumpToFile(mConfig->configCalib.nnClusterizerNetworks[2], mConfig->configCalib.nnClusterizerNetworkSizes[3], "net_regression_c2.onnx"); - } + pc.inputs().get("nn_regression_c2"); } } } From 9ed60e981652ff968c7a563246141e20f8404609 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Tue, 25 Nov 2025 10:26:45 +0000 Subject: [PATCH 14/16] Please consider the following formatting changes --- GPU/GPUTracking/DataTypes/ORTRootSerializer.h | 18 ++--- .../utils/convert_onnx_to_root_serialized.C | 72 +++++++++---------- GPU/Workflow/src/GPUWorkflowSpec.cxx | 6 +- GPU/Workflow/src/GPUWorkflowTPC.cxx | 2 +- 4 files changed, 49 insertions(+), 49 deletions(-) diff --git a/GPU/GPUTracking/DataTypes/ORTRootSerializer.h b/GPU/GPUTracking/DataTypes/ORTRootSerializer.h index 9952abfd55997..24009d4435a96 100644 --- a/GPU/GPUTracking/DataTypes/ORTRootSerializer.h +++ b/GPU/GPUTracking/DataTypes/ORTRootSerializer.h @@ -25,17 +25,17 @@ namespace o2::tpc class ORTRootSerializer { - public: - ORTRootSerializer() = default; - ~ORTRootSerializer() = default; + public: + ORTRootSerializer() = default; + ~ORTRootSerializer() = default; - void setOnnxModel(const char* onnxModel, uint32_t size); - const char* getONNXModel() const { return mModelBuffer.data(); } - uint32_t getONNXModelSize() const { return static_cast(mModelBuffer.size()); } + void setOnnxModel(const char* onnxModel, uint32_t size); + const char* getONNXModel() const { return mModelBuffer.data(); } + uint32_t getONNXModelSize() const { return static_cast(mModelBuffer.size()); } - private: - std::vector mModelBuffer; ///< buffer for serialization - ClassDefNV(ORTRootSerializer, 1); + private: + std::vector mModelBuffer; ///< buffer for serialization + ClassDefNV(ORTRootSerializer, 1); }; } // namespace o2::tpc diff --git a/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C b/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C index 1970c57353e54..b1b8b981393a1 100644 --- a/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C +++ b/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C @@ -111,23 +111,23 @@ void rootToOnnx(std::string infile, std::string outfile, std::string key) /// Upload the ONNX model to CCDB from an ONNX file /// !!! Adjust the metadata, path and validity !!! void uploadToCCDBFromONNX(std::string onnxFile, - const std::map& metadata, - // { // some example metadata entries - // "nnCCDBLayerType": "FC", - // "nnCCDBWithMomentum": "0", - // "inputDType": "FP16", - // "nnCCDBInteractionRate": "500", - // "outputDType": "FP16", - // "nnCCDBEvalType": "regression_c1", - // "nnCCDBBeamType": "pp", - // "partName": "blob", - // "quality": "3" - // } - long tsMin /* = 1 */, - long tsMax /* = 4108971600000 */, - std::string ccdbPath /* = "Users/c/csonnabe/TPC/Clusterization" */, - std::string objname /* = "net_regression_r1.root" */, - std::string ccdbUrl /* = "http://alice-ccdb.cern.ch" */) + const std::map& metadata, + // { // some example metadata entries + // "nnCCDBLayerType": "FC", + // "nnCCDBWithMomentum": "0", + // "inputDType": "FP16", + // "nnCCDBInteractionRate": "500", + // "outputDType": "FP16", + // "nnCCDBEvalType": "regression_c1", + // "nnCCDBBeamType": "pp", + // "partName": "blob", + // "quality": "3" + // } + long tsMin /* = 1 */, + long tsMax /* = 4108971600000 */, + std::string ccdbPath /* = "Users/c/csonnabe/TPC/Clusterization" */, + std::string objname /* = "net_regression_r1.root" */, + std::string ccdbUrl /* = "http://alice-ccdb.cern.ch" */) { readOnnxModelFromFile(onnxFile); @@ -135,7 +135,7 @@ void uploadToCCDBFromONNX(std::string onnxFile, api.init(ccdbUrl); // build full CCDB path including filename - const std::string fullPath = ccdbPath;//.back() == '/' ? (ccdbPath + objname) : (ccdbPath + "/" + objname); + const std::string fullPath = ccdbPath; //.back() == '/' ? (ccdbPath + objname) : (ccdbPath + "/" + objname); api.storeAsTFileAny(&serializer, fullPath, metadata, tsMin, tsMax); } @@ -143,12 +143,12 @@ void uploadToCCDBFromONNX(std::string onnxFile, /// Upload the ONNX model to CCDB from a ROOT file /// !!! Adjust the metadata, path and validity !!! void uploadToCCDBFromROOT(std::string rootFile, - const std::map& metadata, - long tsMin /* = 1 */, - long tsMax /* = 4108971600000 */, - std::string ccdbPath /* = "Users/c/csonnabe/TPC/Clusterization" */, - std::string objname /* = "net_regression_r1.root" */, - std::string ccdbUrl /* = "http://alice-ccdb.cern.ch" */) + const std::map& metadata, + long tsMin /* = 1 */, + long tsMax /* = 4108971600000 */, + std::string ccdbPath /* = "Users/c/csonnabe/TPC/Clusterization" */, + std::string objname /* = "net_regression_r1.root" */, + std::string ccdbUrl /* = "http://alice-ccdb.cern.ch" */) { // read ROOT file, extract ORTRootSerializer object and upload via storeAsTFileAny TFile inRootFile(rootFile.c_str()); @@ -170,7 +170,7 @@ void uploadToCCDBFromROOT(std::string rootFile, api.init(ccdbUrl); // build full CCDB path including filename - const std::string fullPath = ccdbPath;//.back() == '/' ? (ccdbPath + objname) : (ccdbPath + "/" + objname); + const std::string fullPath = ccdbPath; //.back() == '/' ? (ccdbPath + objname) : (ccdbPath + "/" + objname); api.storeAsTFileAny(&serializer, fullPath, metadata, tsMin, tsMax); @@ -178,15 +178,15 @@ void uploadToCCDBFromROOT(std::string rootFile, } void convert_onnx_to_root_serialized(const std::string& onnxFile, - const std::string& rootFile, - int mode = 0, - int ccdbUpload = 0, - const std::string& metadataStr = "nnCCDBLayerType=FC/nnCCDBWithMomentum=0/inputDType=FP16/nnCCDBInteractionRate=500/outputDType=FP16/nnCCDBEvalType=regression_c1/nnCCDBBeamType=pp/partName=blob/quality=3", - long tsMin = 1, - long tsMax = 4108971600000, - std::string ccdbPath = "Users/c/csonnabe/TPC/Clusterization", - std::string objname = "net_regression_r1.root", - std::string ccdbUrl = "http://alice-ccdb.cern.ch") + const std::string& rootFile, + int mode = 0, + int ccdbUpload = 0, + const std::string& metadataStr = "nnCCDBLayerType=FC/nnCCDBWithMomentum=0/inputDType=FP16/nnCCDBInteractionRate=500/outputDType=FP16/nnCCDBEvalType=regression_c1/nnCCDBBeamType=pp/partName=blob/quality=3", + long tsMin = 1, + long tsMax = 4108971600000, + std::string ccdbPath = "Users/c/csonnabe/TPC/Clusterization", + std::string objname = "net_regression_r1.root", + std::string ccdbUrl = "http://alice-ccdb.cern.ch") { // parse metadataStr of the form key=value/key2=value2/... std::map metadata; @@ -206,12 +206,12 @@ void convert_onnx_to_root_serialized(const std::string& onnxFile, start = sep + 1; } - if (ccdbUpload == 0){ + if (ccdbUpload == 0) { if (mode == 0) onnxToRoot(onnxFile, rootFile, o2::ccdb::CcdbApi::CCDBOBJECT_ENTRY); else if (mode == 1) rootToOnnx(rootFile, onnxFile, o2::ccdb::CcdbApi::CCDBOBJECT_ENTRY); - } else if (ccdbUpload == 1){ + } else if (ccdbUpload == 1) { if (mode == 0) uploadToCCDBFromROOT(rootFile, metadata, tsMin, tsMax, ccdbPath, objname, ccdbUrl); else if (mode == 1) diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index c0e46cc7b75f3..d7ea772c31653 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -1278,11 +1278,11 @@ Inputs GPURecoWorkflowSpec::inputs() printSettings(metadata); } - if(mSpecConfig.nnEvalMode[0] == "c1") { + if (mSpecConfig.nnEvalMode[0] == "c1") { metadata["nnCCDBEvalType"] = "classification_c1"; convert_map_to_metadata(metadata, ccdb_metadata); inputs.emplace_back("nn_classification_c1", gDataOriginTPC, "NNCLUSTERIZER_C1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0)); - } else if(mSpecConfig.nnEvalMode[0] == "c2") { + } else if (mSpecConfig.nnEvalMode[0] == "c2") { metadata["nnCCDBLayerType"] = nnClusterizerSettings.nnCCDBRegressionLayerType; metadata["nnCCDBEvalType"] = "classification_c2"; convert_map_to_metadata(metadata, ccdb_metadata); @@ -1294,7 +1294,7 @@ Inputs GPURecoWorkflowSpec::inputs() convert_map_to_metadata(metadata, ccdb_metadata); inputs.emplace_back("nn_regression_c1", gDataOriginTPC, "NNCLUSTERIZER_R1", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0)); - if(mSpecConfig.nnEvalMode[1] == "r2") { + if (mSpecConfig.nnEvalMode[1] == "r2") { metadata["nnCCDBEvalType"] = "regression_c2"; convert_map_to_metadata(metadata, ccdb_metadata); inputs.emplace_back("nn_regression_c2", gDataOriginTPC, "NNCLUSTERIZER_R2", 0, Lifetime::Condition, ccdbParamSpec(nnClusterizerSettings.nnCCDBPath + "/" + metadata["nnCCDBEvalType"], ccdb_metadata, 0)); diff --git a/GPU/Workflow/src/GPUWorkflowTPC.cxx b/GPU/Workflow/src/GPUWorkflowTPC.cxx index ae9aabda1fd74..13a3c4b6162b8 100644 --- a/GPU/Workflow/src/GPUWorkflowTPC.cxx +++ b/GPU/Workflow/src/GPUWorkflowTPC.cxx @@ -300,7 +300,7 @@ void GPURecoWorkflowSpec::finaliseCCDBTPC(ConcreteDataMatcher& matcher, void* ob } else if (matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_C2", 0)) { mConfig->configCalib.nnClusterizerNetworks[0] = static_cast(obj); LOG(info) << "(NN CLUS) " << (mConfig->configCalib.nnClusterizerNetworks[0])->getONNXModelSize() << " bytes loaded for NN clusterizer: classification_c2"; - } else if(matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_R1", 0)){ + } else if (matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_R1", 0)) { mConfig->configCalib.nnClusterizerNetworks[1] = static_cast(obj); LOG(info) << "(NN CLUS) " << (mConfig->configCalib.nnClusterizerNetworks[1])->getONNXModelSize() << " bytes loaded for NN clusterizer: regression_c1"; } else if (matcher == ConcreteDataMatcher(gDataOriginTPC, "NNCLUSTERIZER_R2", 0)) { From 6cba1f3caa943f252177372caaee05d7922c226f Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 25 Nov 2025 11:29:51 +0100 Subject: [PATCH 15/16] Disable dumpToFile by default --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 06f681dea4c1a..28cb2dd6dfbc3 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -285,7 +285,7 @@ AddOption(nnClusterizerUseClassification, int, 1, "", 0, "If 1, the classificati AddOption(nnClusterizerForceGpuInputFill, int, 0, "", 0, "Forces to use the fillInputNNGPU function") // CCDB AddOption(nnLoadFromCCDB, int, 0, "", 0, "If 1 networks are fetched from ccdb, else locally") -AddOption(nnCCDBDumpToFile, int, 1, "", 0, "If 1, additionally dump fetched CCDB networks to nnLocalFolder") +AddOption(nnCCDBDumpToFile, int, 0, "", 0, "If 1, additionally dump fetched CCDB networks to nnLocalFolder") AddOption(nnLocalFolder, std::string, ".", "", 0, "Local folder in which the networks will be fetched") AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, "Folder path containing the networks") AddOption(nnCCDBWithMomentum, std::string, "", "", 0, "Distinguishes between the network with and without momentum output for the regression") From 5c6d214522dd0b6a1a0ff4d72039880cdbbe23d8 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 25 Nov 2025 21:05:11 +0100 Subject: [PATCH 16/16] Moving macro, adding o2-test --- GPU/GPUTracking/Definitions/GPUSettingsList.h | 2 +- macro/CMakeLists.txt | 4 ++++ .../utils => macro}/convert_onnx_to_root_serialized.C | 0 3 files changed, 5 insertions(+), 1 deletion(-) rename {GPU/GPUTracking/utils => macro}/convert_onnx_to_root_serialized.C (100%) diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 28cb2dd6dfbc3..dc1742453ef39 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -291,7 +291,7 @@ AddOption(nnCCDBPath, std::string, "Users/c/csonnabe/TPC/Clusterization", "", 0, AddOption(nnCCDBWithMomentum, std::string, "", "", 0, "Distinguishes between the network with and without momentum output for the regression") AddOption(nnCCDBClassificationLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") AddOption(nnCCDBRegressionLayerType, std::string, "FC", "", 0, "Distinguishes between network with different layer types. Options: FC, CNN") -AddOption(nnCCDBBeamType, std::string, "PbPb", "", 0, "Distinguishes between networks trained for different beam types. Options: PbPb, pp, pPb") +AddOption(nnCCDBBeamType, std::string, "pp", "", 0, "Distinguishes between networks trained for different beam types. Options: pp, pPb, PbPb") AddOption(nnCCDBInteractionRate, std::string, "500", "", 0, "Distinguishes between networks for different interaction rates [kHz].") AddHelp("help", 'h') EndConfig() diff --git a/macro/CMakeLists.txt b/macro/CMakeLists.txt index 843ad4a3be0ab..b5c51e50d3ffb 100644 --- a/macro/CMakeLists.txt +++ b/macro/CMakeLists.txt @@ -58,6 +58,7 @@ install(FILES CheckDigits_mft.C CreateGRPLHCIFObject.C getTimeStamp.C CreateSampleIRFrames.C + convert_onnx_to_root_serialized.C DESTINATION share/macro/) # FIXME: a lot of macros that are here should really be elsewhere. Those which @@ -149,6 +150,9 @@ o2_add_test_root_macro(checkTOFMatching.C O2::SimulationDataFormat O2::DataFormatsTOF) +o2_add_test_root_macro(convert_onnx_to_root_serialized.C + PUBLIC_LINK_LIBRARIES O2::GlobalTracking) + # FIXME: move to subsystem dir o2_add_test_root_macro(compareTopologyDistributions.C PUBLIC_LINK_LIBRARIES O2::DataFormatsITSMFT diff --git a/GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C b/macro/convert_onnx_to_root_serialized.C similarity index 100% rename from GPU/GPUTracking/utils/convert_onnx_to_root_serialized.C rename to macro/convert_onnx_to_root_serialized.C