diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h index 7879789bf91c8..8fe8e8ca68e44 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h @@ -497,6 +497,7 @@ #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass2Regression GPUCA_LB_GPUTPCNNClusterizerKernels + #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishDeconvolutionFlags GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCCFStreamCompaction_scanStart GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE #define GPUCA_LB_GPUTPCCFStreamCompaction_scanUp GPUCA_PAR_CF_SCAN_WORKGROUP_SIZE diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index d8173a5b62a35..e75d5a5890e4b 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -269,6 +269,7 @@ AddOption(nnClusterizerBatchedMode, unsigned int, 1, "", 0, "(int, default = 1) AddOption(nnClusterizerVerbosity, int, -1, "", 0, "(int, default = -1) If >0, logging messages of the clusterizer will be displayed") AddOption(nnClusterizerBoundaryFillValue, int, -1, "", 0, "Fill value for the boundary of the input to the NN") AddOption(nnClusterizerApplyNoiseSuppression, int, 1, "", 0, "Applies the NoiseSuppression kernel before the digits to the network are filled") +AddOption(nnClusterizerSetDeconvolutionFlags, int, 1, "", 0, "Runs the deconvolution kernel without overwriting the charge in order to make cluster-to-track attachment identical to heuristic CF") AddOption(nnClassificationPath, std::string, "network_class.onnx", "", 0, "The classification network path") AddOption(nnClassThreshold, float, 0.5, "", 0, "The cutoff at which clusters will be accepted / rejected.") AddOption(nnRegressionPath, std::string, "network_reg.onnx", "", 0, "The regression network path") diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 64e6f5a31aaa7..816ee43d50b15 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -989,8 +989,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) int withMC = (doGPU && propagateMCLabels); if (clustererNNShadow.mNnClusterizerUseCfRegression || (int)(nn_settings.nnClusterizerApplyCfDeconvolution)) { - runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true); DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); + } else if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) { + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, false); } // float time_clusterizer = 0, time_fill = 0, time_networks = 0; @@ -1001,6 +1003,10 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) // auto start0 = std::chrono::high_resolution_clock::now(); runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the data + if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) { + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, withMC, batchStart); // Filling the regression data + } + // auto stop0 = std::chrono::high_resolution_clock::now(); // auto start1 = std::chrono::high_resolution_clock::now(); @@ -1102,7 +1108,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) GPUFatal("Project not compiled with neural network clusterization. Aborting."); #endif } else { - runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}); + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true); DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSector}}, 0); } diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx index 429d51685e504..d6b8703a9b35d 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.cxx @@ -22,18 +22,19 @@ using namespace o2::gpu; using namespace o2::gpu::tpccf; template <> -GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer) +GPUdii() void GPUTPCCFDeconvolution::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, uint8_t overwriteCharge) { CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); CfArray2D isPeakMap(clusterer.mPpeakMap); - GPUTPCCFDeconvolution::deconvolutionImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, isPeakMap, chargeMap, clusterer.mPpositions, clusterer.mPmemory->counters.nPositions); + GPUTPCCFDeconvolution::deconvolutionImpl(get_num_groups(0), get_local_size(0), get_group_id(0), get_local_id(0), smem, isPeakMap, chargeMap, clusterer.mPpositions, clusterer.mPmemory->counters.nPositions, overwriteCharge); } GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, const CfArray2D& peakMap, CfArray2D& chargeMap, const CfChargePos* positions, - const uint32_t digitnum) + const uint32_t digitnum, + uint8_t overwriteCharge) { SizeT idx = get_global_id(0); @@ -111,9 +112,14 @@ GPUdii() void GPUTPCCFDeconvolution::deconvolutionImpl(int32_t nBlocks, int32_t peakCount = (peakCount == 0) ? 1 : peakCount; PackedCharge charge = chargeMap[pos]; - PackedCharge p(charge.unpack() / peakCount, has3x3, split); - chargeMap[pos] = p; + if (overwriteCharge) { + PackedCharge p(charge.unpack() / peakCount, has3x3, split); + chargeMap[pos] = p; + } else { + PackedCharge p(charge.unpack(), has3x3, split); + chargeMap[pos] = p; + } } GPUdi() uint8_t GPUTPCCFDeconvolution::countPeaksInner( diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h index e971a042e95a4..902e3a28fd21b 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFDeconvolution.h @@ -51,7 +51,7 @@ class GPUTPCCFDeconvolution : public GPUKernelTemplate GPUd() static void Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, Args... args); private: - static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D&, CfArray2D&, const CfChargePos*, const uint32_t); + static GPUd() void deconvolutionImpl(int32_t, int32_t, int32_t, int32_t, GPUSharedMemory&, const CfArray2D&, CfArray2D&, const CfChargePos*, const uint32_t, uint8_t); static GPUdi() uint8_t countPeaksInner(uint16_t, const uint8_t*, uint8_t*); static GPUdi() uint8_t countPeaksOuter(uint16_t, uint8_t, const uint8_t*); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h index f7c2d13407b0e..980c0977aca65 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizer.h @@ -44,6 +44,7 @@ class GPUTPCNNClusterizer : public GPUProcessor bool mNnClusterizerAddIndexData = true; float mNnClassThreshold = 0.01; bool mNnSigmoidTrafoClassThreshold = 1; + bool mNnClusterizerSetDeconvolutionFlags = true; int mNnClusterizerUseCfRegression = 0; int mNnClusterizerBatchedMode = 1; int mNnClusterizerTotalClusters = 1; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx index 90f1d6e27246f..124320396d0d4 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerHost.cxx @@ -92,6 +92,7 @@ void GPUTPCNNClusterizerHost::initClusterizer(const GPUSettingsProcessingNNclust clustererNN.mNnClusterizerBatchedMode = settings.nnClusterizerBatchedMode; clustererNN.mNnClusterizerBoundaryFillValue = settings.nnClusterizerBoundaryFillValue; clustererNN.mNnSigmoidTrafoClassThreshold = settings.nnSigmoidTrafoClassThreshold; + clustererNN.mNnClusterizerSetDeconvolutionFlags = (bool)settings.nnClusterizerSetDeconvolutionFlags; if (clustererNN.mNnSigmoidTrafoClassThreshold) { clustererNN.mNnClassThreshold = (float)std::log(settings.nnClassThreshold / (1.f - settings.nnClassThreshold)); } else { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index 8ca61602ab4e9..58dd49630d8e6 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -51,7 +51,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; @@ -111,7 +111,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; @@ -126,11 +126,13 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); if (is_row_boundary) { if (dtype == 0) { - clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); + clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); } else { - clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); + clustererNN.mInputData_32[glo_idx] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); } } else { int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); int32_t pad_offset = GPUTPCNNClusterizerKernels::padOffset(row, row + r); int32_t rest_1 = transient_index % ((2 * clustererNN.mNnClusterizerSizeInputPad + 1) * (2 * clustererNN.mNnClusterizerSizeInputTime + 1)); int32_t p = CAMath::Floor(rest_1 / (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputPad + pad_offset; - int32_t t = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime; + int32_t time_pos = (rest_1 % (2 * clustererNN.mNnClusterizerSizeInputTime + 1)) - clustererNN.mNnClusterizerSizeInputTime + time; - bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (t < 0 || t >= TPC_MAX_FRAGMENT_LEN_GPU); + bool is_boundary = GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow) && (time_pos < 0 || time_pos >= TPC_MAX_FRAGMENT_LEN_GPU); if (!is_boundary) { float central_charge = static_cast(chargeMap[peak].unpack()); - CfChargePos tmp_pos(row + r, pad + p, time + t); + CfChargePos tmp_pos(row + r, pad + p, time_pos); if (dtype == 0) { - clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); + clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(chargeMap[tmp_pos].unpack()) / central_charge); } else if (dtype == 1) { - clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; + clustererNN.mInputData_32[glo_idx] = static_cast(chargeMap[tmp_pos].unpack()) / central_charge; } } else { if (dtype == 0) { - clustererNN.mInputData_16[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); + clustererNN.mInputData_16[glo_idx] = (OrtDataType::Float16_t)(static_cast(clustererNN.mNnClusterizerBoundaryFillValue)); } else { - clustererNN.mInputData_32[base_idx * clustererNN.mNnClusterizerElementSize + transient_index] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); + clustererNN.mInputData_32[glo_idx] = static_cast(clustererNN.mNnClusterizerBoundaryFillValue); } } } @@ -180,7 +182,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { uint32_t glo_idx = get_global_id(0); if (dtype == 0) { @@ -191,7 +193,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t onlyMC, uint32_t batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { auto& clustererNN = processors.tpcNNClusterer[sector]; uint32_t glo_idx = get_global_id(0); @@ -457,6 +459,33 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint batchStart) +{ + // Implements identical publishing logic as the heuristic clusterizer and deconvolution kernel + uint32_t idx = get_global_id(0); + auto& clusterer = processors.tpcClusterer[sector]; + auto& clustererNN = processors.tpcNNClusterer[sector]; + CfArray2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + CfChargePos peak = clusterer.mPfilteredPeakPositions[idx + batchStart]; + + for (int i = 0; i < 8; i++) { + Delta2 d = cfconsts::InnerNeighbors[i]; + CfChargePos tmp_pos = peak.delta(d); + PackedCharge charge = chargeMap[tmp_pos]; + clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit()); + clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit()); + } + for (int i = 0; i < 16; i++) { + Delta2 d = cfconsts::OuterNeighbors[i]; + CfChargePos tmp_pos = peak.delta(d); + PackedCharge charge = chargeMap[tmp_pos]; + clustererNN.mClusterFlags[2 * idx] += (d.y != 0 && charge.isSplit() && !charge.has3x3Peak()); + clustererNN.mClusterFlags[2 * idx + 1] += (d.x != 0 && charge.isSplit() && !charge.has3x3Peak()); + } +} + // THe following arithmetic is done because the network is trained with a split between IROC and OROC boundary GPUd() int32_t GPUTPCNNClusterizerKernels::padOffset(int32_t row_ref, int32_t row_current) { diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index dac2bf9554849..179eb483cdf6b 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -65,6 +65,7 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate determineClass2Labels = 4, publishClass1Regression = 5, publishClass2Regression = 6, + publishDeconvolutionFlags = 7 }; template diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index 08d879fbb8e9a..2a59f98a6d5b4 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -111,7 +111,7 @@ o2_gpu_add_kernel("GPUTPCCFChargeMapFiller, findFragmentStart" "= TPC o2_gpu_add_kernel("GPUTPCCFPeakFinder" "= TPCCLUSTERFINDER" LB) o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, noiseSuppression" "= TPCCLUSTERFINDER" LB) o2_gpu_add_kernel("GPUTPCCFNoiseSuppression, updatePeaks" "= TPCCLUSTERFINDER" LB) -o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB) +o2_gpu_add_kernel("GPUTPCCFDeconvolution" "= TPCCLUSTERFINDER" LB uint8_t overwriteCharge) o2_gpu_add_kernel("GPUTPCCFClusterizer" "= TPCCLUSTERFINDER" LB int8_t onlyMC) o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, setRowOffsets" "= TPCCLUSTERFINDER") o2_gpu_add_kernel("GPUTPCCFMCLabelFlattener, flatten" "= TPCCLUSTERFINDER" NO GPUTPCLinearLabels* out) @@ -127,13 +127,14 @@ o2_gpu_add_kernel("GPUTPCCFGather" "=" o2_gpu_add_kernel("GPUTrackingRefitKernel, mode0asGPU" "= GLOBALREFIT " LB) o2_gpu_add_kernel("GPUTrackingRefitKernel, mode1asTrackParCov" "= GLOBALREFIT " LB) if(onnxruntime_FOUND) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t onlyMC uint batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass2Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishDeconvolutionFlags" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) endif() o2_gpu_kernel_add_parameter(NEIGHBOURS_FINDER_MAX_NNEIGHUP