diff --git a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h index 0591ac8c58630..48d00b274dc9c 100644 --- a/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h +++ b/GPU/GPUTracking/Definitions/GPUDefParametersDefaults.h @@ -488,8 +488,8 @@ #define GPUCA_LB_GPUTPCCFNoiseSuppression_updatePeaks GPUCA_LB_GPUTPCCFNoiseSuppression #define GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer GPUCA_LB_GPUTPCNNClusterizerKernels - #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN GPUCA_LB_GPUTPCNNClusterizerKernels - #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNSingleElement GPUCA_LB_GPUTPCNNClusterizerKernels + #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNCPU GPUCA_LB_GPUTPCNNClusterizerKernels + #define GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels GPUCA_LB_GPUTPCNNClusterizerKernels #define GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression GPUCA_LB_GPUTPCNNClusterizerKernels diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index dd4cd6ef0be96..25bfe37f0db30 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -990,11 +990,17 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) uint batchStart = batch * clustererNNShadow.mNnClusterizerBatchedMode; size_t iSize = CAMath::Min((uint)clustererNNShadow.mNnClusterizerBatchedMode, (uint)(clusterer.mPmemory->counters.nClusters - batchStart)); - // auto start0 = std::chrono::high_resolution_clock::now(); - runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); // Filling the data + // Filling the data + if (mRec->IsGPU()) { + // Fills element by element of each input matrix -> better parallelizability, but worse on CPU due to unnecessary computations + runKernel({GetGrid(iSize * clustererNNShadow.mNnClusterizerElementSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); + } else { + // Fills the whole input matrix at once -> better performance on CPU, but worse parallelizability + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); + } if (clustererNNShadow.mNnClusterizerSetDeconvolutionFlags) { - runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); // Filling the regression data + runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceInputDType, propagateMCLabels, batchStart); // Publishing the deconvolution flags } // NN evaluations @@ -1042,7 +1048,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } } - // Publishing kernels + // Publishing kernels for class labels and regression results if (nnApplication.mModelClass.getNumOutputNodes()[0][1] == 1) { runKernel({GetGrid(iSize, lane), krnlRunRangeNone}, iSector, clustererNNShadow.mNnInferenceOutputDType, propagateMCLabels, batchStart); // Assigning class labels } else { @@ -1057,7 +1063,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) } if (clustererNNShadow.mNnClusterizerUseCfRegression) { - if(!nn_settings.nnClusterizerApplyCfDeconvolution) { + if(!nn_settings.nnClusterizerApplyCfDeconvolution) { // If it is already applied don't do it twice, otherwise apply now runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSector}}, true); } DoDebugAndDump(RecoStep::TPCClusterFinding, GPUChainTrackingDebugFlags::TPCClustererChargeMap, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx index bc8d26954b5dc..8cdc0684ad588 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.cxx @@ -51,7 +51,7 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; @@ -65,16 +65,14 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(chargeMap[peak].unpack()); int32_t row_offset = GPUTPCNNClusterizerKernels::rowOffset(row, clustererNN.mNnClusterizerSizeInputRow); -#ifndef GPUCA_GPUCODE - GPUCA_UNROLL(U(), U()); -#endif for (int32_t r = -clustererNN.mNnClusterizerSizeInputRow; r <= clustererNN.mNnClusterizerSizeInputRow; r++) { bool is_row_boundary = ((row + r) > (o2::tpc::constants::MAXGLOBALPADROW - 1)) || ((row + r) < 0); int32_t pad_offset = is_row_boundary ? 0 : GPUTPCNNClusterizerKernels::padOffset(row, row + r); for (int32_t p = -clustererNN.mNnClusterizerSizeInputPad + pad_offset; p <= clustererNN.mNnClusterizerSizeInputPad + pad_offset; p++) { bool is_boundary = is_row_boundary || GPUTPCNNClusterizerKernels::isBoundary(row + r + row_offset, pad + p, clustererNN.mNnClusterizerSizeInputRow); for (int32_t t = -clustererNN.mNnClusterizerSizeInputTime; t <= clustererNN.mNnClusterizerSizeInputTime; t++) { - if (!is_boundary) { + int32_t time_pos = time + t; + if (!is_boundary && (time_pos >= 0) && (time_pos < TPC_MAX_FRAGMENT_LEN_GPU)) { CfChargePos tmp_pos(row + r, pad + p, time + t); if (r == 0 && !clustererNN.mClusterFlags[2 * glo_idx] && CAMath::Abs(p) < 3 && CAMath::Abs(t) < 3 && p != 0 && t != 0) { // ordering is done for short circuit optimization clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); @@ -108,10 +106,20 @@ GPUdii() void GPUTPCNNClusterizerKernels::Thread(pad) / GPUTPCGeometry::NPads(row); } } + if (!clustererNN.mNnClusterizerSetDeconvolutionFlags) { + clustererNN.mClusterFlags[2 * glo_idx] = 0; + clustererNN.mClusterFlags[2 * glo_idx + 1] = 0; + for (uint16_t i = 0; i < 8; i++) { + Delta2 d = cfconsts::InnerNeighbors[i]; + CfChargePos tmp_pos = peak.delta(d); + clustererNN.mClusterFlags[2 * glo_idx] += CfUtils::isPeak(isPeakMap[tmp_pos]); + } + clustererNN.mClusterFlags[2 * glo_idx + 1] = clustererNN.mClusterFlags[2 * glo_idx]; + } } template <> -GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) +GPUdii() void GPUTPCNNClusterizerKernels::Thread(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& processors, uint8_t sector, int8_t dtype, int8_t withMC, uint32_t batchStart) { uint32_t glo_idx = get_global_id(0); auto& clusterer = processors.tpcClusterer[sector]; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h index 179eb483cdf6b..a3858d47eb99b 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCNNClusterizerKernels.h @@ -38,8 +38,8 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate { public: // Must all have same number of threads, since they use a common SCRATCH_PAD_WORK_GROUP_SIZE below - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNN) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); - static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNSingleElement) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); + static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNCPU) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); + static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_fillInputNNGPU) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass1Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_determineClass2Labels) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); static_assert(GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_publishClass1Regression) == GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCNNClusterizerKernels_runCfClusterizer)); @@ -59,8 +59,8 @@ class GPUTPCNNClusterizerKernels : public GPUKernelTemplate enum K : int32_t { runCfClusterizer = 0, - fillInputNN = 1, - fillInputNNSingleElement = 2, + fillInputNNCPU = 1, + fillInputNNGPU = 2, determineClass1Labels = 3, determineClass2Labels = 4, publishClass1Regression = 5, diff --git a/GPU/GPUTracking/kernels.cmake b/GPU/GPUTracking/kernels.cmake index 52cda3e8ff416..7ebe631d86e92 100644 --- a/GPU/GPUTracking/kernels.cmake +++ b/GPU/GPUTracking/kernels.cmake @@ -127,8 +127,8 @@ o2_gpu_add_kernel("GPUTrackingRefitKernel, mode0asGPU" "= GLO o2_gpu_add_kernel("GPUTrackingRefitKernel, mode1asTrackParCov" "= GLOBALREFIT " LB) if(onnxruntime_FOUND) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, runCfClusterizer" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNN" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) -o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNSingleElement" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNCPU" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) +o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, fillInputNNGPU" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass1Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, determineClass2Labels" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart) o2_gpu_add_kernel("GPUTPCNNClusterizerKernels, publishClass1Regression" "= TPCNNCLUSTERFINDER" LB uint8_t sector int8_t dtype int8_t withMC uint32_t batchStart)