From 8ae3dccf464f27241c8ac6331b41839aafa1119b Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 20 Feb 2025 10:27:08 +0100 Subject: [PATCH] GPU: Implement parallel memset for host code --- GPU/GPUTracking/Base/GPUReconstruction.cxx | 2 +- GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 22 +++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index 270f092a1fd29..e3522d2d7242d 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -246,7 +246,7 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice() } if (mProcessingSettings.deterministicGPUReconstruction) { #ifndef GPUCA_NO_FAST_MATH - GPUError("Warning, deterministicGPUReconstruction needs GPUCA_NO_FAST_MATH, otherwise results will never be deterministic!"); + GPUError("Warning, deterministicGPUReconstruction needs GPUCA_NO_FAST_MATH for being fully deterministic, without only most indeterminism by concurrency is removed, but floating point effects remain!"); #endif mProcessingSettings.overrideClusterizerFragmentLen = TPC_MAX_FRAGMENT_LEN_GPU; param().rec.tpc.nWaysOuter = true; diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 1365429245fdc..187792b3ba2e7 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -111,10 +111,24 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS template <> inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) { - int32_t ompThreads = std::max(1, std::min(size / (16 * 1024 * 1024), getNOMPThreads())); - if (ompThreads > 1) { - memset(ptr, 0, size); - } else { +#ifdef WITH_OPENMP + int32_t nOMPThreads = std::max(1, std::min(size / (16 * 1024 * 1024), getNOMPThreads())); + if (nOMPThreads > 1) { + GPUCA_OPENMP(parallel num_threads(nOMPThreads)) + { + size_t threadSize = size / omp_get_num_threads(); + if (threadSize % 4096) { + threadSize += 4096 - threadSize % 4096; + } + size_t offset = threadSize * omp_get_thread_num(); + size_t mySize = std::min(threadSize, size - offset); + if (mySize) { + memset((char*)ptr + offset, 0, mySize); + } + } + } else +#endif + { memset(ptr, 0, size); } return 0;