From b21fbd87899ba4a2571a462772e97647d210c736 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Fri, 21 Feb 2025 15:06:02 +0100 Subject: [PATCH] GPU: Replace OpenMP parallization with TBB --- GPU/GPUTracking/Base/GPUReconstruction.cxx | 79 ++-- GPU/GPUTracking/Base/GPUReconstruction.h | 19 +- GPU/GPUTracking/Base/GPUReconstructionCPU.cxx | 93 ++--- GPU/GPUTracking/Base/GPUReconstructionCPU.h | 18 +- .../Base/GPUReconstructionConvert.cxx | 122 +++--- .../Base/GPUReconstructionLibrary.cxx | 4 - .../Base/GPUReconstructionThreading.h | 60 +++ GPU/GPUTracking/Base/cuda/CMakeLists.txt | 7 +- .../Base/cuda/GPUReconstructionCUDA.cu | 4 +- .../Base/cuda/GPUReconstructionCUDAGenRTC.cxx | 13 +- GPU/GPUTracking/Base/hip/CMakeLists.txt | 6 +- .../Base/opencl/GPUReconstructionOCL.cxx | 4 +- GPU/GPUTracking/CMakeLists.txt | 8 +- .../GPUTPCClusterStatistics.cxx | 4 +- .../TPCClusterDecompressor.cxx | 34 +- ...andalone-cluster-dump-entropy-analysed.cxx | 4 +- GPU/GPUTracking/Definitions/GPUDefMacros.h | 6 - GPU/GPUTracking/Definitions/GPUSettingsList.h | 7 +- GPU/GPUTracking/Global/GPUChainTracking.cxx | 12 +- .../Global/GPUChainTrackingClusterizer.cxx | 395 +++++++++--------- .../Global/GPUChainTrackingCompression.cxx | 2 +- .../Global/GPUChainTrackingSliceTracker.cxx | 194 ++++----- GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx | 14 +- GPU/GPUTracking/SliceTracker/GPUTPCDef.h | 2 +- .../SliceTracker/GPUTPCSliceData.cxx | 4 +- .../SliceTracker/GPUTPCTracker.cxx | 2 +- .../Standalone/Benchmark/standalone.cxx | 8 +- GPU/GPUTracking/Standalone/CMakeLists.txt | 17 +- GPU/GPUTracking/Standalone/cmake/config.cmake | 1 - GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx | 11 +- GPU/GPUTracking/TRDTracking/GPUTRDTracker.h | 2 +- .../TRDTracking/GPUTRDTrackerKernels.cxx | 10 +- GPU/GPUTracking/display/CMakeLists.txt | 5 +- GPU/GPUTracking/display/GPUDisplay.cxx | 3 - GPU/GPUTracking/display/GPUDisplay.h | 2 +- .../display/helpers/GPUDisplayHelpers.cxx | 13 +- .../display/render/GPUDisplayDraw.cxx | 110 ++--- .../display/render/GPUDisplayImportEvent.cxx | 178 ++++---- .../display/shaders/GPUDisplayShaders.h | 2 +- GPU/GPUTracking/qa/GPUQA.cxx | 80 ++-- 40 files changed, 789 insertions(+), 770 deletions(-) create mode 100644 GPU/GPUTracking/Base/GPUReconstructionThreading.h diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index e3522d2d7242d..481494f268494 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -23,12 +23,9 @@ #include #include -#ifdef WITH_OPENMP -#include -#endif - #include "GPUReconstruction.h" #include "GPUReconstructionIncludes.h" +#include "GPUReconstructionThreading.h" #include "GPUROOTDumpCore.h" #include "GPUConfigDump.h" #include "GPUChainTracking.h" @@ -121,17 +118,18 @@ void GPUReconstruction::GetITSTraits(std::unique_ptr* tr } } -int32_t GPUReconstruction::SetNOMPThreads(int32_t n) +void GPUReconstruction::SetNActiveThreads(int32_t n) { -#ifdef WITH_OPENMP - omp_set_num_threads(mProcessingSettings.ompThreads = std::max(1, n < 0 ? mMaxOMPThreads : std::min(n, mMaxOMPThreads))); + mActiveHostKernelThreads = std::max(1, n < 0 ? mMaxHostThreads : std::min(n, mMaxHostThreads)); + mThreading->activeThreads = std::make_unique(mActiveHostKernelThreads); if (mProcessingSettings.debugLevel >= 3) { - GPUInfo("Set number of OpenMP threads to %d (%d requested)", mProcessingSettings.ompThreads, n); + GPUInfo("Set number of active parallel kernels threads on host to %d (%d requested)", mActiveHostKernelThreads, n); } - return n > mMaxOMPThreads; -#else - return 1; -#endif +} + +int32_t GPUReconstruction::getHostThreadIndex() +{ + return std::max(0, tbb::this_task_arena::current_thread_index()); } int32_t GPUReconstruction::Init() @@ -197,6 +195,24 @@ int32_t GPUReconstruction::Init() return 0; } +namespace o2::gpu::internal +{ +static uint32_t getDefaultNThreads() +{ + const char* tbbEnv = getenv("TBB_NUM_THREADS"); + uint32_t tbbNum = tbbEnv ? atoi(tbbEnv) : 0; + if (tbbNum) { + return tbbNum; + } + const char* ompEnv = getenv("OMP_NUM_THREADS"); + uint32_t ompNum = ompEnv ? atoi(ompEnv) : 0; + if (ompNum) { + return tbbNum; + } + return tbb::info::default_concurrency(); +} +} // namespace o2::gpu::internal + int32_t GPUReconstruction::InitPhaseBeforeDevice() { if (mProcessingSettings.printSettings) { @@ -299,32 +315,37 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice() mMemoryScalers->rescaleMaxMem(mProcessingSettings.forceMaxMemScalers); } -#ifdef WITH_OPENMP - if (mProcessingSettings.ompThreads <= 0) { - mProcessingSettings.ompThreads = omp_get_max_threads(); - } else { - mProcessingSettings.ompAutoNThreads = false; - omp_set_num_threads(mProcessingSettings.ompThreads); + if (mProcessingSettings.nHostThreads != -1 && mProcessingSettings.ompThreads != -1) { + GPUFatal("Must not use both nHostThreads and ompThreads at the same time!"); + } else if (mProcessingSettings.ompThreads != -1) { + mProcessingSettings.nHostThreads = mProcessingSettings.ompThreads; + GPUWarning("You are using the deprecated ompThreads option, please switch to nHostThreads!"); } - if (mProcessingSettings.ompKernels) { - if (omp_get_max_active_levels() < 2) { - omp_set_max_active_levels(2); - } + + if (mProcessingSettings.nHostThreads <= 0) { + mProcessingSettings.nHostThreads = internal::getDefaultNThreads(); + } else { + mProcessingSettings.autoAdjustHostThreads = false; + } + mMaxHostThreads = mActiveHostKernelThreads = mProcessingSettings.nHostThreads; + if (mMaster == nullptr) { + mThreading = std::make_shared(); + mThreading->control = std::make_unique(tbb::global_control::max_allowed_parallelism, mMaxHostThreads); + mThreading->allThreads = std::make_unique(mMaxHostThreads); + mThreading->activeThreads = std::make_unique(mActiveHostKernelThreads); + } else { + mThreading = mMaster->mThreading; } -#else - mProcessingSettings.ompThreads = 1; -#endif - mMaxOMPThreads = mProcessingSettings.ompThreads; - mMaxThreads = std::max(mMaxThreads, mProcessingSettings.ompThreads); + mMaxBackendThreads = std::max(mMaxBackendThreads, mMaxHostThreads); if (IsGPU()) { mNStreams = std::max(mProcessingSettings.nStreams, 3); } if (mProcessingSettings.nTPCClustererLanes == -1) { - mProcessingSettings.nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max(1, std::min(GPUCA_NSLICES, mProcessingSettings.ompKernels ? (mProcessingSettings.ompThreads >= 4 ? std::min(mProcessingSettings.ompThreads / 2, mProcessingSettings.ompThreads >= 32 ? GPUCA_NSLICES : 4) : 1) : mProcessingSettings.ompThreads)); + mProcessingSettings.nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max(1, std::min(GPUCA_NSLICES, mProcessingSettings.inKernelParallel ? (mMaxHostThreads >= 4 ? std::min(mMaxHostThreads / 2, mMaxHostThreads >= 32 ? GPUCA_NSLICES : 4) : 1) : mMaxHostThreads)); } if (mProcessingSettings.overrideClusterizerFragmentLen == -1) { - mProcessingSettings.overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mProcessingSettings.ompThreads / mProcessingSettings.nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST; + mProcessingSettings.overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mMaxHostThreads / mProcessingSettings.nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST; } if (mProcessingSettings.nTPCClustererLanes > GPUCA_NSLICES) { GPUError("Invalid value for nTPCClustererLanes: %d", mProcessingSettings.nTPCClustererLanes); diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 6fd00e1fda207..1fdfabb11211a 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -51,6 +51,7 @@ namespace gpu class GPUChain; struct GPUMemorySizeScalers; struct GPUReconstructionPipelineContext; +struct GPUReconstructionThreading; class GPUROOTDumpCore; namespace gpu_reconstruction_kernels @@ -206,8 +207,8 @@ class GPUReconstruction void SetOutputControl(void* ptr, size_t size); void SetInputControl(void* ptr, size_t size); GPUOutputControl& OutputControl() { return mOutputControl; } - int32_t GetMaxThreads() const { return mMaxThreads; } - int32_t SetNOMPThreads(int32_t n); + int32_t GetMaxBackendThreads() const { return mMaxBackendThreads; } + void SetNActiveThreads(int32_t n); int32_t NStreams() const { return mNStreams; } const void* DeviceMemoryBase() const { return mDeviceMemoryBase; } @@ -234,6 +235,9 @@ class GPUReconstruction double GetStatKernelTime() { return mStatKernelTime; } double GetStatWallTime() { return mStatWallTime; } + std::shared_ptr mThreading; + static int32_t getHostThreadIndex(); + protected: void AllocateRegisteredMemoryInternal(GPUMemoryResource* res, GPUOutputControl* control, GPUReconstruction* recPool); void FreeRegisteredMemory(GPUMemoryResource* res); @@ -343,11 +347,12 @@ class GPUReconstruction std::shared_ptr mROOTDump; std::vector>* mOutputErrorCodes = nullptr; - int32_t mMaxThreads = 0; // Maximum number of threads that may be running, on CPU or GPU - int32_t mThreadId = -1; // Thread ID that is valid for the local CUDA context - int32_t mGPUStuck = 0; // Marks that the GPU is stuck, skip future events - int32_t mNStreams = 1; // Number of parallel GPU streams - int32_t mMaxOMPThreads = 0; // Maximum number of OMP threads + int32_t mMaxBackendThreads = 0; // Maximum number of threads that may be running, on CPU or GPU + int32_t mThreadId = -1; // Thread ID that is valid for the local CUDA context + int32_t mGPUStuck = 0; // Marks that the GPU is stuck, skip future events + int32_t mNStreams = 1; // Number of parallel GPU streams + int32_t mMaxHostThreads = 0; // Maximum number of OMP threads + int32_t mActiveHostKernelThreads = 0; // Number of currently active threads on the host for kernels // Management for GPUProcessors struct ProcessorData { diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 187792b3ba2e7..b5f9d591fd9a6 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -14,6 +14,7 @@ #include "GPUReconstructionCPU.h" #include "GPUReconstructionIncludes.h" +#include "GPUReconstructionThreading.h" #include "GPUChain.h" #include "GPUTPCClusterData.h" @@ -40,13 +41,6 @@ #include #endif -#if defined(WITH_OPENMP) || defined(_OPENMP) -#include -#else -static inline int32_t omp_get_thread_num() { return 0; } -static inline int32_t omp_get_max_threads() { return 1; } -#endif - using namespace o2::gpu; using namespace o2::gpu::gpu_reconstruction_kernels; @@ -60,19 +54,21 @@ GPUReconstructionCPU::~GPUReconstructionCPU() Exit(); // Needs to be identical to GPU backend bahavior in order to avoid calling abstract methods later in the destructor } -int32_t GPUReconstructionCPUBackend::getNOMPThreads() +int32_t GPUReconstructionCPUBackend::getNKernelHostThreads(bool splitCores) { - int32_t ompThreads = 0; - if (mProcessingSettings.ompKernels == 2) { - ompThreads = mProcessingSettings.ompThreads / mNestedLoopOmpFactor; - if ((uint32_t)getOMPThreadNum() < mProcessingSettings.ompThreads % mNestedLoopOmpFactor) { - ompThreads++; + int32_t nThreads = 0; + if (mProcessingSettings.inKernelParallel == 2 && mNActiveThreadsOuterLoop) { + if (splitCores) { + nThreads = mMaxHostThreads / mNActiveThreadsOuterLoop; + nThreads += (uint32_t)getHostThreadIndex() < mMaxHostThreads % mNActiveThreadsOuterLoop; + } else { + nThreads = mMaxHostThreads; } - ompThreads = std::max(1, ompThreads); + nThreads = std::max(1, nThreads); } else { - ompThreads = mProcessingSettings.ompKernels ? mProcessingSettings.ompThreads : 1; + nThreads = mProcessingSettings.inKernelParallel ? mMaxHostThreads : 1; } - return ompThreads; + return nThreads; } template @@ -88,16 +84,19 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS } uint32_t num = y.num == 0 || y.num == -1 ? 1 : y.num; for (uint32_t k = 0; k < num; k++) { - int32_t ompThreads = getNOMPThreads(); - if (ompThreads > 1) { + int32_t nThreads = getNKernelHostThreads(false); + if (nThreads > 1) { if (mProcessingSettings.debugLevel >= 5) { - printf("Running %d ompThreads\n", ompThreads); - } - GPUCA_OPENMP(parallel for num_threads(ompThreads)) - for (uint32_t iB = 0; iB < x.nBlocks; iB++) { - typename T::GPUSharedMemory smem; - T::template Thread(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...); + printf("Running %d Threads\n", nThreads); } + mThreading->activeThreads->execute([&] { + tbb::parallel_for(tbb::blocked_range(0, x.nBlocks, 1), [&](const tbb::blocked_range& r) { + typename T::GPUSharedMemory smem; + for (uint32_t iB = r.begin(); iB < r.end(); iB++) { + T::template Thread(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...); + } + }); + }); } else { for (uint32_t iB = 0; iB < x.nBlocks; iB++) { typename T::GPUSharedMemory smem; @@ -111,24 +110,20 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS template <> inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) { -#ifdef WITH_OPENMP - int32_t nOMPThreads = std::max(1, std::min(size / (16 * 1024 * 1024), getNOMPThreads())); - if (nOMPThreads > 1) { - GPUCA_OPENMP(parallel num_threads(nOMPThreads)) - { - size_t threadSize = size / omp_get_num_threads(); + int32_t nnThreads = std::max(1, std::min(size / (16 * 1024 * 1024), getNKernelHostThreads(true))); + if (nnThreads > 1) { + tbb::parallel_for(0, nnThreads, [&](int iThread) { + size_t threadSize = size / nnThreads; if (threadSize % 4096) { threadSize += 4096 - threadSize % 4096; } - size_t offset = threadSize * omp_get_thread_num(); + size_t offset = threadSize * iThread; size_t mySize = std::min(threadSize, size - offset); if (mySize) { memset((char*)ptr + offset, 0, mySize); - } - } - } else -#endif - { + } // clang-format off + }, tbb::static_partitioner()); // clang-format on + } else { memset(ptr, 0, size); } return 0; @@ -213,8 +208,8 @@ int32_t GPUReconstructionCPU::InitDevice() mHostMemoryPermanent = mHostMemoryBase; ClearAllocatedMemory(); } - if (mProcessingSettings.ompKernels) { - mBlockCount = getOMPMaxThreads(); + if (mProcessingSettings.inKernelParallel) { + mBlockCount = mMaxHostThreads; } mThreadId = GetThread(); mProcShadow.mProcessorsProc = processors(); @@ -351,16 +346,6 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes() } } -int32_t GPUReconstructionCPUBackend::getOMPThreadNum() -{ - return omp_get_thread_num(); -} - -int32_t GPUReconstructionCPUBackend::getOMPMaxThreads() -{ - return omp_get_max_threads(); -} - static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class member not global, but cannot be moved to header due to ROOT limitation GPUReconstructionCPU::timerMeta* GPUReconstructionCPU::insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step) @@ -402,17 +387,17 @@ uint32_t GPUReconstructionCPU::getNextTimerId() return id.fetch_add(1); } -uint32_t GPUReconstructionCPU::SetAndGetNestedLoopOmpFactor(bool condition, uint32_t max) +uint32_t GPUReconstructionCPU::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max) { - if (condition && mProcessingSettings.ompKernels != 1) { - mNestedLoopOmpFactor = mProcessingSettings.ompKernels == 2 ? std::min(max, mProcessingSettings.ompThreads) : mProcessingSettings.ompThreads; + if (condition && mProcessingSettings.inKernelParallel != 1) { + mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min(max, mMaxHostThreads) : mMaxHostThreads; } else { - mNestedLoopOmpFactor = 1; + mNActiveThreadsOuterLoop = 1; } if (mProcessingSettings.debugLevel >= 5) { - printf("Running %d OMP threads in outer loop\n", mNestedLoopOmpFactor); + printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop); } - return mNestedLoopOmpFactor; + return mNActiveThreadsOuterLoop; } void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream) diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h index 7903be44907df..f82f481df6a63 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h @@ -43,10 +43,8 @@ class GPUReconstructionCPUBackend : public GPUReconstruction int32_t runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime& _xyz, const Args&... args); template gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend(); - uint32_t mNestedLoopOmpFactor = 1; - static int32_t getOMPThreadNum(); - static int32_t getOMPMaxThreads(); - int32_t getNOMPThreads(); + uint32_t mNActiveThreadsOuterLoop = 1; + int32_t getNKernelHostThreads(bool splitCores); }; class GPUReconstructionCPU : public GPUReconstructionKernels @@ -81,8 +79,8 @@ class GPUReconstructionCPU : public GPUReconstructionKernels= 1) { - t = &getKernelTimer(myStep, !IsGPU() || cpuFallback ? getOMPThreadNum() : stream); - if ((!mProcessingSettings.deviceTimers || !IsGPU() || cpuFallback) && (mNestedLoopOmpFactor < 2 || getOMPThreadNum() == 0)) { + t = &getKernelTimer(myStep, !IsGPU() || cpuFallback ? getHostThreadIndex() : stream); + if ((!mProcessingSettings.deviceTimers || !IsGPU() || cpuFallback) && (mNActiveThreadsOuterLoop < 2 || getHostThreadIndex() == 0)) { t->Start(); } } @@ -287,11 +285,11 @@ HighResTimer& GPUReconstructionCPU::getTimer(const char* name, int32_t num) static int32_t id = getNextTimerId(); timerMeta* timer = getTimerById(id); if (timer == nullptr) { - int32_t max = std::max({getOMPMaxThreads(), mProcessingSettings.nStreams}); + int32_t max = std::max({mMaxHostThreads, mProcessingSettings.nStreams}); timer = insertTimer(id, name, J, max, 1, RecoStep::NoRecoStep); } if (num == -1) { - num = getOMPThreadNum(); + num = getHostThreadIndex(); } if (num < 0 || num >= timer->num) { throw std::runtime_error("Invalid timer requested"); diff --git a/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx b/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx index ca1c46766b9da..629d23075d9bc 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx @@ -40,6 +40,8 @@ #include "TPCBase/CRU.h" #include "DetectorsRaw/RDHUtils.h" +#include + using namespace o2::gpu; using namespace o2::tpc; using namespace o2::tpc::constants; @@ -1306,6 +1308,17 @@ size_t zsEncoderRun::compare(std::vector* buffer, std::vector void GPUReconstructionConvert::RunZSEncoder(const S& in, std::unique_ptr* outBuffer, uint32_t* outSizes, o2::raw::RawFileWriter* raw, const o2::InteractionRecord* ir, const GPUParam& param, int32_t version, bool verify, float threshold, bool padding, std::function&)> digitsFilter) { @@ -1316,67 +1329,68 @@ void GPUReconstructionConvert::RunZSEncoder(const S& in, std::unique_ptr buffer[NSLICES][GPUTrackingInOutZS::NENDPOINTS]; - uint32_t totalPages = 0; - size_t totalSize = 0; - size_t nErrors = 0; - size_t digitsInput = 0; - size_t digitsEncoded = 0; - // clang-format off - GPUCA_OPENMP(parallel for reduction(+ : totalPages, nErrors, totalSize, digitsInput, digitsEncoded)) - // clang-format on - for (uint32_t i = 0; i < NSLICES; i++) { - std::vector tmpBuffer; - digitsInput += ZSEncoderGetNDigits(in, i); - tmpBuffer.resize(ZSEncoderGetNDigits(in, i)); - if (threshold > 0.f && !digitsFilter) { - auto it = std::copy_if(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; }); - tmpBuffer.resize(std::distance(tmpBuffer.begin(), it)); - } else { - std::copy(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin()); - } - - if (digitsFilter) { - digitsFilter(tmpBuffer); - if (threshold > 0.f) { - std::vector tmpBuffer2 = std::move(tmpBuffer); - tmpBuffer = std::vector(tmpBuffer2.size()); - auto it = std::copy_if(tmpBuffer2.begin(), tmpBuffer2.end(), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; }); + auto reduced = tbb::parallel_reduce(tbb::blocked_range(0, NSLICES), o2::gpu::internal::tmpReductionResult(), [&](const auto range, auto red) { + for (uint32_t i = range.begin(); i < range.end(); i++) { + std::vector tmpBuffer; + red.digitsInput += ZSEncoderGetNDigits(in, i); + tmpBuffer.resize(ZSEncoderGetNDigits(in, i)); + if (threshold > 0.f && !digitsFilter) { + auto it = std::copy_if(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; }); tmpBuffer.resize(std::distance(tmpBuffer.begin(), it)); + } else { + std::copy(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin()); } - } - digitsEncoded += tmpBuffer.size(); - - auto runZS = [&](auto& encoder) { - encoder.zsVersion = version; - encoder.init(); - totalPages += encoder.run(buffer[i], tmpBuffer, &totalSize); - if (verify) { - nErrors += encoder.compare(buffer[i], tmpBuffer); // Verification + + if (digitsFilter) { + digitsFilter(tmpBuffer); + if (threshold > 0.f) { + std::vector tmpBuffer2 = std::move(tmpBuffer); + tmpBuffer = std::vector(tmpBuffer2.size()); + auto it = std::copy_if(tmpBuffer2.begin(), tmpBuffer2.end(), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; }); + tmpBuffer.resize(std::distance(tmpBuffer.begin(), it)); + } } - }; + red.digitsEncoded += tmpBuffer.size(); + + auto runZS = [&](auto& encoder) { + encoder.zsVersion = version; + encoder.init(); + red.totalPages += encoder.run(buffer[i], tmpBuffer, &red.totalSize); + if (verify) { + red.nErrors += encoder.compare(buffer[i], tmpBuffer); // Verification + } + }; - if (version >= ZSVersion::ZSVersionRowBased10BitADC && version <= ZSVersion::ZSVersionRowBased12BitADC) { - zsEncoderRun enc{{{.iSector = i, .raw = raw, .ir = ir, .param = ¶m, .padding = padding}}}; - runZS(enc); - } else if (version >= ZSVersion::ZSVersionLinkBasedWithMeta && version <= ZSVersion::ZSVersionDenseLinkBasedV2) { -#ifdef GPUCA_O2_LIB - if (version == ZSVersion::ZSVersionLinkBasedWithMeta) { - zsEncoderRun enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = ¶m, .padding = padding}}}}; + if (version >= ZSVersion::ZSVersionRowBased10BitADC && version <= ZSVersion::ZSVersionRowBased12BitADC) { + zsEncoderRun enc{{{.iSector = i, .raw = raw, .ir = ir, .param = ¶m, .padding = padding}}}; runZS(enc); - } else if (version >= ZSVersion::ZSVersionDenseLinkBased && version <= ZSVersion::ZSVersionDenseLinkBasedV2) { - zsEncoderRun enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = ¶m, .padding = padding}}}}; - runZS(enc); - } + } else if (version >= ZSVersion::ZSVersionLinkBasedWithMeta && version <= ZSVersion::ZSVersionDenseLinkBasedV2) { +#ifdef GPUCA_O2_LIB + if (version == ZSVersion::ZSVersionLinkBasedWithMeta) { + zsEncoderRun enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = ¶m, .padding = padding}}}}; + runZS(enc); + } else if (version >= ZSVersion::ZSVersionDenseLinkBased && version <= ZSVersion::ZSVersionDenseLinkBasedV2) { + zsEncoderRun enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = ¶m, .padding = padding}}}}; + runZS(enc); + } #else - throw std::runtime_error("Link based ZS encoding not supported in standalone build"); + throw std::runtime_error("Link based ZS encoding not supported in standalone build"); #endif - } else { - throw std::runtime_error("Invalid ZS version "s + std::to_string(version) + ", cannot decode"s); + } else { + throw std::runtime_error("Invalid ZS version "s + std::to_string(version) + ", cannot decode"s); + } } - } + return red; }, [&](const auto& red1, const auto& red2) { + auto red = red1; + red.totalPages += red2.totalPages; + red.totalSize += red2.totalSize; + red.nErrors += red2.nErrors; + red.digitsInput += red2.digitsInput; + red.digitsEncoded += red2.digitsEncoded; + return red; }); if (outBuffer) { - outBuffer->reset(new uint64_t[totalPages * TPCZSHDR::TPC_ZS_PAGE_SIZE / sizeof(uint64_t)]); + outBuffer->reset(new uint64_t[reduced.totalPages * TPCZSHDR::TPC_ZS_PAGE_SIZE / sizeof(uint64_t)]); uint64_t offset = 0; for (uint32_t i = 0; i < NSLICES; i++) { for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) { @@ -1386,12 +1400,12 @@ void GPUReconstructionConvert::RunZSEncoder(const S& in, std::unique_ptr #endif -#ifdef WITH_OPENMP -#include -#endif - #include "GPUReconstruction.h" #include "GPUReconstructionAvailableBackends.h" diff --git a/GPU/GPUTracking/Base/GPUReconstructionThreading.h b/GPU/GPUTracking/Base/GPUReconstructionThreading.h new file mode 100644 index 0000000000000..374c7545e65da --- /dev/null +++ b/GPU/GPUTracking/Base/GPUReconstructionThreading.h @@ -0,0 +1,60 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUReconstructionThreading.h +/// \author David Rohr + +#if !defined(GPURECONSTRUCTIONTHREADING_H) +#define GPURECONSTRUCTIONTHREADING_H + +#if !defined(GPUCA_GPUCODE) +#include "GPUReconstruction.h" + +#include +#include + +namespace o2::gpu +{ + +struct GPUReconstructionThreading { + std::unique_ptr control; + std::unique_ptr allThreads; + std::unique_ptr activeThreads; + std::unique_ptr outerThreads; +}; + +} // namespace o2::gpu + +#endif + +#define GPUCA_TBB_KERNEL_LOOP_HOST(rec, vartype, varname, iEnd, code) \ + for (vartype varname = get_global_id(0); varname < iEnd; varname += get_global_size(0)) { \ + code \ + } + +#ifdef GPUCA_GPUCODE +#define GPUCA_TBB_KERNEL_LOOP GPUCA_TBB_KERNEL_LOOP_HOST +#else +#define GPUCA_TBB_KERNEL_LOOP(rec, vartype, varname, iEnd, code) \ + if (!rec.GetProcessingSettings().inKernelParallel) { \ + rec.mThreading->activeThreads->execute([&] { \ + tbb::parallel_for(tbb::blocked_range(get_global_id(0), iEnd, get_global_size(0)), [&](const tbb::blocked_range& _r_internal) { \ + for (vartype varname = _r_internal.begin(); varname < _r_internal.end(); varname += get_global_size(0)) { \ + code \ + } \ + }); \ + }); \ + } else { \ + GPUCA_TBB_KERNEL_LOOP_HOST(rec, vartype, varname, iEnd, code) \ + } +#endif + +#endif diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt index bab5ff912c575..5bc1e6e4e6783 100644 --- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt +++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt @@ -136,12 +136,7 @@ set_target_cuda_arch(${targetName}) #target_link_options(${targetName} PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/version_script.ld") #set_target_properties(${targetName} PROPERTIES LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/version_script.ld) -if(OpenMP_CXX_FOUND) - # Must be private, depending libraries might be compiled by compiler not understanding -fopenmp - target_compile_definitions(${targetName} PRIVATE WITH_OPENMP) - target_link_libraries(${targetName} PRIVATE OpenMP::OpenMP_CXX) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fopenmp") -endif() +target_link_libraries(${targetName} PRIVATE TBB::tbb) # Special handling of GPU kernels in case of per-kernel compilation / RDC if(NOT DEFINED GPUCA_CUDA_COMPILE_MODE) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu index 26cbc282b6fc2..20ce23b578d84 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu @@ -265,7 +265,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() throw std::runtime_error("Invalid warp size on GPU"); } mBlockCount = deviceProp.multiProcessorCount; - mMaxThreads = std::max(mMaxThreads, deviceProp.maxThreadsPerBlock * mBlockCount); + mMaxBackendThreads = std::max(mMaxBackendThreads, deviceProp.maxThreadsPerBlock * mBlockCount); #ifndef __HIPCC__ // CUDA mWarpSize = 32; #else // HIP @@ -409,7 +409,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime() mDeviceId = master->mDeviceId; mBlockCount = master->mBlockCount; mWarpSize = master->mWarpSize; - mMaxThreads = master->mMaxThreads; + mMaxBackendThreads = master->mMaxBackendThreads; mDeviceName = master->mDeviceName; mDeviceConstantMem = master->mDeviceConstantMem; mDeviceConstantMemList.resize(master->mDeviceConstantMemList.size()); diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx index 62ad57ae3497a..3bd3afc0ffc23 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx @@ -13,9 +13,6 @@ /// \author David Rohr #define GPUCA_GPUCODE_HOSTONLY -#ifdef WITH_OPENMP -#include -#endif #include "GPUReconstructionCUDA.h" #include "GPUParamRTC.h" #include "GPUDefMacros.h" @@ -25,6 +22,7 @@ #include #include +#include using namespace o2::gpu; #include "utils/qGetLdBinarySymbols.h" @@ -153,10 +151,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) } HighResTimer rtcTimer; rtcTimer.ResetStart(); -#ifdef WITH_OPENMP -#pragma omp parallel for schedule(dynamic, 1) -#endif - for (uint32_t i = 0; i < nCompile; i++) { + tbb::parallel_for(0, nCompile, [&](auto i) { if (mProcessingSettings.debugLevel >= 3) { printf("Compiling %s\n", (filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str()); } @@ -190,8 +185,8 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) printf("Source code file: %s", filename.c_str()); } throw std::runtime_error("Error during CUDA compilation"); - } - } + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on if (mProcessingSettings.debugLevel >= 0) { GPUInfo("RTC Compilation finished (%f seconds)", rtcTimer.GetCurrentElapsedTime()); } diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index d34777f0bef3e..10fbfa8d21ddf 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -189,11 +189,7 @@ endif() target_link_libraries(${targetName} PRIVATE hip::host hip::device hip::hipcub roc::rocthrust) set_target_hip_arch(${targetName}) -if(OpenMP_CXX_FOUND) - # Must be private, depending libraries might be compiled by compiler not understanding -fopenmp - target_compile_definitions(${MODULE}_CXX PRIVATE WITH_OPENMP) - target_link_libraries(${MODULE}_CXX PRIVATE OpenMP::OpenMP_CXX) -endif() +target_link_libraries(${MODULE}_CXX PRIVATE TBB::tbb) # Special handling of GPU kernels in case of per-kernel compilation / RDC if(NOT DEFINED GPUCA_HIP_COMPILE_MODE) diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx index ed985e31ab1b0..30a8fc193774b 100644 --- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx +++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx @@ -267,7 +267,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() mDeviceName += " (OpenCL)"; mBlockCount = shaders; mWarpSize = 32; - mMaxThreads = std::max(mMaxThreads, maxWorkGroup * mBlockCount); + mMaxBackendThreads = std::max(mMaxBackendThreads, maxWorkGroup * mBlockCount); mInternals->context = clCreateContext(nullptr, ContextForAllPlatforms() ? count : 1, ContextForAllPlatforms() ? mInternals->devices.get() : &mInternals->device, nullptr, nullptr, &ocl_error); if (GPUFailedMsgI(ocl_error)) { @@ -380,7 +380,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime() GPUReconstructionOCL* master = dynamic_cast(mMaster); mBlockCount = master->mBlockCount; mWarpSize = master->mWarpSize; - mMaxThreads = master->mMaxThreads; + mMaxBackendThreads = master->mMaxBackendThreads; mDeviceName = master->mDeviceName; mDeviceConstantMem = master->mDeviceConstantMem; mInternals = master->mInternals; diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index a5d335931af37..3e738fb6df5cb 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -105,6 +105,7 @@ set(HDRS_INSTALL Base/GPUParam.inc Base/GPUParamRTC.h Base/GPUReconstructionIncludes.h + Base/GPUReconstructionThreading.h Base/GPUReconstructionIncludesITS.h Base/GPUReconstructionKernelMacros.h Base/GPUReconstructionKernels.h @@ -378,12 +379,7 @@ if(GPUCA_QA) target_compile_definitions(${targetName} PRIVATE GPUCA_BUILD_QA) endif() -if(OpenMP_CXX_FOUND) - message(STATUS "GPU: Using OpenMP: ${OpenMP_CXX_SPEC_DATE}") - # Must be private, depending libraries might be compiled by compiler not understanding -fopenmp - target_compile_definitions(${targetName} PRIVATE WITH_OPENMP) - target_link_libraries(${targetName} PRIVATE OpenMP::OpenMP_CXX) -endif() +target_link_libraries(${targetName} PRIVATE TBB::tbb) target_compile_options(${targetName} PRIVATE -Wno-instantiation-after-specialization) diff --git a/GPU/GPUTracking/DataCompression/GPUTPCClusterStatistics.cxx b/GPU/GPUTracking/DataCompression/GPUTPCClusterStatistics.cxx index e8f8de7658b28..794f4cb485f14 100644 --- a/GPU/GPUTracking/DataCompression/GPUTPCClusterStatistics.cxx +++ b/GPU/GPUTracking/DataCompression/GPUTPCClusterStatistics.cxx @@ -69,7 +69,7 @@ INode* BuildTree(const double* frequencies, uint32_t UniqueSymbols) { std::priority_queue, NodeCmp> trees; - for (uint32_t i = 0; i < UniqueSymbols; ++i) { + for (uint32_t i = 0; i < UniqueSymbols; i++) { if (frequencies[i] != 0) { trees.push(new LeafNode(frequencies[i], i)); } @@ -256,7 +256,7 @@ float GPUTPCClusterStatistics::Analyze(std::vector& p, const char* name GenerateCodes(root, HuffCode(), codes); delete root; - for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end(); ++it) { + for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end(); it++) { huffmanSize += it->second.size() * prob[it->first]; } diff --git a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx index 22641774cd9ee..e3b8965c3e27b 100644 --- a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx +++ b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx @@ -22,6 +22,8 @@ #include #include "TPCClusterDecompressionCore.inc" +#include + using namespace o2::gpu; using namespace o2::tpc; @@ -51,23 +53,24 @@ int32_t TPCClusterDecompressor::decompress(const CompressedClusters* clustersCom for (uint32_t i = 0; i < NSLICES * GPUCA_ROW_COUNT; i++) { (&locks[0][0])[i].clear(); } - uint32_t offset = 0, lasti = 0; const uint32_t maxTime = param.continuousMaxTimeBin > 0 ? ((param.continuousMaxTimeBin + 1) * ClusterNative::scaleTimePacked - 1) : TPC_MAX_TIME_BIN_TRIGGERED; - GPUCA_OPENMP(parallel for firstprivate(offset, lasti)) - for (uint32_t i = 0; i < clustersCompressed->nTracks; i++) { - if (i < lasti) { - offset = lasti = 0; // dynamic OMP scheduling, need to reinitialize offset - } - while (lasti < i) { - offset += clustersCompressed->nTrackClusters[lasti++]; + tbb::parallel_for(tbb::blocked_range(0, clustersCompressed->nTracks), [&](const tbb::blocked_range& range) { + uint32_t offset = 0, lasti = 0; + for (uint32_t i = range.begin(); i < range.end(); i++) { + if (i < lasti) { + offset = lasti = 0; // dynamic scheduling order, need to reinitialize offset + } + while (lasti < i) { + offset += clustersCompressed->nTrackClusters[lasti++]; + } + lasti++; + TPCClusterDecompressionCore::decompressTrack(*clustersCompressed, param, maxTime, i, offset, clusters, locks); } - lasti++; - TPCClusterDecompressionCore::decompressTrack(*clustersCompressed, param, maxTime, i, offset, clusters, locks); - } + }); size_t nTotalClusters = clustersCompressed->nAttachedClusters + clustersCompressed->nUnattachedClusters; ClusterNative* clusterBuffer = allocator(nTotalClusters); uint32_t offsets[NSLICES][GPUCA_ROW_COUNT]; - offset = 0; + uint32_t offset = 0; uint32_t decodedAttachedClusters = 0; for (uint32_t i = 0; i < NSLICES; i++) { for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) { @@ -82,8 +85,7 @@ int32_t TPCClusterDecompressor::decompress(const CompressedClusters* clustersCom } clustersNative.clustersLinear = clusterBuffer; clustersNative.setOffsetPtrs(); - GPUCA_OPENMP(parallel for) - for (uint32_t i = 0; i < NSLICES; i++) { + tbb::parallel_for(0, NSLICES, [&](auto i) { for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) { ClusterNative* buffer = &clusterBuffer[clustersNative.clusterOffset[i][j]]; if (clusters[i][j].size()) { @@ -108,7 +110,7 @@ int32_t TPCClusterDecompressor::decompress(const CompressedClusters* clustersCom if (deterministicRec) { std::sort(buffer, buffer + clustersNative.nClusters[i][j]); } - } - } + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on return 0; } diff --git a/GPU/GPUTracking/DataCompression/standalone-cluster-dump-entropy-analysed.cxx b/GPU/GPUTracking/DataCompression/standalone-cluster-dump-entropy-analysed.cxx index 0d7ca5c6209a4..9cb49bf4c7ef5 100644 --- a/GPU/GPUTracking/DataCompression/standalone-cluster-dump-entropy-analysed.cxx +++ b/GPU/GPUTracking/DataCompression/standalone-cluster-dump-entropy-analysed.cxx @@ -166,7 +166,7 @@ INode* BuildTree(const double* frequencies, uint32_t UniqueSymbols) { std::priority_queue, NodeCmp> trees; - for (int32_t i = 0; i < UniqueSymbols; ++i) { + for (int32_t i = 0; i < UniqueSymbols; i++) { if (frequencies[i] != 0) { trees.push(new LeafNode(frequencies[i], i)); } @@ -621,7 +621,7 @@ int32_t main(int argc, char** argv) GenerateCodes(root, HuffCode(), codes); delete root; - for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end(); ++it) { + for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end(); it++) { huffmanSize += it->second.size() * probabilities[i][it->first]; } } diff --git a/GPU/GPUTracking/Definitions/GPUDefMacros.h b/GPU/GPUTracking/Definitions/GPUDefMacros.h index b47401c9f05aa..caf2d1670f84e 100644 --- a/GPU/GPUTracking/Definitions/GPUDefMacros.h +++ b/GPU/GPUTracking/Definitions/GPUDefMacros.h @@ -50,11 +50,5 @@ #define GPUCA_UNROLL(...) #endif -#if !defined(WITH_OPENMP) || defined(GPUCA_GPUCODE_DEVICE) -#define GPUCA_OPENMP(...) -#else -#define GPUCA_OPENMP(...) _Pragma(GPUCA_M_STR(omp __VA_ARGS__)) -#endif - #endif // clang-format on diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 10bb4797a1c15..905622de26ba9 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -250,9 +250,10 @@ AddOption(conservativeMemoryEstimate, bool, false, "", 0, "Use some more conserv AddOption(tpcInputWithClusterRejection, uint8_t, 0, "", 0, "Indicate whether the TPC input is CTF data with cluster rejection, to tune buffer estimations") AddOption(forceMaxMemScalers, uint64_t, 0, "", 0, "Force using the maximum values for all buffers, Set a value n > 1 to rescale all maximums to a memory size of n") AddOption(registerStandaloneInputMemory, bool, false, "registerInputMemory", 0, "Automatically register input memory buffers for the GPU") -AddOption(ompThreads, int32_t, -1, "omp", 't', "Number of OMP threads to run (-1: all)", min(-1), message("Using %s OMP threads")) -AddOption(ompKernels, uint8_t, 2, "", 0, "Parallelize with OMP inside kernels instead of over slices, 2 for nested parallelization over TPC sectors and inside kernels") -AddOption(ompAutoNThreads, bool, true, "", 0, "Auto-adjust number of OMP threads, decreasing the number for small input data") +AddOption(nHostThreads, int32_t, -1, "nThreads", 't', "Number of host threads to run (-1: all)", min(-1), message("Using %s CPU threads")) +AddOption(ompThreads, int32_t, -1, "", 0, "Deprecated synonym for nHostThreads") +AddOption(inKernelParallel, uint8_t, 2, "", 0, "Parallelize with multi-threading inside kernels on the host instead of over TPC sectors, 2 for nested parallelization over TPC sectors and inside kernels") +AddOption(autoAdjustHostThreads, bool, true, "", 0, "Auto-adjust number of OMP threads, decreasing the number for small input data") AddOption(nStreams, int8_t, 8, "", 0, "Number of GPU streams / command queues") AddOption(nTPCClustererLanes, int8_t, -1, "", 0, "Number of TPC clusterers that can run in parallel (-1 = autoset)") AddOption(overrideClusterizerFragmentLen, int32_t, -1, "", 0, "Force the cluster max fragment len to a certain value (-1 = autodetect)") diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx index 889e12c258cb4..66f37e1122832 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.cxx +++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx @@ -673,8 +673,8 @@ int32_t GPUChainTracking::RunChain() if ((((GetRecoSteps() & RecoStep::TRDTracking) && !GetProcessingSettings().trdTrackModelO2 && !GetProcessingSettings().willProvideO2PropagatorLate) || ((GetRecoSteps() & RecoStep::Refit) && !param().rec.trackingRefitGPUModel)) && processors()->calibObjects.o2Propagator == nullptr) { GPUFatal("Cannot run TRD tracking or refit with o2 track model without o2 propagator"); // This check must happen during run, since o2::Propagator cannot be available during init } - if (GetProcessingSettings().ompAutoNThreads && !mRec->IsGPU()) { - mRec->SetNOMPThreads(-1); + if (GetProcessingSettings().autoAdjustHostThreads && !mRec->IsGPU()) { + mRec->SetNActiveThreads(-1); } const auto threadContext = GetThreadContext(); if (GetProcessingSettings().runCompressionStatistics && mCompressionStatistics == nullptr) { @@ -717,8 +717,8 @@ int32_t GPUChainTracking::RunChain() } } - if (GetProcessingSettings().ompAutoNThreads && !mRec->IsGPU() && mIOPtrs.clustersNative) { - mRec->SetNOMPThreads(mIOPtrs.clustersNative->nClustersTotal / 5000); + if (GetProcessingSettings().autoAdjustHostThreads && !mRec->IsGPU() && mIOPtrs.clustersNative) { + mRec->SetNActiveThreads(mIOPtrs.clustersNative->nClustersTotal / 5000); } if (mIOPtrs.clustersNative && runRecoStep(RecoStep::TPCConversion, &GPUChainTracking::ConvertNativeToClusterData)) { @@ -768,8 +768,8 @@ int32_t GPUChainTracking::RunChain() SynchronizeStream(OutputStream()); } - if (GetProcessingSettings().ompAutoNThreads && !mRec->IsGPU()) { - mRec->SetNOMPThreads(-1); + if (GetProcessingSettings().autoAdjustHostThreads && !mRec->IsGPU()) { + mRec->SetNActiveThreads(-1); } int32_t retVal = 0; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index bec61d6b76f1e..6ca645808c5bd 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -35,6 +35,8 @@ #include "utils/strtag.h" +#include + #ifndef GPUCA_NO_VC #include #endif @@ -576,8 +578,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) { return 1; } - if (GetProcessingSettings().ompAutoNThreads && !doGPU) { - mRec->SetNOMPThreads(mRec->MemoryScalers()->nTPCdigits / 20000); + if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) { + mRec->SetNActiveThreads(mRec->MemoryScalers()->nTPCdigits / 20000); } mRec->MemoryScalers()->nTPCHits = mRec->MemoryScalers()->NTPCClusters(mRec->MemoryScalers()->nTPCdigits); @@ -674,229 +676,232 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) if (GetProcessingSettings().debugLevel >= 3) { GPUInfo("Processing time bins [%d, %d) for sectors %d to %d", fragment.start, fragment.last(), iSliceBase, iSliceBase + GetProcessingSettings().nTPCClustererLanes - 1); } - GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, GetProcessingSettings().nTPCClustererLanes))) - for (int32_t lane = 0; lane < maxLane; lane++) { - if (doGPU && fragment.index != 0) { - SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished - } - - uint32_t iSlice = iSliceBase + lane; - GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice]; - GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer; - clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0; - clusterer.mPmemory->fragment = fragment; - - if (mIOPtrs.tpcPackedDigits) { - bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS; - bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels; - auto* inDigits = mIOPtrs.tpcPackedDigits; - size_t numDigits = inDigits->nTPCDigits[iSlice]; - if (setDigitsOnGPU) { - GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSlice], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true); - } - if (setDigitsOnHost) { - clusterer.mPdigits = const_cast(inDigits->tpcDigits[iSlice]); // TODO: Needs fixing (invalid const cast) + tbb::task_arena(mRec->SetAndGetNActiveThreadsOuterLoop(!doGPU, maxLane)).execute([&] { + tbb::parallel_for(0, maxLane, [&](auto lane) { + if (doGPU && fragment.index != 0) { + SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished } - clusterer.mPmemory->counters.nDigits = numDigits; - } - if (mIOPtrs.tpcZS) { - if (mCFContext->nPagesSector[iSlice] && mCFContext->zsVersion != -1) { - clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSlice].first; - clusterer.mPmemory->counters.nPagesSubslice = mCFContext->nextPos[iSlice].second; - } else { - clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubslice = 0; + uint32_t iSlice = iSliceBase + lane; + GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice]; + GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer; + clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0; + clusterer.mPmemory->fragment = fragment; + + if (mIOPtrs.tpcPackedDigits) { + bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS; + bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels; + auto* inDigits = mIOPtrs.tpcPackedDigits; + size_t numDigits = inDigits->nTPCDigits[iSlice]; + if (setDigitsOnGPU) { + GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSlice], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true); + } + if (setDigitsOnHost) { + clusterer.mPdigits = const_cast(inDigits->tpcDigits[iSlice]); // TODO: Needs fixing (invalid const cast) + } + clusterer.mPmemory->counters.nDigits = numDigits; } - } - TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); - - using ChargeMapType = decltype(*clustererShadow.mPchargeMap); - using PeakMapType = decltype(*clustererShadow.mPpeakMap); - runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); // TODO: Not working in OpenCL2!!! - runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType)); - if (fragment.index == 0) { - runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy)); - } - DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges"); - if (doGPU) { - if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSlice] && mCFContext->zsVersion != -1) { - TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane); - SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane); + if (mIOPtrs.tpcZS) { + if (mCFContext->nPagesSector[iSlice] && mCFContext->zsVersion != -1) { + clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSlice].first; + clusterer.mPmemory->counters.nPagesSubslice = mCFContext->nextPos[iSlice].second; + } else { + clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubslice = 0; + } } - SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory - } + TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); + + using ChargeMapType = decltype(*clustererShadow.mPchargeMap); + using PeakMapType = decltype(*clustererShadow.mPpeakMap); + runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); // TODO: Not working in OpenCL2!!! + runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType)); + if (fragment.index == 0) { + runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy)); + } + DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges"); - if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSlice] || mCFContext->zsVersion == -1)) { - clusterer.mPmemory->counters.nPositions = 0; - continue; - } - if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSlice] == 0) { - clusterer.mPmemory->counters.nPositions = 0; - continue; - } + if (doGPU) { + if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSlice] && mCFContext->zsVersion != -1) { + TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane); + SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane); + } + SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory + } - if (propagateMCLabels && fragment.index == 0) { - clusterer.PrepareMC(); - clusterer.mPinputLabels = digitsMC->v[iSlice]; - if (clusterer.mPinputLabels == nullptr) { - GPUFatal("MC label container missing, sector %d", iSlice); + if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSlice] || mCFContext->zsVersion == -1)) { + clusterer.mPmemory->counters.nPositions = 0; + return; } - if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSlice]) { - GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSlice], (int32_t)clusterer.mPinputLabels->getIndexedSize()); + if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSlice] == 0) { + clusterer.mPmemory->counters.nPositions = 0; + return; } - } - if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSlice) { - if (not mIOPtrs.tpcZS) { - runKernel({GetGrid(1, lane), {iSlice}}, mIOPtrs.tpcZS == nullptr); - TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); - } else if (propagateMCLabels) { - runKernel({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, mIOPtrs.tpcZS == nullptr); - TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); + if (propagateMCLabels && fragment.index == 0) { + clusterer.PrepareMC(); + clusterer.mPinputLabels = digitsMC->v[iSlice]; + if (clusterer.mPinputLabels == nullptr) { + GPUFatal("MC label container missing, sector %d", iSlice); + } + if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSlice]) { + GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSlice], (int32_t)clusterer.mPinputLabels->getIndexedSize()); + } } - } - if (mIOPtrs.tpcZS) { - int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : (mIOPtrs.tpcZS->slice[iSlice].count[0] && mIOPtrs.tpcZS->slice[iSlice].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->slice[iSlice].zsPtr[0][0]) - : 0; - uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubslice : GPUTrackingInOutZS::NENDPOINTS; + if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSlice) { + if (not mIOPtrs.tpcZS) { + runKernel({GetGrid(1, lane), {iSlice}}, mIOPtrs.tpcZS == nullptr); + TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); + } else if (propagateMCLabels) { + runKernel({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, mIOPtrs.tpcZS == nullptr); + TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); + } + } - (void)tpcTimeBinCut; // TODO: To be used in decoding kernels - switch (mCFContext->zsVersion) { - default: - GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion); - break; - case ZSVersionRowBased10BitADC: - case ZSVersionRowBased12BitADC: - runKernel({GetGridBlk(nBlocks, lane), {iSlice}}, firstHBF); - break; - case ZSVersionLinkBasedWithMeta: - runKernel({GetGridBlk(nBlocks, lane), {iSlice}}, firstHBF); - break; - case ZSVersionDenseLinkBased: - runKernel({GetGridBlk(nBlocks, lane), {iSlice}}, firstHBF); - break; + if (mIOPtrs.tpcZS) { + int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->slice[iSlice].count[0] && mIOPtrs.tpcZS->slice[iSlice].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->slice[iSlice].zsPtr[0][0]) : 0); + uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubslice : GPUTrackingInOutZS::NENDPOINTS; + + (void)tpcTimeBinCut; // TODO: To be used in decoding kernels + switch (mCFContext->zsVersion) { + default: + GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion); + break; + case ZSVersionRowBased10BitADC: + case ZSVersionRowBased12BitADC: + runKernel({GetGridBlk(nBlocks, lane), {iSlice}}, firstHBF); + break; + case ZSVersionLinkBasedWithMeta: + runKernel({GetGridBlk(nBlocks, lane), {iSlice}}, firstHBF); + break; + case ZSVersionDenseLinkBased: + runKernel({GetGridBlk(nBlocks, lane), {iSlice}}, firstHBF); + break; + } + TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on + }); + tbb::task_arena(mRec->SetAndGetNActiveThreadsOuterLoop(!doGPU, maxLane)).execute([&] { + tbb::parallel_for(0, maxLane, [&](auto lane) { + uint32_t iSlice = iSliceBase + lane; + if (doGPU) { + SynchronizeStream(lane); } - TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); - } - } - GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, GetProcessingSettings().nTPCClustererLanes))) - for (int32_t lane = 0; lane < maxLane; lane++) { - uint32_t iSlice = iSliceBase + lane; - if (doGPU) { - SynchronizeStream(lane); - } - if (mIOPtrs.tpcZS) { - CfFragment f = fragment.next(); - int32_t nextSlice = iSlice; - if (f.isEnd()) { - nextSlice += GetProcessingSettings().nTPCClustererLanes; - f = mCFContext->fragmentFirst; + if (mIOPtrs.tpcZS) { + CfFragment f = fragment.next(); + int32_t nextSlice = iSlice; + if (f.isEnd()) { + nextSlice += GetProcessingSettings().nTPCClustererLanes; + f = mCFContext->fragmentFirst; + } + if (nextSlice < NSLICES && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSlice] && mCFContext->zsVersion != -1 && !mCFContext->abandonTimeframe) { + mCFContext->nextPos[nextSlice] = RunTPCClusterizer_transferZS(nextSlice, f, GetProcessingSettings().nTPCClustererLanes + lane); + } } - if (nextSlice < NSLICES && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSlice] && mCFContext->zsVersion != -1 && !mCFContext->abandonTimeframe) { - mCFContext->nextPos[nextSlice] = RunTPCClusterizer_transferZS(nextSlice, f, GetProcessingSettings().nTPCClustererLanes + lane); + GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice]; + GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer; + if (clusterer.mPmemory->counters.nPositions == 0) { + return; + } + if (!mIOPtrs.tpcZS) { + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}}); + } + if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 1, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile)) { + clusterer.DumpChargeMap(*mDebugFile, "Charges"); } - } - GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice]; - GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer; - if (clusterer.mPmemory->counters.nPositions == 0) { - continue; - } - if (!mIOPtrs.tpcZS) { - runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}}); - } - if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 1, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile)) { - clusterer.DumpChargeMap(*mDebugFile, "Charges"); - } - if (propagateMCLabels) { - runKernel({GetGrid(clusterer.mPmemory->counters.nDigitsInFragment, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}); - } + if (propagateMCLabels) { + runKernel({GetGrid(clusterer.mPmemory->counters.nDigitsInFragment, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}); + } - bool checkForNoisyPads = (rec()->GetParam().rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (rec()->GetParam().rec.tpc.maxConsecTimeBinAboveThreshold > 0); - checkForNoisyPads &= (rec()->GetParam().rec.tpc.noisyPadsQuickCheck ? fragment.index == 0 : true); - checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter; + bool checkForNoisyPads = (rec()->GetParam().rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (rec()->GetParam().rec.tpc.maxConsecTimeBinAboveThreshold > 0); + checkForNoisyPads &= (rec()->GetParam().rec.tpc.noisyPadsQuickCheck ? fragment.index == 0 : true); + checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter; - if (checkForNoisyPads) { - int32_t nBlocks = TPC_PADS_IN_SECTOR / GPUTPCCFCheckPadBaseline::PadsPerCacheline; + if (checkForNoisyPads) { + int32_t nBlocks = TPC_PADS_IN_SECTOR / GPUTPCCFCheckPadBaseline::PadsPerCacheline; - runKernel({GetGridBlk(nBlocks, lane), {iSlice}}); - } + runKernel({GetGridBlk(nBlocks, lane), {iSlice}}); + } - runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}}); - if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaks, *mDebugFile)) { - clusterer.DumpPeakMap(*mDebugFile, "Peaks"); - } + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}}); + if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaks, *mDebugFile)) { + clusterer.DumpPeakMap(*mDebugFile, "Peaks"); + } - RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane); - TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); - DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaksCompacted, *mDebugFile); - } - GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, GetProcessingSettings().nTPCClustererLanes))) - for (int32_t lane = 0; lane < maxLane; lane++) { - uint32_t iSlice = iSliceBase + lane; - GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice]; - GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer; - if (doGPU) { - SynchronizeStream(lane); - } - if (clusterer.mPmemory->counters.nPeaks == 0) { - continue; - } - runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); - runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); - if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile)) { - clusterer.DumpPeakMap(*mDebugFile, "Suppressed Peaks"); - } + RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane); + TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); + DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaksCompacted, *mDebugFile); // clang-format off + }, tbb::simple_partitioner()); // clang-format on + }); + tbb::task_arena(mRec->SetAndGetNActiveThreadsOuterLoop(!doGPU, maxLane)).execute([&] { + tbb::parallel_for(0, maxLane, [&](auto lane) { + uint32_t iSlice = iSliceBase + lane; + GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice]; + GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer; + if (doGPU) { + SynchronizeStream(lane); + } + if (clusterer.mPmemory->counters.nPeaks == 0) { + return; + } + runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); + runKernel({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}}); + if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile)) { + clusterer.DumpPeakMap(*mDebugFile, "Suppressed Peaks"); + } - RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane); - TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); - DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaksCompacted, *mDebugFile); - } - GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, GetProcessingSettings().nTPCClustererLanes))) - for (int32_t lane = 0; lane < maxLane; lane++) { - uint32_t iSlice = iSliceBase + lane; - GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice]; - GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer; - if (doGPU) { - SynchronizeStream(lane); - } + RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane); + TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane); + DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaksCompacted, *mDebugFile); // clang-format off + }, tbb::simple_partitioner()); // clang-format on + }); + tbb::task_arena(mRec->SetAndGetNActiveThreadsOuterLoop(!doGPU, maxLane)).execute([&] { + tbb::parallel_for(0, maxLane, [&](auto lane) { + uint32_t iSlice = iSliceBase + lane; + GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice]; + GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer; + if (doGPU) { + SynchronizeStream(lane); + } - if (fragment.index == 0) { - deviceEvent* waitEvent = nullptr; - if (transferRunning[lane] == 1) { - waitEvent = &mEvents->stream[lane]; - transferRunning[lane] = 2; + if (fragment.index == 0) { + deviceEvent* waitEvent = nullptr; + if (transferRunning[lane] == 1) { + waitEvent = &mEvents->stream[lane]; + transferRunning[lane] = 2; + } + runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, waitEvent}}, clustererShadow.mPclusterInRow, GPUCA_ROW_COUNT * sizeof(*clustererShadow.mPclusterInRow)); } - runKernel({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, waitEvent}}, clustererShadow.mPclusterInRow, GPUCA_ROW_COUNT * sizeof(*clustererShadow.mPclusterInRow)); - } - if (clusterer.mPmemory->counters.nClusters == 0) { - continue; - } + if (clusterer.mPmemory->counters.nClusters == 0) { + return; + } - runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}}); - DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); + runKernel({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}}); + DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges"); - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0); - if (doGPU && propagateMCLabels) { - TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane); - if (doGPU) { - SynchronizeStream(lane); + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0); + if (doGPU && propagateMCLabels) { + TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane); + if (doGPU) { + SynchronizeStream(lane); + } + runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); + } + if (GetProcessingSettings().debugLevel >= 3) { + GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSlice, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters); } - runKernel({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1); - } - if (GetProcessingSettings().debugLevel >= 3) { - GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSlice, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters); - } - TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clusterer, lane); - laneHasData[lane] = true; - // Include clusters in default debug mask, exclude other debug output by default - DoDebugAndDump(RecoStep::TPCClusterFinding, 131072, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile); - } - mRec->SetNestedLoopOmpFactor(1); + TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clusterer, lane); + laneHasData[lane] = true; + // Include clusters in default debug mask, exclude other debug output by default + DoDebugAndDump(RecoStep::TPCClusterFinding, 131072, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile); // clang-format off + }, tbb::simple_partitioner()); // clang-format on + }); + mRec->SetNActiveThreadsOuterLoop(1); } size_t nClsFirst = nClsTotal; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx index f3f3627573339..4ea7094416d5e 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx @@ -268,7 +268,7 @@ int32_t GPUChainTracking::RunTPCDecompression() int32_t nStreams = doGPU ? mRec->NStreams() - 1 : 1; if (cmprClsHost.nAttachedClusters != 0) { std::exclusive_scan(cmprClsHost.nTrackClusters, cmprClsHost.nTrackClusters + cmprClsHost.nTracks, Decompressor.mAttachedClustersOffsets, 0u); // computing clusters offsets for first kernel - for (int32_t iStream = 0; iStream < nStreams; ++iStream) { + for (int32_t iStream = 0; iStream < nStreams; iStream++) { uint32_t startTrack = cmprClsHost.nTracks / nStreams * iStream; uint32_t endTrack = cmprClsHost.nTracks / nStreams * (iStream + 1) + (iStream < nStreams - 1 ? 0 : cmprClsHost.nTracks % nStreams); // index of last track (excluded from computation) uint32_t numTracks = endTrack - startTrack; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx index b68f0797f425f..cab025b03e8b6 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx @@ -22,6 +22,8 @@ #include "utils/strtag.h" #include +#include + using namespace o2::gpu; int32_t GPUChainTracking::ExtrapolationTracking(uint32_t iSlice, int32_t threadId, bool synchronizeOutput) @@ -154,110 +156,110 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal() int32_t streamMap[NSLICES]; bool error = false; - GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, NSLICES))) - for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice++) { - GPUTPCTracker& trk = processors()->tpcTrackers[iSlice]; - GPUTPCTracker& trkShadow = doGPU ? processorsShadow()->tpcTrackers[iSlice] : trk; - int32_t useStream = (iSlice % mRec->NStreams()); + tbb::task_arena(mRec->SetAndGetNActiveThreadsOuterLoop(!doGPU, NSLICES)).execute([&] { + tbb::parallel_for(0, NSLICES, [&](auto iSlice) { + GPUTPCTracker& trk = processors()->tpcTrackers[iSlice]; + GPUTPCTracker& trkShadow = doGPU ? processorsShadow()->tpcTrackers[iSlice] : trk; + int32_t useStream = (iSlice % mRec->NStreams()); - if (GetProcessingSettings().debugLevel >= 3) { - GPUInfo("Creating Slice Data (Slice %d)", iSlice); - } - if (doGPU) { - TransferMemoryResourcesToGPU(RecoStep::TPCSliceTracking, &trk, useStream); - runKernel({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}, {nullptr, streamInit[useStream] ? nullptr : &mEvents->init}}); - streamInit[useStream] = true; - } else { - if (ReadEvent(iSlice, 0)) { - GPUError("Error reading event"); - error = 1; - continue; + if (GetProcessingSettings().debugLevel >= 3) { + GPUInfo("Creating Slice Data (Slice %d)", iSlice); + } + if (doGPU) { + TransferMemoryResourcesToGPU(RecoStep::TPCSliceTracking, &trk, useStream); + runKernel({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}, {nullptr, streamInit[useStream] ? nullptr : &mEvents->init}}); + streamInit[useStream] = true; + } else { + if (ReadEvent(iSlice, 0)) { + GPUError("Error reading event"); + error = 1; + return; + } + } + if (GetProcessingSettings().deterministicGPUReconstruction) { + runKernel({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}}); + } + if (!doGPU && trk.CheckEmptySlice() && GetProcessingSettings().debugLevel == 0) { + return; } - } - if (GetProcessingSettings().deterministicGPUReconstruction) { - runKernel({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}}); - } - if (!doGPU && trk.CheckEmptySlice() && GetProcessingSettings().debugLevel == 0) { - continue; - } - if (GetProcessingSettings().debugLevel >= 6) { - *mDebugFile << "\n\nReconstruction: Slice " << iSlice << "/" << NSLICES << std::endl; - if (GetProcessingSettings().debugMask & 1) { - if (doGPU) { - TransferMemoryResourcesToHost(RecoStep::TPCSliceTracking, &trk, -1, true); + if (GetProcessingSettings().debugLevel >= 6) { + *mDebugFile << "\n\nReconstruction: Slice " << iSlice << "/" << NSLICES << std::endl; + if (GetProcessingSettings().debugMask & 1) { + if (doGPU) { + TransferMemoryResourcesToHost(RecoStep::TPCSliceTracking, &trk, -1, true); + } + trk.DumpSliceData(*mDebugFile); } - trk.DumpSliceData(*mDebugFile); } - } - // Initialize temporary memory where needed - if (GetProcessingSettings().debugLevel >= 3) { - GPUInfo("Copying Slice Data to GPU and initializing temporary memory"); - } - runKernel(GetGridAutoStep(useStream, RecoStep::TPCSliceTracking), trkShadow.Data().HitWeights(), trkShadow.Data().NumberOfHitsPlusAlign() * sizeof(*trkShadow.Data().HitWeights())); + // Initialize temporary memory where needed + if (GetProcessingSettings().debugLevel >= 3) { + GPUInfo("Copying Slice Data to GPU and initializing temporary memory"); + } + runKernel(GetGridAutoStep(useStream, RecoStep::TPCSliceTracking), trkShadow.Data().HitWeights(), trkShadow.Data().NumberOfHitsPlusAlign() * sizeof(*trkShadow.Data().HitWeights())); - if (!doGPU) { - TransferMemoryResourcesToGPU(RecoStep::TPCSliceTracking, &trk, useStream); // Copy Data to GPU Global Memory - } - if (GPUDebug("Initialization (3)", useStream)) { - throw std::runtime_error("memcpy failure"); - } + if (!doGPU) { + TransferMemoryResourcesToGPU(RecoStep::TPCSliceTracking, &trk, useStream); // Copy Data to GPU Global Memory + } + if (GPUDebug("Initialization (3)", useStream)) { + throw std::runtime_error("memcpy failure"); + } - runKernel({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}, {nullptr, streamInit[useStream] ? nullptr : &mEvents->init}}); - streamInit[useStream] = true; + runKernel({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}, {nullptr, streamInit[useStream] ? nullptr : &mEvents->init}}); + streamInit[useStream] = true; - if (GetProcessingSettings().keepDisplayMemory) { - TransferMemoryResourcesToHost(RecoStep::TPCSliceTracking, &trk, -1, true); - memcpy(trk.LinkTmpMemory(), mRec->Res(trk.MemoryResLinks()).Ptr(), mRec->Res(trk.MemoryResLinks()).Size()); - if (GetProcessingSettings().debugMask & 2) { - trk.DumpLinks(*mDebugFile, 0); + if (GetProcessingSettings().keepDisplayMemory) { + TransferMemoryResourcesToHost(RecoStep::TPCSliceTracking, &trk, -1, true); + memcpy(trk.LinkTmpMemory(), mRec->Res(trk.MemoryResLinks()).Ptr(), mRec->Res(trk.MemoryResLinks()).Size()); + if (GetProcessingSettings().debugMask & 2) { + trk.DumpLinks(*mDebugFile, 0); + } } - } - runKernel({GetGridBlk(GPUCA_ROW_COUNT - 2, useStream), {iSlice}}); - DoDebugAndDump(RecoStep::TPCSliceTracking, 4, trk, &GPUTPCTracker::DumpLinks, *mDebugFile, 1); + runKernel({GetGridBlk(GPUCA_ROW_COUNT - 2, useStream), {iSlice}}); + DoDebugAndDump(RecoStep::TPCSliceTracking, 4, trk, &GPUTPCTracker::DumpLinks, *mDebugFile, 1); - runKernel({GetGridBlk(GPUCA_ROW_COUNT - 6, useStream), {iSlice}}); + runKernel({GetGridBlk(GPUCA_ROW_COUNT - 6, useStream), {iSlice}}); #ifdef GPUCA_SORT_STARTHITS_GPU - if (doGPU) { - runKernel({GetGridAuto(useStream), {iSlice}}); - } + if (doGPU) { + runKernel({GetGridAuto(useStream), {iSlice}}); + } #endif - if (GetProcessingSettings().deterministicGPUReconstruction) { - runKernel({GetGrid(1, 1, useStream), {iSlice}}); - } - DoDebugAndDump(RecoStep::TPCSliceTracking, 32, trk, &GPUTPCTracker::DumpStartHits, *mDebugFile); - - if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { - trk.UpdateMaxData(); - AllocateRegisteredMemory(trk.MemoryResTracklets()); - AllocateRegisteredMemory(trk.MemoryResOutput()); - } - - if (!(doGPU || GetProcessingSettings().debugLevel >= 1) || GetProcessingSettings().trackletConstructorInPipeline) { - runKernel({GetGridAuto(useStream), {iSlice}}); - DoDebugAndDump(RecoStep::TPCSliceTracking, 128, trk, &GPUTPCTracker::DumpTrackletHits, *mDebugFile); - if (GetProcessingSettings().debugMask & 256 && GetProcessingSettings().deterministicGPUReconstruction < 2) { - trk.DumpHitWeights(*mDebugFile); + if (GetProcessingSettings().deterministicGPUReconstruction) { + runKernel({GetGrid(1, 1, useStream), {iSlice}}); } - } + DoDebugAndDump(RecoStep::TPCSliceTracking, 32, trk, &GPUTPCTracker::DumpStartHits, *mDebugFile); - if (!(doGPU || GetProcessingSettings().debugLevel >= 1) || GetProcessingSettings().trackletSelectorInPipeline) { - runKernel({GetGridAuto(useStream), {iSlice}}); - runKernel({{1, -ThreadCount(), useStream}, {iSlice}}, 1); - if (GetProcessingSettings().deterministicGPUReconstruction) { - runKernel({GetGrid(1, 1, useStream), {iSlice}}); + if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) { + trk.UpdateMaxData(); + AllocateRegisteredMemory(trk.MemoryResTracklets()); + AllocateRegisteredMemory(trk.MemoryResOutput()); } - TransferMemoryResourceLinkToHost(RecoStep::TPCSliceTracking, trk.MemoryResCommon(), useStream, &mEvents->slice[iSlice]); - streamMap[iSlice] = useStream; - if (GetProcessingSettings().debugLevel >= 3) { - GPUInfo("Slice %u, Number of tracks: %d", iSlice, *trk.NTracks()); + + if (!(doGPU || GetProcessingSettings().debugLevel >= 1) || GetProcessingSettings().trackletConstructorInPipeline) { + runKernel({GetGridAuto(useStream), {iSlice}}); + DoDebugAndDump(RecoStep::TPCSliceTracking, 128, trk, &GPUTPCTracker::DumpTrackletHits, *mDebugFile); + if (GetProcessingSettings().debugMask & 256 && GetProcessingSettings().deterministicGPUReconstruction < 2) { + trk.DumpHitWeights(*mDebugFile); + } } - DoDebugAndDump(RecoStep::TPCSliceTracking, 512, trk, &GPUTPCTracker::DumpTrackHits, *mDebugFile); - } - } - mRec->SetNestedLoopOmpFactor(1); + + if (!(doGPU || GetProcessingSettings().debugLevel >= 1) || GetProcessingSettings().trackletSelectorInPipeline) { + runKernel({GetGridAuto(useStream), {iSlice}}); + runKernel({{1, -ThreadCount(), useStream}, {iSlice}}, 1); + if (GetProcessingSettings().deterministicGPUReconstruction) { + runKernel({GetGrid(1, 1, useStream), {iSlice}}); + } + TransferMemoryResourceLinkToHost(RecoStep::TPCSliceTracking, trk.MemoryResCommon(), useStream, &mEvents->slice[iSlice]); + streamMap[iSlice] = useStream; + if (GetProcessingSettings().debugLevel >= 3) { + GPUInfo("Slice %u, Number of tracks: %d", iSlice, *trk.NTracks()); + } + DoDebugAndDump(RecoStep::TPCSliceTracking, 512, trk, &GPUTPCTracker::DumpTrackHits, *mDebugFile); + } }, tbb::simple_partitioner()); + }); + mRec->SetNActiveThreadsOuterLoop(1); if (error) { return (3); } @@ -419,16 +421,16 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal() } } else { mSliceSelectorReady = NSLICES; - GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, NSLICES))) - for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice++) { - if (param().rec.tpc.extrapolationTracking) { - ExtrapolationTracking(iSlice, 0); - } - if (GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCSectorTracks) { - WriteOutput(iSlice, 0); - } - } - mRec->SetNestedLoopOmpFactor(1); + tbb::task_arena(mRec->SetAndGetNActiveThreadsOuterLoop(!doGPU, NSLICES)).execute([&] { + tbb::parallel_for(0, NSLICES, [&](auto iSlice) { + if (param().rec.tpc.extrapolationTracking) { + ExtrapolationTracking(iSlice, 0); + } + if (GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCSectorTracks) { + WriteOutput(iSlice, 0); + } }, tbb::simple_partitioner()); + }); + mRec->SetNActiveThreadsOuterLoop(1); } if (param().rec.tpc.extrapolationTracking && GetProcessingSettings().debugLevel >= 3) { diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx index b6f11375328d0..4f654c0fa7beb 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx @@ -14,9 +14,7 @@ #include "GPUTPCGMMergerGPU.h" #include "GPUCommonAlgorithm.h" -#if defined(WITH_OPENMP) && !defined(GPUCA_GPUCODE) -#include "GPUReconstruction.h" -#endif +#include "GPUReconstructionThreading.h" using namespace o2::gpu; @@ -24,20 +22,18 @@ template <> GPUdii() void GPUTPCGMMergerTrackFit::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger, int32_t mode) { const int32_t iEnd = mode == -1 ? merger.Memory()->nRetryRefit : merger.NOutputTracks(); - GPUCA_OPENMP(parallel for if(!merger.GetRec().GetProcessingSettings().ompKernels) num_threads(merger.GetRec().GetProcessingSettings().ompThreads)) - for (int32_t ii = get_global_id(0); ii < iEnd; ii += get_global_size(0)) { + GPUCA_TBB_KERNEL_LOOP(merger.GetRec(), int32_t, ii, iEnd, { const int32_t i = mode == -1 ? merger.RetryRefitIds()[ii] : mode ? merger.TrackOrderProcess()[ii] : ii; GPUTPCGMTrackParam::RefitTrack(merger.OutputTracks()[i], i, &merger, mode == -1); - } + }); } template <> GPUdii() void GPUTPCGMMergerFollowLoopers::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger) { - GPUCA_OPENMP(parallel for if(!merger.GetRec().GetProcessingSettings().ompKernels) num_threads(merger.GetRec().GetProcessingSettings().ompThreads)) - for (uint32_t i = get_global_id(0); i < merger.Memory()->nLoopData; i += get_global_size(0)) { + GPUCA_TBB_KERNEL_LOOP(merger.GetRec(), uint32_t, i, merger.Memory()->nLoopData, { GPUTPCGMTrackParam::RefitLoop(&merger, i); - } + }); } template <> diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCDef.h b/GPU/GPUTracking/SliceTracker/GPUTPCDef.h index 4b4f130faed65..3b53c3e66875a 100644 --- a/GPU/GPUTracking/SliceTracker/GPUTPCDef.h +++ b/GPU/GPUTracking/SliceTracker/GPUTPCDef.h @@ -47,4 +47,4 @@ struct cahit2 { cahit x, y; }; #endif #endif //GPUDTPCEF_H -// clang format on +// clang-format on diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx index 8a727dc2da930..3cc3e3805dce8 100644 --- a/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx +++ b/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx @@ -35,10 +35,10 @@ using namespace o2::gpu; void GPUTPCSliceData::InitializeRows(const GPUParam& p) { // initialisation of rows - for (int32_t i = 0; i < GPUCA_ROW_COUNT + 1; ++i) { + for (int32_t i = 0; i < GPUCA_ROW_COUNT + 1; i++) { new (&mRows[i]) GPUTPCRow; } - for (int32_t i = 0; i < GPUCA_ROW_COUNT; ++i) { + for (int32_t i = 0; i < GPUCA_ROW_COUNT; i++) { mRows[i].mX = p.tpcGeometry.Row2X(i); mRows[i].mMaxY = CAMath::Tan(p.par.dAlpha / 2.f) * mRows[i].mX; } diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx index df0c7813fa0db..cece49073f11b 100644 --- a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx +++ b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx @@ -98,7 +98,7 @@ void* GPUTPCTracker::SetPointersCommon(void* mem) void GPUTPCTracker::RegisterMemoryAllocation() { AllocateAndInitializeLate(); - bool reuseCondition = !mRec->GetProcessingSettings().keepDisplayMemory && mRec->GetProcessingSettings().trackletSelectorInPipeline && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) || mRec->GetProcessingSettings().ompKernels == 1 || mRec->GetProcessingSettings().ompThreads == 1); + bool reuseCondition = !mRec->GetProcessingSettings().keepDisplayMemory && mRec->GetProcessingSettings().trackletSelectorInPipeline && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) || mRec->GetProcessingSettings().inKernelParallel == 1 || mRec->GetProcessingSettings().nHostThreads == 1); GPUMemoryReuse reLinks{reuseCondition, GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::TrackerDataLinks, (uint16_t)(mISlice % mRec->GetProcessingSettings().nStreams)}; mMemoryResLinks = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataLinks, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCSliceLinks", reLinks); mMemoryResSliceScratch = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataScratch, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK | GPUMemoryResource::MEMORY_CUSTOM, "TPCSliceScratch"); diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index e6017788144e0..53ed77fe62d8c 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -165,7 +165,7 @@ int32_t ReadConfiguration(int argc, char** argv) #endif #ifndef GPUCA_TPC_GEOMETRY_O2 #error Why was configStandalone.rec.tpc.mergerReadFromTrackerDirectly = 0 needed? - configStandalone.proc.ompKernels = false; + configStandalone.proc.inKernelParallel = false; configStandalone.proc.createO2Output = 0; if (configStandalone.rundEdx == -1) { configStandalone.rundEdx = 0; @@ -216,10 +216,10 @@ int32_t ReadConfiguration(int argc, char** argv) configStandalone.noprompt = 1; } if (configStandalone.proc.debugLevel >= 4) { - if (configStandalone.proc.ompKernels) { - configStandalone.proc.ompKernels = 1; + if (configStandalone.proc.inKernelParallel) { + configStandalone.proc.inKernelParallel = 1; } else { - configStandalone.proc.ompThreads = 1; + configStandalone.proc.nHostThreads = 1; } } if (configStandalone.setO2Settings) { diff --git a/GPU/GPUTracking/Standalone/CMakeLists.txt b/GPU/GPUTracking/Standalone/CMakeLists.txt index 1f11f0bacffac..32cdb246cf417 100644 --- a/GPU/GPUTracking/Standalone/CMakeLists.txt +++ b/GPU/GPUTracking/Standalone/CMakeLists.txt @@ -70,13 +70,8 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error -Wall -Wextra -Wshadow -Wno-unused-function -Wno-unused-parameter -Wno-unused-local-typedefs -Wno-unknown-pragmas -Wno-write-strings") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -rdynamic -Wl,--no-undefined") -# Check and set settings for optional packages -if(CONFIG_OPENMP) - find_package(OpenMP REQUIRED) -else() - set(OpenMP_FOUND OFF) - set(OpenMP_CXX_FOUND OFF) -endif() +# Find mandatory packages +find_package(TBB REQUIRED) if(GPUCA_CONFIG_VC) find_package(Vc REQUIRED) @@ -252,14 +247,6 @@ if(GPUCA_CONFIG_ROOT) endif() target_link_libraries(standalone_support PUBLIC Microsoft.GSL::GSL TPCFastTransformation) -if(OpenMP_CXX_FOUND) - target_link_libraries(ca PUBLIC OpenMP::OpenMP_CXX) - if (CMAKE_CXX_COMPILER STREQUAL "clang++") - target_link_libraries(ca PUBLIC -fopenmp) - target_link_libraries(GPUTracking PUBLIC -fopenmp) - endif() -endif() - # Installation install(TARGETS ca TPCFastTransformation standalone_support) install(FILES "cmake/makefile" DESTINATION "${CMAKE_INSTALL_PREFIX}") diff --git a/GPU/GPUTracking/Standalone/cmake/config.cmake b/GPU/GPUTracking/Standalone/cmake/config.cmake index 97091d833efd8..87716d700abc8 100644 --- a/GPU/GPUTracking/Standalone/cmake/config.cmake +++ b/GPU/GPUTracking/Standalone/cmake/config.cmake @@ -15,7 +15,6 @@ set(ENABLE_CUDA AUTO) set(ENABLE_HIP AUTO) set(ENABLE_OPENCL AUTO) -set(CONFIG_OPENMP 1) set(GPUCA_CONFIG_VC 1) set(GPUCA_CONFIG_FMT 1) set(GPUCA_CONFIG_ROOT 1) diff --git a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx index c44b4c09a3d7a..4e8fcd13e0801 100644 --- a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx +++ b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx @@ -30,9 +30,6 @@ class GPUTPCGMPolynomialField; #ifndef GPUCA_GPUCODE #include "GPUMemoryResource.h" #include "GPUReconstruction.h" -#ifdef WITH_OPENMP -#include -#endif // WITH_OPENMP #include #include @@ -61,10 +58,10 @@ void* GPUTRDTracker_t::SetPointersBase(void* base) //-------------------------------------------------------------------- // Allocate memory for fixed size objects (needs to be done only once) //-------------------------------------------------------------------- - mMaxThreads = mRec->GetMaxThreads(); + mMaxBackendThreads = mRec->GetMaxBackendThreads(); computePointerWithAlignment(base, mR, kNChambers); - computePointerWithAlignment(base, mHypothesis, mNCandidates * mMaxThreads); - computePointerWithAlignment(base, mCandidates, mNCandidates * 2 * mMaxThreads); + computePointerWithAlignment(base, mHypothesis, mNCandidates * mMaxBackendThreads); + computePointerWithAlignment(base, mCandidates, mNCandidates * 2 * mMaxBackendThreads); return base; } @@ -94,7 +91,7 @@ void* GPUTRDTracker_t::SetPointersTracks(void* base) } template -GPUTRDTracker_t::GPUTRDTracker_t() : mR(nullptr), mIsInitialized(false), mGenerateSpacePoints(false), mProcessPerTimeFrame(false), mNAngleHistogramBins(25), mAngleHistogramRange(50), mMemoryPermanent(-1), mMemoryTracklets(-1), mMemoryTracks(-1), mNMaxCollisions(0), mNMaxTracks(0), mNMaxSpacePoints(0), mTracks(nullptr), mTrackAttribs(nullptr), mNCandidates(1), mNTracks(0), mNEvents(0), mMaxThreads(100), mTrackletIndexArray(nullptr), mHypothesis(nullptr), mCandidates(nullptr), mSpacePoints(nullptr), mGeo(nullptr), mRPhiA2(0), mRPhiB(0), mRPhiC2(0), mDyA2(0), mDyB(0), mDyC2(0), mAngleToDyA(0), mAngleToDyB(0), mAngleToDyC(0), mDebugOutput(false), mMaxEta(0.84f), mRoadZ(18.f), mZCorrCoefNRC(1.4f), mTPCVdrift(2.58f), mTPCTDriftOffset(0.f), mDebug(new GPUTRDTrackerDebug()) +GPUTRDTracker_t::GPUTRDTracker_t() : mR(nullptr), mIsInitialized(false), mGenerateSpacePoints(false), mProcessPerTimeFrame(false), mNAngleHistogramBins(25), mAngleHistogramRange(50), mMemoryPermanent(-1), mMemoryTracklets(-1), mMemoryTracks(-1), mNMaxCollisions(0), mNMaxTracks(0), mNMaxSpacePoints(0), mTracks(nullptr), mTrackAttribs(nullptr), mNCandidates(1), mNTracks(0), mNEvents(0), mMaxBackendThreads(100), mTrackletIndexArray(nullptr), mHypothesis(nullptr), mCandidates(nullptr), mSpacePoints(nullptr), mGeo(nullptr), mRPhiA2(0), mRPhiB(0), mRPhiC2(0), mDyA2(0), mDyB(0), mDyC2(0), mAngleToDyA(0), mAngleToDyB(0), mAngleToDyC(0), mDebugOutput(false), mMaxEta(0.84f), mRoadZ(18.f), mZCorrCoefNRC(1.4f), mTPCVdrift(2.58f), mTPCTDriftOffset(0.f), mDebug(new GPUTRDTrackerDebug()) { //-------------------------------------------------------------------- // Default constructor diff --git a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.h b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.h index 59e753e239cf9..274dfd6668eaf 100644 --- a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.h +++ b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.h @@ -167,7 +167,7 @@ class GPUTRDTracker_t : public GPUProcessor int32_t mNCandidates; // max. track hypothesis per layer int32_t mNTracks; // number of TPC tracks to be matched int32_t mNEvents; // number of processed events - int32_t mMaxThreads; // maximum number of supported threads + int32_t mMaxBackendThreads; // maximum number of supported threads // index of first tracklet for each chamber within tracklets array, last entry is total number of tracklets for given collision // the array has (kNChambers + 1) * numberOfCollisions entries // note, that for collision iColl one has to add an offset corresponding to the index of the first tracklet of iColl to the index stored in mTrackletIndexArray diff --git a/GPU/GPUTracking/TRDTracking/GPUTRDTrackerKernels.cxx b/GPU/GPUTracking/TRDTracking/GPUTRDTrackerKernels.cxx index eb9eecfe6e846..d18f04e554043 100644 --- a/GPU/GPUTracking/TRDTracking/GPUTRDTrackerKernels.cxx +++ b/GPU/GPUTracking/TRDTracking/GPUTRDTrackerKernels.cxx @@ -16,9 +16,8 @@ #include "GPUTRDGeometry.h" #include "GPUConstantMem.h" #include "GPUCommonTypeTraits.h" -#if defined(WITH_OPENMP) && !defined(GPUCA_GPUCODE) -#include "GPUReconstruction.h" -#endif + +#include "GPUReconstructionThreading.h" using namespace o2::gpu; @@ -33,10 +32,9 @@ GPUdii() void GPUTRDTrackerKernels::Thread(int32_t nBlocks, int32_t nThreads, in } } #endif - GPUCA_OPENMP(parallel for if(!trdTracker->GetRec().GetProcessingSettings().ompKernels) num_threads(trdTracker->GetRec().GetProcessingSettings().ompThreads)) - for (int32_t i = get_global_id(0); i < trdTracker->NTracks(); i += get_global_size(0)) { + GPUCA_TBB_KERNEL_LOOP(trdTracker->GetRec(), int32_t, i, trdTracker->NTracks(), { trdTracker->DoTrackingThread(i, get_global_id(0)); - } + }); } #if !defined(GPUCA_GPUCODE) || defined(GPUCA_GPUCODE_DEVICE) // FIXME: DR: WORKAROUND to avoid CUDA bug creating host symbols for device code. diff --git a/GPU/GPUTracking/display/CMakeLists.txt b/GPU/GPUTracking/display/CMakeLists.txt index 2c1814a1a26a0..68385d7916234 100644 --- a/GPU/GPUTracking/display/CMakeLists.txt +++ b/GPU/GPUTracking/display/CMakeLists.txt @@ -219,7 +219,4 @@ if(GPUCA_EVENT_DISPLAY_QT) target_link_libraries(${targetName} PRIVATE Qt5::Widgets) endif() -if(OpenMP_CXX_FOUND) - target_compile_definitions(${targetName} PRIVATE WITH_OPENMP) - target_link_libraries(${targetName} PRIVATE OpenMP::OpenMP_CXX) -endif() +target_link_libraries(${targetName} PRIVATE TBB::tbb) diff --git a/GPU/GPUTracking/display/GPUDisplay.cxx b/GPU/GPUTracking/display/GPUDisplay.cxx index 918011b85ea04..e42a4fa3e4bf1 100644 --- a/GPU/GPUTracking/display/GPUDisplay.cxx +++ b/GPU/GPUTracking/display/GPUDisplay.cxx @@ -24,9 +24,6 @@ #ifndef _WIN32 #include "../utils/linux_helpers.h" #endif -#ifdef WITH_OPENMP -#include -#endif #include "GPUChainTracking.h" #include "GPUQA.h" diff --git a/GPU/GPUTracking/display/GPUDisplay.h b/GPU/GPUTracking/display/GPUDisplay.h index d6a65f212ecf3..1c4b751bbf85b 100644 --- a/GPU/GPUTracking/display/GPUDisplay.h +++ b/GPU/GPUTracking/display/GPUDisplay.h @@ -207,7 +207,7 @@ class GPUDisplay : public GPUDisplayInterface void DrawTrackITS(int32_t trackId, int32_t iSlice); GPUDisplay::vboList DrawFinalITS(); template - void DrawFinal(int32_t iSlice, int32_t /*iCol*/, GPUTPCGMPropagator* prop, std::array, 2>& trackList, threadVertexBuffer& threadBuffer); + void DrawFinal(int32_t iSlice, int32_t /*iCol*/, const GPUTPCGMPropagator* prop, std::array, 2>& trackList, threadVertexBuffer& threadBuffer); vboList DrawGrid(const GPUTPCTracker& tracker); vboList DrawGridTRD(int32_t sector); void DoScreenshot(const char* filename, std::vector& pixels, float animateTime = -1.f); diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx index 764f659d07e64..ca9fd6be01703 100644 --- a/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx +++ b/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx @@ -14,25 +14,20 @@ #include "GPUDisplay.h" -#ifdef WITH_OPENMP -#include -#endif #ifndef _WIN32 #include "bitmapfile.h" #endif +#include "oneapi/tbb.h" + using namespace o2::gpu; int32_t GPUDisplay::getNumThreads() { if (mChain) { - return mChain->GetProcessingSettings().ompThreads; + return mChain->GetProcessingSettings().nHostThreads; } else { -#ifdef WITH_OPENMP - return omp_get_max_threads(); -#else - return 1; -#endif + return tbb::info::default_concurrency(); } } diff --git a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx index 5d4628cf0eb3f..8c42cfa46abb9 100644 --- a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx +++ b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx @@ -36,9 +36,7 @@ #include "SimulationDataFormat/ConstMCTruthContainer.h" #include "GPUTrackParamConvert.h" -#ifdef WITH_OPENMP -#include -#endif +#include using namespace o2::gpu; @@ -325,7 +323,7 @@ GPUDisplay::vboList GPUDisplay::DrawFinalITS() } template -void GPUDisplay::DrawFinal(int32_t iSlice, int32_t /*iCol*/, GPUTPCGMPropagator* prop, std::array, 2>& trackList, threadVertexBuffer& threadBuffer) +void GPUDisplay::DrawFinal(int32_t iSlice, int32_t /*iCol*/, const GPUTPCGMPropagator* prop, std::array, 2>& trackList, threadVertexBuffer& threadBuffer) { auto& vBuf = threadBuffer.vBuf; auto& buffer = threadBuffer.buffer; @@ -698,15 +696,15 @@ GPUDisplay::vboList GPUDisplay::DrawGridTRD(int32_t sector) if (trdsector >= 9) { alpha -= 2 * CAMath::Pi(); } - for (int32_t iLy = 0; iLy < GPUTRDTracker::EGPUTRDTracker::kNLayers; ++iLy) { - for (int32_t iStack = 0; iStack < GPUTRDTracker::EGPUTRDTracker::kNStacks; ++iStack) { + for (int32_t iLy = 0; iLy < GPUTRDTracker::EGPUTRDTracker::kNLayers; iLy++) { + for (int32_t iStack = 0; iStack < GPUTRDTracker::EGPUTRDTracker::kNStacks; iStack++) { int32_t iDet = geo->GetDetector(iLy, iStack, trdsector); auto matrix = geo->GetClusterMatrix(iDet); if (!matrix) { continue; } auto pp = geo->GetPadPlane(iDet); - for (int32_t i = 0; i < pp->GetNrows(); ++i) { + for (int32_t i = 0; i < pp->GetNrows(); i++) { float xyzLoc1[3]; float xyzLoc2[3]; float xyzGlb1[3]; @@ -776,26 +774,17 @@ size_t GPUDisplay::DrawGLScene_updateVertexList() mGlDLFinal[iSlice].resize(mNCollissions); } } - GPUCA_OPENMP(parallel num_threads(getNumThreads())) - { -#ifdef WITH_OPENMP - int32_t numThread = omp_get_thread_num(); - int32_t numThreads = omp_get_num_threads(); -#else - int32_t numThread = 0, numThreads = 1; -#endif + int32_t numThreads = getNumThreads(); + tbb::task_arena(numThreads).execute([&] { if (mChain && (mChain->GetRecoSteps() & GPUDataTypes::RecoStep::TPCSliceTracking)) { - GPUCA_OPENMP(for) - for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) { + tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) { GPUTPCTracker& tracker = (GPUTPCTracker&)sliceTracker(iSlice); tracker.SetPointersDataLinks(tracker.LinkTmpMemory()); mGlDLLines[iSlice][tINITLINK] = DrawLinks(tracker, tINITLINK, true); - tracker.SetPointersDataLinks(mChain->rec()->Res(tracker.MemoryResLinks()).Ptr()); - } - GPUCA_OPENMP(barrier) + tracker.SetPointersDataLinks(mChain->rec()->Res(tracker.MemoryResLinks()).Ptr()); // clang-format off + }, tbb::simple_partitioner()); // clang-format on - GPUCA_OPENMP(for) - for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) { + tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) { const GPUTPCTracker& tracker = sliceTracker(iSlice); mGlDLLines[iSlice][tLINK] = DrawLinks(tracker, tLINK); @@ -805,30 +794,28 @@ size_t GPUDisplay::DrawGLScene_updateVertexList() mGlDLGrid[iSlice] = DrawGrid(tracker); if (iSlice < NSLICES / 2) { mGlDLGridTRD[iSlice] = DrawGridTRD(iSlice); - } - } - GPUCA_OPENMP(barrier) + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on - GPUCA_OPENMP(for) - for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) { + tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) { const GPUTPCTracker& tracker = sliceTracker(iSlice); - mGlDLLines[iSlice][tEXTRAPOLATEDTRACK] = DrawTracks(tracker, 1); - } - GPUCA_OPENMP(barrier) + mGlDLLines[iSlice][tEXTRAPOLATEDTRACK] = DrawTracks(tracker, 1); // clang-format off + }, tbb::simple_partitioner()); // clang-format on } - mThreadTracks[numThread].resize(mNCollissions); - for (int32_t i = 0; i < mNCollissions; i++) { - for (int32_t j = 0; j < NSLICES; j++) { - for (int32_t k = 0; k < 2; k++) { - mThreadTracks[numThread][i][j][k].clear(); + tbb::parallel_for(0, numThreads, [&](int32_t iThread) { + mThreadTracks[iThread].resize(mNCollissions); + for (int32_t i = 0; i < mNCollissions; i++) { + for (int32_t j = 0; j < NSLICES; j++) { + for (int32_t k = 0; k < 2; k++) { + mThreadTracks[iThread][i][j][k].clear(); + } } - } - } + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on if (mConfig.showTPCTracksFromO2Format) { #ifdef GPUCA_TPC_GEOMETRY_O2 uint32_t col = 0; - GPUCA_OPENMP(for) - for (uint32_t i = 0; i < mIOPtrs->nOutputTracksTPCO2; i++) { + tbb::parallel_for(0, mIOPtrs->nOutputTracksTPCO2, [&](auto i) { uint8_t sector, row; if (mIOPtrs->clustersNative) { mIOPtrs->outputTracksTPCO2[i].getCluster(mIOPtrs->outputClusRefsTPCO2, 0, *mIOPtrs->clustersNative, sector, row); @@ -838,18 +825,17 @@ size_t GPUDisplay::DrawGLScene_updateVertexList() if (mQA && mIOPtrs->outputTracksTPCO2MC) { col = mQA->GetMCLabelCol(mIOPtrs->outputTracksTPCO2MC[i]); } - mThreadTracks[numThread][col][sector][0].emplace_back(i); - } + mThreadTracks[GPUReconstruction::getHostThreadIndex()][col][sector][0].emplace_back(i); + }); #endif } else { - GPUCA_OPENMP(for) - for (uint32_t i = 0; i < mIOPtrs->nMergedTracks; i++) { + tbb::parallel_for(0, mIOPtrs->nMergedTracks, [&](auto i) { const GPUTPCGMMergedTrack* track = &mIOPtrs->mergedTracks[i]; if (track->NClusters() == 0) { - continue; + return; } if (mCfgH.hideRejectedTracks && !track->OK()) { - continue; + return; } int32_t slice = mIOPtrs->mergedTrackHits[track->FirstClusterRef() + track->NClusters() - 1].slice; uint32_t col = 0; @@ -863,18 +849,17 @@ size_t GPUDisplay::DrawGLScene_updateVertexList() } #endif } - mThreadTracks[numThread][col][slice][0].emplace_back(i); - } + mThreadTracks[GPUReconstruction::getHostThreadIndex()][col][slice][0].emplace_back(i); + }); } for (uint32_t col = 0; col < mIOPtrs->nMCInfosTPCCol; col++) { - GPUCA_OPENMP(for) - for (uint32_t i = mIOPtrs->mcInfosTPCCol[col].first; i < mIOPtrs->mcInfosTPCCol[col].first + mIOPtrs->mcInfosTPCCol[col].num; i++) { + tbb::parallel_for(mIOPtrs->mcInfosTPCCol[col].first, mIOPtrs->mcInfosTPCCol[col].first + mIOPtrs->mcInfosTPCCol[col].num, [&](uint32_t i) { const GPUTPCMCInfo& mc = mIOPtrs->mcInfosTPC[i]; if (mc.charge == 0.f) { - continue; + return; } if (mc.pid < 0) { - continue; + return; } float alpha = atan2f(mc.y, mc.x); @@ -885,18 +870,17 @@ size_t GPUDisplay::DrawGLScene_updateVertexList() if (mc.z < 0) { slice += 18; } - mThreadTracks[numThread][col][slice][1].emplace_back(i); - } + mThreadTracks[GPUReconstruction::getHostThreadIndex()][col][slice][1].emplace_back(i); + }); } - GPUCA_OPENMP(barrier) GPUTPCGMPropagator prop; prop.SetMaxSinPhi(.999); prop.SetMaterialTPC(); prop.SetPolynomialField(&mParam->polynomialField); - GPUCA_OPENMP(for) - for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) { + tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) { + int32_t numThread = GPUReconstruction::getHostThreadIndex(); for (int32_t iCol = 0; iCol < mNCollissions; iCol++) { mThreadBuffers[numThread].clear(); for (int32_t iSet = 0; iSet < numThreads; iSet++) { @@ -915,19 +899,17 @@ size_t GPUDisplay::DrawGLScene_updateVertexList() } list[i] = vboList(startCount, mVertexBufferStart[iSlice].size() - startCount, iSlice); } - } - } + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on - GPUCA_OPENMP(barrier) - GPUCA_OPENMP(for) - for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) { + tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) { for (int32_t i = 0; i < N_POINTS_TYPE_TPC; i++) { for (int32_t iCol = 0; iCol < mNCollissions; iCol++) { mGlDLPoints[iSlice][i][iCol] = DrawClusters(iSlice, i, iCol); } - } - } - } + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on + }); // End omp parallel mGlDLFinalITS = DrawFinalITS(); diff --git a/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx b/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx index aaa03b8a24d18..f53fa185029f8 100644 --- a/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx +++ b/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx @@ -33,6 +33,8 @@ #include "ITSMFTBase/DPLAlpideParam.h" #endif +#include + using namespace o2::gpu; void GPUDisplay::DrawGLScene_updateEventData() @@ -126,103 +128,107 @@ void GPUDisplay::DrawGLScene_updateEventData() } mUpdateTrackFilter = false; - mMaxClusterZ = 0; - GPUCA_OPENMP(parallel for num_threads(getNumThreads()) reduction(max : mMaxClusterZ)) - for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) { - int32_t row = 0; - uint32_t nCls = mParam->par.earlyTpcTransform ? mIOPtrs->nClusterData[iSlice] : mIOPtrs->clustersNative ? mIOPtrs->clustersNative->nClustersSector[iSlice] - : 0; - for (uint32_t i = 0; i < nCls; i++) { - int32_t cid; - if (mParam->par.earlyTpcTransform) { - const auto& cl = mIOPtrs->clusterData[iSlice][i]; - cid = cl.id; - row = cl.row; - } else { - cid = mIOPtrs->clustersNative->clusterOffset[iSlice][0] + i; - while (row < GPUCA_ROW_COUNT - 1 && mIOPtrs->clustersNative->clusterOffset[iSlice][row + 1] <= (uint32_t)cid) { - row++; + mMaxClusterZ = tbb::parallel_reduce(tbb::blocked_range(0, NSLICES, 1), float(0.f), [&](const tbb::blocked_range& r, float maxClusterZ) { + for (int32_t iSlice = r.begin(); iSlice < r.end(); iSlice++) { + int32_t row = 0; + uint32_t nCls = mParam->par.earlyTpcTransform ? mIOPtrs->nClusterData[iSlice] : (mIOPtrs->clustersNative ? mIOPtrs->clustersNative->nClustersSector[iSlice] : 0); + for (uint32_t i = 0; i < nCls; i++) { + int32_t cid; + if (mParam->par.earlyTpcTransform) { + const auto& cl = mIOPtrs->clusterData[iSlice][i]; + cid = cl.id; + row = cl.row; + } else { + cid = mIOPtrs->clustersNative->clusterOffset[iSlice][0] + i; + while (row < GPUCA_ROW_COUNT - 1 && mIOPtrs->clustersNative->clusterOffset[iSlice][row + 1] <= (uint32_t)cid) { + row++; + } } - } - if (cid >= mNMaxClusters) { - throw std::runtime_error("Cluster Buffer Size exceeded"); - } - float4* ptr = &mGlobalPos[cid]; - if (mParam->par.earlyTpcTransform) { - const auto& cl = mIOPtrs->clusterData[iSlice][i]; - mParam->Slice2Global(iSlice, (mCfgH.clustersOnNominalRow ? mParam->tpcGeometry.Row2X(row) : cl.x) + mCfgH.xAdd, cl.y, cl.z, &ptr->x, &ptr->y, &ptr->z); - } else { - float x, y, z; - const auto& cln = mIOPtrs->clustersNative->clusters[iSlice][0][i]; - GPUTPCConvertImpl::convert(*mCalib->fastTransform, *mParam, iSlice, row, cln.getPad(), cln.getTime(), x, y, z); - if (mCfgH.clustersOnNominalRow) { - x = mParam->tpcGeometry.Row2X(row); + if (cid >= mNMaxClusters) { + throw std::runtime_error("Cluster Buffer Size exceeded"); + } + float4* ptr = &mGlobalPos[cid]; + if (mParam->par.earlyTpcTransform) { + const auto& cl = mIOPtrs->clusterData[iSlice][i]; + mParam->Slice2Global(iSlice, (mCfgH.clustersOnNominalRow ? mParam->tpcGeometry.Row2X(row) : cl.x) + mCfgH.xAdd, cl.y, cl.z, &ptr->x, &ptr->y, &ptr->z); + } else { + float x, y, z; + const auto& cln = mIOPtrs->clustersNative->clusters[iSlice][0][i]; + GPUTPCConvertImpl::convert(*mCalib->fastTransform, *mParam, iSlice, row, cln.getPad(), cln.getTime(), x, y, z); + if (mCfgH.clustersOnNominalRow) { + x = mParam->tpcGeometry.Row2X(row); + } + mParam->Slice2Global(iSlice, x + mCfgH.xAdd, y, z, &ptr->x, &ptr->y, &ptr->z); } - mParam->Slice2Global(iSlice, x + mCfgH.xAdd, y, z, &ptr->x, &ptr->y, &ptr->z); + + if (fabsf(ptr->z) > maxClusterZ) { + maxClusterZ = fabsf(ptr->z); + } + ptr->z += iSlice < 18 ? mCfgH.zAdd : -mCfgH.zAdd; + ptr->x *= GL_SCALE_FACTOR; + ptr->y *= GL_SCALE_FACTOR; + ptr->z *= GL_SCALE_FACTOR; + ptr->w = tCLUSTER; } + } + return maxClusterZ; // clang-format off + }, [](const float a, const float b) { return std::max(a, b); }, tbb::simple_partitioner()); // clang-format on - if (fabsf(ptr->z) > mMaxClusterZ) { - mMaxClusterZ = fabsf(ptr->z); + mMaxClusterZ = tbb::parallel_reduce(tbb::blocked_range(0, mCurrentSpacePointsTRD, 32), float(mMaxClusterZ), [&](const tbb::blocked_range& r, float maxClusterZ) { + int32_t trdTriggerRecord = -1; + float trdZoffset = 0; + for (int i = r.begin(); i < r.end(); i++) { + while (mParam->par.continuousTracking && trdTriggerRecord < (int32_t)mIOPtrs->nTRDTriggerRecords - 1 && mIOPtrs->trdTrackletIdxFirst[trdTriggerRecord + 1] <= i) { + trdTriggerRecord++; // This requires to go through the data in order I believe + float trdTime = mIOPtrs->trdTriggerTimes[trdTriggerRecord] * 1e3 / o2::constants::lhc::LHCBunchSpacingNS / o2::tpc::constants::LHCBCPERTIMEBIN; + trdZoffset = fabsf(mCalib->fastTransformHelper->getCorrMap()->convVertexTimeToZOffset(0, trdTime, mParam->continuousMaxTimeBin)); + } + const auto& sp = mIOPtrs->trdSpacePoints[i]; + int32_t iSec = trdGeometry()->GetSector(mIOPtrs->trdTracklets[i].GetDetector()); + float4* ptr = &mGlobalPosTRD[i]; + mParam->Slice2Global(iSec, sp.getX() + mCfgH.xAdd, sp.getY(), sp.getZ(), &ptr->x, &ptr->y, &ptr->z); + ptr->z += ptr->z > 0 ? trdZoffset : -trdZoffset; + if (fabsf(ptr->z) > maxClusterZ) { + maxClusterZ = fabsf(ptr->z); } - ptr->z += iSlice < 18 ? mCfgH.zAdd : -mCfgH.zAdd; ptr->x *= GL_SCALE_FACTOR; ptr->y *= GL_SCALE_FACTOR; ptr->z *= GL_SCALE_FACTOR; - ptr->w = tCLUSTER; - } - } - - int32_t trdTriggerRecord = -1; - float trdZoffset = 0; - GPUCA_OPENMP(parallel for num_threads(getNumThreads()) reduction(max : mMaxClusterZ) firstprivate(trdTriggerRecord, trdZoffset)) - for (int32_t i = 0; i < mCurrentSpacePointsTRD; i++) { - while (mParam->par.continuousTracking && trdTriggerRecord < (int32_t)mIOPtrs->nTRDTriggerRecords - 1 && mIOPtrs->trdTrackletIdxFirst[trdTriggerRecord + 1] <= i) { - trdTriggerRecord++; - float trdTime = mIOPtrs->trdTriggerTimes[trdTriggerRecord] * 1e3 / o2::constants::lhc::LHCBunchSpacingNS / o2::tpc::constants::LHCBCPERTIMEBIN; - trdZoffset = fabsf(mCalib->fastTransformHelper->getCorrMap()->convVertexTimeToZOffset(0, trdTime, mParam->continuousMaxTimeBin)); - } - const auto& sp = mIOPtrs->trdSpacePoints[i]; - int32_t iSec = trdGeometry()->GetSector(mIOPtrs->trdTracklets[i].GetDetector()); - float4* ptr = &mGlobalPosTRD[i]; - mParam->Slice2Global(iSec, sp.getX() + mCfgH.xAdd, sp.getY(), sp.getZ(), &ptr->x, &ptr->y, &ptr->z); - ptr->z += ptr->z > 0 ? trdZoffset : -trdZoffset; - if (fabsf(ptr->z) > mMaxClusterZ) { - mMaxClusterZ = fabsf(ptr->z); - } - ptr->x *= GL_SCALE_FACTOR; - ptr->y *= GL_SCALE_FACTOR; - ptr->z *= GL_SCALE_FACTOR; - ptr->w = tTRDCLUSTER; - ptr = &mGlobalPosTRD2[i]; - mParam->Slice2Global(iSec, sp.getX() + mCfgH.xAdd + 4.5f, sp.getY() + 1.5f * sp.getDy(), sp.getZ(), &ptr->x, &ptr->y, &ptr->z); - ptr->z += ptr->z > 0 ? trdZoffset : -trdZoffset; - if (fabsf(ptr->z) > mMaxClusterZ) { - mMaxClusterZ = fabsf(ptr->z); + ptr->w = tTRDCLUSTER; + ptr = &mGlobalPosTRD2[i]; + mParam->Slice2Global(iSec, sp.getX() + mCfgH.xAdd + 4.5f, sp.getY() + 1.5f * sp.getDy(), sp.getZ(), &ptr->x, &ptr->y, &ptr->z); + ptr->z += ptr->z > 0 ? trdZoffset : -trdZoffset; + if (fabsf(ptr->z) > maxClusterZ) { + maxClusterZ = fabsf(ptr->z); + } + ptr->x *= GL_SCALE_FACTOR; + ptr->y *= GL_SCALE_FACTOR; + ptr->z *= GL_SCALE_FACTOR; + ptr->w = tTRDCLUSTER; } - ptr->x *= GL_SCALE_FACTOR; - ptr->y *= GL_SCALE_FACTOR; - ptr->z *= GL_SCALE_FACTOR; - ptr->w = tTRDCLUSTER; - } + return maxClusterZ; // clang-format off + }, [](const float a, const float b) { return std::max(a, b); }, tbb::static_partitioner()); // clang-format on - GPUCA_OPENMP(parallel for num_threads(getNumThreads()) reduction(max : mMaxClusterZ)) - for (int32_t i = 0; i < mCurrentClustersTOF; i++) { - float4* ptr = &mGlobalPosTOF[i]; - mParam->Slice2Global(mIOPtrs->tofClusters[i].getSector(), mIOPtrs->tofClusters[i].getX() + mCfgH.xAdd, mIOPtrs->tofClusters[i].getY(), mIOPtrs->tofClusters[i].getZ(), &ptr->x, &ptr->y, &ptr->z); - float ZOffset = 0; - if (mParam->par.continuousTracking) { - float tofTime = mIOPtrs->tofClusters[i].getTime() * 1e-3 / o2::constants::lhc::LHCBunchSpacingNS / o2::tpc::constants::LHCBCPERTIMEBIN; - ZOffset = fabsf(mCalib->fastTransformHelper->getCorrMap()->convVertexTimeToZOffset(0, tofTime, mParam->continuousMaxTimeBin)); - ptr->z += ptr->z > 0 ? ZOffset : -ZOffset; - } - if (fabsf(ptr->z) > mMaxClusterZ) { - mMaxClusterZ = fabsf(ptr->z); + mMaxClusterZ = tbb::parallel_reduce(tbb::blocked_range(0, mCurrentClustersTOF, 32), float(mMaxClusterZ), [&](const tbb::blocked_range& r, float maxClusterZ) { + for (int32_t i = r.begin(); i < r.end(); i++) { + float4* ptr = &mGlobalPosTOF[i]; + mParam->Slice2Global(mIOPtrs->tofClusters[i].getSector(), mIOPtrs->tofClusters[i].getX() + mCfgH.xAdd, mIOPtrs->tofClusters[i].getY(), mIOPtrs->tofClusters[i].getZ(), &ptr->x, &ptr->y, &ptr->z); + float ZOffset = 0; + if (mParam->par.continuousTracking) { + float tofTime = mIOPtrs->tofClusters[i].getTime() * 1e-3 / o2::constants::lhc::LHCBunchSpacingNS / o2::tpc::constants::LHCBCPERTIMEBIN; + ZOffset = fabsf(mCalib->fastTransformHelper->getCorrMap()->convVertexTimeToZOffset(0, tofTime, mParam->continuousMaxTimeBin)); + ptr->z += ptr->z > 0 ? ZOffset : -ZOffset; + } + if (fabsf(ptr->z) > maxClusterZ) { + maxClusterZ = fabsf(ptr->z); + } + ptr->x *= GL_SCALE_FACTOR; + ptr->y *= GL_SCALE_FACTOR; + ptr->z *= GL_SCALE_FACTOR; + ptr->w = tTOFCLUSTER; } - ptr->x *= GL_SCALE_FACTOR; - ptr->y *= GL_SCALE_FACTOR; - ptr->z *= GL_SCALE_FACTOR; - ptr->w = tTOFCLUSTER; - } + return maxClusterZ; // clang-format off + }, [](const float a, const float b) { return std::max(a, b); }); // clang-format on if (mCurrentClustersITS) { float itsROFhalfLen = 0; diff --git a/GPU/GPUTracking/display/shaders/GPUDisplayShaders.h b/GPU/GPUTracking/display/shaders/GPUDisplayShaders.h index 63673505f4732..23d382466ba22 100644 --- a/GPU/GPUTracking/display/shaders/GPUDisplayShaders.h +++ b/GPU/GPUTracking/display/shaders/GPUDisplayShaders.h @@ -458,7 +458,7 @@ const float positionScale = 100.0f; void main() { vec3 position = gl_in[0].gl_Position.xyz; - for(uint32_t i = 0; i < field_config.StepCount; ++i) { + for(uint32_t i = 0; i < field_config.StepCount; i++) { gl_Position = um.ModelViewProj * vec4(position/positionScale, 1.0f); EmitVertex(); const vec3 b_vec = Field(position); diff --git a/GPU/GPUTracking/qa/GPUQA.cxx b/GPU/GPUTracking/qa/GPUQA.cxx index 70a093c7f1de7..015159fee24d7 100644 --- a/GPU/GPUTracking/qa/GPUQA.cxx +++ b/GPU/GPUTracking/qa/GPUQA.cxx @@ -76,6 +76,8 @@ #include "utils/qconfig.h" #include "utils/timer.h" +#include + using namespace o2::gpu; #ifdef GPUCA_MERGER_BY_MC_LABEL @@ -919,49 +921,48 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx } #endif } else { - auto acc = GPUTPCTrkLbl(GetClusterLabels(), 1.f - mConfig.recThreshold); -#if QA_DEBUG == 0 - GPUCA_OPENMP(parallel for firstprivate(acc)) -#endif - for (uint32_t i = 0; i < nReconstructedTracks; i++) { - acc.reset(); - int32_t nClusters = 0; - const GPUTPCGMMergedTrack& track = mTracking->mIOPtrs.mergedTracks[i]; - std::vector labels; - for (uint32_t k = 0; k < track.NClusters(); k++) { - if (mTracking->mIOPtrs.mergedTrackHits[track.FirstClusterRef() + k].state & GPUTPCGMMergedTrackHit::flagReject) { - continue; - } - nClusters++; - uint32_t hitId = mTracking->mIOPtrs.mergedTrackHits[track.FirstClusterRef() + k].num; - if (hitId >= GetNMCLabels()) { - GPUError("Invalid hit id %u > %d (nClusters %d)", hitId, GetNMCLabels(), mTracking->mIOPtrs.clustersNative ? mTracking->mIOPtrs.clustersNative->nClustersTotal : 0); - throw std::runtime_error("qa error"); - } - acc.addLabel(hitId); - for (int32_t j = 0; j < GetMCLabelNID(hitId); j++) { - if (GetMCLabelID(hitId, j) >= (int32_t)GetNMCTracks(GetMCLabelCol(hitId, j))) { - GPUError("Invalid label %d > %d (hit %d, label %d, col %d)", GetMCLabelID(hitId, j), GetNMCTracks(GetMCLabelCol(hitId, j)), hitId, j, (int32_t)GetMCLabelCol(hitId, j)); + tbb::parallel_for(tbb::blocked_range(0, nReconstructedTracks, (QA_DEBUG == 0) ? 32 : nReconstructedTracks), [&](const tbb::blocked_range& range) { + auto acc = GPUTPCTrkLbl(GetClusterLabels(), 1.f - mConfig.recThreshold); + for (auto i = range.begin(); i < range.end(); i++) { + acc.reset(); + int32_t nClusters = 0; + const GPUTPCGMMergedTrack& track = mTracking->mIOPtrs.mergedTracks[i]; + std::vector labels; + for (uint32_t k = 0; k < track.NClusters(); k++) { + if (mTracking->mIOPtrs.mergedTrackHits[track.FirstClusterRef() + k].state & GPUTPCGMMergedTrackHit::flagReject) { + continue; + } + nClusters++; + uint32_t hitId = mTracking->mIOPtrs.mergedTrackHits[track.FirstClusterRef() + k].num; + if (hitId >= GetNMCLabels()) { + GPUError("Invalid hit id %u > %d (nClusters %d)", hitId, GetNMCLabels(), mTracking->mIOPtrs.clustersNative ? mTracking->mIOPtrs.clustersNative->nClustersTotal : 0); throw std::runtime_error("qa error"); } - if (GetMCLabelID(hitId, j) >= 0) { - if (QA_DEBUG >= 3 && track.OK()) { - GPUInfo("Track %d Cluster %u Label %d: %d (%f)", i, k, j, GetMCLabelID(hitId, j), GetMCLabelWeight(hitId, j)); + acc.addLabel(hitId); + for (int32_t j = 0; j < GetMCLabelNID(hitId); j++) { + if (GetMCLabelID(hitId, j) >= (int32_t)GetNMCTracks(GetMCLabelCol(hitId, j))) { + GPUError("Invalid label %d > %d (hit %d, label %d, col %d)", GetMCLabelID(hitId, j), GetNMCTracks(GetMCLabelCol(hitId, j)), hitId, j, (int32_t)GetMCLabelCol(hitId, j)); + throw std::runtime_error("qa error"); + } + if (GetMCLabelID(hitId, j) >= 0) { + if (QA_DEBUG >= 3 && track.OK()) { + GPUInfo("Track %d Cluster %u Label %d: %d (%f)", i, k, j, GetMCLabelID(hitId, j), GetMCLabelWeight(hitId, j)); + } } } } - } - float maxweight, sumweight; - int32_t maxcount; - auto maxLabel = acc.computeLabel(&maxweight, &sumweight, &maxcount); - mTrackMCLabels[i] = maxLabel; - if (QA_DEBUG && track.OK() && GetNMCTracks(maxLabel) > (uint32_t)maxLabel.getTrackID()) { - const mcInfo_t& mc = GetMCTrack(maxLabel); - GPUInfo("Track %d label %d (fake %d) weight %f clusters %d (fitted %d) (%f%% %f%%) Pt %f", i, maxLabel.getTrackID(), (int32_t)(maxLabel.isFake()), maxweight, nClusters, track.NClustersFitted(), 100.f * maxweight / sumweight, 100.f * (float)maxcount / (float)nClusters, - std::sqrt(mc.pX * mc.pX + mc.pY * mc.pY)); + float maxweight, sumweight; + int32_t maxcount; + auto maxLabel = acc.computeLabel(&maxweight, &sumweight, &maxcount); + mTrackMCLabels[i] = maxLabel; + if (QA_DEBUG && track.OK() && GetNMCTracks(maxLabel) > (uint32_t)maxLabel.getTrackID()) { + const mcInfo_t& mc = GetMCTrack(maxLabel); + GPUInfo("Track %d label %d (fake %d) weight %f clusters %d (fitted %d) (%f%% %f%%) Pt %f", i, maxLabel.getTrackID(), (int32_t)(maxLabel.isFake()), maxweight, nClusters, track.NClustersFitted(), 100.f * maxweight / sumweight, 100.f * (float)maxcount / (float)nClusters, + std::sqrt(mc.pX * mc.pX + mc.pY * mc.pY)); + } } - } + }); } if (QA_TIMING || (mTracking && mTracking->GetProcessingSettings().debugLevel >= 3)) { GPUInfo("QA Time: Assign Track Labels:\t\t%6.0f us", timer.GetCurrentElapsedTime(true) * 1e6); @@ -1135,8 +1136,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx } // Compute MC Track Parameters for MC Tracks - GPUCA_OPENMP(parallel for) - for (uint32_t iCol = 0; iCol < GetNMCCollissions(); iCol++) { + tbb::parallel_for(0, GetNMCCollissions(), [&](auto iCol) { for (uint32_t i = 0; i < GetNMCTracks(iCol); i++) { const mcInfo_t& info = GetMCTrack(i, iCol); additionalMCParameters& mc2 = mMCParam[iCol][i]; @@ -1153,8 +1153,8 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx std::vector& effBuffer = mcEffBuffer[mNEvents - 1]; effBuffer[i] = mRecTracks[iCol][i] * 1000 + mFakeTracks[iCol][i]; } - } - } + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on if (QA_TIMING || (mTracking && mTracking->GetProcessingSettings().debugLevel >= 3)) { GPUInfo("QA Time: Compute track mc parameters:\t%6.0f us", timer.GetCurrentElapsedTime(true) * 1e6); }