diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx index e3522d2d7242d..64d1770c07247 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.cxx +++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx @@ -23,12 +23,9 @@ #include #include -#ifdef WITH_OPENMP -#include -#endif - #include "GPUReconstruction.h" #include "GPUReconstructionIncludes.h" +#include "GPUReconstructionThreading.h" #include "GPUROOTDumpCore.h" #include "GPUConfigDump.h" #include "GPUChainTracking.h" @@ -121,17 +118,12 @@ void GPUReconstruction::GetITSTraits(std::unique_ptr* tr } } -int32_t GPUReconstruction::SetNOMPThreads(int32_t n) +void GPUReconstruction::SetNOMPThreads(int32_t n) { -#ifdef WITH_OPENMP - omp_set_num_threads(mProcessingSettings.ompThreads = std::max(1, n < 0 ? mMaxOMPThreads : std::min(n, mMaxOMPThreads))); + mProcessingSettings.ompThreads = std::max(1, n < 0 ? mMaxOMPThreads : std::min(n, mMaxOMPThreads)); if (mProcessingSettings.debugLevel >= 3) { - GPUInfo("Set number of OpenMP threads to %d (%d requested)", mProcessingSettings.ompThreads, n); + GPUInfo("Set number of parallel threads to %d (%d requested)", mProcessingSettings.ompThreads, n); } - return n > mMaxOMPThreads; -#else - return 1; -#endif } int32_t GPUReconstruction::Init() @@ -299,23 +291,15 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice() mMemoryScalers->rescaleMaxMem(mProcessingSettings.forceMaxMemScalers); } -#ifdef WITH_OPENMP if (mProcessingSettings.ompThreads <= 0) { - mProcessingSettings.ompThreads = omp_get_max_threads(); + mProcessingSettings.ompThreads = tbb::info::default_concurrency(); } else { mProcessingSettings.ompAutoNThreads = false; - omp_set_num_threads(mProcessingSettings.ompThreads); } - if (mProcessingSettings.ompKernels) { - if (omp_get_max_active_levels() < 2) { - omp_set_max_active_levels(2); - } - } -#else - mProcessingSettings.ompThreads = 1; -#endif mMaxOMPThreads = mProcessingSettings.ompThreads; - mMaxThreads = std::max(mMaxThreads, mProcessingSettings.ompThreads); + mThreading->control = std::make_unique(tbb::global_control::max_allowed_parallelism, mMaxOMPThreads); + mThreading->allThreads = std::make_unique(mMaxOMPThreads); + mMaxThreads = std::max(mMaxThreads, mMaxOMPThreads); if (IsGPU()) { mNStreams = std::max(mProcessingSettings.nStreams, 3); } diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h index 6fd00e1fda207..62bf9e7d1d973 100644 --- a/GPU/GPUTracking/Base/GPUReconstruction.h +++ b/GPU/GPUTracking/Base/GPUReconstruction.h @@ -51,6 +51,7 @@ namespace gpu class GPUChain; struct GPUMemorySizeScalers; struct GPUReconstructionPipelineContext; +struct GPUReconstructionThreading; class GPUROOTDumpCore; namespace gpu_reconstruction_kernels @@ -207,7 +208,7 @@ class GPUReconstruction void SetInputControl(void* ptr, size_t size); GPUOutputControl& OutputControl() { return mOutputControl; } int32_t GetMaxThreads() const { return mMaxThreads; } - int32_t SetNOMPThreads(int32_t n); + void SetNOMPThreads(int32_t n); int32_t NStreams() const { return mNStreams; } const void* DeviceMemoryBase() const { return mDeviceMemoryBase; } @@ -234,6 +235,8 @@ class GPUReconstruction double GetStatKernelTime() { return mStatKernelTime; } double GetStatWallTime() { return mStatWallTime; } + std::unique_ptr mThreading; + protected: void AllocateRegisteredMemoryInternal(GPUMemoryResource* res, GPUOutputControl* control, GPUReconstruction* recPool); void FreeRegisteredMemory(GPUMemoryResource* res); diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx index 187792b3ba2e7..7d960629b1b41 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx @@ -14,6 +14,7 @@ #include "GPUReconstructionCPU.h" #include "GPUReconstructionIncludes.h" +#include "GPUReconstructionThreading.h" #include "GPUChain.h" #include "GPUTPCClusterData.h" @@ -40,13 +41,6 @@ #include #endif -#if defined(WITH_OPENMP) || defined(_OPENMP) -#include -#else -static inline int32_t omp_get_thread_num() { return 0; } -static inline int32_t omp_get_max_threads() { return 1; } -#endif - using namespace o2::gpu; using namespace o2::gpu::gpu_reconstruction_kernels; @@ -111,24 +105,20 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS template <> inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size) { -#ifdef WITH_OPENMP int32_t nOMPThreads = std::max(1, std::min(size / (16 * 1024 * 1024), getNOMPThreads())); if (nOMPThreads > 1) { - GPUCA_OPENMP(parallel num_threads(nOMPThreads)) - { - size_t threadSize = size / omp_get_num_threads(); + tbb::parallel_for(0, nOMPThreads, [&](int iThread) { + size_t threadSize = size / nOMPThreads; if (threadSize % 4096) { threadSize += 4096 - threadSize % 4096; } - size_t offset = threadSize * omp_get_thread_num(); + size_t offset = threadSize * iThread; size_t mySize = std::min(threadSize, size - offset); if (mySize) { memset((char*)ptr + offset, 0, mySize); - } - } - } else -#endif - { + } // clang-format off + }, tbb::static_partitioner()); // clang-format on + } else { memset(ptr, 0, size); } return 0; @@ -353,7 +343,7 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes() int32_t GPUReconstructionCPUBackend::getOMPThreadNum() { - return omp_get_thread_num(); + return tbb::this_task_arena::current_thread_index(); } int32_t GPUReconstructionCPUBackend::getOMPMaxThreads() diff --git a/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx b/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx index ca1c46766b9da..510b6809480ca 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx @@ -40,6 +40,8 @@ #include "TPCBase/CRU.h" #include "DetectorsRaw/RDHUtils.h" +#include + using namespace o2::gpu; using namespace o2::tpc; using namespace o2::tpc::constants; @@ -1306,6 +1308,17 @@ size_t zsEncoderRun::compare(std::vector* buffer, std::vector void GPUReconstructionConvert::RunZSEncoder(const S& in, std::unique_ptr* outBuffer, uint32_t* outSizes, o2::raw::RawFileWriter* raw, const o2::InteractionRecord* ir, const GPUParam& param, int32_t version, bool verify, float threshold, bool padding, std::function&)> digitsFilter) { @@ -1316,64 +1329,65 @@ void GPUReconstructionConvert::RunZSEncoder(const S& in, std::unique_ptr buffer[NSLICES][GPUTrackingInOutZS::NENDPOINTS]; - uint32_t totalPages = 0; - size_t totalSize = 0; - size_t nErrors = 0; - size_t digitsInput = 0; - size_t digitsEncoded = 0; - // clang-format off - GPUCA_OPENMP(parallel for reduction(+ : totalPages, nErrors, totalSize, digitsInput, digitsEncoded)) - // clang-format on - for (uint32_t i = 0; i < NSLICES; i++) { - std::vector tmpBuffer; - digitsInput += ZSEncoderGetNDigits(in, i); - tmpBuffer.resize(ZSEncoderGetNDigits(in, i)); - if (threshold > 0.f && !digitsFilter) { - auto it = std::copy_if(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; }); - tmpBuffer.resize(std::distance(tmpBuffer.begin(), it)); - } else { - std::copy(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin()); - } - - if (digitsFilter) { - digitsFilter(tmpBuffer); - if (threshold > 0.f) { - std::vector tmpBuffer2 = std::move(tmpBuffer); - tmpBuffer = std::vector(tmpBuffer2.size()); - auto it = std::copy_if(tmpBuffer2.begin(), tmpBuffer2.end(), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; }); + tbb::parallel_reduce(tbb::blocked_range(0, NSLICES), o2::gpu::internal::tmpReductionResult(), [&](const auto range, const auto red) { + for (uint32_t i = r.begin(); i < r.end(); ++i) { + std::vector tmpBuffer; + red.digitsInput += ZSEncoderGetNDigits(in, i); + tmpBuffer.resize(ZSEncoderGetNDigits(in, i)); + if (threshold > 0.f && !digitsFilter) { + auto it = std::copy_if(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; }); tmpBuffer.resize(std::distance(tmpBuffer.begin(), it)); + } else { + std::copy(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin()); } - } - digitsEncoded += tmpBuffer.size(); - - auto runZS = [&](auto& encoder) { - encoder.zsVersion = version; - encoder.init(); - totalPages += encoder.run(buffer[i], tmpBuffer, &totalSize); - if (verify) { - nErrors += encoder.compare(buffer[i], tmpBuffer); // Verification + + if (digitsFilter) { + digitsFilter(tmpBuffer); + if (threshold > 0.f) { + std::vector tmpBuffer2 = std::move(tmpBuffer); + tmpBuffer = std::vector(tmpBuffer2.size()); + auto it = std::copy_if(tmpBuffer2.begin(), tmpBuffer2.end(), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; }); + tmpBuffer.resize(std::distance(tmpBuffer.begin(), it)); + } } - }; + red.digitsEncoded += tmpBuffer.size(); + + auto runZS = [&](auto& encoder) { + encoder.zsVersion = version; + encoder.init(); + red.totalPages += encoder.run(buffer[i], tmpBuffer, &red.totalSize); + if (verify) { + red.nErrors += encoder.compare(buffer[i], tmpBuffer); // Verification + } + }; - if (version >= ZSVersion::ZSVersionRowBased10BitADC && version <= ZSVersion::ZSVersionRowBased12BitADC) { - zsEncoderRun enc{{{.iSector = i, .raw = raw, .ir = ir, .param = ¶m, .padding = padding}}}; - runZS(enc); - } else if (version >= ZSVersion::ZSVersionLinkBasedWithMeta && version <= ZSVersion::ZSVersionDenseLinkBasedV2) { -#ifdef GPUCA_O2_LIB - if (version == ZSVersion::ZSVersionLinkBasedWithMeta) { - zsEncoderRun enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = ¶m, .padding = padding}}}}; + if (version >= ZSVersion::ZSVersionRowBased10BitADC && version <= ZSVersion::ZSVersionRowBased12BitADC) { + zsEncoderRun enc{{{.iSector = i, .raw = raw, .ir = ir, .param = ¶m, .padding = padding}}}; runZS(enc); - } else if (version >= ZSVersion::ZSVersionDenseLinkBased && version <= ZSVersion::ZSVersionDenseLinkBasedV2) { - zsEncoderRun enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = ¶m, .padding = padding}}}}; - runZS(enc); - } + } else if (version >= ZSVersion::ZSVersionLinkBasedWithMeta && version <= ZSVersion::ZSVersionDenseLinkBasedV2) { +#ifdef GPUCA_O2_LIB + if (version == ZSVersion::ZSVersionLinkBasedWithMeta) { + zsEncoderRun enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = ¶m, .padding = padding}}}}; + runZS(enc); + } else if (version >= ZSVersion::ZSVersionDenseLinkBased && version <= ZSVersion::ZSVersionDenseLinkBasedV2) { + zsEncoderRun enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = ¶m, .padding = padding}}}}; + runZS(enc); + } #else - throw std::runtime_error("Link based ZS encoding not supported in standalone build"); + throw std::runtime_error("Link based ZS encoding not supported in standalone build"); #endif - } else { - throw std::runtime_error("Invalid ZS version "s + std::to_string(version) + ", cannot decode"s); + } else { + throw std::runtime_error("Invalid ZS version "s + std::to_string(version) + ", cannot decode"s); + } } - } + return red; }, [&](const auto& red1, const auto& red2) { + auto red = red1; + red.totalPages += red2.totalPages; + red.totalSize += red2.totalSize; + red.nErrors += red2.nErrors; + red.digitsInput += red2.digitsInput; + red.digitsEncoded += red2.digitsEncoded; + return red; }); if (outBuffer) { outBuffer->reset(new uint64_t[totalPages * TPCZSHDR::TPC_ZS_PAGE_SIZE / sizeof(uint64_t)]); diff --git a/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx b/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx index 94b16ae5a6936..c47bd488d96ef 100644 --- a/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx +++ b/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx @@ -22,10 +22,6 @@ #include #endif -#ifdef WITH_OPENMP -#include -#endif - #include "GPUReconstruction.h" #include "GPUReconstructionAvailableBackends.h" diff --git a/GPU/GPUTracking/Base/GPUReconstructionThreading.h b/GPU/GPUTracking/Base/GPUReconstructionThreading.h new file mode 100644 index 0000000000000..2d04877ea6105 --- /dev/null +++ b/GPU/GPUTracking/Base/GPUReconstructionThreading.h @@ -0,0 +1,59 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUReconstructionThreading.h +/// \author David Rohr + +#if !defined(GPURECONSTRUCTIONTHREADING_H) +#define GPURECONSTRUCTIONTHREADING_H + +#if !defined(GPUCA_GPUCODE) +#include "GPUReconstruction.h" + +#include +#include + +namespace o2::gpu +{ + +struct GPUReconstructionThreading { + std::unique_ptr control; + std::unique_ptr allThreads; + std::unique_ptr outerThreads; +}; + +} // namespace o2::gpu + +#endif + +#define GPUCA_TBB_KERNEL_LOOP_HOST(rec, vartype, varname, iEnd, code) \ + for (vartype varname = get_global_id(0); varname < iEnd; varname += get_global_size(0)) { \ + code \ + } + +#ifdef GPUCA_GPUCODE +#define GPUCA_TBB_KERNEL_LOOP GPUCA_TBB_KERNEL_LOOP_HOST +#else +#define GPUCA_TBB_KERNEL_LOOP(rec, vartype, varname, iEnd, code) \ + if (!rec.GetProcessingSettings().ompKernels) { \ + rec.mThreading->allThreads->execute([&] { \ + tbb::parallel_for(tbb::blocked_range(get_global_id(0), iEnd, get_global_size(0)), [&](const tbb::blocked_range& _r_internal) { \ + for (vartype varname = _r_internal.begin(); varname < _r_internal.end(); varname += get_global_size(0)) { \ + code \ + } \ + }); \ + }); \ + } else { \ + GPUCA_TBB_KERNEL_LOOP_HOST(rec, vartype, varname, iEnd, code) \ + } +#endif + +#endif diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt index bab5ff912c575..5bc1e6e4e6783 100644 --- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt +++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt @@ -136,12 +136,7 @@ set_target_cuda_arch(${targetName}) #target_link_options(${targetName} PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/version_script.ld") #set_target_properties(${targetName} PROPERTIES LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/version_script.ld) -if(OpenMP_CXX_FOUND) - # Must be private, depending libraries might be compiled by compiler not understanding -fopenmp - target_compile_definitions(${targetName} PRIVATE WITH_OPENMP) - target_link_libraries(${targetName} PRIVATE OpenMP::OpenMP_CXX) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fopenmp") -endif() +target_link_libraries(${targetName} PRIVATE TBB::tbb) # Special handling of GPU kernels in case of per-kernel compilation / RDC if(NOT DEFINED GPUCA_CUDA_COMPILE_MODE) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx index 62ad57ae3497a..3bd3afc0ffc23 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx @@ -13,9 +13,6 @@ /// \author David Rohr #define GPUCA_GPUCODE_HOSTONLY -#ifdef WITH_OPENMP -#include -#endif #include "GPUReconstructionCUDA.h" #include "GPUParamRTC.h" #include "GPUDefMacros.h" @@ -25,6 +22,7 @@ #include #include +#include using namespace o2::gpu; #include "utils/qGetLdBinarySymbols.h" @@ -153,10 +151,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) } HighResTimer rtcTimer; rtcTimer.ResetStart(); -#ifdef WITH_OPENMP -#pragma omp parallel for schedule(dynamic, 1) -#endif - for (uint32_t i = 0; i < nCompile; i++) { + tbb::parallel_for(0, nCompile, [&](auto i) { if (mProcessingSettings.debugLevel >= 3) { printf("Compiling %s\n", (filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str()); } @@ -190,8 +185,8 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile) printf("Source code file: %s", filename.c_str()); } throw std::runtime_error("Error during CUDA compilation"); - } - } + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on if (mProcessingSettings.debugLevel >= 0) { GPUInfo("RTC Compilation finished (%f seconds)", rtcTimer.GetCurrentElapsedTime()); } diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index d34777f0bef3e..10fbfa8d21ddf 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -189,11 +189,7 @@ endif() target_link_libraries(${targetName} PRIVATE hip::host hip::device hip::hipcub roc::rocthrust) set_target_hip_arch(${targetName}) -if(OpenMP_CXX_FOUND) - # Must be private, depending libraries might be compiled by compiler not understanding -fopenmp - target_compile_definitions(${MODULE}_CXX PRIVATE WITH_OPENMP) - target_link_libraries(${MODULE}_CXX PRIVATE OpenMP::OpenMP_CXX) -endif() +target_link_libraries(${MODULE}_CXX PRIVATE TBB::tbb) # Special handling of GPU kernels in case of per-kernel compilation / RDC if(NOT DEFINED GPUCA_HIP_COMPILE_MODE) diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt index a5d335931af37..3e738fb6df5cb 100644 --- a/GPU/GPUTracking/CMakeLists.txt +++ b/GPU/GPUTracking/CMakeLists.txt @@ -105,6 +105,7 @@ set(HDRS_INSTALL Base/GPUParam.inc Base/GPUParamRTC.h Base/GPUReconstructionIncludes.h + Base/GPUReconstructionThreading.h Base/GPUReconstructionIncludesITS.h Base/GPUReconstructionKernelMacros.h Base/GPUReconstructionKernels.h @@ -378,12 +379,7 @@ if(GPUCA_QA) target_compile_definitions(${targetName} PRIVATE GPUCA_BUILD_QA) endif() -if(OpenMP_CXX_FOUND) - message(STATUS "GPU: Using OpenMP: ${OpenMP_CXX_SPEC_DATE}") - # Must be private, depending libraries might be compiled by compiler not understanding -fopenmp - target_compile_definitions(${targetName} PRIVATE WITH_OPENMP) - target_link_libraries(${targetName} PRIVATE OpenMP::OpenMP_CXX) -endif() +target_link_libraries(${targetName} PRIVATE TBB::tbb) target_compile_options(${targetName} PRIVATE -Wno-instantiation-after-specialization) diff --git a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx index 22641774cd9ee..5649edf9e1aab 100644 --- a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx +++ b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx @@ -22,6 +22,8 @@ #include #include "TPCClusterDecompressionCore.inc" +#include + using namespace o2::gpu; using namespace o2::tpc; @@ -82,8 +84,7 @@ int32_t TPCClusterDecompressor::decompress(const CompressedClusters* clustersCom } clustersNative.clustersLinear = clusterBuffer; clustersNative.setOffsetPtrs(); - GPUCA_OPENMP(parallel for) - for (uint32_t i = 0; i < NSLICES; i++) { + tbb::parallel_for(0, NSLICES, [&](auto i) { for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) { ClusterNative* buffer = &clusterBuffer[clustersNative.clusterOffset[i][j]]; if (clusters[i][j].size()) { @@ -108,7 +109,7 @@ int32_t TPCClusterDecompressor::decompress(const CompressedClusters* clustersCom if (deterministicRec) { std::sort(buffer, buffer + clustersNative.nClusters[i][j]); } - } - } + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on return 0; } diff --git a/GPU/GPUTracking/Definitions/GPUDefMacros.h b/GPU/GPUTracking/Definitions/GPUDefMacros.h index b47401c9f05aa..dfd31347890d4 100644 --- a/GPU/GPUTracking/Definitions/GPUDefMacros.h +++ b/GPU/GPUTracking/Definitions/GPUDefMacros.h @@ -50,11 +50,7 @@ #define GPUCA_UNROLL(...) #endif -#if !defined(WITH_OPENMP) || defined(GPUCA_GPUCODE_DEVICE) #define GPUCA_OPENMP(...) -#else -#define GPUCA_OPENMP(...) _Pragma(GPUCA_M_STR(omp __VA_ARGS__)) -#endif #endif // clang-format on diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx index b6f11375328d0..4f654c0fa7beb 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx @@ -14,9 +14,7 @@ #include "GPUTPCGMMergerGPU.h" #include "GPUCommonAlgorithm.h" -#if defined(WITH_OPENMP) && !defined(GPUCA_GPUCODE) -#include "GPUReconstruction.h" -#endif +#include "GPUReconstructionThreading.h" using namespace o2::gpu; @@ -24,20 +22,18 @@ template <> GPUdii() void GPUTPCGMMergerTrackFit::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger, int32_t mode) { const int32_t iEnd = mode == -1 ? merger.Memory()->nRetryRefit : merger.NOutputTracks(); - GPUCA_OPENMP(parallel for if(!merger.GetRec().GetProcessingSettings().ompKernels) num_threads(merger.GetRec().GetProcessingSettings().ompThreads)) - for (int32_t ii = get_global_id(0); ii < iEnd; ii += get_global_size(0)) { + GPUCA_TBB_KERNEL_LOOP(merger.GetRec(), int32_t, ii, iEnd, { const int32_t i = mode == -1 ? merger.RetryRefitIds()[ii] : mode ? merger.TrackOrderProcess()[ii] : ii; GPUTPCGMTrackParam::RefitTrack(merger.OutputTracks()[i], i, &merger, mode == -1); - } + }); } template <> GPUdii() void GPUTPCGMMergerFollowLoopers::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger) { - GPUCA_OPENMP(parallel for if(!merger.GetRec().GetProcessingSettings().ompKernels) num_threads(merger.GetRec().GetProcessingSettings().ompThreads)) - for (uint32_t i = get_global_id(0); i < merger.Memory()->nLoopData; i += get_global_size(0)) { + GPUCA_TBB_KERNEL_LOOP(merger.GetRec(), uint32_t, i, merger.Memory()->nLoopData, { GPUTPCGMTrackParam::RefitLoop(&merger, i); - } + }); } template <> diff --git a/GPU/GPUTracking/Standalone/CMakeLists.txt b/GPU/GPUTracking/Standalone/CMakeLists.txt index 1f11f0bacffac..32cdb246cf417 100644 --- a/GPU/GPUTracking/Standalone/CMakeLists.txt +++ b/GPU/GPUTracking/Standalone/CMakeLists.txt @@ -70,13 +70,8 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error -Wall -Wextra -Wshadow -Wno-unused-function -Wno-unused-parameter -Wno-unused-local-typedefs -Wno-unknown-pragmas -Wno-write-strings") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -rdynamic -Wl,--no-undefined") -# Check and set settings for optional packages -if(CONFIG_OPENMP) - find_package(OpenMP REQUIRED) -else() - set(OpenMP_FOUND OFF) - set(OpenMP_CXX_FOUND OFF) -endif() +# Find mandatory packages +find_package(TBB REQUIRED) if(GPUCA_CONFIG_VC) find_package(Vc REQUIRED) @@ -252,14 +247,6 @@ if(GPUCA_CONFIG_ROOT) endif() target_link_libraries(standalone_support PUBLIC Microsoft.GSL::GSL TPCFastTransformation) -if(OpenMP_CXX_FOUND) - target_link_libraries(ca PUBLIC OpenMP::OpenMP_CXX) - if (CMAKE_CXX_COMPILER STREQUAL "clang++") - target_link_libraries(ca PUBLIC -fopenmp) - target_link_libraries(GPUTracking PUBLIC -fopenmp) - endif() -endif() - # Installation install(TARGETS ca TPCFastTransformation standalone_support) install(FILES "cmake/makefile" DESTINATION "${CMAKE_INSTALL_PREFIX}") diff --git a/GPU/GPUTracking/Standalone/cmake/config.cmake b/GPU/GPUTracking/Standalone/cmake/config.cmake index 97091d833efd8..87716d700abc8 100644 --- a/GPU/GPUTracking/Standalone/cmake/config.cmake +++ b/GPU/GPUTracking/Standalone/cmake/config.cmake @@ -15,7 +15,6 @@ set(ENABLE_CUDA AUTO) set(ENABLE_HIP AUTO) set(ENABLE_OPENCL AUTO) -set(CONFIG_OPENMP 1) set(GPUCA_CONFIG_VC 1) set(GPUCA_CONFIG_FMT 1) set(GPUCA_CONFIG_ROOT 1) diff --git a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx index c44b4c09a3d7a..fb58b9ab07fde 100644 --- a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx +++ b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx @@ -30,9 +30,6 @@ class GPUTPCGMPolynomialField; #ifndef GPUCA_GPUCODE #include "GPUMemoryResource.h" #include "GPUReconstruction.h" -#ifdef WITH_OPENMP -#include -#endif // WITH_OPENMP #include #include diff --git a/GPU/GPUTracking/TRDTracking/GPUTRDTrackerKernels.cxx b/GPU/GPUTracking/TRDTracking/GPUTRDTrackerKernels.cxx index eb9eecfe6e846..d18f04e554043 100644 --- a/GPU/GPUTracking/TRDTracking/GPUTRDTrackerKernels.cxx +++ b/GPU/GPUTracking/TRDTracking/GPUTRDTrackerKernels.cxx @@ -16,9 +16,8 @@ #include "GPUTRDGeometry.h" #include "GPUConstantMem.h" #include "GPUCommonTypeTraits.h" -#if defined(WITH_OPENMP) && !defined(GPUCA_GPUCODE) -#include "GPUReconstruction.h" -#endif + +#include "GPUReconstructionThreading.h" using namespace o2::gpu; @@ -33,10 +32,9 @@ GPUdii() void GPUTRDTrackerKernels::Thread(int32_t nBlocks, int32_t nThreads, in } } #endif - GPUCA_OPENMP(parallel for if(!trdTracker->GetRec().GetProcessingSettings().ompKernels) num_threads(trdTracker->GetRec().GetProcessingSettings().ompThreads)) - for (int32_t i = get_global_id(0); i < trdTracker->NTracks(); i += get_global_size(0)) { + GPUCA_TBB_KERNEL_LOOP(trdTracker->GetRec(), int32_t, i, trdTracker->NTracks(), { trdTracker->DoTrackingThread(i, get_global_id(0)); - } + }); } #if !defined(GPUCA_GPUCODE) || defined(GPUCA_GPUCODE_DEVICE) // FIXME: DR: WORKAROUND to avoid CUDA bug creating host symbols for device code. diff --git a/GPU/GPUTracking/display/CMakeLists.txt b/GPU/GPUTracking/display/CMakeLists.txt index 2c1814a1a26a0..68385d7916234 100644 --- a/GPU/GPUTracking/display/CMakeLists.txt +++ b/GPU/GPUTracking/display/CMakeLists.txt @@ -219,7 +219,4 @@ if(GPUCA_EVENT_DISPLAY_QT) target_link_libraries(${targetName} PRIVATE Qt5::Widgets) endif() -if(OpenMP_CXX_FOUND) - target_compile_definitions(${targetName} PRIVATE WITH_OPENMP) - target_link_libraries(${targetName} PRIVATE OpenMP::OpenMP_CXX) -endif() +target_link_libraries(${targetName} PRIVATE TBB::tbb) diff --git a/GPU/GPUTracking/display/GPUDisplay.cxx b/GPU/GPUTracking/display/GPUDisplay.cxx index 918011b85ea04..e42a4fa3e4bf1 100644 --- a/GPU/GPUTracking/display/GPUDisplay.cxx +++ b/GPU/GPUTracking/display/GPUDisplay.cxx @@ -24,9 +24,6 @@ #ifndef _WIN32 #include "../utils/linux_helpers.h" #endif -#ifdef WITH_OPENMP -#include -#endif #include "GPUChainTracking.h" #include "GPUQA.h" diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx index 764f659d07e64..757d54bc6d37f 100644 --- a/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx +++ b/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx @@ -14,13 +14,12 @@ #include "GPUDisplay.h" -#ifdef WITH_OPENMP -#include -#endif #ifndef _WIN32 #include "bitmapfile.h" #endif +#include "oneapi/tbb.h" + using namespace o2::gpu; int32_t GPUDisplay::getNumThreads() @@ -28,11 +27,7 @@ int32_t GPUDisplay::getNumThreads() if (mChain) { return mChain->GetProcessingSettings().ompThreads; } else { -#ifdef WITH_OPENMP - return omp_get_max_threads(); -#else - return 1; -#endif + return tbb::info::default_concurrency(); } } diff --git a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx index 5d4628cf0eb3f..a9a7650e62645 100644 --- a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx +++ b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx @@ -36,9 +36,7 @@ #include "SimulationDataFormat/ConstMCTruthContainer.h" #include "GPUTrackParamConvert.h" -#ifdef WITH_OPENMP -#include -#endif +#include using namespace o2::gpu; @@ -776,26 +774,18 @@ size_t GPUDisplay::DrawGLScene_updateVertexList() mGlDLFinal[iSlice].resize(mNCollissions); } } - GPUCA_OPENMP(parallel num_threads(getNumThreads())) - { -#ifdef WITH_OPENMP + tbb::task_arena(getNumThreads()).execute([&] { int32_t numThread = omp_get_thread_num(); int32_t numThreads = omp_get_num_threads(); -#else - int32_t numThread = 0, numThreads = 1; -#endif if (mChain && (mChain->GetRecoSteps() & GPUDataTypes::RecoStep::TPCSliceTracking)) { - GPUCA_OPENMP(for) - for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) { + tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) { GPUTPCTracker& tracker = (GPUTPCTracker&)sliceTracker(iSlice); tracker.SetPointersDataLinks(tracker.LinkTmpMemory()); mGlDLLines[iSlice][tINITLINK] = DrawLinks(tracker, tINITLINK, true); - tracker.SetPointersDataLinks(mChain->rec()->Res(tracker.MemoryResLinks()).Ptr()); - } - GPUCA_OPENMP(barrier) + tracker.SetPointersDataLinks(mChain->rec()->Res(tracker.MemoryResLinks()).Ptr()); // clang-format off + }, tbb::simple_partitioner()); // clang-format on - GPUCA_OPENMP(for) - for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) { + tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) { const GPUTPCTracker& tracker = sliceTracker(iSlice); mGlDLLines[iSlice][tLINK] = DrawLinks(tracker, tLINK); @@ -805,16 +795,13 @@ size_t GPUDisplay::DrawGLScene_updateVertexList() mGlDLGrid[iSlice] = DrawGrid(tracker); if (iSlice < NSLICES / 2) { mGlDLGridTRD[iSlice] = DrawGridTRD(iSlice); - } - } - GPUCA_OPENMP(barrier) + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on - GPUCA_OPENMP(for) - for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) { + tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) { const GPUTPCTracker& tracker = sliceTracker(iSlice); - mGlDLLines[iSlice][tEXTRAPOLATEDTRACK] = DrawTracks(tracker, 1); - } - GPUCA_OPENMP(barrier) + mGlDLLines[iSlice][tEXTRAPOLATEDTRACK] = DrawTracks(tracker, 1); // clang-format off + }, tbb::simple_partitioner()); // clang-format on } mThreadTracks[numThread].resize(mNCollissions); for (int32_t i = 0; i < mNCollissions; i++) { @@ -827,8 +814,7 @@ size_t GPUDisplay::DrawGLScene_updateVertexList() if (mConfig.showTPCTracksFromO2Format) { #ifdef GPUCA_TPC_GEOMETRY_O2 uint32_t col = 0; - GPUCA_OPENMP(for) - for (uint32_t i = 0; i < mIOPtrs->nOutputTracksTPCO2; i++) { + tbb::parallel_for(0, mIOPtrs->nOutputTracksTPCO2, [&](uint32_t i) { uint8_t sector, row; if (mIOPtrs->clustersNative) { mIOPtrs->outputTracksTPCO2[i].getCluster(mIOPtrs->outputClusRefsTPCO2, 0, *mIOPtrs->clustersNative, sector, row); @@ -839,11 +825,10 @@ size_t GPUDisplay::DrawGLScene_updateVertexList() col = mQA->GetMCLabelCol(mIOPtrs->outputTracksTPCO2MC[i]); } mThreadTracks[numThread][col][sector][0].emplace_back(i); - } + }); #endif } else { - GPUCA_OPENMP(for) - for (uint32_t i = 0; i < mIOPtrs->nMergedTracks; i++) { + tbb::parallel_for(0, mIOPtrs->nMergedTracks, [&](uint32_t i) { const GPUTPCGMMergedTrack* track = &mIOPtrs->mergedTracks[i]; if (track->NClusters() == 0) { continue; @@ -864,11 +849,10 @@ size_t GPUDisplay::DrawGLScene_updateVertexList() #endif } mThreadTracks[numThread][col][slice][0].emplace_back(i); - } + }); } for (uint32_t col = 0; col < mIOPtrs->nMCInfosTPCCol; col++) { - GPUCA_OPENMP(for) - for (uint32_t i = mIOPtrs->mcInfosTPCCol[col].first; i < mIOPtrs->mcInfosTPCCol[col].first + mIOPtrs->mcInfosTPCCol[col].num; i++) { + tbb::parallel_for(mIOPtrs->mcInfosTPCCol[col].first, mIOPtrs->mcInfosTPCCol[col].first + mIOPtrs->mcInfosTPCCol[col].num, [&](uint32_t i) { const GPUTPCMCInfo& mc = mIOPtrs->mcInfosTPC[i]; if (mc.charge == 0.f) { continue; @@ -886,17 +870,15 @@ size_t GPUDisplay::DrawGLScene_updateVertexList() slice += 18; } mThreadTracks[numThread][col][slice][1].emplace_back(i); - } + }); } - GPUCA_OPENMP(barrier) GPUTPCGMPropagator prop; prop.SetMaxSinPhi(.999); prop.SetMaterialTPC(); prop.SetPolynomialField(&mParam->polynomialField); - GPUCA_OPENMP(for) - for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) { + tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) { for (int32_t iCol = 0; iCol < mNCollissions; iCol++) { mThreadBuffers[numThread].clear(); for (int32_t iSet = 0; iSet < numThreads; iSet++) { @@ -915,18 +897,16 @@ size_t GPUDisplay::DrawGLScene_updateVertexList() } list[i] = vboList(startCount, mVertexBufferStart[iSlice].size() - startCount, iSlice); } - } - } + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on - GPUCA_OPENMP(barrier) - GPUCA_OPENMP(for) - for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) { + tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) { for (int32_t i = 0; i < N_POINTS_TYPE_TPC; i++) { for (int32_t iCol = 0; iCol < mNCollissions; iCol++) { mGlDLPoints[iSlice][i][iCol] = DrawClusters(iSlice, i, iCol); } - } - } + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on } // End omp parallel diff --git a/GPU/GPUTracking/qa/GPUQA.cxx b/GPU/GPUTracking/qa/GPUQA.cxx index 70a093c7f1de7..651a2152235c2 100644 --- a/GPU/GPUTracking/qa/GPUQA.cxx +++ b/GPU/GPUTracking/qa/GPUQA.cxx @@ -76,6 +76,8 @@ #include "utils/qconfig.h" #include "utils/timer.h" +#include + using namespace o2::gpu; #ifdef GPUCA_MERGER_BY_MC_LABEL @@ -1135,8 +1137,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx } // Compute MC Track Parameters for MC Tracks - GPUCA_OPENMP(parallel for) - for (uint32_t iCol = 0; iCol < GetNMCCollissions(); iCol++) { + tbb::parallel_for(0, GetNMCCollissions(), [&](uint32_t iCol) { for (uint32_t i = 0; i < GetNMCTracks(iCol); i++) { const mcInfo_t& info = GetMCTrack(i, iCol); additionalMCParameters& mc2 = mMCParam[iCol][i]; @@ -1153,8 +1154,8 @@ void GPUQA::RunQA(bool matchOnly, const std::vector* tracksEx std::vector& effBuffer = mcEffBuffer[mNEvents - 1]; effBuffer[i] = mRecTracks[iCol][i] * 1000 + mFakeTracks[iCol][i]; } - } - } + } // clang-format off + }, tbb::simple_partitioner()); // clang-format on if (QA_TIMING || (mTracking && mTracking->GetProcessingSettings().debugLevel >= 3)) { GPUInfo("QA Time: Compute track mc parameters:\t%6.0f us", timer.GetCurrentElapsedTime(true) * 1e6); }