From b21fbd87899ba4a2571a462772e97647d210c736 Mon Sep 17 00:00:00 2001
From: David Rohr <drohr@jwdt.org>
Date: Fri, 21 Feb 2025 15:06:02 +0100
Subject: [PATCH] GPU: Replace OpenMP parallization with TBB

---
 GPU/GPUTracking/Base/GPUReconstruction.cxx    |  79 ++--
 GPU/GPUTracking/Base/GPUReconstruction.h      |  19 +-
 GPU/GPUTracking/Base/GPUReconstructionCPU.cxx |  93 ++---
 GPU/GPUTracking/Base/GPUReconstructionCPU.h   |  18 +-
 .../Base/GPUReconstructionConvert.cxx         | 122 +++---
 .../Base/GPUReconstructionLibrary.cxx         |   4 -
 .../Base/GPUReconstructionThreading.h         |  60 +++
 GPU/GPUTracking/Base/cuda/CMakeLists.txt      |   7 +-
 .../Base/cuda/GPUReconstructionCUDA.cu        |   4 +-
 .../Base/cuda/GPUReconstructionCUDAGenRTC.cxx |  13 +-
 GPU/GPUTracking/Base/hip/CMakeLists.txt       |   6 +-
 .../Base/opencl/GPUReconstructionOCL.cxx      |   4 +-
 GPU/GPUTracking/CMakeLists.txt                |   8 +-
 .../GPUTPCClusterStatistics.cxx               |   4 +-
 .../TPCClusterDecompressor.cxx                |  34 +-
 ...andalone-cluster-dump-entropy-analysed.cxx |   4 +-
 GPU/GPUTracking/Definitions/GPUDefMacros.h    |   6 -
 GPU/GPUTracking/Definitions/GPUSettingsList.h |   7 +-
 GPU/GPUTracking/Global/GPUChainTracking.cxx   |  12 +-
 .../Global/GPUChainTrackingClusterizer.cxx    | 395 +++++++++---------
 .../Global/GPUChainTrackingCompression.cxx    |   2 +-
 .../Global/GPUChainTrackingSliceTracker.cxx   | 194 ++++-----
 GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx  |  14 +-
 GPU/GPUTracking/SliceTracker/GPUTPCDef.h      |   2 +-
 .../SliceTracker/GPUTPCSliceData.cxx          |   4 +-
 .../SliceTracker/GPUTPCTracker.cxx            |   2 +-
 .../Standalone/Benchmark/standalone.cxx       |   8 +-
 GPU/GPUTracking/Standalone/CMakeLists.txt     |  17 +-
 GPU/GPUTracking/Standalone/cmake/config.cmake |   1 -
 GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx |  11 +-
 GPU/GPUTracking/TRDTracking/GPUTRDTracker.h   |   2 +-
 .../TRDTracking/GPUTRDTrackerKernels.cxx      |  10 +-
 GPU/GPUTracking/display/CMakeLists.txt        |   5 +-
 GPU/GPUTracking/display/GPUDisplay.cxx        |   3 -
 GPU/GPUTracking/display/GPUDisplay.h          |   2 +-
 .../display/helpers/GPUDisplayHelpers.cxx     |  13 +-
 .../display/render/GPUDisplayDraw.cxx         | 110 ++---
 .../display/render/GPUDisplayImportEvent.cxx  | 178 ++++----
 .../display/shaders/GPUDisplayShaders.h       |   2 +-
 GPU/GPUTracking/qa/GPUQA.cxx                  |  80 ++--
 40 files changed, 789 insertions(+), 770 deletions(-)
 create mode 100644 GPU/GPUTracking/Base/GPUReconstructionThreading.h
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.cxx b/GPU/GPUTracking/Base/GPUReconstruction.cxx
index e3522d2d7242d..481494f268494 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstruction.cxx
@@ -23,12 +23,9 @@
 #include <condition_variable>
 #include <array>
 
-#ifdef WITH_OPENMP
-#include <omp.h>
-#endif
-
 #include "GPUReconstruction.h"
 #include "GPUReconstructionIncludes.h"
+#include "GPUReconstructionThreading.h"
 #include "GPUROOTDumpCore.h"
 #include "GPUConfigDump.h"
 #include "GPUChainTracking.h"
@@ -121,17 +118,18 @@ void GPUReconstruction::GetITSTraits(std::unique_ptr<o2::its::TrackerTraits>* tr
   }
 }
 
-int32_t GPUReconstruction::SetNOMPThreads(int32_t n)
+void GPUReconstruction::SetNActiveThreads(int32_t n)
 {
-#ifdef WITH_OPENMP
-  omp_set_num_threads(mProcessingSettings.ompThreads = std::max(1, n < 0 ? mMaxOMPThreads : std::min(n, mMaxOMPThreads)));
+  mActiveHostKernelThreads = std::max(1, n < 0 ? mMaxHostThreads : std::min(n, mMaxHostThreads));
+  mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
   if (mProcessingSettings.debugLevel >= 3) {
-    GPUInfo("Set number of OpenMP threads to %d (%d requested)", mProcessingSettings.ompThreads, n);
+    GPUInfo("Set number of active parallel kernels threads on host to %d (%d requested)", mActiveHostKernelThreads, n);
   }
-  return n > mMaxOMPThreads;
-#else
-  return 1;
-#endif
+}
+
+int32_t GPUReconstruction::getHostThreadIndex()
+{
+  return std::max<int32_t>(0, tbb::this_task_arena::current_thread_index());
 }
 
 int32_t GPUReconstruction::Init()
@@ -197,6 +195,24 @@ int32_t GPUReconstruction::Init()
   return 0;
 }
 
+namespace o2::gpu::internal
+{
+static uint32_t getDefaultNThreads()
+{
+  const char* tbbEnv = getenv("TBB_NUM_THREADS");
+  uint32_t tbbNum = tbbEnv ? atoi(tbbEnv) : 0;
+  if (tbbNum) {
+    return tbbNum;
+  }
+  const char* ompEnv = getenv("OMP_NUM_THREADS");
+  uint32_t ompNum = ompEnv ? atoi(ompEnv) : 0;
+  if (ompNum) {
+    return tbbNum;
+  }
+  return tbb::info::default_concurrency();
+}
+} // namespace o2::gpu::internal
+
 int32_t GPUReconstruction::InitPhaseBeforeDevice()
 {
   if (mProcessingSettings.printSettings) {
@@ -299,32 +315,37 @@ int32_t GPUReconstruction::InitPhaseBeforeDevice()
     mMemoryScalers->rescaleMaxMem(mProcessingSettings.forceMaxMemScalers);
   }
 
-#ifdef WITH_OPENMP
-  if (mProcessingSettings.ompThreads <= 0) {
-    mProcessingSettings.ompThreads = omp_get_max_threads();
-  } else {
-    mProcessingSettings.ompAutoNThreads = false;
-    omp_set_num_threads(mProcessingSettings.ompThreads);
+  if (mProcessingSettings.nHostThreads != -1 && mProcessingSettings.ompThreads != -1) {
+    GPUFatal("Must not use both nHostThreads and ompThreads at the same time!");
+  } else if (mProcessingSettings.ompThreads != -1) {
+    mProcessingSettings.nHostThreads = mProcessingSettings.ompThreads;
+    GPUWarning("You are using the deprecated ompThreads option, please switch to nHostThreads!");
   }
-  if (mProcessingSettings.ompKernels) {
-    if (omp_get_max_active_levels() < 2) {
-      omp_set_max_active_levels(2);
-    }
+
+  if (mProcessingSettings.nHostThreads <= 0) {
+    mProcessingSettings.nHostThreads = internal::getDefaultNThreads();
+  } else {
+    mProcessingSettings.autoAdjustHostThreads = false;
+  }
+  mMaxHostThreads = mActiveHostKernelThreads = mProcessingSettings.nHostThreads;
+  if (mMaster == nullptr) {
+    mThreading = std::make_shared<GPUReconstructionThreading>();
+    mThreading->control = std::make_unique<tbb::global_control>(tbb::global_control::max_allowed_parallelism, mMaxHostThreads);
+    mThreading->allThreads = std::make_unique<tbb::task_arena>(mMaxHostThreads);
+    mThreading->activeThreads = std::make_unique<tbb::task_arena>(mActiveHostKernelThreads);
+  } else {
+    mThreading = mMaster->mThreading;
   }
-#else
-  mProcessingSettings.ompThreads = 1;
-#endif
-  mMaxOMPThreads = mProcessingSettings.ompThreads;
-  mMaxThreads = std::max(mMaxThreads, mProcessingSettings.ompThreads);
+  mMaxBackendThreads = std::max(mMaxBackendThreads, mMaxHostThreads);
   if (IsGPU()) {
     mNStreams = std::max<int32_t>(mProcessingSettings.nStreams, 3);
   }
 
   if (mProcessingSettings.nTPCClustererLanes == -1) {
-    mProcessingSettings.nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max<int32_t>(1, std::min<int32_t>(GPUCA_NSLICES, mProcessingSettings.ompKernels ? (mProcessingSettings.ompThreads >= 4 ? std::min<int32_t>(mProcessingSettings.ompThreads / 2, mProcessingSettings.ompThreads >= 32 ? GPUCA_NSLICES : 4) : 1) : mProcessingSettings.ompThreads));
+    mProcessingSettings.nTPCClustererLanes = (GetRecoStepsGPU() & RecoStep::TPCClusterFinding) ? 3 : std::max<int32_t>(1, std::min<int32_t>(GPUCA_NSLICES, mProcessingSettings.inKernelParallel ? (mMaxHostThreads >= 4 ? std::min<int32_t>(mMaxHostThreads / 2, mMaxHostThreads >= 32 ? GPUCA_NSLICES : 4) : 1) : mMaxHostThreads));
   }
   if (mProcessingSettings.overrideClusterizerFragmentLen == -1) {
-    mProcessingSettings.overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mProcessingSettings.ompThreads / mProcessingSettings.nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST;
+    mProcessingSettings.overrideClusterizerFragmentLen = ((GetRecoStepsGPU() & RecoStep::TPCClusterFinding) || (mMaxHostThreads / mProcessingSettings.nTPCClustererLanes >= 3)) ? TPC_MAX_FRAGMENT_LEN_GPU : TPC_MAX_FRAGMENT_LEN_HOST;
   }
   if (mProcessingSettings.nTPCClustererLanes > GPUCA_NSLICES) {
     GPUError("Invalid value for nTPCClustererLanes: %d", mProcessingSettings.nTPCClustererLanes);
diff --git a/GPU/GPUTracking/Base/GPUReconstruction.h b/GPU/GPUTracking/Base/GPUReconstruction.h
index 6fd00e1fda207..1fdfabb11211a 100644
--- a/GPU/GPUTracking/Base/GPUReconstruction.h
+++ b/GPU/GPUTracking/Base/GPUReconstruction.h
@@ -51,6 +51,7 @@ namespace gpu
 class GPUChain;
 struct GPUMemorySizeScalers;
 struct GPUReconstructionPipelineContext;
+struct GPUReconstructionThreading;
 class GPUROOTDumpCore;
 
 namespace gpu_reconstruction_kernels
@@ -206,8 +207,8 @@ class GPUReconstruction
   void SetOutputControl(void* ptr, size_t size);
   void SetInputControl(void* ptr, size_t size);
   GPUOutputControl& OutputControl() { return mOutputControl; }
-  int32_t GetMaxThreads() const { return mMaxThreads; }
-  int32_t SetNOMPThreads(int32_t n);
+  int32_t GetMaxBackendThreads() const { return mMaxBackendThreads; }
+  void SetNActiveThreads(int32_t n);
   int32_t NStreams() const { return mNStreams; }
   const void* DeviceMemoryBase() const { return mDeviceMemoryBase; }
 
@@ -234,6 +235,9 @@ class GPUReconstruction
   double GetStatKernelTime() { return mStatKernelTime; }
   double GetStatWallTime() { return mStatWallTime; }
 
+  std::shared_ptr<GPUReconstructionThreading> mThreading;
+  static int32_t getHostThreadIndex();
+
  protected:
   void AllocateRegisteredMemoryInternal(GPUMemoryResource* res, GPUOutputControl* control, GPUReconstruction* recPool);
   void FreeRegisteredMemory(GPUMemoryResource* res);
@@ -343,11 +347,12 @@ class GPUReconstruction
   std::shared_ptr<GPUROOTDumpCore> mROOTDump;
   std::vector<std::array<uint32_t, 4>>* mOutputErrorCodes = nullptr;
 
-  int32_t mMaxThreads = 0;    // Maximum number of threads that may be running, on CPU or GPU
-  int32_t mThreadId = -1;     // Thread ID that is valid for the local CUDA context
-  int32_t mGPUStuck = 0;      // Marks that the GPU is stuck, skip future events
-  int32_t mNStreams = 1;      // Number of parallel GPU streams
-  int32_t mMaxOMPThreads = 0; // Maximum number of OMP threads
+  int32_t mMaxBackendThreads = 0;       // Maximum number of threads that may be running, on CPU or GPU
+  int32_t mThreadId = -1;               // Thread ID that is valid for the local CUDA context
+  int32_t mGPUStuck = 0;                // Marks that the GPU is stuck, skip future events
+  int32_t mNStreams = 1;                // Number of parallel GPU streams
+  int32_t mMaxHostThreads = 0;          // Maximum number of OMP threads
+  int32_t mActiveHostKernelThreads = 0; // Number of currently active threads on the host for kernels
 
   // Management for GPUProcessors
   struct ProcessorData {
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
index 187792b3ba2e7..b5f9d591fd9a6 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.cxx
@@ -14,6 +14,7 @@
 
 #include "GPUReconstructionCPU.h"
 #include "GPUReconstructionIncludes.h"
+#include "GPUReconstructionThreading.h"
 #include "GPUChain.h"
 
 #include "GPUTPCClusterData.h"
@@ -40,13 +41,6 @@
 #include <unistd.h>
 #endif
 
-#if defined(WITH_OPENMP) || defined(_OPENMP)
-#include <omp.h>
-#else
-static inline int32_t omp_get_thread_num() { return 0; }
-static inline int32_t omp_get_max_threads() { return 1; }
-#endif
-
 using namespace o2::gpu;
 using namespace o2::gpu::gpu_reconstruction_kernels;
 
@@ -60,19 +54,21 @@ GPUReconstructionCPU::~GPUReconstructionCPU()
   Exit(); // Needs to be identical to GPU backend bahavior in order to avoid calling abstract methods later in the destructor
 }
 
-int32_t GPUReconstructionCPUBackend::getNOMPThreads()
+int32_t GPUReconstructionCPUBackend::getNKernelHostThreads(bool splitCores)
 {
-  int32_t ompThreads = 0;
-  if (mProcessingSettings.ompKernels == 2) {
-    ompThreads = mProcessingSettings.ompThreads / mNestedLoopOmpFactor;
-    if ((uint32_t)getOMPThreadNum() < mProcessingSettings.ompThreads % mNestedLoopOmpFactor) {
-      ompThreads++;
+  int32_t nThreads = 0;
+  if (mProcessingSettings.inKernelParallel == 2 && mNActiveThreadsOuterLoop) {
+    if (splitCores) {
+      nThreads = mMaxHostThreads / mNActiveThreadsOuterLoop;
+      nThreads += (uint32_t)getHostThreadIndex() < mMaxHostThreads % mNActiveThreadsOuterLoop;
+    } else {
+      nThreads = mMaxHostThreads;
     }
-    ompThreads = std::max(1, ompThreads);
+    nThreads = std::max(1, nThreads);
   } else {
-    ompThreads = mProcessingSettings.ompKernels ? mProcessingSettings.ompThreads : 1;
+    nThreads = mProcessingSettings.inKernelParallel ? mMaxHostThreads : 1;
   }
-  return ompThreads;
+  return nThreads;
 }
 
 template <class T, int32_t I, typename... Args>
@@ -88,16 +84,19 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
   }
   uint32_t num = y.num == 0 || y.num == -1 ? 1 : y.num;
   for (uint32_t k = 0; k < num; k++) {
-    int32_t ompThreads = getNOMPThreads();
-    if (ompThreads > 1) {
+    int32_t nThreads = getNKernelHostThreads(false);
+    if (nThreads > 1) {
       if (mProcessingSettings.debugLevel >= 5) {
-        printf("Running %d ompThreads\n", ompThreads);
-      }
-      GPUCA_OPENMP(parallel for num_threads(ompThreads))
-      for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
-        typename T::GPUSharedMemory smem;
-        T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
+        printf("Running %d Threads\n", nThreads);
       }
+      mThreading->activeThreads->execute([&] {
+        tbb::parallel_for(tbb::blocked_range<uint32_t>(0, x.nBlocks, 1), [&](const tbb::blocked_range<uint32_t>& r) {
+          typename T::GPUSharedMemory smem;
+          for (uint32_t iB = r.begin(); iB < r.end(); iB++) {
+            T::template Thread<I>(x.nBlocks, 1, iB, 0, smem, T::Processor(*mHostConstantMem)[y.start + k], args...);
+          }
+        });
+      });
     } else {
       for (uint32_t iB = 0; iB < x.nBlocks; iB++) {
         typename T::GPUSharedMemory smem;
@@ -111,24 +110,20 @@ inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal(const krnlS
 template <>
 inline int32_t GPUReconstructionCPUBackend::runKernelBackendInternal<GPUMemClean16, 0>(const krnlSetupTime& _xyz, void* const& ptr, uint64_t const& size)
 {
-#ifdef WITH_OPENMP
-  int32_t nOMPThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNOMPThreads()));
-  if (nOMPThreads > 1) {
-    GPUCA_OPENMP(parallel num_threads(nOMPThreads))
-    {
-      size_t threadSize = size / omp_get_num_threads();
+  int32_t nnThreads = std::max<int32_t>(1, std::min<int32_t>(size / (16 * 1024 * 1024), getNKernelHostThreads(true)));
+  if (nnThreads > 1) {
+    tbb::parallel_for(0, nnThreads, [&](int iThread) {
+      size_t threadSize = size / nnThreads;
       if (threadSize % 4096) {
         threadSize += 4096 - threadSize % 4096;
       }
-      size_t offset = threadSize * omp_get_thread_num();
+      size_t offset = threadSize * iThread;
       size_t mySize = std::min<size_t>(threadSize, size - offset);
       if (mySize) {
         memset((char*)ptr + offset, 0, mySize);
-      }
-    }
-  } else
-#endif
-  {
+      } // clang-format off
+    }, tbb::static_partitioner()); // clang-format on
+  } else {
     memset(ptr, 0, size);
   }
   return 0;
@@ -213,8 +208,8 @@ int32_t GPUReconstructionCPU::InitDevice()
     mHostMemoryPermanent = mHostMemoryBase;
     ClearAllocatedMemory();
   }
-  if (mProcessingSettings.ompKernels) {
-    mBlockCount = getOMPMaxThreads();
+  if (mProcessingSettings.inKernelParallel) {
+    mBlockCount = mMaxHostThreads;
   }
   mThreadId = GetThread();
   mProcShadow.mProcessorsProc = processors();
@@ -351,16 +346,6 @@ void GPUReconstructionCPU::ResetDeviceProcessorTypes()
   }
 }
 
-int32_t GPUReconstructionCPUBackend::getOMPThreadNum()
-{
-  return omp_get_thread_num();
-}
-
-int32_t GPUReconstructionCPUBackend::getOMPMaxThreads()
-{
-  return omp_get_max_threads();
-}
-
 static std::atomic_flag timerFlag = ATOMIC_FLAG_INIT; // TODO: Should be a class member not global, but cannot be moved to header due to ROOT limitation
 
 GPUReconstructionCPU::timerMeta* GPUReconstructionCPU::insertTimer(uint32_t id, std::string&& name, int32_t J, int32_t num, int32_t type, RecoStep step)
@@ -402,17 +387,17 @@ uint32_t GPUReconstructionCPU::getNextTimerId()
   return id.fetch_add(1);
 }
 
-uint32_t GPUReconstructionCPU::SetAndGetNestedLoopOmpFactor(bool condition, uint32_t max)
+uint32_t GPUReconstructionCPU::SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max)
 {
-  if (condition && mProcessingSettings.ompKernels != 1) {
-    mNestedLoopOmpFactor = mProcessingSettings.ompKernels == 2 ? std::min<uint32_t>(max, mProcessingSettings.ompThreads) : mProcessingSettings.ompThreads;
+  if (condition && mProcessingSettings.inKernelParallel != 1) {
+    mNActiveThreadsOuterLoop = mProcessingSettings.inKernelParallel == 2 ? std::min<uint32_t>(max, mMaxHostThreads) : mMaxHostThreads;
   } else {
-    mNestedLoopOmpFactor = 1;
+    mNActiveThreadsOuterLoop = 1;
   }
   if (mProcessingSettings.debugLevel >= 5) {
-    printf("Running %d OMP threads in outer loop\n", mNestedLoopOmpFactor);
+    printf("Running %d threads in outer loop\n", mNActiveThreadsOuterLoop);
   }
-  return mNestedLoopOmpFactor;
+  return mNActiveThreadsOuterLoop;
 }
 
 void GPUReconstructionCPU::UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream)
diff --git a/GPU/GPUTracking/Base/GPUReconstructionCPU.h b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
index 7903be44907df..f82f481df6a63 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionCPU.h
+++ b/GPU/GPUTracking/Base/GPUReconstructionCPU.h
@@ -43,10 +43,8 @@ class GPUReconstructionCPUBackend : public GPUReconstruction
   int32_t runKernelBackendInternal(const gpu_reconstruction_kernels::krnlSetupTime& _xyz, const Args&... args);
   template <class T, int32_t I>
   gpu_reconstruction_kernels::krnlProperties getKernelPropertiesBackend();
-  uint32_t mNestedLoopOmpFactor = 1;
-  static int32_t getOMPThreadNum();
-  static int32_t getOMPMaxThreads();
-  int32_t getNOMPThreads();
+  uint32_t mNActiveThreadsOuterLoop = 1;
+  int32_t getNKernelHostThreads(bool splitCores);
 };
 
 class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCPUBackend>
@@ -81,8 +79,8 @@ class GPUReconstructionCPU : public GPUReconstructionKernels<GPUReconstructionCP
   HighResTimer& getRecoStepTimer(RecoStep step) { return mTimersRecoSteps[getRecoStepNum(step)].timerTotal; }
   HighResTimer& getGeneralStepTimer(GeneralStep step) { return mTimersGeneralSteps[getGeneralStepNum(step)]; }
 
-  void SetNestedLoopOmpFactor(uint32_t f) { mNestedLoopOmpFactor = f; }
-  uint32_t SetAndGetNestedLoopOmpFactor(bool condition, uint32_t max);
+  void SetNActiveThreadsOuterLoop(uint32_t f) { mNActiveThreadsOuterLoop = f; }
+  uint32_t SetAndGetNActiveThreadsOuterLoop(bool condition, uint32_t max);
 
   void UpdateParamOccupancyMap(const uint32_t* mapHost, const uint32_t* mapGPU, uint32_t occupancyTotal, int32_t stream = -1);
 
@@ -220,8 +218,8 @@ inline int32_t GPUReconstructionCPU::runKernel(krnlSetup&& setup, Args&&... args
     return 0;
   }
   if (mProcessingSettings.debugLevel >= 1) {
-    t = &getKernelTimer<S, I>(myStep, !IsGPU() || cpuFallback ? getOMPThreadNum() : stream);
-    if ((!mProcessingSettings.deviceTimers || !IsGPU() || cpuFallback) && (mNestedLoopOmpFactor < 2 || getOMPThreadNum() == 0)) {
+    t = &getKernelTimer<S, I>(myStep, !IsGPU() || cpuFallback ? getHostThreadIndex() : stream);
+    if ((!mProcessingSettings.deviceTimers || !IsGPU() || cpuFallback) && (mNActiveThreadsOuterLoop < 2 || getHostThreadIndex() == 0)) {
       t->Start();
     }
   }
@@ -287,11 +285,11 @@ HighResTimer& GPUReconstructionCPU::getTimer(const char* name, int32_t num)
   static int32_t id = getNextTimerId();
   timerMeta* timer = getTimerById(id);
   if (timer == nullptr) {
-    int32_t max = std::max<int32_t>({getOMPMaxThreads(), mProcessingSettings.nStreams});
+    int32_t max = std::max<int32_t>({mMaxHostThreads, mProcessingSettings.nStreams});
     timer = insertTimer(id, name, J, max, 1, RecoStep::NoRecoStep);
   }
   if (num == -1) {
-    num = getOMPThreadNum();
+    num = getHostThreadIndex();
   }
   if (num < 0 || num >= timer->num) {
     throw std::runtime_error("Invalid timer requested");
diff --git a/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx b/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx
index ca1c46766b9da..629d23075d9bc 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionConvert.cxx
@@ -40,6 +40,8 @@
 #include "TPCBase/CRU.h"
 #include "DetectorsRaw/RDHUtils.h"
 
+#include <oneapi/tbb.h>
+
 using namespace o2::gpu;
 using namespace o2::tpc;
 using namespace o2::tpc::constants;
@@ -1306,6 +1308,17 @@ size_t zsEncoderRun<T>::compare(std::vector<zsPage>* buffer, std::vector<o2::tpc
 } // anonymous namespace
 #endif // GPUCA_TPC_GEOMETRY_O2
 
+namespace o2::gpu::internal
+{
+struct tmpReductionResult {
+  uint32_t totalPages = 0;
+  size_t totalSize = 0;
+  size_t nErrors = 0;
+  size_t digitsInput = 0;
+  size_t digitsEncoded = 0;
+};
+} // namespace o2::gpu::internal
+
 template <class S>
 void GPUReconstructionConvert::RunZSEncoder(const S& in, std::unique_ptr<uint64_t[]>* outBuffer, uint32_t* outSizes, o2::raw::RawFileWriter* raw, const o2::InteractionRecord* ir, const GPUParam& param, int32_t version, bool verify, float threshold, bool padding, std::function<void(std::vector<o2::tpc::Digit>&)> digitsFilter)
 {
@@ -1316,67 +1329,68 @@ void GPUReconstructionConvert::RunZSEncoder(const S& in, std::unique_ptr<uint64_
   }
 #ifdef GPUCA_TPC_GEOMETRY_O2
   std::vector<zsPage> buffer[NSLICES][GPUTrackingInOutZS::NENDPOINTS];
-  uint32_t totalPages = 0;
-  size_t totalSize = 0;
-  size_t nErrors = 0;
-  size_t digitsInput = 0;
-  size_t digitsEncoded = 0;
-  // clang-format off
-  GPUCA_OPENMP(parallel for reduction(+ : totalPages, nErrors, totalSize, digitsInput, digitsEncoded))
-  // clang-format on
-  for (uint32_t i = 0; i < NSLICES; i++) {
-    std::vector<o2::tpc::Digit> tmpBuffer;
-    digitsInput += ZSEncoderGetNDigits(in, i);
-    tmpBuffer.resize(ZSEncoderGetNDigits(in, i));
-    if (threshold > 0.f && !digitsFilter) {
-      auto it = std::copy_if(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; });
-      tmpBuffer.resize(std::distance(tmpBuffer.begin(), it));
-    } else {
-      std::copy(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin());
-    }
-
-    if (digitsFilter) {
-      digitsFilter(tmpBuffer);
-      if (threshold > 0.f) {
-        std::vector<o2::tpc::Digit> tmpBuffer2 = std::move(tmpBuffer);
-        tmpBuffer = std::vector<o2::tpc::Digit>(tmpBuffer2.size());
-        auto it = std::copy_if(tmpBuffer2.begin(), tmpBuffer2.end(), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; });
+  auto reduced = tbb::parallel_reduce(tbb::blocked_range<uint32_t>(0, NSLICES), o2::gpu::internal::tmpReductionResult(), [&](const auto range, auto red) {
+    for (uint32_t i = range.begin(); i < range.end(); i++) {
+      std::vector<o2::tpc::Digit> tmpBuffer;
+      red.digitsInput += ZSEncoderGetNDigits(in, i);
+      tmpBuffer.resize(ZSEncoderGetNDigits(in, i));
+      if (threshold > 0.f && !digitsFilter) {
+        auto it = std::copy_if(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; });
         tmpBuffer.resize(std::distance(tmpBuffer.begin(), it));
+      } else {
+        std::copy(ZSEncoderGetDigits(in, i), ZSEncoderGetDigits(in, i) + ZSEncoderGetNDigits(in, i), tmpBuffer.begin());
       }
-    }
-    digitsEncoded += tmpBuffer.size();
-
-    auto runZS = [&](auto& encoder) {
-      encoder.zsVersion = version;
-      encoder.init();
-      totalPages += encoder.run(buffer[i], tmpBuffer, &totalSize);
-      if (verify) {
-        nErrors += encoder.compare(buffer[i], tmpBuffer); // Verification
+
+      if (digitsFilter) {
+        digitsFilter(tmpBuffer);
+        if (threshold > 0.f) {
+          std::vector<o2::tpc::Digit> tmpBuffer2 = std::move(tmpBuffer);
+          tmpBuffer = std::vector<o2::tpc::Digit>(tmpBuffer2.size());
+          auto it = std::copy_if(tmpBuffer2.begin(), tmpBuffer2.end(), tmpBuffer.begin(), [threshold](auto& v) { return v.getChargeFloat() >= threshold; });
+          tmpBuffer.resize(std::distance(tmpBuffer.begin(), it));
+        }
       }
-    };
+      red.digitsEncoded += tmpBuffer.size();
+
+      auto runZS = [&](auto& encoder) {
+        encoder.zsVersion = version;
+        encoder.init();
+        red.totalPages += encoder.run(buffer[i], tmpBuffer, &red.totalSize);
+        if (verify) {
+          red.nErrors += encoder.compare(buffer[i], tmpBuffer); // Verification
+        }
+      };
 
-    if (version >= ZSVersion::ZSVersionRowBased10BitADC && version <= ZSVersion::ZSVersionRowBased12BitADC) {
-      zsEncoderRun<zsEncoderRow> enc{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}};
-      runZS(enc);
-    } else if (version >= ZSVersion::ZSVersionLinkBasedWithMeta && version <= ZSVersion::ZSVersionDenseLinkBasedV2) {
-#ifdef GPUCA_O2_LIB
-      if (version == ZSVersion::ZSVersionLinkBasedWithMeta) {
-        zsEncoderRun<zsEncoderImprovedLinkBased> enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}}};
+      if (version >= ZSVersion::ZSVersionRowBased10BitADC && version <= ZSVersion::ZSVersionRowBased12BitADC) {
+        zsEncoderRun<zsEncoderRow> enc{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}};
         runZS(enc);
-      } else if (version >= ZSVersion::ZSVersionDenseLinkBased && version <= ZSVersion::ZSVersionDenseLinkBasedV2) {
-        zsEncoderRun<zsEncoderDenseLinkBased> enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}}};
-        runZS(enc);
-      }
+      } else if (version >= ZSVersion::ZSVersionLinkBasedWithMeta && version <= ZSVersion::ZSVersionDenseLinkBasedV2) {
+#ifdef GPUCA_O2_LIB
+        if (version == ZSVersion::ZSVersionLinkBasedWithMeta) {
+          zsEncoderRun<zsEncoderImprovedLinkBased> enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}}};
+          runZS(enc);
+        } else if (version >= ZSVersion::ZSVersionDenseLinkBased && version <= ZSVersion::ZSVersionDenseLinkBasedV2) {
+          zsEncoderRun<zsEncoderDenseLinkBased> enc{{{{.iSector = i, .raw = raw, .ir = ir, .param = &param, .padding = padding}}}};
+          runZS(enc);
+        }
 #else
-      throw std::runtime_error("Link based ZS encoding not supported in standalone build");
+        throw std::runtime_error("Link based ZS encoding not supported in standalone build");
 #endif
-    } else {
-      throw std::runtime_error("Invalid ZS version "s + std::to_string(version) + ", cannot decode"s);
+      } else {
+        throw std::runtime_error("Invalid ZS version "s + std::to_string(version) + ", cannot decode"s);
+      }
     }
-  }
+    return red; }, [&](const auto& red1, const auto& red2) {
+    auto red = red1;
+    red.totalPages += red2.totalPages;
+    red.totalSize += red2.totalSize;
+    red.nErrors += red2.nErrors;
+    red.digitsInput += red2.digitsInput;
+    red.digitsEncoded += red2.digitsEncoded;
+    return red; });
 
   if (outBuffer) {
-    outBuffer->reset(new uint64_t[totalPages * TPCZSHDR::TPC_ZS_PAGE_SIZE / sizeof(uint64_t)]);
+    outBuffer->reset(new uint64_t[reduced.totalPages * TPCZSHDR::TPC_ZS_PAGE_SIZE / sizeof(uint64_t)]);
     uint64_t offset = 0;
     for (uint32_t i = 0; i < NSLICES; i++) {
       for (uint32_t j = 0; j < GPUTrackingInOutZS::NENDPOINTS; j++) {
@@ -1386,12 +1400,12 @@ void GPUReconstructionConvert::RunZSEncoder(const S& in, std::unique_ptr<uint64_
       }
     }
   }
-  if (nErrors) {
-    GPUError("ERROR: %ld INCORRECT SAMPLES DURING ZS ENCODING VERIFICATION!!!", (int64_t)nErrors);
+  if (reduced.nErrors) {
+    GPUError("ERROR: %lu INCORRECT SAMPLES DURING ZS ENCODING VERIFICATION!!!", reduced.nErrors);
   } else if (verify) {
     GPUInfo("ENCODING VERIFICATION PASSED");
   }
-  GPUInfo("TOTAL ENCODED SIZE: %lu (%lu of %lu digits encoded)", totalSize, digitsEncoded, digitsInput);
+  GPUInfo("TOTAL ENCODED SIZE: %lu (%lu of %lu digits encoded)", reduced.totalSize, reduced.digitsEncoded, reduced.digitsInput);
 #endif
 }
 
diff --git a/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx b/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx
index 94b16ae5a6936..c47bd488d96ef 100644
--- a/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx
+++ b/GPU/GPUTracking/Base/GPUReconstructionLibrary.cxx
@@ -22,10 +22,6 @@
 #include <unistd.h>
 #endif
 
-#ifdef WITH_OPENMP
-#include <omp.h>
-#endif
-
 #include "GPUReconstruction.h"
 #include "GPUReconstructionAvailableBackends.h"
 
diff --git a/GPU/GPUTracking/Base/GPUReconstructionThreading.h b/GPU/GPUTracking/Base/GPUReconstructionThreading.h
new file mode 100644
index 0000000000000..374c7545e65da
--- /dev/null
+++ b/GPU/GPUTracking/Base/GPUReconstructionThreading.h
@@ -0,0 +1,60 @@
+// Copyright 2019-2020 CERN and copyright holders of ALICE O2.
+// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders.
+// All rights not expressly granted are reserved.
+//
+// This software is distributed under the terms of the GNU General Public
+// License v3 (GPL Version 3), copied verbatim in the file "COPYING".
+//
+// In applying this license CERN does not waive the privileges and immunities
+// granted to it by virtue of its status as an Intergovernmental Organization
+// or submit itself to any jurisdiction.
+
+/// \file GPUReconstructionThreading.h
+/// \author David Rohr
+
+#if !defined(GPURECONSTRUCTIONTHREADING_H)
+#define GPURECONSTRUCTIONTHREADING_H
+
+#if !defined(GPUCA_GPUCODE)
+#include "GPUReconstruction.h"
+
+#include <memory>
+#include <oneapi/tbb.h>
+
+namespace o2::gpu
+{
+
+struct GPUReconstructionThreading {
+  std::unique_ptr<tbb::global_control> control;
+  std::unique_ptr<tbb::task_arena> allThreads;
+  std::unique_ptr<tbb::task_arena> activeThreads;
+  std::unique_ptr<tbb::task_arena> outerThreads;
+};
+
+} // namespace o2::gpu
+
+#endif
+
+#define GPUCA_TBB_KERNEL_LOOP_HOST(rec, vartype, varname, iEnd, code)                       \
+  for (vartype varname = get_global_id(0); varname < iEnd; varname += get_global_size(0)) { \
+    code                                                                                    \
+  }
+
+#ifdef GPUCA_GPUCODE
+#define GPUCA_TBB_KERNEL_LOOP GPUCA_TBB_KERNEL_LOOP_HOST
+#else
+#define GPUCA_TBB_KERNEL_LOOP(rec, vartype, varname, iEnd, code)                                                                                       \
+  if (!rec.GetProcessingSettings().inKernelParallel) {                                                                                                 \
+    rec.mThreading->activeThreads->execute([&] {                                                                                                       \
+      tbb::parallel_for(tbb::blocked_range<vartype>(get_global_id(0), iEnd, get_global_size(0)), [&](const tbb::blocked_range<vartype>& _r_internal) { \
+        for (vartype varname = _r_internal.begin(); varname < _r_internal.end(); varname += get_global_size(0)) {                                      \
+          code                                                                                                                                         \
+        }                                                                                                                                              \
+      });                                                                                                                                              \
+    });                                                                                                                                                \
+  } else {                                                                                                                                             \
+    GPUCA_TBB_KERNEL_LOOP_HOST(rec, vartype, varname, iEnd, code)                                                                                      \
+  }
+#endif
+
+#endif
diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
index bab5ff912c575..5bc1e6e4e6783 100644
--- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt
@@ -136,12 +136,7 @@ set_target_cuda_arch(${targetName})
 #target_link_options(${targetName} PRIVATE "LINKER:--version-script=${CMAKE_CURRENT_SOURCE_DIR}/version_script.ld")
 #set_target_properties(${targetName} PROPERTIES LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/version_script.ld)
 
-if(OpenMP_CXX_FOUND)
-  # Must be private, depending libraries might be compiled by compiler not understanding -fopenmp
-  target_compile_definitions(${targetName} PRIVATE WITH_OPENMP)
-  target_link_libraries(${targetName} PRIVATE OpenMP::OpenMP_CXX)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fopenmp")
-endif()
+target_link_libraries(${targetName} PRIVATE TBB::tbb)
 
 # Special handling of GPU kernels in case of per-kernel compilation / RDC
 if(NOT DEFINED GPUCA_CUDA_COMPILE_MODE)
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
index 26cbc282b6fc2..20ce23b578d84 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDA.cu
@@ -265,7 +265,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
       throw std::runtime_error("Invalid warp size on GPU");
     }
     mBlockCount = deviceProp.multiProcessorCount;
-    mMaxThreads = std::max<int32_t>(mMaxThreads, deviceProp.maxThreadsPerBlock * mBlockCount);
+    mMaxBackendThreads = std::max<int32_t>(mMaxBackendThreads, deviceProp.maxThreadsPerBlock * mBlockCount);
 #ifndef __HIPCC__ // CUDA
     mWarpSize = 32;
 #else // HIP
@@ -409,7 +409,7 @@ int32_t GPUReconstructionCUDA::InitDevice_Runtime()
     mDeviceId = master->mDeviceId;
     mBlockCount = master->mBlockCount;
     mWarpSize = master->mWarpSize;
-    mMaxThreads = master->mMaxThreads;
+    mMaxBackendThreads = master->mMaxBackendThreads;
     mDeviceName = master->mDeviceName;
     mDeviceConstantMem = master->mDeviceConstantMem;
     mDeviceConstantMemList.resize(master->mDeviceConstantMemList.size());
diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
index 62ad57ae3497a..3bd3afc0ffc23 100644
--- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
+++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
@@ -13,9 +13,6 @@
 /// \author David Rohr
 
 #define GPUCA_GPUCODE_HOSTONLY
-#ifdef WITH_OPENMP
-#include <omp.h>
-#endif
 #include "GPUReconstructionCUDA.h"
 #include "GPUParamRTC.h"
 #include "GPUDefMacros.h"
@@ -25,6 +22,7 @@
 #include <fcntl.h>
 #include <filesystem>
 
+#include <oneapi/tbb.h>
 using namespace o2::gpu;
 
 #include "utils/qGetLdBinarySymbols.h"
@@ -153,10 +151,7 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
     }
     HighResTimer rtcTimer;
     rtcTimer.ResetStart();
-#ifdef WITH_OPENMP
-#pragma omp parallel for schedule(dynamic, 1)
-#endif
-    for (uint32_t i = 0; i < nCompile; i++) {
+    tbb::parallel_for<uint32_t>(0, nCompile, [&](auto i) {
       if (mProcessingSettings.debugLevel >= 3) {
         printf("Compiling %s\n", (filename + "_" + std::to_string(i) + mRtcSrcExtension).c_str());
       }
@@ -190,8 +185,8 @@ int32_t GPUReconstructionCUDA::genRTC(std::string& filename, uint32_t& nCompile)
           printf("Source code file: %s", filename.c_str());
         }
         throw std::runtime_error("Error during CUDA compilation");
-      }
-    }
+      } // clang-format off
+    }, tbb::simple_partitioner()); // clang-format on
     if (mProcessingSettings.debugLevel >= 0) {
       GPUInfo("RTC Compilation finished (%f seconds)", rtcTimer.GetCurrentElapsedTime());
     }
diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt
index d34777f0bef3e..10fbfa8d21ddf 100644
--- a/GPU/GPUTracking/Base/hip/CMakeLists.txt
+++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt
@@ -189,11 +189,7 @@ endif()
 target_link_libraries(${targetName} PRIVATE hip::host hip::device hip::hipcub roc::rocthrust)
 set_target_hip_arch(${targetName})
 
-if(OpenMP_CXX_FOUND)
-  # Must be private, depending libraries might be compiled by compiler not understanding -fopenmp
-  target_compile_definitions(${MODULE}_CXX PRIVATE WITH_OPENMP)
-  target_link_libraries(${MODULE}_CXX PRIVATE OpenMP::OpenMP_CXX)
-endif()
+target_link_libraries(${MODULE}_CXX PRIVATE TBB::tbb)
 
 # Special handling of GPU kernels in case of per-kernel compilation / RDC
 if(NOT DEFINED GPUCA_HIP_COMPILE_MODE)
diff --git a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
index ed985e31ab1b0..30a8fc193774b 100644
--- a/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
+++ b/GPU/GPUTracking/Base/opencl/GPUReconstructionOCL.cxx
@@ -267,7 +267,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
     mDeviceName += " (OpenCL)";
     mBlockCount = shaders;
     mWarpSize = 32;
-    mMaxThreads = std::max<int32_t>(mMaxThreads, maxWorkGroup * mBlockCount);
+    mMaxBackendThreads = std::max<int32_t>(mMaxBackendThreads, maxWorkGroup * mBlockCount);
 
     mInternals->context = clCreateContext(nullptr, ContextForAllPlatforms() ? count : 1, ContextForAllPlatforms() ? mInternals->devices.get() : &mInternals->device, nullptr, nullptr, &ocl_error);
     if (GPUFailedMsgI(ocl_error)) {
@@ -380,7 +380,7 @@ int32_t GPUReconstructionOCLBackend::InitDevice_Runtime()
     GPUReconstructionOCL* master = dynamic_cast<GPUReconstructionOCL*>(mMaster);
     mBlockCount = master->mBlockCount;
     mWarpSize = master->mWarpSize;
-    mMaxThreads = master->mMaxThreads;
+    mMaxBackendThreads = master->mMaxBackendThreads;
     mDeviceName = master->mDeviceName;
     mDeviceConstantMem = master->mDeviceConstantMem;
     mInternals = master->mInternals;
diff --git a/GPU/GPUTracking/CMakeLists.txt b/GPU/GPUTracking/CMakeLists.txt
index a5d335931af37..3e738fb6df5cb 100644
--- a/GPU/GPUTracking/CMakeLists.txt
+++ b/GPU/GPUTracking/CMakeLists.txt
@@ -105,6 +105,7 @@ set(HDRS_INSTALL
     Base/GPUParam.inc
     Base/GPUParamRTC.h
     Base/GPUReconstructionIncludes.h
+    Base/GPUReconstructionThreading.h
     Base/GPUReconstructionIncludesITS.h
     Base/GPUReconstructionKernelMacros.h
     Base/GPUReconstructionKernels.h
@@ -378,12 +379,7 @@ if(GPUCA_QA)
   target_compile_definitions(${targetName} PRIVATE GPUCA_BUILD_QA)
 endif()
 
-if(OpenMP_CXX_FOUND)
-  message(STATUS "GPU: Using OpenMP: ${OpenMP_CXX_SPEC_DATE}")
-  # Must be private, depending libraries might be compiled by compiler not understanding -fopenmp
-  target_compile_definitions(${targetName} PRIVATE WITH_OPENMP)
-  target_link_libraries(${targetName} PRIVATE OpenMP::OpenMP_CXX)
-endif()
+target_link_libraries(${targetName} PRIVATE TBB::tbb)
 
 target_compile_options(${targetName} PRIVATE -Wno-instantiation-after-specialization)
 
diff --git a/GPU/GPUTracking/DataCompression/GPUTPCClusterStatistics.cxx b/GPU/GPUTracking/DataCompression/GPUTPCClusterStatistics.cxx
index e8f8de7658b28..794f4cb485f14 100644
--- a/GPU/GPUTracking/DataCompression/GPUTPCClusterStatistics.cxx
+++ b/GPU/GPUTracking/DataCompression/GPUTPCClusterStatistics.cxx
@@ -69,7 +69,7 @@ INode* BuildTree(const double* frequencies, uint32_t UniqueSymbols)
 {
   std::priority_queue<INode*, std::vector<INode*>, NodeCmp> trees;
 
-  for (uint32_t i = 0; i < UniqueSymbols; ++i) {
+  for (uint32_t i = 0; i < UniqueSymbols; i++) {
     if (frequencies[i] != 0) {
       trees.push(new LeafNode(frequencies[i], i));
     }
@@ -256,7 +256,7 @@ float GPUTPCClusterStatistics::Analyze(std::vector<int32_t>& p, const char* name
     GenerateCodes(root, HuffCode(), codes);
     delete root;
 
-    for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end(); ++it) {
+    for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end(); it++) {
       huffmanSize += it->second.size() * prob[it->first];
     }
 
diff --git a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
index 22641774cd9ee..e3b8965c3e27b 100644
--- a/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
+++ b/GPU/GPUTracking/DataCompression/TPCClusterDecompressor.cxx
@@ -22,6 +22,8 @@
 #include <atomic>
 #include "TPCClusterDecompressionCore.inc"
 
+#include <oneapi/tbb.h>
+
 using namespace o2::gpu;
 using namespace o2::tpc;
 
@@ -51,23 +53,24 @@ int32_t TPCClusterDecompressor::decompress(const CompressedClusters* clustersCom
   for (uint32_t i = 0; i < NSLICES * GPUCA_ROW_COUNT; i++) {
     (&locks[0][0])[i].clear();
   }
-  uint32_t offset = 0, lasti = 0;
   const uint32_t maxTime = param.continuousMaxTimeBin > 0 ? ((param.continuousMaxTimeBin + 1) * ClusterNative::scaleTimePacked - 1) : TPC_MAX_TIME_BIN_TRIGGERED;
-  GPUCA_OPENMP(parallel for firstprivate(offset, lasti))
-  for (uint32_t i = 0; i < clustersCompressed->nTracks; i++) {
-    if (i < lasti) {
-      offset = lasti = 0; // dynamic OMP scheduling, need to reinitialize offset
-    }
-    while (lasti < i) {
-      offset += clustersCompressed->nTrackClusters[lasti++];
+  tbb::parallel_for(tbb::blocked_range<uint32_t>(0, clustersCompressed->nTracks), [&](const tbb::blocked_range<uint32_t>& range) {
+    uint32_t offset = 0, lasti = 0;
+    for (uint32_t i = range.begin(); i < range.end(); i++) {
+      if (i < lasti) {
+        offset = lasti = 0; // dynamic scheduling order, need to reinitialize offset
+      }
+      while (lasti < i) {
+        offset += clustersCompressed->nTrackClusters[lasti++];
+      }
+      lasti++;
+      TPCClusterDecompressionCore::decompressTrack(*clustersCompressed, param, maxTime, i, offset, clusters, locks);
     }
-    lasti++;
-    TPCClusterDecompressionCore::decompressTrack(*clustersCompressed, param, maxTime, i, offset, clusters, locks);
-  }
+  });
   size_t nTotalClusters = clustersCompressed->nAttachedClusters + clustersCompressed->nUnattachedClusters;
   ClusterNative* clusterBuffer = allocator(nTotalClusters);
   uint32_t offsets[NSLICES][GPUCA_ROW_COUNT];
-  offset = 0;
+  uint32_t offset = 0;
   uint32_t decodedAttachedClusters = 0;
   for (uint32_t i = 0; i < NSLICES; i++) {
     for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
@@ -82,8 +85,7 @@ int32_t TPCClusterDecompressor::decompress(const CompressedClusters* clustersCom
   }
   clustersNative.clustersLinear = clusterBuffer;
   clustersNative.setOffsetPtrs();
-  GPUCA_OPENMP(parallel for)
-  for (uint32_t i = 0; i < NSLICES; i++) {
+  tbb::parallel_for<uint32_t>(0, NSLICES, [&](auto i) {
     for (uint32_t j = 0; j < GPUCA_ROW_COUNT; j++) {
       ClusterNative* buffer = &clusterBuffer[clustersNative.clusterOffset[i][j]];
       if (clusters[i][j].size()) {
@@ -108,7 +110,7 @@ int32_t TPCClusterDecompressor::decompress(const CompressedClusters* clustersCom
       if (deterministicRec) {
         std::sort(buffer, buffer + clustersNative.nClusters[i][j]);
       }
-    }
-  }
+    } // clang-format off
+  }, tbb::simple_partitioner()); // clang-format on
   return 0;
 }
diff --git a/GPU/GPUTracking/DataCompression/standalone-cluster-dump-entropy-analysed.cxx b/GPU/GPUTracking/DataCompression/standalone-cluster-dump-entropy-analysed.cxx
index 0d7ca5c6209a4..9cb49bf4c7ef5 100644
--- a/GPU/GPUTracking/DataCompression/standalone-cluster-dump-entropy-analysed.cxx
+++ b/GPU/GPUTracking/DataCompression/standalone-cluster-dump-entropy-analysed.cxx
@@ -166,7 +166,7 @@ INode* BuildTree(const double* frequencies, uint32_t UniqueSymbols)
 {
   std::priority_queue<INode*, std::vector<INode*>, NodeCmp> trees;
 
-  for (int32_t i = 0; i < UniqueSymbols; ++i) {
+  for (int32_t i = 0; i < UniqueSymbols; i++) {
     if (frequencies[i] != 0) {
       trees.push(new LeafNode(frequencies[i], i));
     }
@@ -621,7 +621,7 @@ int32_t main(int argc, char** argv)
           GenerateCodes(root, HuffCode(), codes);
           delete root;
 
-          for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end(); ++it) {
+          for (HuffCodeMap::const_iterator it = codes.begin(); it != codes.end(); it++) {
             huffmanSize += it->second.size() * probabilities[i][it->first];
           }
         }
diff --git a/GPU/GPUTracking/Definitions/GPUDefMacros.h b/GPU/GPUTracking/Definitions/GPUDefMacros.h
index b47401c9f05aa..caf2d1670f84e 100644
--- a/GPU/GPUTracking/Definitions/GPUDefMacros.h
+++ b/GPU/GPUTracking/Definitions/GPUDefMacros.h
@@ -50,11 +50,5 @@
 #define GPUCA_UNROLL(...)
 #endif
 
-#if !defined(WITH_OPENMP) || defined(GPUCA_GPUCODE_DEVICE)
-#define GPUCA_OPENMP(...)
-#else
-#define GPUCA_OPENMP(...) _Pragma(GPUCA_M_STR(omp __VA_ARGS__))
-#endif
-
 #endif
 // clang-format on
diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h
index 10bb4797a1c15..905622de26ba9 100644
--- a/GPU/GPUTracking/Definitions/GPUSettingsList.h
+++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h
@@ -250,9 +250,10 @@ AddOption(conservativeMemoryEstimate, bool, false, "", 0, "Use some more conserv
 AddOption(tpcInputWithClusterRejection, uint8_t, 0, "", 0, "Indicate whether the TPC input is CTF data with cluster rejection, to tune buffer estimations")
 AddOption(forceMaxMemScalers, uint64_t, 0, "", 0, "Force using the maximum values for all buffers, Set a value n > 1 to rescale all maximums to a memory size of n")
 AddOption(registerStandaloneInputMemory, bool, false, "registerInputMemory", 0, "Automatically register input memory buffers for the GPU")
-AddOption(ompThreads, int32_t, -1, "omp", 't', "Number of OMP threads to run (-1: all)", min(-1), message("Using %s OMP threads"))
-AddOption(ompKernels, uint8_t, 2, "", 0, "Parallelize with OMP inside kernels instead of over slices, 2 for nested parallelization over TPC sectors and inside kernels")
-AddOption(ompAutoNThreads, bool, true, "", 0, "Auto-adjust number of OMP threads, decreasing the number for small input data")
+AddOption(nHostThreads, int32_t, -1, "nThreads", 't', "Number of host threads to run (-1: all)", min(-1), message("Using %s CPU threads"))
+AddOption(ompThreads, int32_t, -1, "", 0, "Deprecated synonym for nHostThreads")
+AddOption(inKernelParallel, uint8_t, 2, "", 0, "Parallelize with multi-threading inside kernels on the host instead of over TPC sectors, 2 for nested parallelization over TPC sectors and inside kernels")
+AddOption(autoAdjustHostThreads, bool, true, "", 0, "Auto-adjust number of OMP threads, decreasing the number for small input data")
 AddOption(nStreams, int8_t, 8, "", 0, "Number of GPU streams / command queues")
 AddOption(nTPCClustererLanes, int8_t, -1, "", 0, "Number of TPC clusterers that can run in parallel (-1 = autoset)")
 AddOption(overrideClusterizerFragmentLen, int32_t, -1, "", 0, "Force the cluster max fragment len to a certain value (-1 = autodetect)")
diff --git a/GPU/GPUTracking/Global/GPUChainTracking.cxx b/GPU/GPUTracking/Global/GPUChainTracking.cxx
index 889e12c258cb4..66f37e1122832 100644
--- a/GPU/GPUTracking/Global/GPUChainTracking.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTracking.cxx
@@ -673,8 +673,8 @@ int32_t GPUChainTracking::RunChain()
   if ((((GetRecoSteps() & RecoStep::TRDTracking) && !GetProcessingSettings().trdTrackModelO2 && !GetProcessingSettings().willProvideO2PropagatorLate) || ((GetRecoSteps() & RecoStep::Refit) && !param().rec.trackingRefitGPUModel)) && processors()->calibObjects.o2Propagator == nullptr) {
     GPUFatal("Cannot run TRD tracking or refit with o2 track model without o2 propagator"); // This check must happen during run, since o2::Propagator cannot be available during init
   }
-  if (GetProcessingSettings().ompAutoNThreads && !mRec->IsGPU()) {
-    mRec->SetNOMPThreads(-1);
+  if (GetProcessingSettings().autoAdjustHostThreads && !mRec->IsGPU()) {
+    mRec->SetNActiveThreads(-1);
   }
   const auto threadContext = GetThreadContext();
   if (GetProcessingSettings().runCompressionStatistics && mCompressionStatistics == nullptr) {
@@ -717,8 +717,8 @@ int32_t GPUChainTracking::RunChain()
     }
   }
 
-  if (GetProcessingSettings().ompAutoNThreads && !mRec->IsGPU() && mIOPtrs.clustersNative) {
-    mRec->SetNOMPThreads(mIOPtrs.clustersNative->nClustersTotal / 5000);
+  if (GetProcessingSettings().autoAdjustHostThreads && !mRec->IsGPU() && mIOPtrs.clustersNative) {
+    mRec->SetNActiveThreads(mIOPtrs.clustersNative->nClustersTotal / 5000);
   }
 
   if (mIOPtrs.clustersNative && runRecoStep(RecoStep::TPCConversion, &GPUChainTracking::ConvertNativeToClusterData)) {
@@ -768,8 +768,8 @@ int32_t GPUChainTracking::RunChain()
     SynchronizeStream(OutputStream());
   }
 
-  if (GetProcessingSettings().ompAutoNThreads && !mRec->IsGPU()) {
-    mRec->SetNOMPThreads(-1);
+  if (GetProcessingSettings().autoAdjustHostThreads && !mRec->IsGPU()) {
+    mRec->SetNActiveThreads(-1);
   }
 
   int32_t retVal = 0;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
index bec61d6b76f1e..6ca645808c5bd 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
@@ -35,6 +35,8 @@
 
 #include "utils/strtag.h"
 
+#include <oneapi/tbb.h>
+
 #ifndef GPUCA_NO_VC
 #include <Vc/Vc>
 #endif
@@ -576,8 +578,8 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
   if (RunTPCClusterizer_prepare(mPipelineNotifyCtx && GetProcessingSettings().doublePipelineClusterizer)) {
     return 1;
   }
-  if (GetProcessingSettings().ompAutoNThreads && !doGPU) {
-    mRec->SetNOMPThreads(mRec->MemoryScalers()->nTPCdigits / 20000);
+  if (GetProcessingSettings().autoAdjustHostThreads && !doGPU) {
+    mRec->SetNActiveThreads(mRec->MemoryScalers()->nTPCdigits / 20000);
   }
 
   mRec->MemoryScalers()->nTPCHits = mRec->MemoryScalers()->NTPCClusters(mRec->MemoryScalers()->nTPCdigits);
@@ -674,229 +676,232 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
       if (GetProcessingSettings().debugLevel >= 3) {
         GPUInfo("Processing time bins [%d, %d) for sectors %d to %d", fragment.start, fragment.last(), iSliceBase, iSliceBase + GetProcessingSettings().nTPCClustererLanes - 1);
       }
-      GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, GetProcessingSettings().nTPCClustererLanes)))
-      for (int32_t lane = 0; lane < maxLane; lane++) {
-        if (doGPU && fragment.index != 0) {
-          SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished
-        }
-
-        uint32_t iSlice = iSliceBase + lane;
-        GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice];
-        GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer;
-        clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0;
-        clusterer.mPmemory->fragment = fragment;
-
-        if (mIOPtrs.tpcPackedDigits) {
-          bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS;
-          bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels;
-          auto* inDigits = mIOPtrs.tpcPackedDigits;
-          size_t numDigits = inDigits->nTPCDigits[iSlice];
-          if (setDigitsOnGPU) {
-            GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSlice], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true);
-          }
-          if (setDigitsOnHost) {
-            clusterer.mPdigits = const_cast<o2::tpc::Digit*>(inDigits->tpcDigits[iSlice]); // TODO: Needs fixing (invalid const cast)
+      tbb::task_arena(mRec->SetAndGetNActiveThreadsOuterLoop(!doGPU, maxLane)).execute([&] {
+        tbb::parallel_for<int32_t>(0, maxLane, [&](auto lane) {
+          if (doGPU && fragment.index != 0) {
+            SynchronizeStream(lane); // Don't overwrite charge map from previous iteration until cluster computation is finished
           }
-          clusterer.mPmemory->counters.nDigits = numDigits;
-        }
 
-        if (mIOPtrs.tpcZS) {
-          if (mCFContext->nPagesSector[iSlice] && mCFContext->zsVersion != -1) {
-            clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSlice].first;
-            clusterer.mPmemory->counters.nPagesSubslice = mCFContext->nextPos[iSlice].second;
-          } else {
-            clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubslice = 0;
+          uint32_t iSlice = iSliceBase + lane;
+          GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice];
+          GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer;
+          clusterer.mPmemory->counters.nPeaks = clusterer.mPmemory->counters.nClusters = 0;
+          clusterer.mPmemory->fragment = fragment;
+
+          if (mIOPtrs.tpcPackedDigits) {
+            bool setDigitsOnGPU = doGPU && not mIOPtrs.tpcZS;
+            bool setDigitsOnHost = (not doGPU && not mIOPtrs.tpcZS) || propagateMCLabels;
+            auto* inDigits = mIOPtrs.tpcPackedDigits;
+            size_t numDigits = inDigits->nTPCDigits[iSlice];
+            if (setDigitsOnGPU) {
+              GPUMemCpy(RecoStep::TPCClusterFinding, clustererShadow.mPdigits, inDigits->tpcDigits[iSlice], sizeof(clustererShadow.mPdigits[0]) * numDigits, lane, true);
+            }
+            if (setDigitsOnHost) {
+              clusterer.mPdigits = const_cast<o2::tpc::Digit*>(inDigits->tpcDigits[iSlice]); // TODO: Needs fixing (invalid const cast)
+            }
+            clusterer.mPmemory->counters.nDigits = numDigits;
           }
-        }
-        TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-
-        using ChargeMapType = decltype(*clustererShadow.mPchargeMap);
-        using PeakMapType = decltype(*clustererShadow.mPpeakMap);
-        runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout<ChargeMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); // TODO: Not working in OpenCL2!!!
-        runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout<PeakMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType));
-        if (fragment.index == 0) {
-          runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy));
-        }
-        DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges");
 
-        if (doGPU) {
-          if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSlice] && mCFContext->zsVersion != -1) {
-            TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane);
-            SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane);
+          if (mIOPtrs.tpcZS) {
+            if (mCFContext->nPagesSector[iSlice] && mCFContext->zsVersion != -1) {
+              clusterer.mPmemory->counters.nPositions = mCFContext->nextPos[iSlice].first;
+              clusterer.mPmemory->counters.nPagesSubslice = mCFContext->nextPos[iSlice].second;
+            } else {
+              clusterer.mPmemory->counters.nPositions = clusterer.mPmemory->counters.nPagesSubslice = 0;
+            }
           }
-          SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory
-        }
+          TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+
+          using ChargeMapType = decltype(*clustererShadow.mPchargeMap);
+          using PeakMapType = decltype(*clustererShadow.mPpeakMap);
+          runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPchargeMap, TPCMapMemoryLayout<ChargeMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(ChargeMapType)); // TODO: Not working in OpenCL2!!!
+          runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpeakMap, TPCMapMemoryLayout<PeakMapType>::items(GetProcessingSettings().overrideClusterizerFragmentLen) * sizeof(PeakMapType));
+          if (fragment.index == 0) {
+            runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding)}, clustererShadow.mPpadIsNoisy, TPC_PADS_IN_SECTOR * sizeof(*clustererShadow.mPpadIsNoisy));
+          }
+          DoDebugAndDump(RecoStep::TPCClusterFinding, 262144, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Zeroed Charges");
 
-        if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSlice] || mCFContext->zsVersion == -1)) {
-          clusterer.mPmemory->counters.nPositions = 0;
-          continue;
-        }
-        if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSlice] == 0) {
-          clusterer.mPmemory->counters.nPositions = 0;
-          continue;
-        }
+          if (doGPU) {
+            if (mIOPtrs.tpcZS && mCFContext->nPagesSector[iSlice] && mCFContext->zsVersion != -1) {
+              TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, mInputsHost->mResourceZS, lane);
+              SynchronizeStream(GetProcessingSettings().nTPCClustererLanes + lane);
+            }
+            SynchronizeStream(mRec->NStreams() - 1); // Wait for copying to constant memory
+          }
 
-        if (propagateMCLabels && fragment.index == 0) {
-          clusterer.PrepareMC();
-          clusterer.mPinputLabels = digitsMC->v[iSlice];
-          if (clusterer.mPinputLabels == nullptr) {
-            GPUFatal("MC label container missing, sector %d", iSlice);
+          if (mIOPtrs.tpcZS && (mCFContext->abandonTimeframe || !mCFContext->nPagesSector[iSlice] || mCFContext->zsVersion == -1)) {
+            clusterer.mPmemory->counters.nPositions = 0;
+            return;
           }
-          if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSlice]) {
-            GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSlice], (int32_t)clusterer.mPinputLabels->getIndexedSize());
+          if (!mIOPtrs.tpcZS && mIOPtrs.tpcPackedDigits->nTPCDigits[iSlice] == 0) {
+            clusterer.mPmemory->counters.nPositions = 0;
+            return;
           }
-        }
 
-        if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSlice) {
-          if (not mIOPtrs.tpcZS) {
-            runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane), {iSlice}}, mIOPtrs.tpcZS == nullptr);
-            TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-          } else if (propagateMCLabels) {
-            runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, mIOPtrs.tpcZS == nullptr);
-            TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+          if (propagateMCLabels && fragment.index == 0) {
+            clusterer.PrepareMC();
+            clusterer.mPinputLabels = digitsMC->v[iSlice];
+            if (clusterer.mPinputLabels == nullptr) {
+              GPUFatal("MC label container missing, sector %d", iSlice);
+            }
+            if (clusterer.mPinputLabels->getIndexedSize() != mIOPtrs.tpcPackedDigits->nTPCDigits[iSlice]) {
+              GPUFatal("MC label container has incorrect number of entries: %d expected, has %d\n", (int32_t)mIOPtrs.tpcPackedDigits->nTPCDigits[iSlice], (int32_t)clusterer.mPinputLabels->getIndexedSize());
+            }
           }
-        }
 
-        if (mIOPtrs.tpcZS) {
-          int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : (mIOPtrs.tpcZS->slice[iSlice].count[0] && mIOPtrs.tpcZS->slice[iSlice].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->slice[iSlice].zsPtr[0][0])
-                                                                                                                                                                                                                   : 0;
-          uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubslice : GPUTrackingInOutZS::NENDPOINTS;
+          if (GetProcessingSettings().tpcSingleSector == -1 || GetProcessingSettings().tpcSingleSector == (int32_t)iSlice) {
+            if (not mIOPtrs.tpcZS) {
+              runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane), {iSlice}}, mIOPtrs.tpcZS == nullptr);
+              TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+            } else if (propagateMCLabels) {
+              runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::findFragmentStart>({GetGrid(1, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, mIOPtrs.tpcZS == nullptr);
+              TransferMemoryResourceLinkToGPU(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+            }
+          }
 
-          (void)tpcTimeBinCut; // TODO: To be used in decoding kernels
-          switch (mCFContext->zsVersion) {
-            default:
-              GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion);
-              break;
-            case ZSVersionRowBased10BitADC:
-            case ZSVersionRowBased12BitADC:
-              runKernel<GPUTPCCFDecodeZS>({GetGridBlk(nBlocks, lane), {iSlice}}, firstHBF);
-              break;
-            case ZSVersionLinkBasedWithMeta:
-              runKernel<GPUTPCCFDecodeZSLink>({GetGridBlk(nBlocks, lane), {iSlice}}, firstHBF);
-              break;
-            case ZSVersionDenseLinkBased:
-              runKernel<GPUTPCCFDecodeZSDenseLink>({GetGridBlk(nBlocks, lane), {iSlice}}, firstHBF);
-              break;
+          if (mIOPtrs.tpcZS) {
+            int32_t firstHBF = (mIOPtrs.settingsTF && mIOPtrs.settingsTF->hasTfStartOrbit) ? mIOPtrs.settingsTF->tfStartOrbit : ((mIOPtrs.tpcZS->slice[iSlice].count[0] && mIOPtrs.tpcZS->slice[iSlice].nZSPtr[0][0]) ? o2::raw::RDHUtils::getHeartBeatOrbit(*(const o2::header::RAWDataHeader*)mIOPtrs.tpcZS->slice[iSlice].zsPtr[0][0]) : 0);
+            uint32_t nBlocks = doGPU ? clusterer.mPmemory->counters.nPagesSubslice : GPUTrackingInOutZS::NENDPOINTS;
+
+            (void)tpcTimeBinCut; // TODO: To be used in decoding kernels
+            switch (mCFContext->zsVersion) {
+              default:
+                GPUFatal("Data with invalid TPC ZS mode (%d) received", mCFContext->zsVersion);
+                break;
+              case ZSVersionRowBased10BitADC:
+              case ZSVersionRowBased12BitADC:
+                runKernel<GPUTPCCFDecodeZS>({GetGridBlk(nBlocks, lane), {iSlice}}, firstHBF);
+                break;
+              case ZSVersionLinkBasedWithMeta:
+                runKernel<GPUTPCCFDecodeZSLink>({GetGridBlk(nBlocks, lane), {iSlice}}, firstHBF);
+                break;
+              case ZSVersionDenseLinkBased:
+                runKernel<GPUTPCCFDecodeZSDenseLink>({GetGridBlk(nBlocks, lane), {iSlice}}, firstHBF);
+                break;
+            }
+            TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+          } // clang-format off
+        }, tbb::simple_partitioner()); // clang-format on
+      });
+      tbb::task_arena(mRec->SetAndGetNActiveThreadsOuterLoop(!doGPU, maxLane)).execute([&] {
+        tbb::parallel_for<int32_t>(0, maxLane, [&](auto lane) {
+          uint32_t iSlice = iSliceBase + lane;
+          if (doGPU) {
+            SynchronizeStream(lane);
           }
-          TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-        }
-      }
-      GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, GetProcessingSettings().nTPCClustererLanes)))
-      for (int32_t lane = 0; lane < maxLane; lane++) {
-        uint32_t iSlice = iSliceBase + lane;
-        if (doGPU) {
-          SynchronizeStream(lane);
-        }
-        if (mIOPtrs.tpcZS) {
-          CfFragment f = fragment.next();
-          int32_t nextSlice = iSlice;
-          if (f.isEnd()) {
-            nextSlice += GetProcessingSettings().nTPCClustererLanes;
-            f = mCFContext->fragmentFirst;
+          if (mIOPtrs.tpcZS) {
+            CfFragment f = fragment.next();
+            int32_t nextSlice = iSlice;
+            if (f.isEnd()) {
+              nextSlice += GetProcessingSettings().nTPCClustererLanes;
+              f = mCFContext->fragmentFirst;
+            }
+            if (nextSlice < NSLICES && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSlice] && mCFContext->zsVersion != -1 && !mCFContext->abandonTimeframe) {
+              mCFContext->nextPos[nextSlice] = RunTPCClusterizer_transferZS(nextSlice, f, GetProcessingSettings().nTPCClustererLanes + lane);
+            }
           }
-          if (nextSlice < NSLICES && mIOPtrs.tpcZS && mCFContext->nPagesSector[nextSlice] && mCFContext->zsVersion != -1 && !mCFContext->abandonTimeframe) {
-            mCFContext->nextPos[nextSlice] = RunTPCClusterizer_transferZS(nextSlice, f, GetProcessingSettings().nTPCClustererLanes + lane);
+          GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice];
+          GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer;
+          if (clusterer.mPmemory->counters.nPositions == 0) {
+            return;
+          }
+          if (!mIOPtrs.tpcZS) {
+            runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillFromDigits>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
+          }
+          if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 1, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile)) {
+            clusterer.DumpChargeMap(*mDebugFile, "Charges");
           }
-        }
-        GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice];
-        GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer;
-        if (clusterer.mPmemory->counters.nPositions == 0) {
-          continue;
-        }
-        if (!mIOPtrs.tpcZS) {
-          runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillFromDigits>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
-        }
-        if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 1, clusterer, &GPUTPCClusterFinder::DumpDigits, *mDebugFile)) {
-          clusterer.DumpChargeMap(*mDebugFile, "Charges");
-        }
 
-        if (propagateMCLabels) {
-          runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillIndexMap>({GetGrid(clusterer.mPmemory->counters.nDigitsInFragment, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}});
-        }
+          if (propagateMCLabels) {
+            runKernel<GPUTPCCFChargeMapFiller, GPUTPCCFChargeMapFiller::fillIndexMap>({GetGrid(clusterer.mPmemory->counters.nDigitsInFragment, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}});
+          }
 
-        bool checkForNoisyPads = (rec()->GetParam().rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (rec()->GetParam().rec.tpc.maxConsecTimeBinAboveThreshold > 0);
-        checkForNoisyPads &= (rec()->GetParam().rec.tpc.noisyPadsQuickCheck ? fragment.index == 0 : true);
-        checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter;
+          bool checkForNoisyPads = (rec()->GetParam().rec.tpc.maxTimeBinAboveThresholdIn1000Bin > 0) || (rec()->GetParam().rec.tpc.maxConsecTimeBinAboveThreshold > 0);
+          checkForNoisyPads &= (rec()->GetParam().rec.tpc.noisyPadsQuickCheck ? fragment.index == 0 : true);
+          checkForNoisyPads &= !GetProcessingSettings().disableTPCNoisyPadFilter;
 
-        if (checkForNoisyPads) {
-          int32_t nBlocks = TPC_PADS_IN_SECTOR / GPUTPCCFCheckPadBaseline::PadsPerCacheline;
+          if (checkForNoisyPads) {
+            int32_t nBlocks = TPC_PADS_IN_SECTOR / GPUTPCCFCheckPadBaseline::PadsPerCacheline;
 
-          runKernel<GPUTPCCFCheckPadBaseline>({GetGridBlk(nBlocks, lane), {iSlice}});
-        }
+            runKernel<GPUTPCCFCheckPadBaseline>({GetGridBlk(nBlocks, lane), {iSlice}});
+          }
 
-        runKernel<GPUTPCCFPeakFinder>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
-        if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaks, *mDebugFile)) {
-          clusterer.DumpPeakMap(*mDebugFile, "Peaks");
-        }
+          runKernel<GPUTPCCFPeakFinder>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
+          if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaks, *mDebugFile)) {
+            clusterer.DumpPeakMap(*mDebugFile, "Peaks");
+          }
 
-        RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane);
-        TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-        DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaksCompacted, *mDebugFile);
-      }
-      GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, GetProcessingSettings().nTPCClustererLanes)))
-      for (int32_t lane = 0; lane < maxLane; lane++) {
-        uint32_t iSlice = iSliceBase + lane;
-        GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice];
-        GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer;
-        if (doGPU) {
-          SynchronizeStream(lane);
-        }
-        if (clusterer.mPmemory->counters.nPeaks == 0) {
-          continue;
-        }
-        runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
-        runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
-        if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile)) {
-          clusterer.DumpPeakMap(*mDebugFile, "Suppressed Peaks");
-        }
+          RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 0, doGPU, lane);
+          TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+          DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 2, clusterer, &GPUTPCClusterFinder::DumpPeaksCompacted, *mDebugFile); // clang-format off
+        }, tbb::simple_partitioner()); // clang-format on
+      });
+      tbb::task_arena(mRec->SetAndGetNActiveThreadsOuterLoop(!doGPU, maxLane)).execute([&] {
+        tbb::parallel_for<int32_t>(0, maxLane, [&](auto lane) {
+          uint32_t iSlice = iSliceBase + lane;
+          GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice];
+          GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer;
+          if (doGPU) {
+            SynchronizeStream(lane);
+          }
+          if (clusterer.mPmemory->counters.nPeaks == 0) {
+            return;
+          }
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::noiseSuppression>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+          runKernel<GPUTPCCFNoiseSuppression, GPUTPCCFNoiseSuppression::updatePeaks>({GetGrid(clusterer.mPmemory->counters.nPeaks, lane), {iSlice}});
+          if (DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaks, *mDebugFile)) {
+            clusterer.DumpPeakMap(*mDebugFile, "Suppressed Peaks");
+          }
 
-        RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
-        TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
-        DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaksCompacted, *mDebugFile);
-      }
-      GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, GetProcessingSettings().nTPCClustererLanes)))
-      for (int32_t lane = 0; lane < maxLane; lane++) {
-        uint32_t iSlice = iSliceBase + lane;
-        GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice];
-        GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer;
-        if (doGPU) {
-          SynchronizeStream(lane);
-        }
+          RunTPCClusterizer_compactPeaks(clusterer, clustererShadow, 1, doGPU, lane);
+          TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mMemoryId, lane);
+          DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 3, clusterer, &GPUTPCClusterFinder::DumpSuppressedPeaksCompacted, *mDebugFile); // clang-format off
+        }, tbb::simple_partitioner()); // clang-format on
+      });
+      tbb::task_arena(mRec->SetAndGetNActiveThreadsOuterLoop(!doGPU, maxLane)).execute([&] {
+        tbb::parallel_for<int32_t>(0, maxLane, [&](auto lane) {
+          uint32_t iSlice = iSliceBase + lane;
+          GPUTPCClusterFinder& clusterer = processors()->tpcClusterer[iSlice];
+          GPUTPCClusterFinder& clustererShadow = doGPU ? processorsShadow()->tpcClusterer[iSlice] : clusterer;
+          if (doGPU) {
+            SynchronizeStream(lane);
+          }
 
-        if (fragment.index == 0) {
-          deviceEvent* waitEvent = nullptr;
-          if (transferRunning[lane] == 1) {
-            waitEvent = &mEvents->stream[lane];
-            transferRunning[lane] = 2;
+          if (fragment.index == 0) {
+            deviceEvent* waitEvent = nullptr;
+            if (transferRunning[lane] == 1) {
+              waitEvent = &mEvents->stream[lane];
+              transferRunning[lane] = 2;
+            }
+            runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, waitEvent}}, clustererShadow.mPclusterInRow, GPUCA_ROW_COUNT * sizeof(*clustererShadow.mPclusterInRow));
           }
-          runKernel<GPUMemClean16>({GetGridAutoStep(lane, RecoStep::TPCClusterFinding), krnlRunRangeNone, {nullptr, waitEvent}}, clustererShadow.mPclusterInRow, GPUCA_ROW_COUNT * sizeof(*clustererShadow.mPclusterInRow));
-        }
 
-        if (clusterer.mPmemory->counters.nClusters == 0) {
-          continue;
-        }
+          if (clusterer.mPmemory->counters.nClusters == 0) {
+            return;
+          }
 
-        runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
-        DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
+          runKernel<GPUTPCCFDeconvolution>({GetGrid(clusterer.mPmemory->counters.nPositions, lane), {iSlice}});
+          DoDebugAndDump(RecoStep::TPCClusterFinding, 262144 << 4, clusterer, &GPUTPCClusterFinder::DumpChargeMap, *mDebugFile, "Split Charges");
 
-        runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0);
-        if (doGPU && propagateMCLabels) {
-          TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
-          if (doGPU) {
-            SynchronizeStream(lane);
+          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane), {iSlice}}, 0);
+          if (doGPU && propagateMCLabels) {
+            TransferMemoryResourceLinkToHost(RecoStep::TPCClusterFinding, clusterer.mScratchId, lane);
+            if (doGPU) {
+              SynchronizeStream(lane);
+            }
+            runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
+          }
+          if (GetProcessingSettings().debugLevel >= 3) {
+            GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSlice, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters);
           }
-          runKernel<GPUTPCCFClusterizer>({GetGrid(clusterer.mPmemory->counters.nClusters, lane, GPUReconstruction::krnlDeviceType::CPU), {iSlice}}, 1);
-        }
-        if (GetProcessingSettings().debugLevel >= 3) {
-          GPUInfo("Sector %02d Fragment %02d Lane %d: Found clusters: digits %u peaks %u clusters %u", iSlice, fragment.index, lane, (int32_t)clusterer.mPmemory->counters.nPositions, (int32_t)clusterer.mPmemory->counters.nPeaks, (int32_t)clusterer.mPmemory->counters.nClusters);
-        }
 
-        TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clusterer, lane);
-        laneHasData[lane] = true;
-        // Include clusters in default debug mask, exclude other debug output by default
-        DoDebugAndDump(RecoStep::TPCClusterFinding, 131072, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile);
-      }
-      mRec->SetNestedLoopOmpFactor(1);
+          TransferMemoryResourcesToHost(RecoStep::TPCClusterFinding, &clusterer, lane);
+          laneHasData[lane] = true;
+          // Include clusters in default debug mask, exclude other debug output by default
+          DoDebugAndDump(RecoStep::TPCClusterFinding, 131072, clusterer, &GPUTPCClusterFinder::DumpClusters, *mDebugFile); // clang-format off
+        }, tbb::simple_partitioner()); // clang-format on
+      });
+      mRec->SetNActiveThreadsOuterLoop(1);
     }
 
     size_t nClsFirst = nClsTotal;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index f3f3627573339..4ea7094416d5e 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -268,7 +268,7 @@ int32_t GPUChainTracking::RunTPCDecompression()
     int32_t nStreams = doGPU ? mRec->NStreams() - 1 : 1;
     if (cmprClsHost.nAttachedClusters != 0) {
       std::exclusive_scan(cmprClsHost.nTrackClusters, cmprClsHost.nTrackClusters + cmprClsHost.nTracks, Decompressor.mAttachedClustersOffsets, 0u); // computing clusters offsets for first kernel
-      for (int32_t iStream = 0; iStream < nStreams; ++iStream) {
+      for (int32_t iStream = 0; iStream < nStreams; iStream++) {
         uint32_t startTrack = cmprClsHost.nTracks / nStreams * iStream;
         uint32_t endTrack = cmprClsHost.nTracks / nStreams * (iStream + 1) + (iStream < nStreams - 1 ? 0 : cmprClsHost.nTracks % nStreams); // index of last track (excluded from computation)
         uint32_t numTracks = endTrack - startTrack;
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
index b68f0797f425f..cab025b03e8b6 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingSliceTracker.cxx
@@ -22,6 +22,8 @@
 #include "utils/strtag.h"
 #include <fstream>
 
+#include <oneapi/tbb.h>
+
 using namespace o2::gpu;
 
 int32_t GPUChainTracking::ExtrapolationTracking(uint32_t iSlice, int32_t threadId, bool synchronizeOutput)
@@ -154,110 +156,110 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
   int32_t streamMap[NSLICES];
 
   bool error = false;
-  GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, NSLICES)))
-  for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
-    GPUTPCTracker& trk = processors()->tpcTrackers[iSlice];
-    GPUTPCTracker& trkShadow = doGPU ? processorsShadow()->tpcTrackers[iSlice] : trk;
-    int32_t useStream = (iSlice % mRec->NStreams());
+  tbb::task_arena(mRec->SetAndGetNActiveThreadsOuterLoop(!doGPU, NSLICES)).execute([&] {
+    tbb::parallel_for<uint32_t>(0, NSLICES, [&](auto iSlice) {
+      GPUTPCTracker& trk = processors()->tpcTrackers[iSlice];
+      GPUTPCTracker& trkShadow = doGPU ? processorsShadow()->tpcTrackers[iSlice] : trk;
+      int32_t useStream = (iSlice % mRec->NStreams());
 
-    if (GetProcessingSettings().debugLevel >= 3) {
-      GPUInfo("Creating Slice Data (Slice %d)", iSlice);
-    }
-    if (doGPU) {
-      TransferMemoryResourcesToGPU(RecoStep::TPCSliceTracking, &trk, useStream);
-      runKernel<GPUTPCCreateSliceData>({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}, {nullptr, streamInit[useStream] ? nullptr : &mEvents->init}});
-      streamInit[useStream] = true;
-    } else {
-      if (ReadEvent(iSlice, 0)) {
-        GPUError("Error reading event");
-        error = 1;
-        continue;
+      if (GetProcessingSettings().debugLevel >= 3) {
+        GPUInfo("Creating Slice Data (Slice %d)", iSlice);
+      }
+      if (doGPU) {
+        TransferMemoryResourcesToGPU(RecoStep::TPCSliceTracking, &trk, useStream);
+        runKernel<GPUTPCCreateSliceData>({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}, {nullptr, streamInit[useStream] ? nullptr : &mEvents->init}});
+        streamInit[useStream] = true;
+      } else {
+        if (ReadEvent(iSlice, 0)) {
+          GPUError("Error reading event");
+          error = 1;
+          return;
+        }
+      }
+      if (GetProcessingSettings().deterministicGPUReconstruction) {
+        runKernel<GPUTPCSectorDebugSortKernels, GPUTPCSectorDebugSortKernels::hitData>({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}});
+      }
+      if (!doGPU && trk.CheckEmptySlice() && GetProcessingSettings().debugLevel == 0) {
+        return;
       }
-    }
-    if (GetProcessingSettings().deterministicGPUReconstruction) {
-      runKernel<GPUTPCSectorDebugSortKernels, GPUTPCSectorDebugSortKernels::hitData>({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}});
-    }
-    if (!doGPU && trk.CheckEmptySlice() && GetProcessingSettings().debugLevel == 0) {
-      continue;
-    }
 
-    if (GetProcessingSettings().debugLevel >= 6) {
-      *mDebugFile << "\n\nReconstruction: Slice " << iSlice << "/" << NSLICES << std::endl;
-      if (GetProcessingSettings().debugMask & 1) {
-        if (doGPU) {
-          TransferMemoryResourcesToHost(RecoStep::TPCSliceTracking, &trk, -1, true);
+      if (GetProcessingSettings().debugLevel >= 6) {
+        *mDebugFile << "\n\nReconstruction: Slice " << iSlice << "/" << NSLICES << std::endl;
+        if (GetProcessingSettings().debugMask & 1) {
+          if (doGPU) {
+            TransferMemoryResourcesToHost(RecoStep::TPCSliceTracking, &trk, -1, true);
+          }
+          trk.DumpSliceData(*mDebugFile);
         }
-        trk.DumpSliceData(*mDebugFile);
       }
-    }
 
-    // Initialize temporary memory where needed
-    if (GetProcessingSettings().debugLevel >= 3) {
-      GPUInfo("Copying Slice Data to GPU and initializing temporary memory");
-    }
-    runKernel<GPUMemClean16>(GetGridAutoStep(useStream, RecoStep::TPCSliceTracking), trkShadow.Data().HitWeights(), trkShadow.Data().NumberOfHitsPlusAlign() * sizeof(*trkShadow.Data().HitWeights()));
+      // Initialize temporary memory where needed
+      if (GetProcessingSettings().debugLevel >= 3) {
+        GPUInfo("Copying Slice Data to GPU and initializing temporary memory");
+      }
+      runKernel<GPUMemClean16>(GetGridAutoStep(useStream, RecoStep::TPCSliceTracking), trkShadow.Data().HitWeights(), trkShadow.Data().NumberOfHitsPlusAlign() * sizeof(*trkShadow.Data().HitWeights()));
 
-    if (!doGPU) {
-      TransferMemoryResourcesToGPU(RecoStep::TPCSliceTracking, &trk, useStream); // Copy Data to GPU Global Memory
-    }
-    if (GPUDebug("Initialization (3)", useStream)) {
-      throw std::runtime_error("memcpy failure");
-    }
+      if (!doGPU) {
+        TransferMemoryResourcesToGPU(RecoStep::TPCSliceTracking, &trk, useStream); // Copy Data to GPU Global Memory
+      }
+      if (GPUDebug("Initialization (3)", useStream)) {
+        throw std::runtime_error("memcpy failure");
+      }
 
-    runKernel<GPUTPCNeighboursFinder>({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}, {nullptr, streamInit[useStream] ? nullptr : &mEvents->init}});
-    streamInit[useStream] = true;
+      runKernel<GPUTPCNeighboursFinder>({GetGridBlk(GPUCA_ROW_COUNT, useStream), {iSlice}, {nullptr, streamInit[useStream] ? nullptr : &mEvents->init}});
+      streamInit[useStream] = true;
 
-    if (GetProcessingSettings().keepDisplayMemory) {
-      TransferMemoryResourcesToHost(RecoStep::TPCSliceTracking, &trk, -1, true);
-      memcpy(trk.LinkTmpMemory(), mRec->Res(trk.MemoryResLinks()).Ptr(), mRec->Res(trk.MemoryResLinks()).Size());
-      if (GetProcessingSettings().debugMask & 2) {
-        trk.DumpLinks(*mDebugFile, 0);
+      if (GetProcessingSettings().keepDisplayMemory) {
+        TransferMemoryResourcesToHost(RecoStep::TPCSliceTracking, &trk, -1, true);
+        memcpy(trk.LinkTmpMemory(), mRec->Res(trk.MemoryResLinks()).Ptr(), mRec->Res(trk.MemoryResLinks()).Size());
+        if (GetProcessingSettings().debugMask & 2) {
+          trk.DumpLinks(*mDebugFile, 0);
+        }
       }
-    }
 
-    runKernel<GPUTPCNeighboursCleaner>({GetGridBlk(GPUCA_ROW_COUNT - 2, useStream), {iSlice}});
-    DoDebugAndDump(RecoStep::TPCSliceTracking, 4, trk, &GPUTPCTracker::DumpLinks, *mDebugFile, 1);
+      runKernel<GPUTPCNeighboursCleaner>({GetGridBlk(GPUCA_ROW_COUNT - 2, useStream), {iSlice}});
+      DoDebugAndDump(RecoStep::TPCSliceTracking, 4, trk, &GPUTPCTracker::DumpLinks, *mDebugFile, 1);
 
-    runKernel<GPUTPCStartHitsFinder>({GetGridBlk(GPUCA_ROW_COUNT - 6, useStream), {iSlice}});
+      runKernel<GPUTPCStartHitsFinder>({GetGridBlk(GPUCA_ROW_COUNT - 6, useStream), {iSlice}});
 #ifdef GPUCA_SORT_STARTHITS_GPU
-    if (doGPU) {
-      runKernel<GPUTPCStartHitsSorter>({GetGridAuto(useStream), {iSlice}});
-    }
+      if (doGPU) {
+        runKernel<GPUTPCStartHitsSorter>({GetGridAuto(useStream), {iSlice}});
+      }
 #endif
-    if (GetProcessingSettings().deterministicGPUReconstruction) {
-      runKernel<GPUTPCSectorDebugSortKernels, GPUTPCSectorDebugSortKernels::startHits>({GetGrid(1, 1, useStream), {iSlice}});
-    }
-    DoDebugAndDump(RecoStep::TPCSliceTracking, 32, trk, &GPUTPCTracker::DumpStartHits, *mDebugFile);
-
-    if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
-      trk.UpdateMaxData();
-      AllocateRegisteredMemory(trk.MemoryResTracklets());
-      AllocateRegisteredMemory(trk.MemoryResOutput());
-    }
-
-    if (!(doGPU || GetProcessingSettings().debugLevel >= 1) || GetProcessingSettings().trackletConstructorInPipeline) {
-      runKernel<GPUTPCTrackletConstructor>({GetGridAuto(useStream), {iSlice}});
-      DoDebugAndDump(RecoStep::TPCSliceTracking, 128, trk, &GPUTPCTracker::DumpTrackletHits, *mDebugFile);
-      if (GetProcessingSettings().debugMask & 256 && GetProcessingSettings().deterministicGPUReconstruction < 2) {
-        trk.DumpHitWeights(*mDebugFile);
+      if (GetProcessingSettings().deterministicGPUReconstruction) {
+        runKernel<GPUTPCSectorDebugSortKernels, GPUTPCSectorDebugSortKernels::startHits>({GetGrid(1, 1, useStream), {iSlice}});
       }
-    }
+      DoDebugAndDump(RecoStep::TPCSliceTracking, 32, trk, &GPUTPCTracker::DumpStartHits, *mDebugFile);
 
-    if (!(doGPU || GetProcessingSettings().debugLevel >= 1) || GetProcessingSettings().trackletSelectorInPipeline) {
-      runKernel<GPUTPCTrackletSelector>({GetGridAuto(useStream), {iSlice}});
-      runKernel<GPUTPCExtrapolationTrackingCopyNumbers>({{1, -ThreadCount(), useStream}, {iSlice}}, 1);
-      if (GetProcessingSettings().deterministicGPUReconstruction) {
-        runKernel<GPUTPCSectorDebugSortKernels, GPUTPCSectorDebugSortKernels::sliceTracks>({GetGrid(1, 1, useStream), {iSlice}});
+      if (GetProcessingSettings().memoryAllocationStrategy == GPUMemoryResource::ALLOCATION_INDIVIDUAL) {
+        trk.UpdateMaxData();
+        AllocateRegisteredMemory(trk.MemoryResTracklets());
+        AllocateRegisteredMemory(trk.MemoryResOutput());
       }
-      TransferMemoryResourceLinkToHost(RecoStep::TPCSliceTracking, trk.MemoryResCommon(), useStream, &mEvents->slice[iSlice]);
-      streamMap[iSlice] = useStream;
-      if (GetProcessingSettings().debugLevel >= 3) {
-        GPUInfo("Slice %u, Number of tracks: %d", iSlice, *trk.NTracks());
+
+      if (!(doGPU || GetProcessingSettings().debugLevel >= 1) || GetProcessingSettings().trackletConstructorInPipeline) {
+        runKernel<GPUTPCTrackletConstructor>({GetGridAuto(useStream), {iSlice}});
+        DoDebugAndDump(RecoStep::TPCSliceTracking, 128, trk, &GPUTPCTracker::DumpTrackletHits, *mDebugFile);
+        if (GetProcessingSettings().debugMask & 256 && GetProcessingSettings().deterministicGPUReconstruction < 2) {
+          trk.DumpHitWeights(*mDebugFile);
+        }
       }
-      DoDebugAndDump(RecoStep::TPCSliceTracking, 512, trk, &GPUTPCTracker::DumpTrackHits, *mDebugFile);
-    }
-  }
-  mRec->SetNestedLoopOmpFactor(1);
+
+      if (!(doGPU || GetProcessingSettings().debugLevel >= 1) || GetProcessingSettings().trackletSelectorInPipeline) {
+        runKernel<GPUTPCTrackletSelector>({GetGridAuto(useStream), {iSlice}});
+        runKernel<GPUTPCExtrapolationTrackingCopyNumbers>({{1, -ThreadCount(), useStream}, {iSlice}}, 1);
+        if (GetProcessingSettings().deterministicGPUReconstruction) {
+          runKernel<GPUTPCSectorDebugSortKernels, GPUTPCSectorDebugSortKernels::sliceTracks>({GetGrid(1, 1, useStream), {iSlice}});
+        }
+        TransferMemoryResourceLinkToHost(RecoStep::TPCSliceTracking, trk.MemoryResCommon(), useStream, &mEvents->slice[iSlice]);
+        streamMap[iSlice] = useStream;
+        if (GetProcessingSettings().debugLevel >= 3) {
+          GPUInfo("Slice %u, Number of tracks: %d", iSlice, *trk.NTracks());
+        }
+        DoDebugAndDump(RecoStep::TPCSliceTracking, 512, trk, &GPUTPCTracker::DumpTrackHits, *mDebugFile);
+      } }, tbb::simple_partitioner());
+  });
+  mRec->SetNActiveThreadsOuterLoop(1);
   if (error) {
     return (3);
   }
@@ -419,16 +421,16 @@ int32_t GPUChainTracking::RunTPCTrackingSlices_internal()
     }
   } else {
     mSliceSelectorReady = NSLICES;
-    GPUCA_OPENMP(parallel for if(!doGPU && GetProcessingSettings().ompKernels != 1) num_threads(mRec->SetAndGetNestedLoopOmpFactor(!doGPU, NSLICES)))
-    for (uint32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
-      if (param().rec.tpc.extrapolationTracking) {
-        ExtrapolationTracking(iSlice, 0);
-      }
-      if (GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCSectorTracks) {
-        WriteOutput(iSlice, 0);
-      }
-    }
-    mRec->SetNestedLoopOmpFactor(1);
+    tbb::task_arena(mRec->SetAndGetNActiveThreadsOuterLoop(!doGPU, NSLICES)).execute([&] {
+      tbb::parallel_for<uint32_t>(0, NSLICES, [&](auto iSlice) {
+        if (param().rec.tpc.extrapolationTracking) {
+          ExtrapolationTracking(iSlice, 0);
+        }
+        if (GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCSectorTracks) {
+          WriteOutput(iSlice, 0);
+        } }, tbb::simple_partitioner());
+    });
+    mRec->SetNActiveThreadsOuterLoop(1);
   }
 
   if (param().rec.tpc.extrapolationTracking && GetProcessingSettings().debugLevel >= 3) {
diff --git a/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx b/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx
index b6f11375328d0..4f654c0fa7beb 100644
--- a/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx
+++ b/GPU/GPUTracking/Merger/GPUTPCGMMergerGPU.cxx
@@ -14,9 +14,7 @@
 
 #include "GPUTPCGMMergerGPU.h"
 #include "GPUCommonAlgorithm.h"
-#if defined(WITH_OPENMP) && !defined(GPUCA_GPUCODE)
-#include "GPUReconstruction.h"
-#endif
+#include "GPUReconstructionThreading.h"
 
 using namespace o2::gpu;
 
@@ -24,20 +22,18 @@ template <>
 GPUdii() void GPUTPCGMMergerTrackFit::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger, int32_t mode)
 {
   const int32_t iEnd = mode == -1 ? merger.Memory()->nRetryRefit : merger.NOutputTracks();
-  GPUCA_OPENMP(parallel for if(!merger.GetRec().GetProcessingSettings().ompKernels) num_threads(merger.GetRec().GetProcessingSettings().ompThreads))
-  for (int32_t ii = get_global_id(0); ii < iEnd; ii += get_global_size(0)) {
+  GPUCA_TBB_KERNEL_LOOP(merger.GetRec(), int32_t, ii, iEnd, {
     const int32_t i = mode == -1 ? merger.RetryRefitIds()[ii] : mode ? merger.TrackOrderProcess()[ii] : ii;
     GPUTPCGMTrackParam::RefitTrack(merger.OutputTracks()[i], i, &merger, mode == -1);
-  }
+  });
 }
 
 template <>
 GPUdii() void GPUTPCGMMergerFollowLoopers::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUsharedref() GPUSharedMemory& smem, processorType& GPUrestrict() merger)
 {
-  GPUCA_OPENMP(parallel for if(!merger.GetRec().GetProcessingSettings().ompKernels) num_threads(merger.GetRec().GetProcessingSettings().ompThreads))
-  for (uint32_t i = get_global_id(0); i < merger.Memory()->nLoopData; i += get_global_size(0)) {
+  GPUCA_TBB_KERNEL_LOOP(merger.GetRec(), uint32_t, i, merger.Memory()->nLoopData, {
     GPUTPCGMTrackParam::RefitLoop(&merger, i);
-  }
+  });
 }
 
 template <>
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCDef.h b/GPU/GPUTracking/SliceTracker/GPUTPCDef.h
index 4b4f130faed65..3b53c3e66875a 100644
--- a/GPU/GPUTracking/SliceTracker/GPUTPCDef.h
+++ b/GPU/GPUTracking/SliceTracker/GPUTPCDef.h
@@ -47,4 +47,4 @@ struct cahit2 { cahit x, y; };
 #endif
 
 #endif //GPUDTPCEF_H
-// clang format on
+// clang-format on
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx
index 8a727dc2da930..3cc3e3805dce8 100644
--- a/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx
+++ b/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx
@@ -35,10 +35,10 @@ using namespace o2::gpu;
 void GPUTPCSliceData::InitializeRows(const GPUParam& p)
 {
   // initialisation of rows
-  for (int32_t i = 0; i < GPUCA_ROW_COUNT + 1; ++i) {
+  for (int32_t i = 0; i < GPUCA_ROW_COUNT + 1; i++) {
     new (&mRows[i]) GPUTPCRow;
   }
-  for (int32_t i = 0; i < GPUCA_ROW_COUNT; ++i) {
+  for (int32_t i = 0; i < GPUCA_ROW_COUNT; i++) {
     mRows[i].mX = p.tpcGeometry.Row2X(i);
     mRows[i].mMaxY = CAMath::Tan(p.par.dAlpha / 2.f) * mRows[i].mX;
   }
diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx
index df0c7813fa0db..cece49073f11b 100644
--- a/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx
+++ b/GPU/GPUTracking/SliceTracker/GPUTPCTracker.cxx
@@ -98,7 +98,7 @@ void* GPUTPCTracker::SetPointersCommon(void* mem)
 void GPUTPCTracker::RegisterMemoryAllocation()
 {
   AllocateAndInitializeLate();
-  bool reuseCondition = !mRec->GetProcessingSettings().keepDisplayMemory && mRec->GetProcessingSettings().trackletSelectorInPipeline && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) || mRec->GetProcessingSettings().ompKernels == 1 || mRec->GetProcessingSettings().ompThreads == 1);
+  bool reuseCondition = !mRec->GetProcessingSettings().keepDisplayMemory && mRec->GetProcessingSettings().trackletSelectorInPipeline && ((mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) || mRec->GetProcessingSettings().inKernelParallel == 1 || mRec->GetProcessingSettings().nHostThreads == 1);
   GPUMemoryReuse reLinks{reuseCondition, GPUMemoryReuse::REUSE_1TO1, GPUMemoryReuse::TrackerDataLinks, (uint16_t)(mISlice % mRec->GetProcessingSettings().nStreams)};
   mMemoryResLinks = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataLinks, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK, "TPCSliceLinks", reLinks);
   mMemoryResSliceScratch = mRec->RegisterMemoryAllocation(this, &GPUTPCTracker::SetPointersDataScratch, GPUMemoryResource::MEMORY_SCRATCH | GPUMemoryResource::MEMORY_STACK | GPUMemoryResource::MEMORY_CUSTOM, "TPCSliceScratch");
diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
index e6017788144e0..53ed77fe62d8c 100644
--- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
+++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
@@ -165,7 +165,7 @@ int32_t ReadConfiguration(int argc, char** argv)
 #endif
 #ifndef GPUCA_TPC_GEOMETRY_O2
 #error Why was configStandalone.rec.tpc.mergerReadFromTrackerDirectly = 0 needed?
-  configStandalone.proc.ompKernels = false;
+  configStandalone.proc.inKernelParallel = false;
   configStandalone.proc.createO2Output = 0;
   if (configStandalone.rundEdx == -1) {
     configStandalone.rundEdx = 0;
@@ -216,10 +216,10 @@ int32_t ReadConfiguration(int argc, char** argv)
     configStandalone.noprompt = 1;
   }
   if (configStandalone.proc.debugLevel >= 4) {
-    if (configStandalone.proc.ompKernels) {
-      configStandalone.proc.ompKernels = 1;
+    if (configStandalone.proc.inKernelParallel) {
+      configStandalone.proc.inKernelParallel = 1;
     } else {
-      configStandalone.proc.ompThreads = 1;
+      configStandalone.proc.nHostThreads = 1;
     }
   }
   if (configStandalone.setO2Settings) {
diff --git a/GPU/GPUTracking/Standalone/CMakeLists.txt b/GPU/GPUTracking/Standalone/CMakeLists.txt
index 1f11f0bacffac..32cdb246cf417 100644
--- a/GPU/GPUTracking/Standalone/CMakeLists.txt
+++ b/GPU/GPUTracking/Standalone/CMakeLists.txt
@@ -70,13 +70,8 @@ endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error -Wall -Wextra -Wshadow -Wno-unused-function -Wno-unused-parameter -Wno-unused-local-typedefs -Wno-unknown-pragmas -Wno-write-strings")
 set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -rdynamic -Wl,--no-undefined")
 
-# Check and set settings for optional packages
-if(CONFIG_OPENMP)
-  find_package(OpenMP REQUIRED)
-else()
-  set(OpenMP_FOUND OFF)
-  set(OpenMP_CXX_FOUND OFF)
-endif()
+# Find mandatory packages
+find_package(TBB REQUIRED)
 
 if(GPUCA_CONFIG_VC)
   find_package(Vc REQUIRED)
@@ -252,14 +247,6 @@ if(GPUCA_CONFIG_ROOT)
 endif()
 target_link_libraries(standalone_support PUBLIC Microsoft.GSL::GSL TPCFastTransformation)
 
-if(OpenMP_CXX_FOUND)
-  target_link_libraries(ca PUBLIC OpenMP::OpenMP_CXX)
-  if (CMAKE_CXX_COMPILER STREQUAL "clang++")
-    target_link_libraries(ca PUBLIC -fopenmp)
-    target_link_libraries(GPUTracking PUBLIC -fopenmp)
-  endif()
-endif()
-
 # Installation
 install(TARGETS ca TPCFastTransformation standalone_support)
 install(FILES "cmake/makefile" DESTINATION "${CMAKE_INSTALL_PREFIX}")
diff --git a/GPU/GPUTracking/Standalone/cmake/config.cmake b/GPU/GPUTracking/Standalone/cmake/config.cmake
index 97091d833efd8..87716d700abc8 100644
--- a/GPU/GPUTracking/Standalone/cmake/config.cmake
+++ b/GPU/GPUTracking/Standalone/cmake/config.cmake
@@ -15,7 +15,6 @@
 set(ENABLE_CUDA AUTO)
 set(ENABLE_HIP AUTO)
 set(ENABLE_OPENCL AUTO)
-set(CONFIG_OPENMP 1)
 set(GPUCA_CONFIG_VC 1)
 set(GPUCA_CONFIG_FMT 1)
 set(GPUCA_CONFIG_ROOT 1)
diff --git a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx
index c44b4c09a3d7a..4e8fcd13e0801 100644
--- a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx
+++ b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.cxx
@@ -30,9 +30,6 @@ class GPUTPCGMPolynomialField;
 #ifndef GPUCA_GPUCODE
 #include "GPUMemoryResource.h"
 #include "GPUReconstruction.h"
-#ifdef WITH_OPENMP
-#include <omp.h>
-#endif // WITH_OPENMP
 #include <chrono>
 #include <vector>
 
@@ -61,10 +58,10 @@ void* GPUTRDTracker_t<TRDTRK, PROP>::SetPointersBase(void* base)
   //--------------------------------------------------------------------
   // Allocate memory for fixed size objects (needs to be done only once)
   //--------------------------------------------------------------------
-  mMaxThreads = mRec->GetMaxThreads();
+  mMaxBackendThreads = mRec->GetMaxBackendThreads();
   computePointerWithAlignment(base, mR, kNChambers);
-  computePointerWithAlignment(base, mHypothesis, mNCandidates * mMaxThreads);
-  computePointerWithAlignment(base, mCandidates, mNCandidates * 2 * mMaxThreads);
+  computePointerWithAlignment(base, mHypothesis, mNCandidates * mMaxBackendThreads);
+  computePointerWithAlignment(base, mCandidates, mNCandidates * 2 * mMaxBackendThreads);
   return base;
 }
 
@@ -94,7 +91,7 @@ void* GPUTRDTracker_t<TRDTRK, PROP>::SetPointersTracks(void* base)
 }
 
 template <class TRDTRK, class PROP>
-GPUTRDTracker_t<TRDTRK, PROP>::GPUTRDTracker_t() : mR(nullptr), mIsInitialized(false), mGenerateSpacePoints(false), mProcessPerTimeFrame(false), mNAngleHistogramBins(25), mAngleHistogramRange(50), mMemoryPermanent(-1), mMemoryTracklets(-1), mMemoryTracks(-1), mNMaxCollisions(0), mNMaxTracks(0), mNMaxSpacePoints(0), mTracks(nullptr), mTrackAttribs(nullptr), mNCandidates(1), mNTracks(0), mNEvents(0), mMaxThreads(100), mTrackletIndexArray(nullptr), mHypothesis(nullptr), mCandidates(nullptr), mSpacePoints(nullptr), mGeo(nullptr), mRPhiA2(0), mRPhiB(0), mRPhiC2(0), mDyA2(0), mDyB(0), mDyC2(0), mAngleToDyA(0), mAngleToDyB(0), mAngleToDyC(0), mDebugOutput(false), mMaxEta(0.84f), mRoadZ(18.f), mZCorrCoefNRC(1.4f), mTPCVdrift(2.58f), mTPCTDriftOffset(0.f), mDebug(new GPUTRDTrackerDebug<TRDTRK>())
+GPUTRDTracker_t<TRDTRK, PROP>::GPUTRDTracker_t() : mR(nullptr), mIsInitialized(false), mGenerateSpacePoints(false), mProcessPerTimeFrame(false), mNAngleHistogramBins(25), mAngleHistogramRange(50), mMemoryPermanent(-1), mMemoryTracklets(-1), mMemoryTracks(-1), mNMaxCollisions(0), mNMaxTracks(0), mNMaxSpacePoints(0), mTracks(nullptr), mTrackAttribs(nullptr), mNCandidates(1), mNTracks(0), mNEvents(0), mMaxBackendThreads(100), mTrackletIndexArray(nullptr), mHypothesis(nullptr), mCandidates(nullptr), mSpacePoints(nullptr), mGeo(nullptr), mRPhiA2(0), mRPhiB(0), mRPhiC2(0), mDyA2(0), mDyB(0), mDyC2(0), mAngleToDyA(0), mAngleToDyB(0), mAngleToDyC(0), mDebugOutput(false), mMaxEta(0.84f), mRoadZ(18.f), mZCorrCoefNRC(1.4f), mTPCVdrift(2.58f), mTPCTDriftOffset(0.f), mDebug(new GPUTRDTrackerDebug<TRDTRK>())
 {
   //--------------------------------------------------------------------
   // Default constructor
diff --git a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.h b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.h
index 59e753e239cf9..274dfd6668eaf 100644
--- a/GPU/GPUTracking/TRDTracking/GPUTRDTracker.h
+++ b/GPU/GPUTracking/TRDTracking/GPUTRDTracker.h
@@ -167,7 +167,7 @@ class GPUTRDTracker_t : public GPUProcessor
   int32_t mNCandidates;                    // max. track hypothesis per layer
   int32_t mNTracks;                        // number of TPC tracks to be matched
   int32_t mNEvents;                        // number of processed events
-  int32_t mMaxThreads;                     // maximum number of supported threads
+  int32_t mMaxBackendThreads;              // maximum number of supported threads
   // index of first tracklet for each chamber within tracklets array, last entry is total number of tracklets for given collision
   // the array has (kNChambers + 1) * numberOfCollisions entries
   // note, that for collision iColl one has to add an offset corresponding to the index of the first tracklet of iColl to the index stored in mTrackletIndexArray
diff --git a/GPU/GPUTracking/TRDTracking/GPUTRDTrackerKernels.cxx b/GPU/GPUTracking/TRDTracking/GPUTRDTrackerKernels.cxx
index eb9eecfe6e846..d18f04e554043 100644
--- a/GPU/GPUTracking/TRDTracking/GPUTRDTrackerKernels.cxx
+++ b/GPU/GPUTracking/TRDTracking/GPUTRDTrackerKernels.cxx
@@ -16,9 +16,8 @@
 #include "GPUTRDGeometry.h"
 #include "GPUConstantMem.h"
 #include "GPUCommonTypeTraits.h"
-#if defined(WITH_OPENMP) && !defined(GPUCA_GPUCODE)
-#include "GPUReconstruction.h"
-#endif
+
+#include "GPUReconstructionThreading.h"
 
 using namespace o2::gpu;
 
@@ -33,10 +32,9 @@ GPUdii() void GPUTRDTrackerKernels::Thread(int32_t nBlocks, int32_t nThreads, in
     }
   }
 #endif
-  GPUCA_OPENMP(parallel for if(!trdTracker->GetRec().GetProcessingSettings().ompKernels) num_threads(trdTracker->GetRec().GetProcessingSettings().ompThreads))
-  for (int32_t i = get_global_id(0); i < trdTracker->NTracks(); i += get_global_size(0)) {
+  GPUCA_TBB_KERNEL_LOOP(trdTracker->GetRec(), int32_t, i, trdTracker->NTracks(), {
     trdTracker->DoTrackingThread(i, get_global_id(0));
-  }
+  });
 }
 
 #if !defined(GPUCA_GPUCODE) || defined(GPUCA_GPUCODE_DEVICE) // FIXME: DR: WORKAROUND to avoid CUDA bug creating host symbols for device code.
diff --git a/GPU/GPUTracking/display/CMakeLists.txt b/GPU/GPUTracking/display/CMakeLists.txt
index 2c1814a1a26a0..68385d7916234 100644
--- a/GPU/GPUTracking/display/CMakeLists.txt
+++ b/GPU/GPUTracking/display/CMakeLists.txt
@@ -219,7 +219,4 @@ if(GPUCA_EVENT_DISPLAY_QT)
   target_link_libraries(${targetName} PRIVATE Qt5::Widgets)
 endif()
 
-if(OpenMP_CXX_FOUND)
-  target_compile_definitions(${targetName} PRIVATE WITH_OPENMP)
-  target_link_libraries(${targetName} PRIVATE OpenMP::OpenMP_CXX)
-endif()
+target_link_libraries(${targetName} PRIVATE TBB::tbb)
diff --git a/GPU/GPUTracking/display/GPUDisplay.cxx b/GPU/GPUTracking/display/GPUDisplay.cxx
index 918011b85ea04..e42a4fa3e4bf1 100644
--- a/GPU/GPUTracking/display/GPUDisplay.cxx
+++ b/GPU/GPUTracking/display/GPUDisplay.cxx
@@ -24,9 +24,6 @@
 #ifndef _WIN32
 #include "../utils/linux_helpers.h"
 #endif
-#ifdef WITH_OPENMP
-#include <omp.h>
-#endif
 
 #include "GPUChainTracking.h"
 #include "GPUQA.h"
diff --git a/GPU/GPUTracking/display/GPUDisplay.h b/GPU/GPUTracking/display/GPUDisplay.h
index d6a65f212ecf3..1c4b751bbf85b 100644
--- a/GPU/GPUTracking/display/GPUDisplay.h
+++ b/GPU/GPUTracking/display/GPUDisplay.h
@@ -207,7 +207,7 @@ class GPUDisplay : public GPUDisplayInterface
   void DrawTrackITS(int32_t trackId, int32_t iSlice);
   GPUDisplay::vboList DrawFinalITS();
   template <class T>
-  void DrawFinal(int32_t iSlice, int32_t /*iCol*/, GPUTPCGMPropagator* prop, std::array<vecpod<int32_t>, 2>& trackList, threadVertexBuffer& threadBuffer);
+  void DrawFinal(int32_t iSlice, int32_t /*iCol*/, const GPUTPCGMPropagator* prop, std::array<vecpod<int32_t>, 2>& trackList, threadVertexBuffer& threadBuffer);
   vboList DrawGrid(const GPUTPCTracker& tracker);
   vboList DrawGridTRD(int32_t sector);
   void DoScreenshot(const char* filename, std::vector<char>& pixels, float animateTime = -1.f);
diff --git a/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx b/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx
index 764f659d07e64..ca9fd6be01703 100644
--- a/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx
+++ b/GPU/GPUTracking/display/helpers/GPUDisplayHelpers.cxx
@@ -14,25 +14,20 @@
 
 #include "GPUDisplay.h"
 
-#ifdef WITH_OPENMP
-#include <omp.h>
-#endif
 #ifndef _WIN32
 #include "bitmapfile.h"
 #endif
 
+#include "oneapi/tbb.h"
+
 using namespace o2::gpu;
 
 int32_t GPUDisplay::getNumThreads()
 {
   if (mChain) {
-    return mChain->GetProcessingSettings().ompThreads;
+    return mChain->GetProcessingSettings().nHostThreads;
   } else {
-#ifdef WITH_OPENMP
-    return omp_get_max_threads();
-#else
-    return 1;
-#endif
+    return tbb::info::default_concurrency();
   }
 }
 
diff --git a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx
index 5d4628cf0eb3f..8c42cfa46abb9 100644
--- a/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx
+++ b/GPU/GPUTracking/display/render/GPUDisplayDraw.cxx
@@ -36,9 +36,7 @@
 #include "SimulationDataFormat/ConstMCTruthContainer.h"
 #include "GPUTrackParamConvert.h"
 
-#ifdef WITH_OPENMP
-#include <omp.h>
-#endif
+#include <oneapi/tbb.h>
 
 using namespace o2::gpu;
 
@@ -325,7 +323,7 @@ GPUDisplay::vboList GPUDisplay::DrawFinalITS()
 }
 
 template <class T>
-void GPUDisplay::DrawFinal(int32_t iSlice, int32_t /*iCol*/, GPUTPCGMPropagator* prop, std::array<vecpod<int32_t>, 2>& trackList, threadVertexBuffer& threadBuffer)
+void GPUDisplay::DrawFinal(int32_t iSlice, int32_t /*iCol*/, const GPUTPCGMPropagator* prop, std::array<vecpod<int32_t>, 2>& trackList, threadVertexBuffer& threadBuffer)
 {
   auto& vBuf = threadBuffer.vBuf;
   auto& buffer = threadBuffer.buffer;
@@ -698,15 +696,15 @@ GPUDisplay::vboList GPUDisplay::DrawGridTRD(int32_t sector)
     if (trdsector >= 9) {
       alpha -= 2 * CAMath::Pi();
     }
-    for (int32_t iLy = 0; iLy < GPUTRDTracker::EGPUTRDTracker::kNLayers; ++iLy) {
-      for (int32_t iStack = 0; iStack < GPUTRDTracker::EGPUTRDTracker::kNStacks; ++iStack) {
+    for (int32_t iLy = 0; iLy < GPUTRDTracker::EGPUTRDTracker::kNLayers; iLy++) {
+      for (int32_t iStack = 0; iStack < GPUTRDTracker::EGPUTRDTracker::kNStacks; iStack++) {
         int32_t iDet = geo->GetDetector(iLy, iStack, trdsector);
         auto matrix = geo->GetClusterMatrix(iDet);
         if (!matrix) {
           continue;
         }
         auto pp = geo->GetPadPlane(iDet);
-        for (int32_t i = 0; i < pp->GetNrows(); ++i) {
+        for (int32_t i = 0; i < pp->GetNrows(); i++) {
           float xyzLoc1[3];
           float xyzLoc2[3];
           float xyzGlb1[3];
@@ -776,26 +774,17 @@ size_t GPUDisplay::DrawGLScene_updateVertexList()
       mGlDLFinal[iSlice].resize(mNCollissions);
     }
   }
-  GPUCA_OPENMP(parallel num_threads(getNumThreads()))
-  {
-#ifdef WITH_OPENMP
-    int32_t numThread = omp_get_thread_num();
-    int32_t numThreads = omp_get_num_threads();
-#else
-    int32_t numThread = 0, numThreads = 1;
-#endif
+  int32_t numThreads = getNumThreads();
+  tbb::task_arena(numThreads).execute([&] {
     if (mChain && (mChain->GetRecoSteps() & GPUDataTypes::RecoStep::TPCSliceTracking)) {
-      GPUCA_OPENMP(for)
-      for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
+      tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) {
         GPUTPCTracker& tracker = (GPUTPCTracker&)sliceTracker(iSlice);
         tracker.SetPointersDataLinks(tracker.LinkTmpMemory());
         mGlDLLines[iSlice][tINITLINK] = DrawLinks(tracker, tINITLINK, true);
-        tracker.SetPointersDataLinks(mChain->rec()->Res(tracker.MemoryResLinks()).Ptr());
-      }
-      GPUCA_OPENMP(barrier)
+        tracker.SetPointersDataLinks(mChain->rec()->Res(tracker.MemoryResLinks()).Ptr()); // clang-format off
+      }, tbb::simple_partitioner()); // clang-format on
 
-      GPUCA_OPENMP(for)
-      for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
+      tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) {
         const GPUTPCTracker& tracker = sliceTracker(iSlice);
 
         mGlDLLines[iSlice][tLINK] = DrawLinks(tracker, tLINK);
@@ -805,30 +794,28 @@ size_t GPUDisplay::DrawGLScene_updateVertexList()
         mGlDLGrid[iSlice] = DrawGrid(tracker);
         if (iSlice < NSLICES / 2) {
           mGlDLGridTRD[iSlice] = DrawGridTRD(iSlice);
-        }
-      }
-      GPUCA_OPENMP(barrier)
+        } // clang-format off
+      }, tbb::simple_partitioner()); // clang-format on
 
-      GPUCA_OPENMP(for)
-      for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
+      tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) {
         const GPUTPCTracker& tracker = sliceTracker(iSlice);
-        mGlDLLines[iSlice][tEXTRAPOLATEDTRACK] = DrawTracks(tracker, 1);
-      }
-      GPUCA_OPENMP(barrier)
+        mGlDLLines[iSlice][tEXTRAPOLATEDTRACK] = DrawTracks(tracker, 1); // clang-format off
+      }, tbb::simple_partitioner()); // clang-format on
     }
-    mThreadTracks[numThread].resize(mNCollissions);
-    for (int32_t i = 0; i < mNCollissions; i++) {
-      for (int32_t j = 0; j < NSLICES; j++) {
-        for (int32_t k = 0; k < 2; k++) {
-          mThreadTracks[numThread][i][j][k].clear();
+    tbb::parallel_for(0, numThreads, [&](int32_t iThread) {
+      mThreadTracks[iThread].resize(mNCollissions);
+      for (int32_t i = 0; i < mNCollissions; i++) {
+        for (int32_t j = 0; j < NSLICES; j++) {
+          for (int32_t k = 0; k < 2; k++) {
+            mThreadTracks[iThread][i][j][k].clear();
+          }
         }
-      }
-    }
+      } // clang-format off
+    }, tbb::simple_partitioner()); // clang-format on
     if (mConfig.showTPCTracksFromO2Format) {
 #ifdef GPUCA_TPC_GEOMETRY_O2
       uint32_t col = 0;
-      GPUCA_OPENMP(for)
-      for (uint32_t i = 0; i < mIOPtrs->nOutputTracksTPCO2; i++) {
+      tbb::parallel_for<uint32_t>(0, mIOPtrs->nOutputTracksTPCO2, [&](auto i) {
         uint8_t sector, row;
         if (mIOPtrs->clustersNative) {
           mIOPtrs->outputTracksTPCO2[i].getCluster(mIOPtrs->outputClusRefsTPCO2, 0, *mIOPtrs->clustersNative, sector, row);
@@ -838,18 +825,17 @@ size_t GPUDisplay::DrawGLScene_updateVertexList()
         if (mQA && mIOPtrs->outputTracksTPCO2MC) {
           col = mQA->GetMCLabelCol(mIOPtrs->outputTracksTPCO2MC[i]);
         }
-        mThreadTracks[numThread][col][sector][0].emplace_back(i);
-      }
+        mThreadTracks[GPUReconstruction::getHostThreadIndex()][col][sector][0].emplace_back(i);
+      });
 #endif
     } else {
-      GPUCA_OPENMP(for)
-      for (uint32_t i = 0; i < mIOPtrs->nMergedTracks; i++) {
+      tbb::parallel_for<uint32_t>(0, mIOPtrs->nMergedTracks, [&](auto i) {
         const GPUTPCGMMergedTrack* track = &mIOPtrs->mergedTracks[i];
         if (track->NClusters() == 0) {
-          continue;
+          return;
         }
         if (mCfgH.hideRejectedTracks && !track->OK()) {
-          continue;
+          return;
         }
         int32_t slice = mIOPtrs->mergedTrackHits[track->FirstClusterRef() + track->NClusters() - 1].slice;
         uint32_t col = 0;
@@ -863,18 +849,17 @@ size_t GPUDisplay::DrawGLScene_updateVertexList()
           }
 #endif
         }
-        mThreadTracks[numThread][col][slice][0].emplace_back(i);
-      }
+        mThreadTracks[GPUReconstruction::getHostThreadIndex()][col][slice][0].emplace_back(i);
+      });
     }
     for (uint32_t col = 0; col < mIOPtrs->nMCInfosTPCCol; col++) {
-      GPUCA_OPENMP(for)
-      for (uint32_t i = mIOPtrs->mcInfosTPCCol[col].first; i < mIOPtrs->mcInfosTPCCol[col].first + mIOPtrs->mcInfosTPCCol[col].num; i++) {
+      tbb::parallel_for(mIOPtrs->mcInfosTPCCol[col].first, mIOPtrs->mcInfosTPCCol[col].first + mIOPtrs->mcInfosTPCCol[col].num, [&](uint32_t i) {
         const GPUTPCMCInfo& mc = mIOPtrs->mcInfosTPC[i];
         if (mc.charge == 0.f) {
-          continue;
+          return;
         }
         if (mc.pid < 0) {
-          continue;
+          return;
         }
 
         float alpha = atan2f(mc.y, mc.x);
@@ -885,18 +870,17 @@ size_t GPUDisplay::DrawGLScene_updateVertexList()
         if (mc.z < 0) {
           slice += 18;
         }
-        mThreadTracks[numThread][col][slice][1].emplace_back(i);
-      }
+        mThreadTracks[GPUReconstruction::getHostThreadIndex()][col][slice][1].emplace_back(i);
+      });
     }
-    GPUCA_OPENMP(barrier)
 
     GPUTPCGMPropagator prop;
     prop.SetMaxSinPhi(.999);
     prop.SetMaterialTPC();
     prop.SetPolynomialField(&mParam->polynomialField);
 
-    GPUCA_OPENMP(for)
-    for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
+    tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) {
+      int32_t numThread = GPUReconstruction::getHostThreadIndex();
       for (int32_t iCol = 0; iCol < mNCollissions; iCol++) {
         mThreadBuffers[numThread].clear();
         for (int32_t iSet = 0; iSet < numThreads; iSet++) {
@@ -915,19 +899,17 @@ size_t GPUDisplay::DrawGLScene_updateVertexList()
           }
           list[i] = vboList(startCount, mVertexBufferStart[iSlice].size() - startCount, iSlice);
         }
-      }
-    }
+      } // clang-format off
+    }, tbb::simple_partitioner()); // clang-format on
 
-    GPUCA_OPENMP(barrier)
-    GPUCA_OPENMP(for)
-    for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
+    tbb::parallel_for(0, NSLICES, [&](int32_t iSlice) {
       for (int32_t i = 0; i < N_POINTS_TYPE_TPC; i++) {
         for (int32_t iCol = 0; iCol < mNCollissions; iCol++) {
           mGlDLPoints[iSlice][i][iCol] = DrawClusters(iSlice, i, iCol);
         }
-      }
-    }
-  }
+      } // clang-format off
+    }, tbb::simple_partitioner()); // clang-format on
+  });
   // End omp parallel
 
   mGlDLFinalITS = DrawFinalITS();
diff --git a/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx b/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx
index aaa03b8a24d18..f53fa185029f8 100644
--- a/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx
+++ b/GPU/GPUTracking/display/render/GPUDisplayImportEvent.cxx
@@ -33,6 +33,8 @@
 #include "ITSMFTBase/DPLAlpideParam.h"
 #endif
 
+#include <oneapi/tbb.h>
+
 using namespace o2::gpu;
 
 void GPUDisplay::DrawGLScene_updateEventData()
@@ -126,103 +128,107 @@ void GPUDisplay::DrawGLScene_updateEventData()
   }
   mUpdateTrackFilter = false;
 
-  mMaxClusterZ = 0;
-  GPUCA_OPENMP(parallel for num_threads(getNumThreads()) reduction(max : mMaxClusterZ))
-  for (int32_t iSlice = 0; iSlice < NSLICES; iSlice++) {
-    int32_t row = 0;
-    uint32_t nCls = mParam->par.earlyTpcTransform ? mIOPtrs->nClusterData[iSlice] : mIOPtrs->clustersNative ? mIOPtrs->clustersNative->nClustersSector[iSlice]
-                                                                                                            : 0;
-    for (uint32_t i = 0; i < nCls; i++) {
-      int32_t cid;
-      if (mParam->par.earlyTpcTransform) {
-        const auto& cl = mIOPtrs->clusterData[iSlice][i];
-        cid = cl.id;
-        row = cl.row;
-      } else {
-        cid = mIOPtrs->clustersNative->clusterOffset[iSlice][0] + i;
-        while (row < GPUCA_ROW_COUNT - 1 && mIOPtrs->clustersNative->clusterOffset[iSlice][row + 1] <= (uint32_t)cid) {
-          row++;
+  mMaxClusterZ = tbb::parallel_reduce(tbb::blocked_range<int32_t>(0, NSLICES, 1), float(0.f), [&](const tbb::blocked_range<int32_t>& r, float maxClusterZ) {
+    for (int32_t iSlice = r.begin(); iSlice < r.end(); iSlice++) {
+      int32_t row = 0;
+      uint32_t nCls = mParam->par.earlyTpcTransform ? mIOPtrs->nClusterData[iSlice] : (mIOPtrs->clustersNative ? mIOPtrs->clustersNative->nClustersSector[iSlice] : 0);
+      for (uint32_t i = 0; i < nCls; i++) {
+        int32_t cid;
+        if (mParam->par.earlyTpcTransform) {
+          const auto& cl = mIOPtrs->clusterData[iSlice][i];
+          cid = cl.id;
+          row = cl.row;
+        } else {
+          cid = mIOPtrs->clustersNative->clusterOffset[iSlice][0] + i;
+          while (row < GPUCA_ROW_COUNT - 1 && mIOPtrs->clustersNative->clusterOffset[iSlice][row + 1] <= (uint32_t)cid) {
+            row++;
+          }
         }
-      }
-      if (cid >= mNMaxClusters) {
-        throw std::runtime_error("Cluster Buffer Size exceeded");
-      }
-      float4* ptr = &mGlobalPos[cid];
-      if (mParam->par.earlyTpcTransform) {
-        const auto& cl = mIOPtrs->clusterData[iSlice][i];
-        mParam->Slice2Global(iSlice, (mCfgH.clustersOnNominalRow ? mParam->tpcGeometry.Row2X(row) : cl.x) + mCfgH.xAdd, cl.y, cl.z, &ptr->x, &ptr->y, &ptr->z);
-      } else {
-        float x, y, z;
-        const auto& cln = mIOPtrs->clustersNative->clusters[iSlice][0][i];
-        GPUTPCConvertImpl::convert(*mCalib->fastTransform, *mParam, iSlice, row, cln.getPad(), cln.getTime(), x, y, z);
-        if (mCfgH.clustersOnNominalRow) {
-          x = mParam->tpcGeometry.Row2X(row);
+        if (cid >= mNMaxClusters) {
+          throw std::runtime_error("Cluster Buffer Size exceeded");
+        }
+        float4* ptr = &mGlobalPos[cid];
+        if (mParam->par.earlyTpcTransform) {
+          const auto& cl = mIOPtrs->clusterData[iSlice][i];
+          mParam->Slice2Global(iSlice, (mCfgH.clustersOnNominalRow ? mParam->tpcGeometry.Row2X(row) : cl.x) + mCfgH.xAdd, cl.y, cl.z, &ptr->x, &ptr->y, &ptr->z);
+        } else {
+          float x, y, z;
+          const auto& cln = mIOPtrs->clustersNative->clusters[iSlice][0][i];
+          GPUTPCConvertImpl::convert(*mCalib->fastTransform, *mParam, iSlice, row, cln.getPad(), cln.getTime(), x, y, z);
+          if (mCfgH.clustersOnNominalRow) {
+            x = mParam->tpcGeometry.Row2X(row);
+          }
+          mParam->Slice2Global(iSlice, x + mCfgH.xAdd, y, z, &ptr->x, &ptr->y, &ptr->z);
         }
-        mParam->Slice2Global(iSlice, x + mCfgH.xAdd, y, z, &ptr->x, &ptr->y, &ptr->z);
+
+        if (fabsf(ptr->z) > maxClusterZ) {
+          maxClusterZ = fabsf(ptr->z);
+        }
+        ptr->z += iSlice < 18 ? mCfgH.zAdd : -mCfgH.zAdd;
+        ptr->x *= GL_SCALE_FACTOR;
+        ptr->y *= GL_SCALE_FACTOR;
+        ptr->z *= GL_SCALE_FACTOR;
+        ptr->w = tCLUSTER;
       }
+    }
+    return maxClusterZ; // clang-format off
+  }, [](const float a, const float b) { return std::max(a, b); }, tbb::simple_partitioner()); // clang-format on
 
-      if (fabsf(ptr->z) > mMaxClusterZ) {
-        mMaxClusterZ = fabsf(ptr->z);
+  mMaxClusterZ = tbb::parallel_reduce(tbb::blocked_range<int32_t>(0, mCurrentSpacePointsTRD, 32), float(mMaxClusterZ), [&](const tbb::blocked_range<int32_t>& r, float maxClusterZ) {
+    int32_t trdTriggerRecord = -1;
+    float trdZoffset = 0;
+    for (int i = r.begin(); i < r.end(); i++) {
+      while (mParam->par.continuousTracking && trdTriggerRecord < (int32_t)mIOPtrs->nTRDTriggerRecords - 1 && mIOPtrs->trdTrackletIdxFirst[trdTriggerRecord + 1] <= i) {
+        trdTriggerRecord++; // This requires to go through the data in order I believe
+        float trdTime = mIOPtrs->trdTriggerTimes[trdTriggerRecord] * 1e3 / o2::constants::lhc::LHCBunchSpacingNS / o2::tpc::constants::LHCBCPERTIMEBIN;
+        trdZoffset = fabsf(mCalib->fastTransformHelper->getCorrMap()->convVertexTimeToZOffset(0, trdTime, mParam->continuousMaxTimeBin));
+      }
+      const auto& sp = mIOPtrs->trdSpacePoints[i];
+      int32_t iSec = trdGeometry()->GetSector(mIOPtrs->trdTracklets[i].GetDetector());
+      float4* ptr = &mGlobalPosTRD[i];
+      mParam->Slice2Global(iSec, sp.getX() + mCfgH.xAdd, sp.getY(), sp.getZ(), &ptr->x, &ptr->y, &ptr->z);
+      ptr->z += ptr->z > 0 ? trdZoffset : -trdZoffset;
+      if (fabsf(ptr->z) > maxClusterZ) {
+        maxClusterZ = fabsf(ptr->z);
       }
-      ptr->z += iSlice < 18 ? mCfgH.zAdd : -mCfgH.zAdd;
       ptr->x *= GL_SCALE_FACTOR;
       ptr->y *= GL_SCALE_FACTOR;
       ptr->z *= GL_SCALE_FACTOR;
-      ptr->w = tCLUSTER;
-    }
-  }
-
-  int32_t trdTriggerRecord = -1;
-  float trdZoffset = 0;
-  GPUCA_OPENMP(parallel for num_threads(getNumThreads()) reduction(max : mMaxClusterZ) firstprivate(trdTriggerRecord, trdZoffset))
-  for (int32_t i = 0; i < mCurrentSpacePointsTRD; i++) {
-    while (mParam->par.continuousTracking && trdTriggerRecord < (int32_t)mIOPtrs->nTRDTriggerRecords - 1 && mIOPtrs->trdTrackletIdxFirst[trdTriggerRecord + 1] <= i) {
-      trdTriggerRecord++;
-      float trdTime = mIOPtrs->trdTriggerTimes[trdTriggerRecord] * 1e3 / o2::constants::lhc::LHCBunchSpacingNS / o2::tpc::constants::LHCBCPERTIMEBIN;
-      trdZoffset = fabsf(mCalib->fastTransformHelper->getCorrMap()->convVertexTimeToZOffset(0, trdTime, mParam->continuousMaxTimeBin));
-    }
-    const auto& sp = mIOPtrs->trdSpacePoints[i];
-    int32_t iSec = trdGeometry()->GetSector(mIOPtrs->trdTracklets[i].GetDetector());
-    float4* ptr = &mGlobalPosTRD[i];
-    mParam->Slice2Global(iSec, sp.getX() + mCfgH.xAdd, sp.getY(), sp.getZ(), &ptr->x, &ptr->y, &ptr->z);
-    ptr->z += ptr->z > 0 ? trdZoffset : -trdZoffset;
-    if (fabsf(ptr->z) > mMaxClusterZ) {
-      mMaxClusterZ = fabsf(ptr->z);
-    }
-    ptr->x *= GL_SCALE_FACTOR;
-    ptr->y *= GL_SCALE_FACTOR;
-    ptr->z *= GL_SCALE_FACTOR;
-    ptr->w = tTRDCLUSTER;
-    ptr = &mGlobalPosTRD2[i];
-    mParam->Slice2Global(iSec, sp.getX() + mCfgH.xAdd + 4.5f, sp.getY() + 1.5f * sp.getDy(), sp.getZ(), &ptr->x, &ptr->y, &ptr->z);
-    ptr->z += ptr->z > 0 ? trdZoffset : -trdZoffset;
-    if (fabsf(ptr->z) > mMaxClusterZ) {
-      mMaxClusterZ = fabsf(ptr->z);
+      ptr->w = tTRDCLUSTER;
+      ptr = &mGlobalPosTRD2[i];
+      mParam->Slice2Global(iSec, sp.getX() + mCfgH.xAdd + 4.5f, sp.getY() + 1.5f * sp.getDy(), sp.getZ(), &ptr->x, &ptr->y, &ptr->z);
+      ptr->z += ptr->z > 0 ? trdZoffset : -trdZoffset;
+      if (fabsf(ptr->z) > maxClusterZ) {
+        maxClusterZ = fabsf(ptr->z);
+      }
+      ptr->x *= GL_SCALE_FACTOR;
+      ptr->y *= GL_SCALE_FACTOR;
+      ptr->z *= GL_SCALE_FACTOR;
+      ptr->w = tTRDCLUSTER;
     }
-    ptr->x *= GL_SCALE_FACTOR;
-    ptr->y *= GL_SCALE_FACTOR;
-    ptr->z *= GL_SCALE_FACTOR;
-    ptr->w = tTRDCLUSTER;
-  }
+    return maxClusterZ; // clang-format off
+  }, [](const float a, const float b) { return std::max(a, b); }, tbb::static_partitioner()); // clang-format on
 
-  GPUCA_OPENMP(parallel for num_threads(getNumThreads()) reduction(max : mMaxClusterZ))
-  for (int32_t i = 0; i < mCurrentClustersTOF; i++) {
-    float4* ptr = &mGlobalPosTOF[i];
-    mParam->Slice2Global(mIOPtrs->tofClusters[i].getSector(), mIOPtrs->tofClusters[i].getX() + mCfgH.xAdd, mIOPtrs->tofClusters[i].getY(), mIOPtrs->tofClusters[i].getZ(), &ptr->x, &ptr->y, &ptr->z);
-    float ZOffset = 0;
-    if (mParam->par.continuousTracking) {
-      float tofTime = mIOPtrs->tofClusters[i].getTime() * 1e-3 / o2::constants::lhc::LHCBunchSpacingNS / o2::tpc::constants::LHCBCPERTIMEBIN;
-      ZOffset = fabsf(mCalib->fastTransformHelper->getCorrMap()->convVertexTimeToZOffset(0, tofTime, mParam->continuousMaxTimeBin));
-      ptr->z += ptr->z > 0 ? ZOffset : -ZOffset;
-    }
-    if (fabsf(ptr->z) > mMaxClusterZ) {
-      mMaxClusterZ = fabsf(ptr->z);
+  mMaxClusterZ = tbb::parallel_reduce(tbb::blocked_range<int32_t>(0, mCurrentClustersTOF, 32), float(mMaxClusterZ), [&](const tbb::blocked_range<int32_t>& r, float maxClusterZ) {
+    for (int32_t i = r.begin(); i < r.end(); i++) {
+      float4* ptr = &mGlobalPosTOF[i];
+      mParam->Slice2Global(mIOPtrs->tofClusters[i].getSector(), mIOPtrs->tofClusters[i].getX() + mCfgH.xAdd, mIOPtrs->tofClusters[i].getY(), mIOPtrs->tofClusters[i].getZ(), &ptr->x, &ptr->y, &ptr->z);
+      float ZOffset = 0;
+      if (mParam->par.continuousTracking) {
+        float tofTime = mIOPtrs->tofClusters[i].getTime() * 1e-3 / o2::constants::lhc::LHCBunchSpacingNS / o2::tpc::constants::LHCBCPERTIMEBIN;
+        ZOffset = fabsf(mCalib->fastTransformHelper->getCorrMap()->convVertexTimeToZOffset(0, tofTime, mParam->continuousMaxTimeBin));
+        ptr->z += ptr->z > 0 ? ZOffset : -ZOffset;
+      }
+      if (fabsf(ptr->z) > maxClusterZ) {
+        maxClusterZ = fabsf(ptr->z);
+      }
+      ptr->x *= GL_SCALE_FACTOR;
+      ptr->y *= GL_SCALE_FACTOR;
+      ptr->z *= GL_SCALE_FACTOR;
+      ptr->w = tTOFCLUSTER;
     }
-    ptr->x *= GL_SCALE_FACTOR;
-    ptr->y *= GL_SCALE_FACTOR;
-    ptr->z *= GL_SCALE_FACTOR;
-    ptr->w = tTOFCLUSTER;
-  }
+    return maxClusterZ; // clang-format off
+  }, [](const float a, const float b) { return std::max(a, b); }); // clang-format on
 
   if (mCurrentClustersITS) {
     float itsROFhalfLen = 0;
diff --git a/GPU/GPUTracking/display/shaders/GPUDisplayShaders.h b/GPU/GPUTracking/display/shaders/GPUDisplayShaders.h
index 63673505f4732..23d382466ba22 100644
--- a/GPU/GPUTracking/display/shaders/GPUDisplayShaders.h
+++ b/GPU/GPUTracking/display/shaders/GPUDisplayShaders.h
@@ -458,7 +458,7 @@ const float positionScale = 100.0f;
 void main() {
     vec3 position = gl_in[0].gl_Position.xyz;
 
-    for(uint32_t i = 0; i < field_config.StepCount; ++i) {
+    for(uint32_t i = 0; i < field_config.StepCount; i++) {
         gl_Position = um.ModelViewProj * vec4(position/positionScale, 1.0f);
         EmitVertex();
         const vec3 b_vec = Field(position);
diff --git a/GPU/GPUTracking/qa/GPUQA.cxx b/GPU/GPUTracking/qa/GPUQA.cxx
index 70a093c7f1de7..015159fee24d7 100644
--- a/GPU/GPUTracking/qa/GPUQA.cxx
+++ b/GPU/GPUTracking/qa/GPUQA.cxx
@@ -76,6 +76,8 @@
 #include "utils/qconfig.h"
 #include "utils/timer.h"
 
+#include <oneapi/tbb.h>
+
 using namespace o2::gpu;
 
 #ifdef GPUCA_MERGER_BY_MC_LABEL
@@ -919,49 +921,48 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
       }
 #endif
     } else {
-      auto acc = GPUTPCTrkLbl<true, mcLabelI_t>(GetClusterLabels(), 1.f - mConfig.recThreshold);
-#if QA_DEBUG == 0
-      GPUCA_OPENMP(parallel for firstprivate(acc))
-#endif
-      for (uint32_t i = 0; i < nReconstructedTracks; i++) {
-        acc.reset();
-        int32_t nClusters = 0;
-        const GPUTPCGMMergedTrack& track = mTracking->mIOPtrs.mergedTracks[i];
-        std::vector<mcLabel_t> labels;
-        for (uint32_t k = 0; k < track.NClusters(); k++) {
-          if (mTracking->mIOPtrs.mergedTrackHits[track.FirstClusterRef() + k].state & GPUTPCGMMergedTrackHit::flagReject) {
-            continue;
-          }
-          nClusters++;
-          uint32_t hitId = mTracking->mIOPtrs.mergedTrackHits[track.FirstClusterRef() + k].num;
-          if (hitId >= GetNMCLabels()) {
-            GPUError("Invalid hit id %u > %d (nClusters %d)", hitId, GetNMCLabels(), mTracking->mIOPtrs.clustersNative ? mTracking->mIOPtrs.clustersNative->nClustersTotal : 0);
-            throw std::runtime_error("qa error");
-          }
-          acc.addLabel(hitId);
-          for (int32_t j = 0; j < GetMCLabelNID(hitId); j++) {
-            if (GetMCLabelID(hitId, j) >= (int32_t)GetNMCTracks(GetMCLabelCol(hitId, j))) {
-              GPUError("Invalid label %d > %d (hit %d, label %d, col %d)", GetMCLabelID(hitId, j), GetNMCTracks(GetMCLabelCol(hitId, j)), hitId, j, (int32_t)GetMCLabelCol(hitId, j));
+      tbb::parallel_for(tbb::blocked_range<uint32_t>(0, nReconstructedTracks, (QA_DEBUG == 0) ? 32 : nReconstructedTracks), [&](const tbb::blocked_range<uint32_t>& range) {
+        auto acc = GPUTPCTrkLbl<true, mcLabelI_t>(GetClusterLabels(), 1.f - mConfig.recThreshold);
+        for (auto i = range.begin(); i < range.end(); i++) {
+          acc.reset();
+          int32_t nClusters = 0;
+          const GPUTPCGMMergedTrack& track = mTracking->mIOPtrs.mergedTracks[i];
+          std::vector<mcLabel_t> labels;
+          for (uint32_t k = 0; k < track.NClusters(); k++) {
+            if (mTracking->mIOPtrs.mergedTrackHits[track.FirstClusterRef() + k].state & GPUTPCGMMergedTrackHit::flagReject) {
+              continue;
+            }
+            nClusters++;
+            uint32_t hitId = mTracking->mIOPtrs.mergedTrackHits[track.FirstClusterRef() + k].num;
+            if (hitId >= GetNMCLabels()) {
+              GPUError("Invalid hit id %u > %d (nClusters %d)", hitId, GetNMCLabels(), mTracking->mIOPtrs.clustersNative ? mTracking->mIOPtrs.clustersNative->nClustersTotal : 0);
               throw std::runtime_error("qa error");
             }
-            if (GetMCLabelID(hitId, j) >= 0) {
-              if (QA_DEBUG >= 3 && track.OK()) {
-                GPUInfo("Track %d Cluster %u Label %d: %d (%f)", i, k, j, GetMCLabelID(hitId, j), GetMCLabelWeight(hitId, j));
+            acc.addLabel(hitId);
+            for (int32_t j = 0; j < GetMCLabelNID(hitId); j++) {
+              if (GetMCLabelID(hitId, j) >= (int32_t)GetNMCTracks(GetMCLabelCol(hitId, j))) {
+                GPUError("Invalid label %d > %d (hit %d, label %d, col %d)", GetMCLabelID(hitId, j), GetNMCTracks(GetMCLabelCol(hitId, j)), hitId, j, (int32_t)GetMCLabelCol(hitId, j));
+                throw std::runtime_error("qa error");
+              }
+              if (GetMCLabelID(hitId, j) >= 0) {
+                if (QA_DEBUG >= 3 && track.OK()) {
+                  GPUInfo("Track %d Cluster %u Label %d: %d (%f)", i, k, j, GetMCLabelID(hitId, j), GetMCLabelWeight(hitId, j));
+                }
               }
             }
           }
-        }
 
-        float maxweight, sumweight;
-        int32_t maxcount;
-        auto maxLabel = acc.computeLabel(&maxweight, &sumweight, &maxcount);
-        mTrackMCLabels[i] = maxLabel;
-        if (QA_DEBUG && track.OK() && GetNMCTracks(maxLabel) > (uint32_t)maxLabel.getTrackID()) {
-          const mcInfo_t& mc = GetMCTrack(maxLabel);
-          GPUInfo("Track %d label %d (fake %d) weight %f clusters %d (fitted %d) (%f%% %f%%) Pt %f", i, maxLabel.getTrackID(), (int32_t)(maxLabel.isFake()), maxweight, nClusters, track.NClustersFitted(), 100.f * maxweight / sumweight, 100.f * (float)maxcount / (float)nClusters,
-                  std::sqrt(mc.pX * mc.pX + mc.pY * mc.pY));
+          float maxweight, sumweight;
+          int32_t maxcount;
+          auto maxLabel = acc.computeLabel(&maxweight, &sumweight, &maxcount);
+          mTrackMCLabels[i] = maxLabel;
+          if (QA_DEBUG && track.OK() && GetNMCTracks(maxLabel) > (uint32_t)maxLabel.getTrackID()) {
+            const mcInfo_t& mc = GetMCTrack(maxLabel);
+            GPUInfo("Track %d label %d (fake %d) weight %f clusters %d (fitted %d) (%f%% %f%%) Pt %f", i, maxLabel.getTrackID(), (int32_t)(maxLabel.isFake()), maxweight, nClusters, track.NClustersFitted(), 100.f * maxweight / sumweight, 100.f * (float)maxcount / (float)nClusters,
+                    std::sqrt(mc.pX * mc.pX + mc.pY * mc.pY));
+          }
         }
-      }
+      });
     }
     if (QA_TIMING || (mTracking && mTracking->GetProcessingSettings().debugLevel >= 3)) {
       GPUInfo("QA Time: Assign Track Labels:\t\t%6.0f us", timer.GetCurrentElapsedTime(true) * 1e6);
@@ -1135,8 +1136,7 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
     }
 
     // Compute MC Track Parameters for MC Tracks
-    GPUCA_OPENMP(parallel for)
-    for (uint32_t iCol = 0; iCol < GetNMCCollissions(); iCol++) {
+    tbb::parallel_for<uint32_t>(0, GetNMCCollissions(), [&](auto iCol) {
       for (uint32_t i = 0; i < GetNMCTracks(iCol); i++) {
         const mcInfo_t& info = GetMCTrack(i, iCol);
         additionalMCParameters& mc2 = mMCParam[iCol][i];
@@ -1153,8 +1153,8 @@ void GPUQA::RunQA(bool matchOnly, const std::vector<o2::tpc::TrackTPC>* tracksEx
           std::vector<int32_t>& effBuffer = mcEffBuffer[mNEvents - 1];
           effBuffer[i] = mRecTracks[iCol][i] * 1000 + mFakeTracks[iCol][i];
         }
-      }
-    }
+      } // clang-format off
+    }, tbb::simple_partitioner()); // clang-format on
     if (QA_TIMING || (mTracking && mTracking->GetProcessingSettings().debugLevel >= 3)) {
       GPUInfo("QA Time: Compute track mc parameters:\t%6.0f us", timer.GetCurrentElapsedTime(true) * 1e6);
     }